date:20160517

[2/2] spark git commit: Preparing development version 2.0.0-SNAPSHOT

2016-05-17 Thread pwendell

Preparing development version 2.0.0-SNAPSHOT


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b5450091
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b5450091
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b5450091

Branch: refs/heads/branch-2.0
Commit: b54500913d49b91949716b2c41bd5f637b1795a7
Parents: 8f5a04b
Author: Patrick Wendell 
Authored: Tue May 17 18:15:51 2016 -0700
Committer: Patrick Wendell 
Committed: Tue May 17 18:15:51 2016 -0700

--
 assembly/pom.xml  | 2 +-
 common/network-common/pom.xml | 2 +-
 common/network-shuffle/pom.xml| 2 +-
 common/network-yarn/pom.xml   | 2 +-
 common/sketch/pom.xml | 2 +-
 common/tags/pom.xml   | 2 +-
 common/unsafe/pom.xml | 2 +-
 core/pom.xml  | 2 +-
 examples/pom.xml  | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml   | 2 +-
 external/flume-sink/pom.xml   | 2 +-
 external/flume/pom.xml| 2 +-
 external/java8-tests/pom.xml  | 2 +-
 external/kafka-0-8-assembly/pom.xml   | 2 +-
 external/kafka-0-8/pom.xml| 2 +-
 external/kinesis-asl-assembly/pom.xml | 2 +-
 external/kinesis-asl/pom.xml  | 2 +-
 external/spark-ganglia-lgpl/pom.xml   | 2 +-
 graphx/pom.xml| 2 +-
 launcher/pom.xml  | 2 +-
 mllib-local/pom.xml   | 2 +-
 mllib/pom.xml | 2 +-
 pom.xml   | 2 +-
 repl/pom.xml  | 2 +-
 sql/catalyst/pom.xml  | 2 +-
 sql/core/pom.xml  | 2 +-
 sql/hive-thriftserver/pom.xml | 2 +-
 sql/hive/pom.xml  | 2 +-
 sql/hivecontext-compatibility/pom.xml | 2 +-
 streaming/pom.xml | 2 +-
 tools/pom.xml | 2 +-
 yarn/pom.xml  | 2 +-
 33 files changed, 33 insertions(+), 33 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b5450091/assembly/pom.xml
--
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 958cb45..75ac926 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.0.0-preview
+2.0.0-SNAPSHOT
 ../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/b5450091/common/network-common/pom.xml
--
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 2cb86ea..5444ae6 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.0.0-preview
+2.0.0-SNAPSHOT
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/b5450091/common/network-shuffle/pom.xml
--
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 58d7879..e736436 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.0.0-preview
+2.0.0-SNAPSHOT
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/b5450091/common/network-yarn/pom.xml
--
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 3f8dfe4..1fd3af2 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.0.0-preview
+2.0.0-SNAPSHOT
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/b5450091/common/sketch/pom.xml
--
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 03db5b8..0bd 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.0.0-preview
+2.0.0-SNAPSHOT
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/b5450091/common/tags/pom.xml
--
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index e100851..14e94ec 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark

[spark] Git Push Summary

2016-05-17 Thread pwendell

Repository: spark
Updated Tags:  refs/tags/2.0.0-preview [created] 8f5a04b62

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[1/2] spark git commit: Preparing Spark release 2.0.0-preview

2016-05-17 Thread pwendell

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 c8be3da66 -> b54500913


Preparing Spark release 2.0.0-preview


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8f5a04b6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8f5a04b6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8f5a04b6

Branch: refs/heads/branch-2.0
Commit: 8f5a04b6299e3a47aca13cbb40e72344c0114860
Parents: c8be3da
Author: Patrick Wendell 
Authored: Tue May 17 18:15:42 2016 -0700
Committer: Patrick Wendell 
Committed: Tue May 17 18:15:42 2016 -0700

--
 assembly/pom.xml  | 2 +-
 common/network-common/pom.xml | 2 +-
 common/network-shuffle/pom.xml| 2 +-
 common/network-yarn/pom.xml   | 2 +-
 common/sketch/pom.xml | 2 +-
 common/tags/pom.xml   | 2 +-
 common/unsafe/pom.xml | 2 +-
 core/pom.xml  | 2 +-
 examples/pom.xml  | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml   | 2 +-
 external/flume-sink/pom.xml   | 2 +-
 external/flume/pom.xml| 2 +-
 external/java8-tests/pom.xml  | 2 +-
 external/kafka-0-8-assembly/pom.xml   | 2 +-
 external/kafka-0-8/pom.xml| 2 +-
 external/kinesis-asl-assembly/pom.xml | 2 +-
 external/kinesis-asl/pom.xml  | 2 +-
 external/spark-ganglia-lgpl/pom.xml   | 2 +-
 graphx/pom.xml| 2 +-
 launcher/pom.xml  | 2 +-
 mllib-local/pom.xml   | 2 +-
 mllib/pom.xml | 2 +-
 pom.xml   | 2 +-
 repl/pom.xml  | 2 +-
 sql/catalyst/pom.xml  | 2 +-
 sql/core/pom.xml  | 2 +-
 sql/hive-thriftserver/pom.xml | 2 +-
 sql/hive/pom.xml  | 2 +-
 sql/hivecontext-compatibility/pom.xml | 2 +-
 streaming/pom.xml | 2 +-
 tools/pom.xml | 2 +-
 yarn/pom.xml  | 2 +-
 33 files changed, 33 insertions(+), 33 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8f5a04b6/assembly/pom.xml
--
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 75ac926..958cb45 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.0.0-SNAPSHOT
+2.0.0-preview
 ../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/8f5a04b6/common/network-common/pom.xml
--
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 5444ae6..2cb86ea 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.0.0-SNAPSHOT
+2.0.0-preview
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/8f5a04b6/common/network-shuffle/pom.xml
--
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index e736436..58d7879 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.0.0-SNAPSHOT
+2.0.0-preview
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/8f5a04b6/common/network-yarn/pom.xml
--
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 1fd3af2..3f8dfe4 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.0.0-SNAPSHOT
+2.0.0-preview
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/8f5a04b6/common/sketch/pom.xml
--
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 0bd..03db5b8 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.0.0-SNAPSHOT
+2.0.0-preview
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/8f5a04b6/common/tags/pom.xml
--
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 14e94ec..e100851 100644
--- a/common/tags/pom.xml

spark git commit: Prepare branch for 2.0.0-preview.

2016-05-17 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 5f5270ead -> c8be3da66


Prepare branch for 2.0.0-preview.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c8be3da6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c8be3da6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c8be3da6

Branch: refs/heads/branch-2.0
Commit: c8be3da66903899fcd743c425c25e32fc356d981
Parents: 5f5270e
Author: Reynold Xin 
Authored: Tue May 17 18:07:59 2016 -0700
Committer: Reynold Xin 
Committed: Tue May 17 18:07:59 2016 -0700

--
 core/src/main/scala/org/apache/spark/package.scala | 2 +-
 docs/_config.yml   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c8be3da6/core/src/main/scala/org/apache/spark/package.scala
--
diff --git a/core/src/main/scala/org/apache/spark/package.scala 
b/core/src/main/scala/org/apache/spark/package.scala
index cc5e7ef..65a3dd2 100644
--- a/core/src/main/scala/org/apache/spark/package.scala
+++ b/core/src/main/scala/org/apache/spark/package.scala
@@ -43,5 +43,5 @@ package org.apache
 
 package object spark {
   // For package docs only
-  val SPARK_VERSION = "2.0.0-SNAPSHOT"
+  val SPARK_VERSION = "2.0.0-preview"
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/c8be3da6/docs/_config.yml
--
diff --git a/docs/_config.yml b/docs/_config.yml
index 8bdc68a..c0a3be7 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,7 +14,7 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.0.0-SNAPSHOT
+SPARK_VERSION: 2.0.0-preview
 SPARK_VERSION_SHORT: 2.0.0
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.7"


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[3/3] spark git commit: [SPARK-15171][SQL] Remove the references to deprecated method dataset.registerTempTable

2016-05-17 Thread lian

[SPARK-15171][SQL] Remove the references to deprecated method 
dataset.registerTempTable

## What changes were proposed in this pull request?

Update the unit test code, examples, and documents to remove calls to 
deprecated method `dataset.registerTempTable`.

## How was this patch tested?

This PR only changes the unit test code, examples, and comments. It should be 
safe.
This is a follow up of PR https://github.com/apache/spark/pull/12945 which was 
merged.

Author: Sean Zhong 

Closes #13098 from clockfly/spark-15171-remove-deprecation.

(cherry picked from commit 25b315e6cad7c27b62dcaa2c194293c1115fdfb3)
Signed-off-by: Cheng Lian 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5f5270ea
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5f5270ea
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5f5270ea

Branch: refs/heads/branch-2.0
Commit: 5f5270ead86d5294af6c871e36112e2a833e9d64
Parents: 1db3741
Author: Sean Zhong 
Authored: Wed May 18 09:01:59 2016 +0800
Committer: Cheng Lian 
Committed: Wed May 18 09:05:34 2016 +0800

--
 docs/sql-programming-guide.md   |  48 -
 docs/streaming-programming-guide.md |  12 +--
 .../apache/spark/examples/sql/JavaSparkSQL.java |   8 +-
 .../streaming/JavaSqlNetworkWordCount.java  |   2 +-
 examples/src/main/python/sql.py |   2 +-
 .../python/streaming/sql_network_wordcount.py   |   2 +-
 .../apache/spark/examples/sql/RDDRelation.scala |   6 +-
 .../spark/examples/sql/hive/HiveFromSpark.scala |   4 +-
 .../streaming/SqlNetworkWordCount.scala |   2 +-
 .../org/apache/spark/ml/JavaPipelineSuite.java  |   2 +-
 .../JavaLogisticRegressionSuite.java|  10 +-
 .../regression/JavaLinearRegressionSuite.java   |   4 +-
 python/pyspark/sql/context.py   |   4 +-
 python/pyspark/sql/readwriter.py|   2 +-
 python/pyspark/sql/session.py   |   2 +-
 python/pyspark/sql/tests.py |  25 ++---
 .../scala/org/apache/spark/sql/SQLContext.scala |   2 +-
 .../apache/spark/sql/JavaApplySchemaSuite.java  |   8 +-
 .../spark/sql/sources/JavaSaveLoadSuite.java|   2 +-
 .../org/apache/spark/sql/CachedTableSuite.scala |  60 +--
 .../spark/sql/ColumnExpressionSuite.scala   |   2 +-
 .../spark/sql/DataFrameTimeWindowingSuite.scala |   2 +-
 .../apache/spark/sql/DataFrameWindowSuite.scala |  22 ++--
 .../scala/org/apache/spark/sql/JoinSuite.scala  |   4 +-
 .../org/apache/spark/sql/ListTablesSuite.scala  |   4 +-
 .../org/apache/spark/sql/SQLContextSuite.scala  |   2 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala| 103 ++-
 .../sql/ScalaReflectionRelationSuite.scala  |  10 +-
 .../org/apache/spark/sql/SubquerySuite.scala|   8 +-
 .../scala/org/apache/spark/sql/UDFSuite.scala   |  12 +--
 .../apache/spark/sql/UserDefinedTypeSuite.scala |   2 +-
 .../spark/sql/execution/PlannerSuite.scala  |  10 +-
 .../benchmark/AggregateBenchmark.scala  |   3 +-
 .../columnar/InMemoryColumnarQuerySuite.scala   |   8 +-
 .../columnar/PartitionBatchPruningSuite.scala   |   2 +-
 .../execution/datasources/json/JsonSuite.scala  |  58 +--
 .../ParquetPartitionDiscoverySuite.scala|  10 +-
 .../datasources/parquet/ParquetQuerySuite.scala |   4 +-
 .../parquet/ParquetReadBenchmark.scala  |  20 ++--
 .../datasources/parquet/TPCDSBenchmark.scala|   2 +-
 .../sql/execution/metric/SQLMetricsSuite.scala  |   8 +-
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala   |   2 +-
 .../sql/sources/CreateTableAsSelectSuite.scala  |   2 +-
 .../apache/spark/sql/sources/InsertSuite.scala  |   6 +-
 .../spark/sql/sources/SaveLoadSuite.scala   |   4 +-
 .../spark/sql/streaming/StreamSuite.scala   |   2 +-
 .../org/apache/spark/sql/test/SQLTestData.scala |  46 -
 .../spark/sql/hive/JavaDataFrameSuite.java  |   2 +-
 .../sql/hive/JavaMetastoreDataSourcesSuite.java |   2 +-
 .../spark/sql/hive/ErrorPositionSuite.scala |   4 +-
 .../spark/sql/hive/HiveParquetSuite.scala   |   4 +-
 .../spark/sql/hive/HiveSparkSubmitSuite.scala   |   8 +-
 .../sql/hive/InsertIntoHiveTableSuite.scala |  12 +--
 .../sql/hive/MetastoreDataSourcesSuite.scala|   8 +-
 .../hive/ParquetHiveCompatibilitySuite.scala|   2 +-
 .../spark/sql/hive/QueryPartitionSuite.scala|   2 +-
 .../apache/spark/sql/hive/StatisticsSuite.scala |   2 +-
 .../org/apache/spark/sql/hive/UDFSuite.scala|   2 +-
 .../hive/execution/AggregationQuerySuite.scala  |   8 +-
 .../sql/hive/execution/HiveExplainSuite.scala   |   2 +-
 .../execution/HiveOperatorQueryableSuite.scala  |   4 +-
 .../spark/sql/hive/execution/HivePlanTest.scala |   2 +-
 .../sql/hive/execution/HiveQuerySuite.scala

[2/3] spark git commit: [SPARK-15171][SQL] Remove the references to deprecated method dataset.registerTempTable

2016-05-17 Thread lian

http://git-wip-us.apache.org/repos/asf/spark/blob/5f5270ea/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index d2e1ea1..2a5295d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -78,7 +78,7 @@ class PlannerSuite extends SharedSQLContext {
 val schema = StructType(fields)
 val row = Row.fromSeq(Seq.fill(fields.size)(null))
 val rowRDD = sparkContext.parallelize(row :: Nil)
-spark.createDataFrame(rowRDD, schema).registerTempTable("testLimit")
+spark.createDataFrame(rowRDD, 
schema).createOrReplaceTempView("testLimit")
 
 val planned = sql(
   """
@@ -132,7 +132,7 @@ class PlannerSuite extends SharedSQLContext {
   test("InMemoryRelation statistics propagation") {
 withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "81920") {
   withTempTable("tiny") {
-testData.limit(3).registerTempTable("tiny")
+testData.limit(3).createOrReplaceTempView("tiny")
 sql("CACHE TABLE tiny")
 
 val a = testData.as("a")
@@ -199,9 +199,9 @@ class PlannerSuite extends SharedSQLContext {
 
   test("PartitioningCollection") {
 withTempTable("normal", "small", "tiny") {
-  testData.registerTempTable("normal")
-  testData.limit(10).registerTempTable("small")
-  testData.limit(3).registerTempTable("tiny")
+  testData.createOrReplaceTempView("normal")
+  testData.limit(10).createOrReplaceTempView("small")
+  testData.limit(3).createOrReplaceTempView("tiny")
 
   // Disable broadcast join
   withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") {

http://git-wip-us.apache.org/repos/asf/spark/blob/5f5270ea/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
index b31338e..bf3a39c 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
@@ -134,7 +134,8 @@ class AggregateBenchmark extends BenchmarkBase {
 val N = 20 << 22
 
 val benchmark = new Benchmark("Aggregate w keys", N)
-sparkSession.range(N).selectExpr("id", "floor(rand() * 1) as 
k").registerTempTable("test")
+sparkSession.range(N).selectExpr("id", "floor(rand() * 1) as k")
+  .createOrReplaceTempView("test")
 
 def f(): Unit = sparkSession.sql("select k, k, sum(id) from test group by 
k, k").collect()
 

http://git-wip-us.apache.org/repos/asf/spark/blob/5f5270ea/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
index 2099d4e..e2fb913 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
@@ -42,7 +42,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with 
SharedSQLContext {
   test("default size avoids broadcast") {
 // TODO: Improve this test when we have better statistics
 sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString))
-  .toDF().registerTempTable("sizeTst")
+  .toDF().createOrReplaceTempView("sizeTst")
 spark.catalog.cacheTable("sizeTst")
 assert(
   spark.table("sizeTst").queryExecution.analyzed.statistics.sizeInBytes >
@@ -92,7 +92,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with 
SharedSQLContext {
 
   test("SPARK-2729 regression: timestamp data type") {
 val timestamps = (0 to 3).map(i => Tuple1(new Timestamp(i))).toDF("time")
-timestamps.registerTempTable("timestamps")
+timestamps.createOrReplaceTempView("timestamps")
 
 checkAnswer(
   sql("SELECT time FROM timestamps"),
@@ -133,7 +133,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with 
SharedSQLContext {
 
 assert(df.schema.head.dataType === DecimalType(15, 10))
 
-df.cache().registerTempTable("test_fixed_decimal")
+df.cache().createOrReplaceTempView("test_fixed_decimal")
 checkAnswer(
   sql("SELECT * FROM test_fixed_decimal"),

spark git commit: [SPARK-14346] Fix scala-2.10 build

2016-05-17 Thread yhuai

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 2dddec40d -> 1db37417c


[SPARK-14346] Fix scala-2.10 build

## What changes were proposed in this pull request?
Scala 2.10 build was broken by #13079. I am reverting the change of that line.

Author: Yin Huai 

Closes #13157 from yhuai/SPARK-14346-fix-scala2.10.

(cherry picked from commit 2a5db9c140b9d60a5ec91018be19bec7b80850ee)
Signed-off-by: Yin Huai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1db37417
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1db37417
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1db37417

Branch: refs/heads/branch-2.0
Commit: 1db37417c25429c0001c19d2f10f4a314fe4585c
Parents: 2dddec4
Author: Yin Huai 
Authored: Tue May 17 18:02:31 2016 -0700
Committer: Yin Huai 
Committed: Tue May 17 18:02:45 2016 -0700

--
 .../scala/org/apache/spark/sql/catalyst/catalog/interface.scala| 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1db37417/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index d4f5cbb..3fdd411 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -208,6 +208,6 @@ case class SimpleCatalogRelation(
   }
 
   require(
-metadata.identifier.database.contains(databaseName),
+metadata.identifier.database == Some(databaseName),
 "provided database does not match the one specified in the table 
definition")
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-14346] Fix scala-2.10 build

2016-05-17 Thread yhuai

Repository: spark
Updated Branches:
  refs/heads/master 25b315e6c -> 2a5db9c14


[SPARK-14346] Fix scala-2.10 build

## What changes were proposed in this pull request?
Scala 2.10 build was broken by #13079. I am reverting the change of that line.

Author: Yin Huai 

Closes #13157 from yhuai/SPARK-14346-fix-scala2.10.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2a5db9c1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2a5db9c1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2a5db9c1

Branch: refs/heads/master
Commit: 2a5db9c140b9d60a5ec91018be19bec7b80850ee
Parents: 25b315e
Author: Yin Huai 
Authored: Tue May 17 18:02:31 2016 -0700
Committer: Yin Huai 
Committed: Tue May 17 18:02:31 2016 -0700

--
 .../scala/org/apache/spark/sql/catalyst/catalog/interface.scala| 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2a5db9c1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index d4f5cbb..3fdd411 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -208,6 +208,6 @@ case class SimpleCatalogRelation(
   }
 
   require(
-metadata.identifier.database.contains(databaseName),
+metadata.identifier.database == Some(databaseName),
 "provided database does not match the one specified in the table 
definition")
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[2/3] spark git commit: [SPARK-15171][SQL] Remove the references to deprecated method dataset.registerTempTable

2016-05-17 Thread lian

http://git-wip-us.apache.org/repos/asf/spark/blob/25b315e6/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index d2e1ea1..2a5295d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -78,7 +78,7 @@ class PlannerSuite extends SharedSQLContext {
 val schema = StructType(fields)
 val row = Row.fromSeq(Seq.fill(fields.size)(null))
 val rowRDD = sparkContext.parallelize(row :: Nil)
-spark.createDataFrame(rowRDD, schema).registerTempTable("testLimit")
+spark.createDataFrame(rowRDD, 
schema).createOrReplaceTempView("testLimit")
 
 val planned = sql(
   """
@@ -132,7 +132,7 @@ class PlannerSuite extends SharedSQLContext {
   test("InMemoryRelation statistics propagation") {
 withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "81920") {
   withTempTable("tiny") {
-testData.limit(3).registerTempTable("tiny")
+testData.limit(3).createOrReplaceTempView("tiny")
 sql("CACHE TABLE tiny")
 
 val a = testData.as("a")
@@ -199,9 +199,9 @@ class PlannerSuite extends SharedSQLContext {
 
   test("PartitioningCollection") {
 withTempTable("normal", "small", "tiny") {
-  testData.registerTempTable("normal")
-  testData.limit(10).registerTempTable("small")
-  testData.limit(3).registerTempTable("tiny")
+  testData.createOrReplaceTempView("normal")
+  testData.limit(10).createOrReplaceTempView("small")
+  testData.limit(3).createOrReplaceTempView("tiny")
 
   // Disable broadcast join
   withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") {

http://git-wip-us.apache.org/repos/asf/spark/blob/25b315e6/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
index b31338e..bf3a39c 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
@@ -134,7 +134,8 @@ class AggregateBenchmark extends BenchmarkBase {
 val N = 20 << 22
 
 val benchmark = new Benchmark("Aggregate w keys", N)
-sparkSession.range(N).selectExpr("id", "floor(rand() * 1) as 
k").registerTempTable("test")
+sparkSession.range(N).selectExpr("id", "floor(rand() * 1) as k")
+  .createOrReplaceTempView("test")
 
 def f(): Unit = sparkSession.sql("select k, k, sum(id) from test group by 
k, k").collect()
 

http://git-wip-us.apache.org/repos/asf/spark/blob/25b315e6/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
index 2099d4e..e2fb913 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
@@ -42,7 +42,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with 
SharedSQLContext {
   test("default size avoids broadcast") {
 // TODO: Improve this test when we have better statistics
 sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString))
-  .toDF().registerTempTable("sizeTst")
+  .toDF().createOrReplaceTempView("sizeTst")
 spark.catalog.cacheTable("sizeTst")
 assert(
   spark.table("sizeTst").queryExecution.analyzed.statistics.sizeInBytes >
@@ -92,7 +92,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with 
SharedSQLContext {
 
   test("SPARK-2729 regression: timestamp data type") {
 val timestamps = (0 to 3).map(i => Tuple1(new Timestamp(i))).toDF("time")
-timestamps.registerTempTable("timestamps")
+timestamps.createOrReplaceTempView("timestamps")
 
 checkAnswer(
   sql("SELECT time FROM timestamps"),
@@ -133,7 +133,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with 
SharedSQLContext {
 
 assert(df.schema.head.dataType === DecimalType(15, 10))
 
-df.cache().registerTempTable("test_fixed_decimal")
+df.cache().createOrReplaceTempView("test_fixed_decimal")
 checkAnswer(
   sql("SELECT * FROM test_fixed_decimal"),

[3/3] spark git commit: [SPARK-15171][SQL] Remove the references to deprecated method dataset.registerTempTable

2016-05-17 Thread lian

[SPARK-15171][SQL] Remove the references to deprecated method 
dataset.registerTempTable

## What changes were proposed in this pull request?

Update the unit test code, examples, and documents to remove calls to 
deprecated method `dataset.registerTempTable`.

## How was this patch tested?

This PR only changes the unit test code, examples, and comments. It should be 
safe.
This is a follow up of PR https://github.com/apache/spark/pull/12945 which was 
merged.

Author: Sean Zhong 

Closes #13098 from clockfly/spark-15171-remove-deprecation.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/25b315e6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/25b315e6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/25b315e6

Branch: refs/heads/master
Commit: 25b315e6cad7c27b62dcaa2c194293c1115fdfb3
Parents: b674e67
Author: Sean Zhong 
Authored: Wed May 18 09:01:59 2016 +0800
Committer: Cheng Lian 
Committed: Wed May 18 09:01:59 2016 +0800

--
 docs/sql-programming-guide.md   |  48 -
 docs/streaming-programming-guide.md |  12 +--
 .../apache/spark/examples/sql/JavaSparkSQL.java |   8 +-
 .../streaming/JavaSqlNetworkWordCount.java  |   2 +-
 examples/src/main/python/sql.py |   2 +-
 .../python/streaming/sql_network_wordcount.py   |   2 +-
 .../apache/spark/examples/sql/RDDRelation.scala |   6 +-
 .../spark/examples/sql/hive/HiveFromSpark.scala |   4 +-
 .../streaming/SqlNetworkWordCount.scala |   2 +-
 .../org/apache/spark/ml/JavaPipelineSuite.java  |   2 +-
 .../JavaLogisticRegressionSuite.java|  10 +-
 .../regression/JavaLinearRegressionSuite.java   |   4 +-
 python/pyspark/sql/context.py   |   4 +-
 python/pyspark/sql/readwriter.py|   2 +-
 python/pyspark/sql/session.py   |   2 +-
 python/pyspark/sql/tests.py |  25 ++---
 .../scala/org/apache/spark/sql/SQLContext.scala |   2 +-
 .../apache/spark/sql/JavaApplySchemaSuite.java  |   8 +-
 .../spark/sql/sources/JavaSaveLoadSuite.java|   2 +-
 .../org/apache/spark/sql/CachedTableSuite.scala |  60 +--
 .../spark/sql/ColumnExpressionSuite.scala   |   2 +-
 .../spark/sql/DataFrameTimeWindowingSuite.scala |   2 +-
 .../apache/spark/sql/DataFrameWindowSuite.scala |  22 ++--
 .../scala/org/apache/spark/sql/JoinSuite.scala  |   4 +-
 .../org/apache/spark/sql/ListTablesSuite.scala  |   4 +-
 .../org/apache/spark/sql/SQLContextSuite.scala  |   2 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala| 103 ++-
 .../sql/ScalaReflectionRelationSuite.scala  |  10 +-
 .../org/apache/spark/sql/SubquerySuite.scala|   8 +-
 .../scala/org/apache/spark/sql/UDFSuite.scala   |  12 +--
 .../apache/spark/sql/UserDefinedTypeSuite.scala |   2 +-
 .../spark/sql/execution/PlannerSuite.scala  |  10 +-
 .../benchmark/AggregateBenchmark.scala  |   3 +-
 .../columnar/InMemoryColumnarQuerySuite.scala   |   8 +-
 .../columnar/PartitionBatchPruningSuite.scala   |   2 +-
 .../execution/datasources/json/JsonSuite.scala  |  58 +--
 .../ParquetPartitionDiscoverySuite.scala|  10 +-
 .../datasources/parquet/ParquetQuerySuite.scala |   4 +-
 .../parquet/ParquetReadBenchmark.scala  |  20 ++--
 .../datasources/parquet/TPCDSBenchmark.scala|   2 +-
 .../sql/execution/metric/SQLMetricsSuite.scala  |   8 +-
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala   |   2 +-
 .../sql/sources/CreateTableAsSelectSuite.scala  |   2 +-
 .../apache/spark/sql/sources/InsertSuite.scala  |   6 +-
 .../spark/sql/sources/SaveLoadSuite.scala   |   4 +-
 .../spark/sql/streaming/StreamSuite.scala   |   2 +-
 .../org/apache/spark/sql/test/SQLTestData.scala |  46 -
 .../spark/sql/hive/JavaDataFrameSuite.java  |   2 +-
 .../sql/hive/JavaMetastoreDataSourcesSuite.java |   2 +-
 .../spark/sql/hive/ErrorPositionSuite.scala |   4 +-
 .../spark/sql/hive/HiveParquetSuite.scala   |   4 +-
 .../spark/sql/hive/HiveSparkSubmitSuite.scala   |   8 +-
 .../sql/hive/InsertIntoHiveTableSuite.scala |  12 +--
 .../sql/hive/MetastoreDataSourcesSuite.scala|   8 +-
 .../hive/ParquetHiveCompatibilitySuite.scala|   2 +-
 .../spark/sql/hive/QueryPartitionSuite.scala|   2 +-
 .../apache/spark/sql/hive/StatisticsSuite.scala |   2 +-
 .../org/apache/spark/sql/hive/UDFSuite.scala|   2 +-
 .../hive/execution/AggregationQuerySuite.scala  |   8 +-
 .../sql/hive/execution/HiveExplainSuite.scala   |   2 +-
 .../execution/HiveOperatorQueryableSuite.scala  |   4 +-
 .../spark/sql/hive/execution/HivePlanTest.scala |   2 +-
 .../sql/hive/execution/HiveQuerySuite.scala |  18 ++--
 .../hive/execution/HiveResolutionSuite.scala|  10 +-
 .../sql/hive/execution/HiveTableScanSuite.scala |

[1/3] spark git commit: [SPARK-15171][SQL] Remove the references to deprecated method dataset.registerTempTable

2016-05-17 Thread lian

Repository: spark
Updated Branches:
  refs/heads/master b674e67c2 -> 25b315e6c


http://git-wip-us.apache.org/repos/asf/spark/blob/25b315e6/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index ac9a393..81f3ea8 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -102,14 +102,14 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils 
with TestHiveSingleton {
 
   test("SPARK-6835: udtf in lateral view") {
 val df = Seq((1, 1)).toDF("c1", "c2")
-df.registerTempTable("table1")
+df.createOrReplaceTempView("table1")
 val query = sql("SELECT c1, v FROM table1 LATERAL VIEW stack(3, 1, c1 + 1, 
c1 + 2) d AS v")
 checkAnswer(query, Row(1, 1) :: Row(1, 2) :: Row(1, 3) :: Nil)
   }
 
   test("SPARK-13651: generator outputs shouldn't be resolved from its child's 
output") {
 withTempTable("src") {
-  Seq(("id1", "value1")).toDF("key", "value").registerTempTable("src")
+  Seq(("id1", "value1")).toDF("key", 
"value").createOrReplaceTempView("src")
   val query =
 sql("SELECT genoutput.* FROM src " +
   "LATERAL VIEW explode(map('key1', 100, 'key2', 200)) genoutput AS 
key, value")
@@ -135,8 +135,8 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils 
with TestHiveSingleton {
   Order(1, "Atlas", "MTB", 434, "2015-01-07", "John D", "Pacifica", "CA", 
20151),
   Order(11, "Swift", "YFlikr", 137, "2015-01-23", "John D", "Hayward", 
"CA", 20151))
 
-orders.toDF.registerTempTable("orders1")
-orderUpdates.toDF.registerTempTable("orderupdates1")
+orders.toDF.createOrReplaceTempView("orders1")
+orderUpdates.toDF.createOrReplaceTempView("orderupdates1")
 
 sql(
   """CREATE TABLE orders(
@@ -305,7 +305,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils 
with TestHiveSingleton {
 
   test("SPARK-5371: union with null and sum") {
 val df = Seq((1, 1)).toDF("c1", "c2")
-df.registerTempTable("table1")
+df.createOrReplaceTempView("table1")
 
 val query = sql(
   """
@@ -329,7 +329,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils 
with TestHiveSingleton {
 
   test("CTAS with WITH clause") {
 val df = Seq((1, 1)).toDF("c1", "c2")
-df.registerTempTable("table1")
+df.createOrReplaceTempView("table1")
 
 sql(
   """
@@ -346,7 +346,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils 
with TestHiveSingleton {
   }
 
   test("explode nested Field") {
-Seq(NestedArray1(NestedArray2(Seq(1, 2, 
3.toDF.registerTempTable("nestedArray")
+Seq(NestedArray1(NestedArray2(Seq(1, 2, 
3.toDF.createOrReplaceTempView("nestedArray")
 checkAnswer(
   sql("SELECT ints FROM nestedArray LATERAL VIEW explode(a.b) a AS ints"),
   Row(1) :: Row(2) :: Row(3) :: Nil)
@@ -543,7 +543,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils 
with TestHiveSingleton {
   }
 
   test("specifying the column list for CTAS") {
-Seq((1, "11"), (2, "22")).toDF("key", 
"value").registerTempTable("mytable1")
+Seq((1, "11"), (2, "22")).toDF("key", 
"value").createOrReplaceTempView("mytable1")
 
 sql("create table gen__tmp(a int, b string) as select key, value from 
mytable1")
 checkAnswer(
@@ -598,7 +598,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils 
with TestHiveSingleton {
 
   test("double nested data") {
 sparkContext.parallelize(Nested1(Nested2(Nested3(1))) :: Nil)
-  .toDF().registerTempTable("nested")
+  .toDF().createOrReplaceTempView("nested")
 checkAnswer(
   sql("SELECT f1.f2.f3 FROM nested"),
   Row(1))
@@ -682,7 +682,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils 
with TestHiveSingleton {
   test("SPARK-4963 DataFrame sample on mutable row return wrong result") {
 sql("SELECT * FROM src WHERE key % 2 = 0")
   .sample(withReplacement = false, fraction = 0.3)
-  .registerTempTable("sampled")
+  .createOrReplaceTempView("sampled")
 (1 to 10).foreach { i =>
   checkAnswer(
 sql("SELECT * FROM sampled WHERE key % 2 = 1"),
@@ -707,7 +707,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils 
with TestHiveSingleton {
 
 val rowRdd = sparkContext.parallelize(row :: Nil)
 
-hiveContext.createDataFrame(rowRdd, schema).registerTempTable("testTable")
+hiveContext.createDataFrame(rowRdd, 
schema).createOrReplaceTempView("testTable")
 
 sql(
   """CREATE TABLE nullValuesInInnerComplexTypes
@@ -733,14 +733,14 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils 
with TestHiveSingleton {
 
   test("SPARK-4296 Grouping field with Hive

spark git commit: [SPARK-14346][SQL] Native SHOW CREATE TABLE for Hive tables/views

2016-05-17 Thread yhuai

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 7b62b7c11 -> 2dddec40d


[SPARK-14346][SQL] Native SHOW CREATE TABLE for Hive tables/views

## What changes were proposed in this pull request?

This is a follow-up of #12781. It adds native `SHOW CREATE TABLE` support for 
Hive tables and views. A new field `hasUnsupportedFeatures` is added to 
`CatalogTable` to indicate whether all table metadata retrieved from the 
concrete underlying external catalog (i.e. Hive metastore in this case) can be 
mapped to fields in `CatalogTable`. This flag is useful when the target Hive 
table contains structures that can't be handled by Spark SQL, e.g., skewed 
columns and storage handler, etc..

## How was this patch tested?

New test cases are added in `ShowCreateTableSuite` to do round-trip tests.

Author: Cheng Lian 

Closes #13079 from liancheng/spark-14346-show-create-table-for-hive-tables.

(cherry picked from commit b674e67c22bf663334e537e35787c00533adbb04)
Signed-off-by: Yin Huai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2dddec40
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2dddec40
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2dddec40

Branch: refs/heads/branch-2.0
Commit: 2dddec40d6562d1d16bb242bf7dc730431ee1e3e
Parents: 7b62b7c
Author: Cheng Lian 
Authored: Tue May 17 15:56:44 2016 -0700
Committer: Yin Huai 
Committed: Tue May 17 15:56:57 2016 -0700

--
 .../spark/sql/catalyst/catalog/interface.scala  |  12 +-
 .../spark/sql/execution/command/tables.scala| 184 +-
 .../spark/sql/hive/client/HiveClientImpl.scala  |  10 +-
 .../spark/sql/hive/ShowCreateTableSuite.scala   | 185 ++-
 4 files changed, 333 insertions(+), 58 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2dddec40/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index d215655..d4f5cbb 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -79,6 +79,12 @@ case class CatalogTablePartition(
  *
  * Note that Hive's metastore also tracks skewed columns. We should consider 
adding that in the
  * future once we have a better understanding of how we want to handle skewed 
columns.
+ *
+ * @param hasUnsupportedFeatures is used to indicate whether all table 
metadata entries retrieved
+ *from the concrete underlying external catalog (e.g. Hive metastore) 
are supported by
+ *Spark SQL. For example, if the underlying Hive table has skewed 
columns, this information
+ *can't be mapped to [[CatalogTable]] since Spark SQL doesn't handle 
skewed columns for now.
+ *In this case `hasUnsupportedFeatures` is set to true. By default, it 
is false.
  */
 case class CatalogTable(
 identifier: TableIdentifier,
@@ -95,7 +101,8 @@ case class CatalogTable(
 properties: Map[String, String] = Map.empty,
 viewOriginalText: Option[String] = None,
 viewText: Option[String] = None,
-comment: Option[String] = None) {
+comment: Option[String] = None,
+hasUnsupportedFeatures: Boolean = false) {
 
   // Verify that the provided columns are part of the schema
   private val colNames = schema.map(_.name).toSet
@@ -200,6 +207,7 @@ case class SimpleCatalogRelation(
 }
   }
 
-  require(metadata.identifier.database == Some(databaseName),
+  require(
+metadata.identifier.database.contains(databaseName),
 "provided database does not match the one specified in the table 
definition")
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/2dddec40/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index bb4f1ff..1fc02d1 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -626,40 +626,149 @@ case class ShowCreateTableCommand(table: 
TableIdentifier) extends RunnableComman
 val stmt = if (DDLUtils.isDatasourceTable(tableMetadata)) {
   showCreateDataSourceTable(tableMetadata)
 } else {
-  throw new UnsupportedOperationException(
-

spark git commit: [SPARK-14346][SQL] Native SHOW CREATE TABLE for Hive tables/views

2016-05-17 Thread yhuai

Repository: spark
Updated Branches:
  refs/heads/master 8e8bc9f95 -> b674e67c2


[SPARK-14346][SQL] Native SHOW CREATE TABLE for Hive tables/views

## What changes were proposed in this pull request?

This is a follow-up of #12781. It adds native `SHOW CREATE TABLE` support for 
Hive tables and views. A new field `hasUnsupportedFeatures` is added to 
`CatalogTable` to indicate whether all table metadata retrieved from the 
concrete underlying external catalog (i.e. Hive metastore in this case) can be 
mapped to fields in `CatalogTable`. This flag is useful when the target Hive 
table contains structures that can't be handled by Spark SQL, e.g., skewed 
columns and storage handler, etc..

## How was this patch tested?

New test cases are added in `ShowCreateTableSuite` to do round-trip tests.

Author: Cheng Lian 

Closes #13079 from liancheng/spark-14346-show-create-table-for-hive-tables.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b674e67c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b674e67c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b674e67c

Branch: refs/heads/master
Commit: b674e67c22bf663334e537e35787c00533adbb04
Parents: 8e8bc9f
Author: Cheng Lian 
Authored: Tue May 17 15:56:44 2016 -0700
Committer: Yin Huai 
Committed: Tue May 17 15:56:44 2016 -0700

--
 .../spark/sql/catalyst/catalog/interface.scala  |  12 +-
 .../spark/sql/execution/command/tables.scala| 184 +-
 .../spark/sql/hive/client/HiveClientImpl.scala  |  10 +-
 .../spark/sql/hive/ShowCreateTableSuite.scala   | 185 ++-
 4 files changed, 333 insertions(+), 58 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b674e67c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index d215655..d4f5cbb 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -79,6 +79,12 @@ case class CatalogTablePartition(
  *
  * Note that Hive's metastore also tracks skewed columns. We should consider 
adding that in the
  * future once we have a better understanding of how we want to handle skewed 
columns.
+ *
+ * @param hasUnsupportedFeatures is used to indicate whether all table 
metadata entries retrieved
+ *from the concrete underlying external catalog (e.g. Hive metastore) 
are supported by
+ *Spark SQL. For example, if the underlying Hive table has skewed 
columns, this information
+ *can't be mapped to [[CatalogTable]] since Spark SQL doesn't handle 
skewed columns for now.
+ *In this case `hasUnsupportedFeatures` is set to true. By default, it 
is false.
  */
 case class CatalogTable(
 identifier: TableIdentifier,
@@ -95,7 +101,8 @@ case class CatalogTable(
 properties: Map[String, String] = Map.empty,
 viewOriginalText: Option[String] = None,
 viewText: Option[String] = None,
-comment: Option[String] = None) {
+comment: Option[String] = None,
+hasUnsupportedFeatures: Boolean = false) {
 
   // Verify that the provided columns are part of the schema
   private val colNames = schema.map(_.name).toSet
@@ -200,6 +207,7 @@ case class SimpleCatalogRelation(
 }
   }
 
-  require(metadata.identifier.database == Some(databaseName),
+  require(
+metadata.identifier.database.contains(databaseName),
 "provided database does not match the one specified in the table 
definition")
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/b674e67c/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index bb4f1ff..1fc02d1 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -626,40 +626,149 @@ case class ShowCreateTableCommand(table: 
TableIdentifier) extends RunnableComman
 val stmt = if (DDLUtils.isDatasourceTable(tableMetadata)) {
   showCreateDataSourceTable(tableMetadata)
 } else {
-  throw new UnsupportedOperationException(
-"SHOW CREATE TABLE only supports Spark SQL data source tables.")
+  showCreateHiveTable(tableMetadata)
 }

spark git commit: [SPARK-11735][CORE][SQL] Add a check in the constructor of SQLContext/SparkSession to make sure its SparkContext is not stopped

2016-05-17 Thread yhuai

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 c0bb77132 -> 7b62b7c11


[SPARK-11735][CORE][SQL] Add a check in the constructor of 
SQLContext/SparkSession to make sure its SparkContext is not stopped

## What changes were proposed in this pull request?

Add a check in the constructor of SQLContext/SparkSession to make sure its 
SparkContext is not stopped.

## How was this patch tested?

Jenkins unit tests.

Author: Shixiong Zhu 

Closes #13154 from zsxwing/check-spark-context-stop.

(cherry picked from commit 8e8bc9f957de6c0aefbc6ef4b18c421b486477a6)
Signed-off-by: Yin Huai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7b62b7c1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7b62b7c1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7b62b7c1

Branch: refs/heads/branch-2.0
Commit: 7b62b7c1180dc3414b7d7e07561d0e6b89ff2b37
Parents: c0bb771
Author: Shixiong Zhu 
Authored: Tue May 17 14:57:21 2016 -0700
Committer: Yin Huai 
Committed: Tue May 17 14:57:31 2016 -0700

--
 core/src/main/scala/org/apache/spark/SparkContext.scala | 2 +-
 sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala   | 2 ++
 sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7b62b7c1/core/src/main/scala/org/apache/spark/SparkContext.scala
--
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala 
b/core/src/main/scala/org/apache/spark/SparkContext.scala
index e391599..e6cdd0d 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -94,7 +94,7 @@ class SparkContext(config: SparkConf) extends Logging with 
ExecutorAllocationCli
 
   private[spark] val stopped: AtomicBoolean = new AtomicBoolean(false)
 
-  private def assertNotStopped(): Unit = {
+  private[spark] def assertNotStopped(): Unit = {
 if (stopped.get()) {
   val activeContext = SparkContext.activeContext.get()
   val activeCreationSite =

http://git-wip-us.apache.org/repos/asf/spark/blob/7b62b7c1/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index c64e284..4451188 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -67,6 +67,8 @@ class SQLContext private[sql](
 
   self =>
 
+  sparkSession.sparkContext.assertNotStopped()
+
   // Note: Since Spark 2.0 this class has become a wrapper of SparkSession, 
where the
   // real functionality resides. This class remains mainly for backward 
compatibility.
 

http://git-wip-us.apache.org/repos/asf/spark/blob/7b62b7c1/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 100b43f..aa974f2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -68,6 +68,7 @@ class SparkSession private(
 this(sc, None)
   }
 
+  sparkContext.assertNotStopped()
 
   /* --- *
|  Session-related state  |


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-11735][CORE][SQL] Add a check in the constructor of SQLContext/SparkSession to make sure its SparkContext is not stopped

2016-05-17 Thread yhuai

Repository: spark
Updated Branches:
  refs/heads/master 0f576a574 -> 8e8bc9f95


[SPARK-11735][CORE][SQL] Add a check in the constructor of 
SQLContext/SparkSession to make sure its SparkContext is not stopped

## What changes were proposed in this pull request?

Add a check in the constructor of SQLContext/SparkSession to make sure its 
SparkContext is not stopped.

## How was this patch tested?

Jenkins unit tests.

Author: Shixiong Zhu 

Closes #13154 from zsxwing/check-spark-context-stop.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8e8bc9f9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8e8bc9f9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8e8bc9f9

Branch: refs/heads/master
Commit: 8e8bc9f957de6c0aefbc6ef4b18c421b486477a6
Parents: 0f576a5
Author: Shixiong Zhu 
Authored: Tue May 17 14:57:21 2016 -0700
Committer: Yin Huai 
Committed: Tue May 17 14:57:21 2016 -0700

--
 core/src/main/scala/org/apache/spark/SparkContext.scala | 2 +-
 sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala   | 2 ++
 sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8e8bc9f9/core/src/main/scala/org/apache/spark/SparkContext.scala
--
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala 
b/core/src/main/scala/org/apache/spark/SparkContext.scala
index e391599..e6cdd0d 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -94,7 +94,7 @@ class SparkContext(config: SparkConf) extends Logging with 
ExecutorAllocationCli
 
   private[spark] val stopped: AtomicBoolean = new AtomicBoolean(false)
 
-  private def assertNotStopped(): Unit = {
+  private[spark] def assertNotStopped(): Unit = {
 if (stopped.get()) {
   val activeContext = SparkContext.activeContext.get()
   val activeCreationSite =

http://git-wip-us.apache.org/repos/asf/spark/blob/8e8bc9f9/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index c64e284..4451188 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -67,6 +67,8 @@ class SQLContext private[sql](
 
   self =>
 
+  sparkSession.sparkContext.assertNotStopped()
+
   // Note: Since Spark 2.0 this class has become a wrapper of SparkSession, 
where the
   // real functionality resides. This class remains mainly for backward 
compatibility.
 

http://git-wip-us.apache.org/repos/asf/spark/blob/8e8bc9f9/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 3016437..da575c7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -68,6 +68,7 @@ class SparkSession private(
 this(sc, None)
   }
 
+  sparkContext.assertNotStopped()
 
   /* --- *
|  Session-related state  |


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15244] [PYTHON] Type of column name created with createDataFrame is not consistent.

2016-05-17 Thread davies

Repository: spark
Updated Branches:
  refs/heads/master e2efe0529 -> 0f576a574


[SPARK-15244] [PYTHON] Type of column name created with createDataFrame is not 
consistent.

## What changes were proposed in this pull request?

**createDataFrame** returns inconsistent types for column names.
```python
>>> from pyspark.sql.types import StructType, StructField, StringType
>>> schema = StructType([StructField(u"col", StringType())])
>>> df1 = spark.createDataFrame([("a",)], schema)
>>> df1.columns # "col" is str
['col']
>>> df2 = spark.createDataFrame([("a",)], [u"col"])
>>> df2.columns # "col" is unicode
[u'col']
```

The reason is only **StructField** has the following code.
```
if not isinstance(name, str):
name = name.encode('utf-8')
```
This PR adds the same logic into **createDataFrame** for consistency.
```
if isinstance(schema, list):
schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in 
schema]
```

## How was this patch tested?

Pass the Jenkins test (with new python doctest)

Author: Dongjoon Hyun 

Closes #13097 from dongjoon-hyun/SPARK-15244.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0f576a57
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0f576a57
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0f576a57

Branch: refs/heads/master
Commit: 0f576a5748244f7e874b925f8d841f1ca238f087
Parents: e2efe05
Author: Dongjoon Hyun 
Authored: Tue May 17 13:05:07 2016 -0700
Committer: Davies Liu 
Committed: Tue May 17 13:05:07 2016 -0700

--
 python/pyspark/sql/session.py | 2 ++
 python/pyspark/sql/tests.py   | 7 +++
 2 files changed, 9 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0f576a57/python/pyspark/sql/session.py
--
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index ae31435..0781b44 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -465,6 +465,8 @@ class SparkSession(object):
 return (obj, )
 schema = StructType().add("value", datatype)
 else:
+if isinstance(schema, list):
+schema = [x.encode('utf-8') if not isinstance(x, str) else x 
for x in schema]
 prepare = lambda obj: obj
 
 if isinstance(data, RDD):

http://git-wip-us.apache.org/repos/asf/spark/blob/0f576a57/python/pyspark/sql/tests.py
--
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 0c73f58..0977c43 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -228,6 +228,13 @@ class SQLTests(ReusedPySparkTestCase):
 self.assertRaises(AnalysisException, lambda: df.select(df.c).first())
 self.assertRaises(AnalysisException, lambda: 
df.select(df["c"]).first())
 
+def test_column_name_encoding(self):
+"""Ensure that created columns has `str` type consistently."""
+columns = self.spark.createDataFrame([('Alice', 1)], ['name', 
u'age']).columns
+self.assertEqual(columns, ['name', 'age'])
+self.assertTrue(isinstance(columns[0], str))
+self.assertTrue(isinstance(columns[1], str))
+
 def test_explode(self):
 from pyspark.sql.functions import explode
 d = [Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"})]


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15244] [PYTHON] Type of column name created with createDataFrame is not consistent.

2016-05-17 Thread davies

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 ff1cfce18 -> c0bb77132


[SPARK-15244] [PYTHON] Type of column name created with createDataFrame is not 
consistent.

## What changes were proposed in this pull request?

**createDataFrame** returns inconsistent types for column names.
```python
>>> from pyspark.sql.types import StructType, StructField, StringType
>>> schema = StructType([StructField(u"col", StringType())])
>>> df1 = spark.createDataFrame([("a",)], schema)
>>> df1.columns # "col" is str
['col']
>>> df2 = spark.createDataFrame([("a",)], [u"col"])
>>> df2.columns # "col" is unicode
[u'col']
```

The reason is only **StructField** has the following code.
```
if not isinstance(name, str):
name = name.encode('utf-8')
```
This PR adds the same logic into **createDataFrame** for consistency.
```
if isinstance(schema, list):
schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in 
schema]
```

## How was this patch tested?

Pass the Jenkins test (with new python doctest)

Author: Dongjoon Hyun 

Closes #13097 from dongjoon-hyun/SPARK-15244.

(cherry picked from commit 0f576a5748244f7e874b925f8d841f1ca238f087)
Signed-off-by: Davies Liu 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c0bb7713
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c0bb7713
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c0bb7713

Branch: refs/heads/branch-2.0
Commit: c0bb77132b9acac951074fd623892abafeb02512
Parents: ff1cfce
Author: Dongjoon Hyun 
Authored: Tue May 17 13:05:07 2016 -0700
Committer: Davies Liu 
Committed: Tue May 17 13:05:17 2016 -0700

--
 python/pyspark/sql/session.py | 2 ++
 python/pyspark/sql/tests.py   | 7 +++
 2 files changed, 9 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c0bb7713/python/pyspark/sql/session.py
--
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index ae31435..0781b44 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -465,6 +465,8 @@ class SparkSession(object):
 return (obj, )
 schema = StructType().add("value", datatype)
 else:
+if isinstance(schema, list):
+schema = [x.encode('utf-8') if not isinstance(x, str) else x 
for x in schema]
 prepare = lambda obj: obj
 
 if isinstance(data, RDD):

http://git-wip-us.apache.org/repos/asf/spark/blob/c0bb7713/python/pyspark/sql/tests.py
--
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 0c73f58..0977c43 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -228,6 +228,13 @@ class SQLTests(ReusedPySparkTestCase):
 self.assertRaises(AnalysisException, lambda: df.select(df.c).first())
 self.assertRaises(AnalysisException, lambda: 
df.select(df["c"]).first())
 
+def test_column_name_encoding(self):
+"""Ensure that created columns has `str` type consistently."""
+columns = self.spark.createDataFrame([('Alice', 1)], ['name', 
u'age']).columns
+self.assertEqual(columns, ['name', 'age'])
+self.assertTrue(isinstance(columns[0], str))
+self.assertTrue(isinstance(columns[1], str))
+
 def test_explode(self):
 from pyspark.sql.functions import explode
 d = [Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"})]


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[3/4] spark git commit: [SPARK-14615][ML] Use the new ML Vector and Matrix in the ML pipeline based algorithms

2016-05-17 Thread meng

http://git-wip-us.apache.org/repos/asf/spark/blob/ff1cfce1/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
index 626e97e..9d084b5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
@@ -21,11 +21,14 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml._
+import org.apache.spark.ml.linalg.{Vector, VectorUDT}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.feature
-import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
+import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => 
OldVectors}
+import org.apache.spark.mllib.linalg.VectorImplicits._
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{StructField, StructType}
@@ -93,7 +96,9 @@ class StandardScaler(override val uid: String) extends 
Estimator[StandardScalerM
   @Since("2.0.0")
   override def fit(dataset: Dataset[_]): StandardScalerModel = {
 transformSchema(dataset.schema, logging = true)
-val input = dataset.select($(inputCol)).rdd.map { case Row(v: Vector) => v 
}
+val input: RDD[OldVector] = dataset.select($(inputCol)).rdd.map {
+  case Row(v: Vector) => OldVectors.fromML(v)
+}
 val scaler = new feature.StandardScaler(withMean = $(withMean), withStd = 
$(withStd))
 val scalerModel = scaler.fit(input)
 copyValues(new StandardScalerModel(uid, scalerModel.std, 
scalerModel.mean).setParent(this))
@@ -145,7 +150,11 @@ class StandardScalerModel private[ml] (
   override def transform(dataset: Dataset[_]): DataFrame = {
 transformSchema(dataset.schema, logging = true)
 val scaler = new feature.StandardScalerModel(std, mean, $(withStd), 
$(withMean))
-val scale = udf { scaler.transform _ }
+
+// TODO: Make the transformer natively in ml framework to avoid extra 
conversion.
+val transformer: Vector => Vector = v => 
scaler.transform(OldVectors.fromML(v)).asML
+
+val scale = udf(transformer)
 dataset.withColumn($(outputCol), scale(col($(inputCol
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/ff1cfce1/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
index 4d3e46e..1bc2420 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -23,10 +23,10 @@ import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.Transformer
 import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, 
NumericAttribute, UnresolvedAttribute}
+import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
-import org.apache.spark.mllib.linalg.{Vector, Vectors, VectorUDT}
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._

http://git-wip-us.apache.org/repos/asf/spark/blob/ff1cfce1/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
index 68b699d..2bc9d22 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
@@ -27,10 +27,10 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.attribute._
+import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, 
VectorUDT}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
-import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, 
VectorUDT}
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions.udf
 import org.apache.spark.sql.types.{StructField, StructType}

[4/4] spark git commit: [SPARK-14615][ML] Use the new ML Vector and Matrix in the ML pipeline based algorithms

2016-05-17 Thread meng

[SPARK-14615][ML] Use the new ML Vector and Matrix in the ML pipeline based 
algorithms

## What changes were proposed in this pull request?

Once SPARK-14487 and SPARK-14549 are merged, we will migrate to use the new 
vector and matrix type in the new ml pipeline based apis.

## How was this patch tested?

Unit tests

Author: DB Tsai 
Author: Liang-Chi Hsieh 
Author: Xiangrui Meng 

Closes #12627 from dbtsai/SPARK-14615-NewML.

(cherry picked from commit e2efe0529acd748f26dbaa41331d1733ed256237)
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ff1cfce1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ff1cfce1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ff1cfce1

Branch: refs/heads/branch-2.0
Commit: ff1cfce18829ccb176f27d4fcc242cbb341a2277
Parents: 1ad3bbd
Author: DB Tsai 
Authored: Tue May 17 12:51:07 2016 -0700
Committer: Xiangrui Meng 
Committed: Tue May 17 12:51:41 2016 -0700

--
 dev/sparktestsupport/modules.py |   1 +
 .../examples/ml/JavaBisectingKMeansExample.java |   2 +-
 .../examples/ml/JavaDeveloperApiExample.java|   8 +-
 .../ml/JavaElementwiseProductExample.java   |   6 +-
 .../spark/examples/ml/JavaKMeansExample.java|   2 +-
 .../ml/AFTSurvivalRegressionExample.scala   |   2 +-
 .../examples/ml/ChiSqSelectorExample.scala  |   2 +-
 .../apache/spark/examples/ml/DCTExample.scala   |   2 +-
 .../spark/examples/ml/DataFrameExample.scala|   3 +-
 .../spark/examples/ml/DecisionTreeExample.scala |   2 +-
 .../spark/examples/ml/DeveloperApiExample.scala |   4 +-
 .../examples/ml/ElementwiseProductExample.scala |   2 +-
 .../ml/EstimatorTransformerParamExample.scala   |   2 +-
 ...odelSelectionViaCrossValidationExample.scala |   2 +-
 .../apache/spark/examples/ml/PCAExample.scala   |   2 +-
 .../spark/examples/ml/PipelineExample.scala |   2 +-
 .../ml/PolynomialExpansionExample.scala |   2 +-
 .../spark/examples/ml/SimpleParamsExample.scala |   4 +-
 .../ml/SimpleTextClassificationPipeline.scala   |   2 +-
 .../examples/ml/VectorAssemblerExample.scala|   2 +-
 .../spark/examples/ml/VectorSlicerExample.scala |   2 +-
 .../scala/org/apache/spark/ml/Predictor.scala   |   4 +-
 .../scala/org/apache/spark/ml/ann/Layer.scala   |  22 ++-
 .../spark/ml/attribute/AttributeGroup.scala |   2 +-
 .../spark/ml/classification/Classifier.scala|   4 +-
 .../classification/DecisionTreeClassifier.scala |   4 +-
 .../spark/ml/classification/GBTClassifier.scala |   4 +-
 .../ml/classification/LogisticRegression.scala  |   5 +-
 .../MultilayerPerceptronClassifier.scala|   4 +-
 .../spark/ml/classification/NaiveBayes.scala|   9 +-
 .../spark/ml/classification/OneVsRest.scala |   2 +-
 .../ProbabilisticClassifier.scala   |   2 +-
 .../classification/RandomForestClassifier.scala |   4 +-
 .../spark/ml/clustering/BisectingKMeans.scala   |  16 +-
 .../spark/ml/clustering/GaussianMixture.scala   |   9 +-
 .../org/apache/spark/ml/clustering/KMeans.scala |  18 +-
 .../org/apache/spark/ml/clustering/LDA.scala|  18 +-
 .../BinaryClassificationEvaluator.scala |   2 +-
 .../org/apache/spark/ml/feature/Binarizer.scala |   2 +-
 .../apache/spark/ml/feature/ChiSqSelector.scala |  21 +-
 .../spark/ml/feature/CountVectorizer.scala  |   2 +-
 .../scala/org/apache/spark/ml/feature/DCT.scala |   2 +-
 .../spark/ml/feature/ElementwiseProduct.scala   |   6 +-
 .../org/apache/spark/ml/feature/HashingTF.scala |   3 +-
 .../scala/org/apache/spark/ml/feature/IDF.scala |  15 +-
 .../org/apache/spark/ml/feature/Instance.scala  |   2 +-
 .../apache/spark/ml/feature/Interaction.scala   |   2 +-
 .../apache/spark/ml/feature/LabeledPoint.scala  |  38 
 .../apache/spark/ml/feature/MaxAbsScaler.scala  |   8 +-
 .../apache/spark/ml/feature/MinMaxScaler.scala  |   9 +-
 .../apache/spark/ml/feature/Normalizer.scala|   5 +-
 .../apache/spark/ml/feature/OneHotEncoder.scala |   2 +-
 .../scala/org/apache/spark/ml/feature/PCA.scala |  21 +-
 .../spark/ml/feature/PolynomialExpansion.scala  |   2 +-
 .../org/apache/spark/ml/feature/RFormula.scala  |   2 +-
 .../spark/ml/feature/RFormulaParser.scala   |   2 +-
 .../spark/ml/feature/StandardScaler.scala   |  15 +-
 .../spark/ml/feature/VectorAssembler.scala  |   2 +-
 .../apache/spark/ml/feature/VectorIndexer.scala |   2 +-
 .../apache/spark/ml/feature/VectorSlicer.scala  |   2 +-
 .../org/apache/spark/ml/feature/Word2Vec.scala  |   3 +-
 .../org/apache/spark/ml/linalg/VectorUDT.scala  |   2 +-
 .../IterativelyReweightedLeastSquares.scala |   2 +-
 .../spark/ml/optim/WeightedLeastSquares.scala   |   3 +-
 .../org/apache/spark/ml/param/params.scala  |   7 +-

[2/4] spark git commit: [SPARK-14615][ML] Use the new ML Vector and Matrix in the ML pipeline based algorithms

2016-05-17 Thread meng

http://git-wip-us.apache.org/repos/asf/spark/blob/ff1cfce1/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 69650eb..a1b4853 100644
--- 
a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -17,18 +17,19 @@
 
 package org.apache.spark.ml.classification
 
+import scala.collection.JavaConverters._
 import scala.language.existentials
 import scala.util.Random
+import scala.util.control.Breaks._
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.ml.feature.Instance
+import org.apache.spark.ml.classification.LogisticRegressionSuite._
+import org.apache.spark.ml.feature.{Instance, LabeledPoint}
+import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
-import org.apache.spark.mllib.classification.LogisticRegressionSuite._
-import org.apache.spark.mllib.linalg.{Vector, Vectors}
-import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions.lit
 
@@ -967,4 +968,122 @@ object LogisticRegressionSuite {
 "standardization" -> false,
 "threshold" -> 0.6
   )
+
+  def generateLogisticInputAsList(
+offset: Double,
+scale: Double,
+nPoints: Int,
+seed: Int): java.util.List[LabeledPoint] = {
+generateLogisticInput(offset, scale, nPoints, seed).asJava
+  }
+
+  // Generate input of the form Y = logistic(offset + scale*X)
+  def generateLogisticInput(
+  offset: Double,
+  scale: Double,
+  nPoints: Int,
+  seed: Int): Seq[LabeledPoint] = {
+val rnd = new Random(seed)
+val x1 = Array.fill[Double](nPoints)(rnd.nextGaussian())
+
+val y = (0 until nPoints).map { i =>
+  val p = 1.0 / (1.0 + math.exp(-(offset + scale * x1(i
+  if (rnd.nextDouble() < p) 1.0 else 0.0
+}
+
+val testData = (0 until nPoints).map(i => LabeledPoint(y(i), 
Vectors.dense(Array(x1(i)
+testData
+  }
+
+  /**
+   * Generates `k` classes multinomial synthetic logistic input in `n` 
dimensional space given the
+   * model weights and mean/variance of the features. The synthetic data will 
be drawn from
+   * the probability distribution constructed by weights using the following 
formula.
+   *
+   * P(y = 0 | x) = 1 / norm
+   * P(y = 1 | x) = exp(x * w_1) / norm
+   * P(y = 2 | x) = exp(x * w_2) / norm
+   * ...
+   * P(y = k-1 | x) = exp(x * w_{k-1}) / norm
+   * where norm = 1 + exp(x * w_1) + exp(x * w_2) + ... + exp(x * w_{k-1})
+   *
+   * @param weights matrix is flatten into a vector; as a result, the 
dimension of weights vector
+   *will be (k - 1) * (n + 1) if `addIntercept == true`, and
+   *if `addIntercept != true`, the dimension will be (k - 1) * 
n.
+   * @param xMean the mean of the generated features. Lots of time, if the 
features are not properly
+   *  standardized, the algorithm with poor implementation will 
have difficulty
+   *  to converge.
+   * @param xVariance the variance of the generated features.
+   * @param addIntercept whether to add intercept.
+   * @param nPoints the number of instance of generated data.
+   * @param seed the seed for random generator. For consistent testing result, 
it will be fixed.
+   */
+  def generateMultinomialLogisticInput(
+  weights: Array[Double],
+  xMean: Array[Double],
+  xVariance: Array[Double],
+  addIntercept: Boolean,
+  nPoints: Int,
+  seed: Int): Seq[LabeledPoint] = {
+val rnd = new Random(seed)
+
+val xDim = xMean.length
+val xWithInterceptsDim = if (addIntercept) xDim + 1 else xDim
+val nClasses = weights.length / xWithInterceptsDim + 1
+
+val x = 
Array.fill[Vector](nPoints)(Vectors.dense(Array.fill[Double](xDim)(rnd.nextGaussian(
+
+x.foreach { vector =>
+  // This doesn't work if `vector` is a sparse vector.
+  val vectorArray = vector.toArray
+  var i = 0
+  val len = vectorArray.length
+  while (i < len) {
+vectorArray(i) = vectorArray(i) * math.sqrt(xVariance(i)) + xMean(i)
+i += 1
+  }
+}
+
+val y = (0 until nPoints).map { idx =>
+  val xArray = x(idx).toArray
+  val margins = Array.ofDim[Double](nClasses)
+  val probs = Array.ofDim[Double](nClasses)
+
+  for (i <- 0 until nClasses - 1) {
+for (j <- 0 until xDim) margins(i +

[3/4] spark git commit: [SPARK-14615][ML] Use the new ML Vector and Matrix in the ML pipeline based algorithms

2016-05-17 Thread meng

http://git-wip-us.apache.org/repos/asf/spark/blob/e2efe052/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
index 626e97e..9d084b5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
@@ -21,11 +21,14 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml._
+import org.apache.spark.ml.linalg.{Vector, VectorUDT}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.feature
-import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
+import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => 
OldVectors}
+import org.apache.spark.mllib.linalg.VectorImplicits._
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{StructField, StructType}
@@ -93,7 +96,9 @@ class StandardScaler(override val uid: String) extends 
Estimator[StandardScalerM
   @Since("2.0.0")
   override def fit(dataset: Dataset[_]): StandardScalerModel = {
 transformSchema(dataset.schema, logging = true)
-val input = dataset.select($(inputCol)).rdd.map { case Row(v: Vector) => v 
}
+val input: RDD[OldVector] = dataset.select($(inputCol)).rdd.map {
+  case Row(v: Vector) => OldVectors.fromML(v)
+}
 val scaler = new feature.StandardScaler(withMean = $(withMean), withStd = 
$(withStd))
 val scalerModel = scaler.fit(input)
 copyValues(new StandardScalerModel(uid, scalerModel.std, 
scalerModel.mean).setParent(this))
@@ -145,7 +150,11 @@ class StandardScalerModel private[ml] (
   override def transform(dataset: Dataset[_]): DataFrame = {
 transformSchema(dataset.schema, logging = true)
 val scaler = new feature.StandardScalerModel(std, mean, $(withStd), 
$(withMean))
-val scale = udf { scaler.transform _ }
+
+// TODO: Make the transformer natively in ml framework to avoid extra 
conversion.
+val transformer: Vector => Vector = v => 
scaler.transform(OldVectors.fromML(v)).asML
+
+val scale = udf(transformer)
 dataset.withColumn($(outputCol), scale(col($(inputCol
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/e2efe052/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
index 4d3e46e..1bc2420 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -23,10 +23,10 @@ import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.Transformer
 import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, 
NumericAttribute, UnresolvedAttribute}
+import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
-import org.apache.spark.mllib.linalg.{Vector, Vectors, VectorUDT}
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._

http://git-wip-us.apache.org/repos/asf/spark/blob/e2efe052/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
index 68b699d..2bc9d22 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
@@ -27,10 +27,10 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.attribute._
+import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, 
VectorUDT}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
-import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, 
VectorUDT}
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions.udf
 import org.apache.spark.sql.types.{StructField, StructType}

[1/4] spark git commit: [SPARK-14615][ML] Use the new ML Vector and Matrix in the ML pipeline based algorithms

2016-05-17 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 9f176dd39 -> e2efe0529


http://git-wip-us.apache.org/repos/asf/spark/blob/e2efe052/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
index 49cb7e1..441d0f7 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
@@ -73,7 +73,7 @@ class DecisionTreeSuite extends SparkFunSuite with 
MLlibTestSparkContext {
   maxBins = 100,
   categoricalFeaturesInfo = Map(0 -> 3, 1 -> 3))
 
-val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy)
+val metadata = DecisionTreeMetadata.buildMetadata(rdd.map(_.asML), 
strategy)
 assert(!metadata.isUnordered(featureIndex = 0))
 assert(!metadata.isUnordered(featureIndex = 1))
 
@@ -100,7 +100,7 @@ class DecisionTreeSuite extends SparkFunSuite with 
MLlibTestSparkContext {
   maxDepth = 2,
   maxBins = 100,
   categoricalFeaturesInfo = Map(0 -> 2, 1 -> 2))
-val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy)
+val metadata = DecisionTreeMetadata.buildMetadata(rdd.map(_.asML), 
strategy)
 assert(!metadata.isUnordered(featureIndex = 0))
 assert(!metadata.isUnordered(featureIndex = 1))
 
@@ -116,7 +116,7 @@ class DecisionTreeSuite extends SparkFunSuite with 
MLlibTestSparkContext {
 val rdd = sc.parallelize(arr)
 val strategy = new Strategy(Classification, Gini, maxDepth = 3,
   numClasses = 2, maxBins = 100)
-val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy)
+val metadata = DecisionTreeMetadata.buildMetadata(rdd.map(_.asML), 
strategy)
 assert(!metadata.isUnordered(featureIndex = 0))
 assert(!metadata.isUnordered(featureIndex = 1))
 
@@ -133,7 +133,7 @@ class DecisionTreeSuite extends SparkFunSuite with 
MLlibTestSparkContext {
 val rdd = sc.parallelize(arr)
 val strategy = new Strategy(Classification, Gini, maxDepth = 3,
   numClasses = 2, maxBins = 100)
-val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy)
+val metadata = DecisionTreeMetadata.buildMetadata(rdd.map(_.asML), 
strategy)
 assert(!metadata.isUnordered(featureIndex = 0))
 assert(!metadata.isUnordered(featureIndex = 1))
 
@@ -150,7 +150,7 @@ class DecisionTreeSuite extends SparkFunSuite with 
MLlibTestSparkContext {
 val rdd = sc.parallelize(arr)
 val strategy = new Strategy(Classification, Entropy, maxDepth = 3,
   numClasses = 2, maxBins = 100)
-val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy)
+val metadata = DecisionTreeMetadata.buildMetadata(rdd.map(_.asML), 
strategy)
 assert(!metadata.isUnordered(featureIndex = 0))
 assert(!metadata.isUnordered(featureIndex = 1))
 
@@ -167,7 +167,7 @@ class DecisionTreeSuite extends SparkFunSuite with 
MLlibTestSparkContext {
 val rdd = sc.parallelize(arr)
 val strategy = new Strategy(Classification, Entropy, maxDepth = 3,
   numClasses = 2, maxBins = 100)
-val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy)
+val metadata = DecisionTreeMetadata.buildMetadata(rdd.map(_.asML), 
strategy)
 assert(!metadata.isUnordered(featureIndex = 0))
 assert(!metadata.isUnordered(featureIndex = 1))
 
@@ -183,7 +183,7 @@ class DecisionTreeSuite extends SparkFunSuite with 
MLlibTestSparkContext {
 val rdd = sc.parallelize(arr)
 val strategy = new Strategy(algo = Classification, impurity = Gini, 
maxDepth = 4,
   numClasses = 3, categoricalFeaturesInfo = Map(0 -> 3, 1 -> 3))
-val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy)
+val metadata = DecisionTreeMetadata.buildMetadata(rdd.map(_.asML), 
strategy)
 assert(strategy.isMulticlassClassification)
 assert(metadata.isUnordered(featureIndex = 0))
 assert(metadata.isUnordered(featureIndex = 1))
@@ -240,7 +240,7 @@ class DecisionTreeSuite extends SparkFunSuite with 
MLlibTestSparkContext {
   numClasses = 3, maxBins = maxBins,
   categoricalFeaturesInfo = Map(0 -> 3, 1 -> 3))
 assert(strategy.isMulticlassClassification)
-val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy)
+val metadata = DecisionTreeMetadata.buildMetadata(rdd.map(_.asML), 
strategy)
 assert(metadata.isUnordered(featureIndex = 0))
 assert(metadata.isUnordered(featureIndex = 1))
 
@@ -288,7 +288,7 @@ class DecisionTreeSuite extends SparkFunSuite with 
MLlibTestSparkContext {
 val strategy = new Strategy(algo = Classification, impurity = Gini, 
maxDepth = 4,
   numClasses = 3, maxBins = 100, categoricalFeaturesInfo = Map(0 -> 3))
 assert(strategy.isMulticlassClassification)
-val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy)

[4/4] spark git commit: [SPARK-14615][ML] Use the new ML Vector and Matrix in the ML pipeline based algorithms

2016-05-17 Thread meng

[SPARK-14615][ML] Use the new ML Vector and Matrix in the ML pipeline based 
algorithms

## What changes were proposed in this pull request?

Once SPARK-14487 and SPARK-14549 are merged, we will migrate to use the new 
vector and matrix type in the new ml pipeline based apis.

## How was this patch tested?

Unit tests

Author: DB Tsai 
Author: Liang-Chi Hsieh 
Author: Xiangrui Meng 

Closes #12627 from dbtsai/SPARK-14615-NewML.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e2efe052
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e2efe052
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e2efe052

Branch: refs/heads/master
Commit: e2efe0529acd748f26dbaa41331d1733ed256237
Parents: 9f176dd
Author: DB Tsai 
Authored: Tue May 17 12:51:07 2016 -0700
Committer: Xiangrui Meng 
Committed: Tue May 17 12:51:07 2016 -0700

--
 dev/sparktestsupport/modules.py |   1 +
 .../examples/ml/JavaBisectingKMeansExample.java |   2 +-
 .../examples/ml/JavaDeveloperApiExample.java|   8 +-
 .../ml/JavaElementwiseProductExample.java   |   6 +-
 .../spark/examples/ml/JavaKMeansExample.java|   2 +-
 .../ml/AFTSurvivalRegressionExample.scala   |   2 +-
 .../examples/ml/ChiSqSelectorExample.scala  |   2 +-
 .../apache/spark/examples/ml/DCTExample.scala   |   2 +-
 .../spark/examples/ml/DataFrameExample.scala|   3 +-
 .../spark/examples/ml/DecisionTreeExample.scala |   2 +-
 .../spark/examples/ml/DeveloperApiExample.scala |   4 +-
 .../examples/ml/ElementwiseProductExample.scala |   2 +-
 .../ml/EstimatorTransformerParamExample.scala   |   2 +-
 ...odelSelectionViaCrossValidationExample.scala |   2 +-
 .../apache/spark/examples/ml/PCAExample.scala   |   2 +-
 .../spark/examples/ml/PipelineExample.scala |   2 +-
 .../ml/PolynomialExpansionExample.scala |   2 +-
 .../spark/examples/ml/SimpleParamsExample.scala |   4 +-
 .../ml/SimpleTextClassificationPipeline.scala   |   2 +-
 .../examples/ml/VectorAssemblerExample.scala|   2 +-
 .../spark/examples/ml/VectorSlicerExample.scala |   2 +-
 .../scala/org/apache/spark/ml/Predictor.scala   |   4 +-
 .../scala/org/apache/spark/ml/ann/Layer.scala   |  22 ++-
 .../spark/ml/attribute/AttributeGroup.scala |   2 +-
 .../spark/ml/classification/Classifier.scala|   4 +-
 .../classification/DecisionTreeClassifier.scala |   4 +-
 .../spark/ml/classification/GBTClassifier.scala |   4 +-
 .../ml/classification/LogisticRegression.scala  |   5 +-
 .../MultilayerPerceptronClassifier.scala|   4 +-
 .../spark/ml/classification/NaiveBayes.scala|   9 +-
 .../spark/ml/classification/OneVsRest.scala |   2 +-
 .../ProbabilisticClassifier.scala   |   2 +-
 .../classification/RandomForestClassifier.scala |   4 +-
 .../spark/ml/clustering/BisectingKMeans.scala   |  16 +-
 .../spark/ml/clustering/GaussianMixture.scala   |   9 +-
 .../org/apache/spark/ml/clustering/KMeans.scala |  18 +-
 .../org/apache/spark/ml/clustering/LDA.scala|  18 +-
 .../BinaryClassificationEvaluator.scala |   2 +-
 .../org/apache/spark/ml/feature/Binarizer.scala |   2 +-
 .../apache/spark/ml/feature/ChiSqSelector.scala |  21 +-
 .../spark/ml/feature/CountVectorizer.scala  |   2 +-
 .../scala/org/apache/spark/ml/feature/DCT.scala |   2 +-
 .../spark/ml/feature/ElementwiseProduct.scala   |   6 +-
 .../org/apache/spark/ml/feature/HashingTF.scala |   3 +-
 .../scala/org/apache/spark/ml/feature/IDF.scala |  15 +-
 .../org/apache/spark/ml/feature/Instance.scala  |   2 +-
 .../apache/spark/ml/feature/Interaction.scala   |   2 +-
 .../apache/spark/ml/feature/LabeledPoint.scala  |  38 
 .../apache/spark/ml/feature/MaxAbsScaler.scala  |   8 +-
 .../apache/spark/ml/feature/MinMaxScaler.scala  |   9 +-
 .../apache/spark/ml/feature/Normalizer.scala|   5 +-
 .../apache/spark/ml/feature/OneHotEncoder.scala |   2 +-
 .../scala/org/apache/spark/ml/feature/PCA.scala |  21 +-
 .../spark/ml/feature/PolynomialExpansion.scala  |   2 +-
 .../org/apache/spark/ml/feature/RFormula.scala  |   2 +-
 .../spark/ml/feature/RFormulaParser.scala   |   2 +-
 .../spark/ml/feature/StandardScaler.scala   |  15 +-
 .../spark/ml/feature/VectorAssembler.scala  |   2 +-
 .../apache/spark/ml/feature/VectorIndexer.scala |   2 +-
 .../apache/spark/ml/feature/VectorSlicer.scala  |   2 +-
 .../org/apache/spark/ml/feature/Word2Vec.scala  |   3 +-
 .../org/apache/spark/ml/linalg/VectorUDT.scala  |   2 +-
 .../IterativelyReweightedLeastSquares.scala |   2 +-
 .../spark/ml/optim/WeightedLeastSquares.scala   |   3 +-
 .../org/apache/spark/ml/param/params.scala  |   7 +-
 .../ml/regression/AFTSurvivalRegression.scala   |   3 +-
 .../ml/regression/DecisionTreeRegressor.scala   |   4 +-

[2/4] spark git commit: [SPARK-14615][ML] Use the new ML Vector and Matrix in the ML pipeline based algorithms

2016-05-17 Thread meng

http://git-wip-us.apache.org/repos/asf/spark/blob/e2efe052/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 69650eb..a1b4853 100644
--- 
a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -17,18 +17,19 @@
 
 package org.apache.spark.ml.classification
 
+import scala.collection.JavaConverters._
 import scala.language.existentials
 import scala.util.Random
+import scala.util.control.Breaks._
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.ml.feature.Instance
+import org.apache.spark.ml.classification.LogisticRegressionSuite._
+import org.apache.spark.ml.feature.{Instance, LabeledPoint}
+import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
-import org.apache.spark.mllib.classification.LogisticRegressionSuite._
-import org.apache.spark.mllib.linalg.{Vector, Vectors}
-import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions.lit
 
@@ -967,4 +968,122 @@ object LogisticRegressionSuite {
 "standardization" -> false,
 "threshold" -> 0.6
   )
+
+  def generateLogisticInputAsList(
+offset: Double,
+scale: Double,
+nPoints: Int,
+seed: Int): java.util.List[LabeledPoint] = {
+generateLogisticInput(offset, scale, nPoints, seed).asJava
+  }
+
+  // Generate input of the form Y = logistic(offset + scale*X)
+  def generateLogisticInput(
+  offset: Double,
+  scale: Double,
+  nPoints: Int,
+  seed: Int): Seq[LabeledPoint] = {
+val rnd = new Random(seed)
+val x1 = Array.fill[Double](nPoints)(rnd.nextGaussian())
+
+val y = (0 until nPoints).map { i =>
+  val p = 1.0 / (1.0 + math.exp(-(offset + scale * x1(i
+  if (rnd.nextDouble() < p) 1.0 else 0.0
+}
+
+val testData = (0 until nPoints).map(i => LabeledPoint(y(i), 
Vectors.dense(Array(x1(i)
+testData
+  }
+
+  /**
+   * Generates `k` classes multinomial synthetic logistic input in `n` 
dimensional space given the
+   * model weights and mean/variance of the features. The synthetic data will 
be drawn from
+   * the probability distribution constructed by weights using the following 
formula.
+   *
+   * P(y = 0 | x) = 1 / norm
+   * P(y = 1 | x) = exp(x * w_1) / norm
+   * P(y = 2 | x) = exp(x * w_2) / norm
+   * ...
+   * P(y = k-1 | x) = exp(x * w_{k-1}) / norm
+   * where norm = 1 + exp(x * w_1) + exp(x * w_2) + ... + exp(x * w_{k-1})
+   *
+   * @param weights matrix is flatten into a vector; as a result, the 
dimension of weights vector
+   *will be (k - 1) * (n + 1) if `addIntercept == true`, and
+   *if `addIntercept != true`, the dimension will be (k - 1) * 
n.
+   * @param xMean the mean of the generated features. Lots of time, if the 
features are not properly
+   *  standardized, the algorithm with poor implementation will 
have difficulty
+   *  to converge.
+   * @param xVariance the variance of the generated features.
+   * @param addIntercept whether to add intercept.
+   * @param nPoints the number of instance of generated data.
+   * @param seed the seed for random generator. For consistent testing result, 
it will be fixed.
+   */
+  def generateMultinomialLogisticInput(
+  weights: Array[Double],
+  xMean: Array[Double],
+  xVariance: Array[Double],
+  addIntercept: Boolean,
+  nPoints: Int,
+  seed: Int): Seq[LabeledPoint] = {
+val rnd = new Random(seed)
+
+val xDim = xMean.length
+val xWithInterceptsDim = if (addIntercept) xDim + 1 else xDim
+val nClasses = weights.length / xWithInterceptsDim + 1
+
+val x = 
Array.fill[Vector](nPoints)(Vectors.dense(Array.fill[Double](xDim)(rnd.nextGaussian(
+
+x.foreach { vector =>
+  // This doesn't work if `vector` is a sparse vector.
+  val vectorArray = vector.toArray
+  var i = 0
+  val len = vectorArray.length
+  while (i < len) {
+vectorArray(i) = vectorArray(i) * math.sqrt(xVariance(i)) + xMean(i)
+i += 1
+  }
+}
+
+val y = (0 until nPoints).map { idx =>
+  val xArray = x(idx).toArray
+  val margins = Array.ofDim[Double](nClasses)
+  val probs = Array.ofDim[Double](nClasses)
+
+  for (i <- 0 until nClasses - 1) {
+for (j <- 0 until xDim) margins(i +

spark git commit: [MINOR][DOCS] Replace remaining 'sqlContext' in ScalaDoc/JavaDoc.

2016-05-17 Thread mlnick

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 025b3e9f1 -> 1ad3bbd0a


[MINOR][DOCS] Replace remaining 'sqlContext' in ScalaDoc/JavaDoc.

## What changes were proposed in this pull request?

According to the recent change, this PR replaces all the remaining `sqlContext` 
usage with `spark` in ScalaDoc/JavaDoc (.scala/.java files) except 
`SQLContext.scala`, `SparkPlan.scala', and `DatasetHolder.scala`.

## How was this patch tested?

Manual.

Author: Dongjoon Hyun 

Closes #13125 from dongjoon-hyun/minor_doc_sparksession.

(cherry picked from commit 9f176dd3918129a72282a6b7a12e2899cbb6dac9)
Signed-off-by: Nick Pentreath 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1ad3bbd0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1ad3bbd0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1ad3bbd0

Branch: refs/heads/branch-2.0
Commit: 1ad3bbd0a4c7d4ab4aad0806f345d3904d7cd441
Parents: 025b3e9
Author: Dongjoon Hyun 
Authored: Tue May 17 20:50:22 2016 +0200
Committer: Nick Pentreath 
Committed: Tue May 17 20:50:47 2016 +0200

--
 .../main/scala/org/apache/spark/ml/feature/package.scala  |  2 +-
 .../main/scala/org/apache/spark/sql/DataFrameReader.scala |  4 ++--
 .../org/apache/spark/sql/DataFrameStatFunctions.scala | 10 +-
 .../scala/org/apache/spark/sql/ExperimentalMethods.scala  |  2 +-
 .../datasources/PartitioningAwareFileCatalog.scala|  8 
 .../src/main/scala/org/apache/spark/sql/functions.scala   |  4 ++--
 6 files changed, 15 insertions(+), 15 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1ad3bbd0/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
index 4571ab2..b94187a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
@@ -44,7 +44,7 @@ import org.apache.spark.sql.DataFrame
  *   import org.apache.spark.ml.Pipeline
  *
  *   // a DataFrame with three columns: id (integer), text (string), and 
rating (double).
- *   val df = sqlContext.createDataFrame(Seq(
+ *   val df = spark.createDataFrame(Seq(
  * (0, "Hi I heard about Spark", 3.0),
  * (1, "I wish Java could use case classes", 4.0),
  * (2, "Logistic regression models are neat", 4.0)

http://git-wip-us.apache.org/repos/asf/spark/blob/1ad3bbd0/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index e1a64df..011aff4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -446,10 +446,10 @@ class DataFrameReader private[sql](sparkSession: 
SparkSession) extends Logging {
* Each line in the text file is a new row in the resulting Dataset. For 
example:
* {{{
*   // Scala:
-   *   sqlContext.read.text("/path/to/spark/README.md")
+   *   spark.read.text("/path/to/spark/README.md")
*
*   // Java:
-   *   sqlContext.read().text("/path/to/spark/README.md")
+   *   spark.read().text("/path/to/spark/README.md")
* }}}
*
* @param paths input path

http://git-wip-us.apache.org/repos/asf/spark/blob/1ad3bbd0/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index 3eb1f0f..1855eab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -160,8 +160,8 @@ final class DataFrameStatFunctions private[sql](df: 
DataFrame) {
* @return A DataFrame containing for the contingency table.
*
* {{{
-   *val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 
1), (2, 3), (3, 2),
-   *  (3, 3))).toDF("key", "value")
+   *val df = spark.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 
3), (3, 2), (3, 3)))
+   *  .toDF("key", "value")
*val ct = df.stat.crosstab("key", "value")
*ct.show()
*+-+---+---+---+
@@ -197,7 +197,7 @@ final class DataFrameStatFunctions private[sql](df: 
DataFrame) {
*

spark git commit: [MINOR][DOCS] Replace remaining 'sqlContext' in ScalaDoc/JavaDoc.

2016-05-17 Thread mlnick

Repository: spark
Updated Branches:
  refs/heads/master 3308a862b -> 9f176dd39


[MINOR][DOCS] Replace remaining 'sqlContext' in ScalaDoc/JavaDoc.

## What changes were proposed in this pull request?

According to the recent change, this PR replaces all the remaining `sqlContext` 
usage with `spark` in ScalaDoc/JavaDoc (.scala/.java files) except 
`SQLContext.scala`, `SparkPlan.scala', and `DatasetHolder.scala`.

## How was this patch tested?

Manual.

Author: Dongjoon Hyun 

Closes #13125 from dongjoon-hyun/minor_doc_sparksession.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9f176dd3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9f176dd3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9f176dd3

Branch: refs/heads/master
Commit: 9f176dd3918129a72282a6b7a12e2899cbb6dac9
Parents: 3308a86
Author: Dongjoon Hyun 
Authored: Tue May 17 20:50:22 2016 +0200
Committer: Nick Pentreath 
Committed: Tue May 17 20:50:22 2016 +0200

--
 .../main/scala/org/apache/spark/ml/feature/package.scala  |  2 +-
 .../main/scala/org/apache/spark/sql/DataFrameReader.scala |  4 ++--
 .../org/apache/spark/sql/DataFrameStatFunctions.scala | 10 +-
 .../scala/org/apache/spark/sql/ExperimentalMethods.scala  |  2 +-
 .../datasources/PartitioningAwareFileCatalog.scala|  8 
 .../src/main/scala/org/apache/spark/sql/functions.scala   |  4 ++--
 6 files changed, 15 insertions(+), 15 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9f176dd3/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
index 4571ab2..b94187a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
@@ -44,7 +44,7 @@ import org.apache.spark.sql.DataFrame
  *   import org.apache.spark.ml.Pipeline
  *
  *   // a DataFrame with three columns: id (integer), text (string), and 
rating (double).
- *   val df = sqlContext.createDataFrame(Seq(
+ *   val df = spark.createDataFrame(Seq(
  * (0, "Hi I heard about Spark", 3.0),
  * (1, "I wish Java could use case classes", 4.0),
  * (2, "Logistic regression models are neat", 4.0)

http://git-wip-us.apache.org/repos/asf/spark/blob/9f176dd3/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index e1a64df..011aff4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -446,10 +446,10 @@ class DataFrameReader private[sql](sparkSession: 
SparkSession) extends Logging {
* Each line in the text file is a new row in the resulting Dataset. For 
example:
* {{{
*   // Scala:
-   *   sqlContext.read.text("/path/to/spark/README.md")
+   *   spark.read.text("/path/to/spark/README.md")
*
*   // Java:
-   *   sqlContext.read().text("/path/to/spark/README.md")
+   *   spark.read().text("/path/to/spark/README.md")
* }}}
*
* @param paths input path

http://git-wip-us.apache.org/repos/asf/spark/blob/9f176dd3/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index 3eb1f0f..1855eab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -160,8 +160,8 @@ final class DataFrameStatFunctions private[sql](df: 
DataFrame) {
* @return A DataFrame containing for the contingency table.
*
* {{{
-   *val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 
1), (2, 3), (3, 2),
-   *  (3, 3))).toDF("key", "value")
+   *val df = spark.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 
3), (3, 2), (3, 3)))
+   *  .toDF("key", "value")
*val ct = df.stat.crosstab("key", "value")
*ct.show()
*+-+---+---+---+
@@ -197,7 +197,7 @@ final class DataFrameStatFunctions private[sql](df: 
DataFrame) {
*val rows = Seq.tabulate(100) { i =>
*  if (i % 2 == 0) (1, -1.0) else (i, i * -1.0)
*}
-   *val df =

spark git commit: [SPARK-10216][SQL] Avoid creating empty files during overwriting with group by query

2016-05-17 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 adc1c2685 -> af37bdd3a


[SPARK-10216][SQL] Avoid creating empty files during overwriting with group by 
query

## What changes were proposed in this pull request?

Currently, `INSERT INTO` with `GROUP BY` query tries to make at least 200 files 
(default value of `spark.sql.shuffle.partition`), which results in lots of 
empty files.

This PR makes it avoid creating empty files during overwriting into Hive table 
and in internal data sources  with group by query.

This checks whether the given partition has data in it or not and 
creates/writes file only when it actually has data.

## How was this patch tested?

Unittests in `InsertIntoHiveTableSuite` and `HadoopFsRelationTest`.

Closes #8411

Author: hyukjinkwon 
Author: Keuntae Park 

Closes #12855 from HyukjinKwon/pr/8411.

(cherry picked from commit 8d05a7a98bdbd3ce7c81d273e05a375877ebe68f)
Signed-off-by: Michael Armbrust 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/af37bdd3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/af37bdd3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/af37bdd3

Branch: refs/heads/branch-2.0
Commit: af37bdd3a7cee5206f98b3a2ba9113e71b53a2f4
Parents: adc1c26
Author: hyukjinkwon 
Authored: Tue May 17 11:18:51 2016 -0700
Committer: Michael Armbrust 
Committed: Tue May 17 11:21:06 2016 -0700

--
 .../execution/datasources/WriterContainer.scala | 221 ++-
 .../spark/sql/hive/hiveWriterContainers.scala   |  24 +-
 .../sql/hive/InsertIntoHiveTableSuite.scala |  41 +++-
 .../sql/sources/HadoopFsRelationTest.scala  |  22 +-
 4 files changed, 182 insertions(+), 126 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/af37bdd3/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
index 3b064a5..7e12bbb 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
@@ -239,48 +239,50 @@ private[sql] class DefaultWriterContainer(
   extends BaseWriterContainer(relation, job, isAppend) {
 
   def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): 
Unit = {
-executorSideSetup(taskContext)
-val configuration = taskAttemptContext.getConfiguration
-configuration.set("spark.sql.sources.output.path", outputPath)
-var writer = newOutputWriter(getWorkPath)
-writer.initConverter(dataSchema)
-
-// If anything below fails, we should abort the task.
-try {
-  Utils.tryWithSafeFinallyAndFailureCallbacks {
-while (iterator.hasNext) {
-  val internalRow = iterator.next()
-  writer.writeInternal(internalRow)
-}
-commitTask()
-  }(catchBlock = abortTask())
-} catch {
-  case t: Throwable =>
-throw new SparkException("Task failed while writing rows", t)
-}
+if (iterator.hasNext) {
+  executorSideSetup(taskContext)
+  val configuration = taskAttemptContext.getConfiguration
+  configuration.set("spark.sql.sources.output.path", outputPath)
+  var writer = newOutputWriter(getWorkPath)
+  writer.initConverter(dataSchema)
 
-def commitTask(): Unit = {
+  // If anything below fails, we should abort the task.
   try {
-if (writer != null) {
-  writer.close()
-  writer = null
-}
-super.commitTask()
+Utils.tryWithSafeFinallyAndFailureCallbacks {
+  while (iterator.hasNext) {
+val internalRow = iterator.next()
+writer.writeInternal(internalRow)
+  }
+  commitTask()
+}(catchBlock = abortTask())
   } catch {
-case cause: Throwable =>
-  // This exception will be handled in 
`InsertIntoHadoopFsRelation.insert$writeRows`, and
-  // will cause `abortTask()` to be invoked.
-  throw new RuntimeException("Failed to commit task", cause)
+case t: Throwable =>
+  throw new SparkException("Task failed while writing rows", t)
   }
-}
 
-def abortTask(): Unit = {
-  try {
-if (writer != null) {
-  writer.close()
+  def commitTask(): Unit = {
+try {
+  if (writer != null) {
+writer.close()
+writer = null
+  }
+  super.commitTask()
+

spark git commit: [SPARK-10216][SQL] Avoid creating empty files during overwriting with group by query

2016-05-17 Thread marmbrus

Repository: spark
Updated Branches:
  refs/heads/master 20a89478e -> 8d05a7a98


[SPARK-10216][SQL] Avoid creating empty files during overwriting with group by 
query

## What changes were proposed in this pull request?

Currently, `INSERT INTO` with `GROUP BY` query tries to make at least 200 files 
(default value of `spark.sql.shuffle.partition`), which results in lots of 
empty files.

This PR makes it avoid creating empty files during overwriting into Hive table 
and in internal data sources  with group by query.

This checks whether the given partition has data in it or not and 
creates/writes file only when it actually has data.

## How was this patch tested?

Unittests in `InsertIntoHiveTableSuite` and `HadoopFsRelationTest`.

Closes #8411

Author: hyukjinkwon 
Author: Keuntae Park 

Closes #12855 from HyukjinKwon/pr/8411.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8d05a7a9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8d05a7a9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8d05a7a9

Branch: refs/heads/master
Commit: 8d05a7a98bdbd3ce7c81d273e05a375877ebe68f
Parents: 20a8947
Author: hyukjinkwon 
Authored: Tue May 17 11:18:51 2016 -0700
Committer: Michael Armbrust 
Committed: Tue May 17 11:18:51 2016 -0700

--
 .../execution/datasources/WriterContainer.scala | 221 ++-
 .../spark/sql/hive/hiveWriterContainers.scala   |  24 +-
 .../sql/hive/InsertIntoHiveTableSuite.scala |  41 +++-
 .../sql/sources/HadoopFsRelationTest.scala  |  22 +-
 4 files changed, 182 insertions(+), 126 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8d05a7a9/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
index 3b064a5..7e12bbb 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala
@@ -239,48 +239,50 @@ private[sql] class DefaultWriterContainer(
   extends BaseWriterContainer(relation, job, isAppend) {
 
   def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): 
Unit = {
-executorSideSetup(taskContext)
-val configuration = taskAttemptContext.getConfiguration
-configuration.set("spark.sql.sources.output.path", outputPath)
-var writer = newOutputWriter(getWorkPath)
-writer.initConverter(dataSchema)
-
-// If anything below fails, we should abort the task.
-try {
-  Utils.tryWithSafeFinallyAndFailureCallbacks {
-while (iterator.hasNext) {
-  val internalRow = iterator.next()
-  writer.writeInternal(internalRow)
-}
-commitTask()
-  }(catchBlock = abortTask())
-} catch {
-  case t: Throwable =>
-throw new SparkException("Task failed while writing rows", t)
-}
+if (iterator.hasNext) {
+  executorSideSetup(taskContext)
+  val configuration = taskAttemptContext.getConfiguration
+  configuration.set("spark.sql.sources.output.path", outputPath)
+  var writer = newOutputWriter(getWorkPath)
+  writer.initConverter(dataSchema)
 
-def commitTask(): Unit = {
+  // If anything below fails, we should abort the task.
   try {
-if (writer != null) {
-  writer.close()
-  writer = null
-}
-super.commitTask()
+Utils.tryWithSafeFinallyAndFailureCallbacks {
+  while (iterator.hasNext) {
+val internalRow = iterator.next()
+writer.writeInternal(internalRow)
+  }
+  commitTask()
+}(catchBlock = abortTask())
   } catch {
-case cause: Throwable =>
-  // This exception will be handled in 
`InsertIntoHadoopFsRelation.insert$writeRows`, and
-  // will cause `abortTask()` to be invoked.
-  throw new RuntimeException("Failed to commit task", cause)
+case t: Throwable =>
+  throw new SparkException("Task failed while writing rows", t)
   }
-}
 
-def abortTask(): Unit = {
-  try {
-if (writer != null) {
-  writer.close()
+  def commitTask(): Unit = {
+try {
+  if (writer != null) {
+writer.close()
+writer = null
+  }
+  super.commitTask()
+} catch {
+  case cause: Throwable =>
+// This exception will be handled in

spark git commit: [SPARK-14346][SQL][FOLLOW-UP] add tests for CREAT TABLE USING with partition and bucket

2016-05-17 Thread yhuai

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 110876b9a -> adc1c2685


[SPARK-14346][SQL][FOLLOW-UP] add tests for CREAT TABLE USING with partition 
and bucket

## What changes were proposed in this pull request?

https://github.com/apache/spark/pull/12781 introduced PARTITIONED BY, CLUSTERED 
BY, and SORTED BY keywords to CREATE TABLE USING. This PR adds tests to make 
sure those keywords are handled correctly.

This PR also fixes a mistake that we should create non-hive-compatible table if 
partition or bucket info exists.

## How was this patch tested?

N/A

Author: Wenchen Fan 

Closes #13144 from cloud-fan/add-test.

(cherry picked from commit 20a89478e168cb6901ef89f4cb6aa79193ed244a)
Signed-off-by: Yin Huai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/adc1c268
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/adc1c268
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/adc1c268

Branch: refs/heads/branch-2.0
Commit: adc1c2685ea0cfbf23716a4199b85c65021d15c6
Parents: 110876b
Author: Wenchen Fan 
Authored: Tue May 17 10:12:51 2016 -0700
Committer: Yin Huai 
Committed: Tue May 17 10:14:00 2016 -0700

--
 .../command/createDataSourceTables.scala| 11 +++-
 .../sql/execution/command/DDLCommandSuite.scala | 53 
 .../spark/sql/execution/command/DDLSuite.scala  | 44 
 3 files changed, 106 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/adc1c268/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index 7d3c5257..70e5108 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -399,8 +399,8 @@ object CreateDataSourceTableUtils extends Logging {
 "Hive metastore in Spark SQL specific format, which is NOT 
compatible with Hive."
 (None, message)
 
-  case (Some(serde), relation: HadoopFsRelation)
-if relation.location.paths.length == 1 && 
relation.partitionSchema.isEmpty =>
+  case (Some(serde), relation: HadoopFsRelation) if 
relation.location.paths.length == 1 &&
+relation.partitionSchema.isEmpty && relation.bucketSpec.isEmpty =>
 val hiveTable = newHiveCompatibleMetastoreTable(relation, serde)
 val message =
   s"Persisting data source relation $qualifiedTableName with a single 
input path " +
@@ -415,6 +415,13 @@ object CreateDataSourceTableUtils extends Logging {
 "Input path(s): " + relation.location.paths.mkString("\n", "\n", 
"")
 (None, message)
 
+  case (Some(serde), relation: HadoopFsRelation) if 
relation.bucketSpec.nonEmpty =>
+val message =
+  s"Persisting bucketed data source relation $qualifiedTableName into 
" +
+"Hive metastore in Spark SQL specific format, which is NOT 
compatible with Hive. " +
+"Input path(s): " + relation.location.paths.mkString("\n", "\n", 
"")
+(None, message)
+
   case (Some(serde), relation: HadoopFsRelation) =>
 val message =
   s"Persisting data source relation $qualifiedTableName with multiple 
input paths into " +

http://git-wip-us.apache.org/repos/asf/spark/blob/adc1c268/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
index 13df449..897170e 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
@@ -24,7 +24,9 @@ import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.Project
 import org.apache.spark.sql.execution.SparkSqlParser
+import org.apache.spark.sql.execution.datasources.{BucketSpec, 
CreateTableUsing}
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
 
 // TODO: merge this with DDLSuite (SPARK-14441)
 class DDLCommandSuite extends PlanTest {
@@

spark git commit: [SPARK-15165] [SQL] Codegen can break because toCommentSafeString is not actually safe

2016-05-17 Thread davies

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 670f48222 -> 110876b9a


[SPARK-15165] [SQL] Codegen can break because toCommentSafeString is not 
actually safe

## What changes were proposed in this pull request?

toCommentSafeString method replaces "\u" with "u" to avoid codegen breaking.
But if the even number of "\" is put before "u", like "u", in the string 
literal in the query, codegen can break.

Following code causes compilation error.

```
val df = Seq(...).toDF
df.select("'u002A/'").show
```

The reason of the compilation error is because "u002A/" is 
translated into "*/" (the end of comment).

Due to this unsafety, arbitrary code can be injected like as follows.

```
val df = Seq(...).toDF
// Inject "System.exit(1)"
df.select("'u002A/{System.exit(1);}/*'").show
```

## How was this patch tested?

Added new test cases.

Author: Kousuke Saruta 
Author: sarutak 

Closes #12939 from sarutak/SPARK-15165.

(cherry picked from commit c0c3ec35476c756e569a1f34c4b258eb0490585c)
Signed-off-by: Davies Liu 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/110876b9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/110876b9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/110876b9

Branch: refs/heads/branch-2.0
Commit: 110876b9afe5e4205062fd8e8979e096e585737d
Parents: 670f482
Author: Kousuke Saruta 
Authored: Tue May 17 10:07:01 2016 -0700
Committer: Davies Liu 
Committed: Tue May 17 10:08:52 2016 -0700

--
 .../spark/sql/catalyst/util/package.scala   |  13 +-
 .../expressions/CodeGenerationSuite.scala   |  44 
 .../org/apache/spark/sql/SQLQuerySuite.scala| 264 +++
 3 files changed, 320 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/110876b9/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala
index 3d2a624..f1d6cab 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala
@@ -162,7 +162,18 @@ package object util {
   def toCommentSafeString(str: String): String = {
 val len = math.min(str.length, 128)
 val suffix = if (str.length > len) "..." else ""
-str.substring(0, len).replace("*/", "\\*\\/").replace("\\u", "u") + 
suffix
+
+// Unicode literals, like \u0022, should be escaped before
+// they are put in code comment to avoid codegen breaking.
+// To escape them, single "\" should be prepended to a series of "\" just 
before "u"
+// only when the number of "\" is odd.
+// For example, \u0022 should become to \\u0022
+// but \\u0022 should not become to \\\u0022 because the first backslash 
escapes the second one,
+// and \u0022 will remain, means not escaped.
+// Otherwise, the runtime Java compiler will fail to compile or code 
injection can be allowed.
+// For details, see SPARK-15165.
+str.substring(0, len).replace("*/", "*\\/")
+  .replaceAll("(^|[^])(()*u)", "$1$2") + suffix
   }
 
   /* FIX ME

http://git-wip-us.apache.org/repos/asf/spark/blob/110876b9/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
index 2082cea..db34d12 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -194,4 +194,48 @@ class CodeGenerationSuite extends SparkFunSuite with 
ExpressionEvalHelper {
   true,
   InternalRow(UTF8String.fromString("\\u")))
   }
+
+  test("check compilation error doesn't occur caused by specific literal") {
+// The end of comment (*/) should be escaped.
+GenerateUnsafeProjection.generate(
+  Literal.create("*/Compilation error occurs/*", StringType) :: Nil)
+
+// `\u002A` is `*` and `\u002F` is `/`
+// so if the end of comment consists of those characters in queries, we 
need to escape them.
+GenerateUnsafeProjection.generate(
+  Literal.create("\\u002A/Compilation error

spark git commit: [SPARK-15318][ML][EXAMPLE] spark.ml Collaborative Filtering example does not work in spark-shell

2016-05-17 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 273f3d052 -> 670f48222


[SPARK-15318][ML][EXAMPLE] spark.ml Collaborative Filtering example does not 
work in spark-shell

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)

copy & paste example in ml-collaborative-filtering.html into spark-shell, we 
see the following errors.
scala> case class Rating(userId: Int, movieId: Int, rating: Float, timestamp: 
Long)
defined class Rating

scala> object Rating {
def parseRating(str: String): Rating = { | val fields = str.split("::") | 
assert(fields.size == 4) | Rating(fields(0).toInt, fields(1).toInt, 
fields(2).toFloat, fields(3).toLong) | }
}
:29: error: Rating.type does not take parameters
Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong)
^
In standard scala repl, it has the same error.

Scala/spark-shell repl has some quirks (e.g. packages are also not well 
supported).

The reason of errors is that scala/spark-shell repl discards previous 
definitions when we define the Object with the same class name. Solution: We 
can rename the Object Rating.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, 
manual tests)

Manually test it: 1). ./bin/run-example ALSExample
2). copy & paste example in the generated document. It works fine.

Author: wm...@hotmail.com 

Closes #13110 from wangmiao1981/repl.

(cherry picked from commit bebe5f9811f968db92c2d33e2b30c35cfb808a4a)
Signed-off-by: Sean Owen 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/670f4822
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/670f4822
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/670f4822

Branch: refs/heads/branch-2.0
Commit: 670f482225e20d512c2c1c1fccee5b9a7d3745b0
Parents: 273f3d0
Author: wm...@hotmail.com 
Authored: Tue May 17 16:51:01 2016 +0100
Committer: Sean Owen 
Committed: Tue May 17 16:51:07 2016 +0100

--
 .../apache/spark/examples/ml/ALSExample.scala| 19 ---
 1 file changed, 12 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/670f4822/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala 
b/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala
index 6b151a6..da19ea9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala
@@ -24,16 +24,21 @@ import org.apache.spark.ml.recommendation.ALS
 // $example off$
 import org.apache.spark.sql.SparkSession
 
+/**
+ * An example demonstrating ALS.
+ * Run with
+ * {{{
+ * bin/run-example ml.ALSExample
+ * }}}
+ */
 object ALSExample {
 
   // $example on$
   case class Rating(userId: Int, movieId: Int, rating: Float, timestamp: Long)
-  object Rating {
-def parseRating(str: String): Rating = {
-  val fields = str.split("::")
-  assert(fields.size == 4)
-  Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, 
fields(3).toLong)
-}
+  def parseRating(str: String): Rating = {
+val fields = str.split("::")
+assert(fields.size == 4)
+Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, 
fields(3).toLong)
   }
   // $example off$
 
@@ -46,7 +51,7 @@ object ALSExample {
 
 // $example on$
 val ratings = 
spark.read.text("data/mllib/als/sample_movielens_ratings.txt")
-  .map(Rating.parseRating)
+  .map(parseRating)
   .toDF()
 val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2))
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15318][ML][EXAMPLE] spark.ml Collaborative Filtering example does not work in spark-shell

2016-05-17 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/master 932d80029 -> bebe5f981


[SPARK-15318][ML][EXAMPLE] spark.ml Collaborative Filtering example does not 
work in spark-shell

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)

copy & paste example in ml-collaborative-filtering.html into spark-shell, we 
see the following errors.
scala> case class Rating(userId: Int, movieId: Int, rating: Float, timestamp: 
Long)
defined class Rating

scala> object Rating {
def parseRating(str: String): Rating = { | val fields = str.split("::") | 
assert(fields.size == 4) | Rating(fields(0).toInt, fields(1).toInt, 
fields(2).toFloat, fields(3).toLong) | }
}
:29: error: Rating.type does not take parameters
Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong)
^
In standard scala repl, it has the same error.

Scala/spark-shell repl has some quirks (e.g. packages are also not well 
supported).

The reason of errors is that scala/spark-shell repl discards previous 
definitions when we define the Object with the same class name. Solution: We 
can rename the Object Rating.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, 
manual tests)

Manually test it: 1). ./bin/run-example ALSExample
2). copy & paste example in the generated document. It works fine.

Author: wm...@hotmail.com 

Closes #13110 from wangmiao1981/repl.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bebe5f98
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bebe5f98
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bebe5f98

Branch: refs/heads/master
Commit: bebe5f9811f968db92c2d33e2b30c35cfb808a4a
Parents: 932d800
Author: wm...@hotmail.com 
Authored: Tue May 17 16:51:01 2016 +0100
Committer: Sean Owen 
Committed: Tue May 17 16:51:01 2016 +0100

--
 .../apache/spark/examples/ml/ALSExample.scala| 19 ---
 1 file changed, 12 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/bebe5f98/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala 
b/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala
index 6b151a6..da19ea9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala
@@ -24,16 +24,21 @@ import org.apache.spark.ml.recommendation.ALS
 // $example off$
 import org.apache.spark.sql.SparkSession
 
+/**
+ * An example demonstrating ALS.
+ * Run with
+ * {{{
+ * bin/run-example ml.ALSExample
+ * }}}
+ */
 object ALSExample {
 
   // $example on$
   case class Rating(userId: Int, movieId: Int, rating: Float, timestamp: Long)
-  object Rating {
-def parseRating(str: String): Rating = {
-  val fields = str.split("::")
-  assert(fields.size == 4)
-  Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, 
fields(3).toLong)
-}
+  def parseRating(str: String): Rating = {
+val fields = str.split("::")
+assert(fields.size == 4)
+Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, 
fields(3).toLong)
   }
   // $example off$
 
@@ -46,7 +51,7 @@ object ALSExample {
 
 // $example on$
 val ratings = 
spark.read.text("data/mllib/als/sample_movielens_ratings.txt")
-  .map(Rating.parseRating)
+  .map(parseRating)
   .toDF()
 val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2))
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15333][DOCS] Reorganize building-spark.md; rationalize vs wiki

2016-05-17 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 b031ea7dc -> 273f3d052


[SPARK-15333][DOCS] Reorganize building-spark.md; rationalize vs wiki

## What changes were proposed in this pull request?

See JIRA for the motivation. The changes are almost entirely movement of text 
and edits to sections. Minor changes to text include:

- Copying in / merging text from the "Useful Developer Tools" wiki, in areas of
  - Docker
  - R
  - Running one test
- standardizing on ./build/mvn not mvn, and likewise for ./build/sbt
- correcting some typos
- standardizing code block formatting

No text has been removed from this doc; text has been imported from the 
https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools wiki

## How was this patch tested?

Jekyll doc build and inspection of resulting HTML in browser.

Author: Sean Owen 

Closes #13124 from srowen/SPARK-15333.

(cherry picked from commit 932d8002931d352dd2ec87184e6c84ec5fa859cd)
Signed-off-by: Sean Owen 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/273f3d05
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/273f3d05
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/273f3d05

Branch: refs/heads/branch-2.0
Commit: 273f3d05294f8fcd8f3f4e116afcd96bd4b50920
Parents: b031ea7
Author: Sean Owen 
Authored: Tue May 17 16:40:38 2016 +0100
Committer: Sean Owen 
Committed: Tue May 17 16:40:48 2016 +0100

--
 docs/building-spark.md | 295 +++-
 1 file changed, 156 insertions(+), 139 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/273f3d05/docs/building-spark.md
--
diff --git a/docs/building-spark.md b/docs/building-spark.md
index 63532c7..2c987cf 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -7,48 +7,18 @@ redirect_from: "building-with-maven.html"
 * This will become a table of contents (this text will be scraped).
 {:toc}
 
-Building Spark using Maven requires Maven 3.3.9 or newer and Java 7+.
-The Spark build can supply a suitable Maven binary; see below.
-
-# Building with `build/mvn`
-
-Spark now comes packaged with a self-contained Maven installation to ease 
building and deployment of Spark from source located under the `build/` 
directory. This script will automatically download and setup all necessary 
build requirements ([Maven](https://maven.apache.org/), 
[Scala](http://www.scala-lang.org/), and 
[Zinc](https://github.com/typesafehub/zinc)) locally within the `build/` 
directory itself. It honors any `mvn` binary if present already, however, will 
pull down its own copy of Scala and Zinc regardless to ensure proper version 
requirements are met. `build/mvn` execution acts as a pass through to the `mvn` 
call allowing easy transition from previous build methods. As an example, one 
can build a version of Spark as follows:
-
-{% highlight bash %}
-build/mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -DskipTests clean package
-{% endhighlight %}
-
-Other build examples can be found below.
-
-**Note:** When building on an encrypted filesystem (if your home directory is 
encrypted, for example), then the Spark build might fail with a "Filename too 
long" error. As a workaround, add the following in the configuration args of 
the `scala-maven-plugin` in the project `pom.xml`:
-
--Xmax-classfile-name
-128
-
-and in `project/SparkBuild.scala` add:
-
-scalacOptions in Compile ++= Seq("-Xmax-classfile-name", "128"),
-
-to the `sharedSettings` val. See also [this 
PR](https://github.com/apache/spark/pull/2883/files) if you are unsure of where 
to add these lines.
-
-# Building a Runnable Distribution
+# Building Apache Spark
 
-To create a Spark distribution like those distributed by the
-[Spark Downloads](http://spark.apache.org/downloads.html) page, and that is 
laid out so as
-to be runnable, use `./dev/make-distribution.sh` in the project root 
directory. It can be configured
-with Maven profile settings and so on like the direct Maven build. Example:
+## Apache Maven
 
-./dev/make-distribution.sh --name custom-spark --tgz -Psparkr -Phadoop-2.4 
-Phive -Phive-thriftserver -Pyarn
-
-For more information on usage, run `./dev/make-distribution.sh --help`
+The Maven-based build is the build of reference for Apache Spark.
+Building Spark using Maven requires Maven 3.3.9 or newer and Java 7+.
 
-# Setting up Maven's Memory Usage
+### Setting up Maven's Memory Usage
 
 You'll need to configure Maven to use more memory than usual by setting 
`MAVEN_OPTS`. We recommend the following settings:
 
-{% highlight bash %}
-export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M

spark git commit: [SPARK-15333][DOCS] Reorganize building-spark.md; rationalize vs wiki

2016-05-17 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/master 4134ff0c6 -> 932d80029


[SPARK-15333][DOCS] Reorganize building-spark.md; rationalize vs wiki

## What changes were proposed in this pull request?

See JIRA for the motivation. The changes are almost entirely movement of text 
and edits to sections. Minor changes to text include:

- Copying in / merging text from the "Useful Developer Tools" wiki, in areas of
  - Docker
  - R
  - Running one test
- standardizing on ./build/mvn not mvn, and likewise for ./build/sbt
- correcting some typos
- standardizing code block formatting

No text has been removed from this doc; text has been imported from the 
https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools wiki

## How was this patch tested?

Jekyll doc build and inspection of resulting HTML in browser.

Author: Sean Owen 

Closes #13124 from srowen/SPARK-15333.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/932d8002
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/932d8002
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/932d8002

Branch: refs/heads/master
Commit: 932d8002931d352dd2ec87184e6c84ec5fa859cd
Parents: 4134ff0
Author: Sean Owen 
Authored: Tue May 17 16:40:38 2016 +0100
Committer: Sean Owen 
Committed: Tue May 17 16:40:38 2016 +0100

--
 docs/building-spark.md | 295 +++-
 1 file changed, 156 insertions(+), 139 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/932d8002/docs/building-spark.md
--
diff --git a/docs/building-spark.md b/docs/building-spark.md
index 63532c7..2c987cf 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -7,48 +7,18 @@ redirect_from: "building-with-maven.html"
 * This will become a table of contents (this text will be scraped).
 {:toc}
 
-Building Spark using Maven requires Maven 3.3.9 or newer and Java 7+.
-The Spark build can supply a suitable Maven binary; see below.
-
-# Building with `build/mvn`
-
-Spark now comes packaged with a self-contained Maven installation to ease 
building and deployment of Spark from source located under the `build/` 
directory. This script will automatically download and setup all necessary 
build requirements ([Maven](https://maven.apache.org/), 
[Scala](http://www.scala-lang.org/), and 
[Zinc](https://github.com/typesafehub/zinc)) locally within the `build/` 
directory itself. It honors any `mvn` binary if present already, however, will 
pull down its own copy of Scala and Zinc regardless to ensure proper version 
requirements are met. `build/mvn` execution acts as a pass through to the `mvn` 
call allowing easy transition from previous build methods. As an example, one 
can build a version of Spark as follows:
-
-{% highlight bash %}
-build/mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -DskipTests clean package
-{% endhighlight %}
-
-Other build examples can be found below.
-
-**Note:** When building on an encrypted filesystem (if your home directory is 
encrypted, for example), then the Spark build might fail with a "Filename too 
long" error. As a workaround, add the following in the configuration args of 
the `scala-maven-plugin` in the project `pom.xml`:
-
--Xmax-classfile-name
-128
-
-and in `project/SparkBuild.scala` add:
-
-scalacOptions in Compile ++= Seq("-Xmax-classfile-name", "128"),
-
-to the `sharedSettings` val. See also [this 
PR](https://github.com/apache/spark/pull/2883/files) if you are unsure of where 
to add these lines.
-
-# Building a Runnable Distribution
+# Building Apache Spark
 
-To create a Spark distribution like those distributed by the
-[Spark Downloads](http://spark.apache.org/downloads.html) page, and that is 
laid out so as
-to be runnable, use `./dev/make-distribution.sh` in the project root 
directory. It can be configured
-with Maven profile settings and so on like the direct Maven build. Example:
+## Apache Maven
 
-./dev/make-distribution.sh --name custom-spark --tgz -Psparkr -Phadoop-2.4 
-Phive -Phive-thriftserver -Pyarn
-
-For more information on usage, run `./dev/make-distribution.sh --help`
+The Maven-based build is the build of reference for Apache Spark.
+Building Spark using Maven requires Maven 3.3.9 or newer and Java 7+.
 
-# Setting up Maven's Memory Usage
+### Setting up Maven's Memory Usage
 
 You'll need to configure Maven to use more memory than usual by setting 
`MAVEN_OPTS`. We recommend the following settings:
 
-{% highlight bash %}
-export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m"
-{% endhighlight %}
+export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M 
-XX:ReservedCodeCacheSize=512m"
 
 If you

spark git commit: [SPARK-14434][ML] User guide doc and examples for GaussianMixture in spark.ml

2016-05-17 Thread mlnick

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 c0bcecf91 -> b031ea7dc


[SPARK-14434][ML] User guide doc and examples for GaussianMixture in spark.ml

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)

Add guide doc and examples for GaussianMixture in Spark.ml in Java, Scala and 
Python.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, 
manual tests)

Manual compile and test all examples

Author: wm...@hotmail.com 

Closes #12788 from wangmiao1981/example.

(cherry picked from commit 4134ff0c657efcbf0f61eff0423215afd6132837)
Signed-off-by: Nick Pentreath 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b031ea7d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b031ea7d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b031ea7d

Branch: refs/heads/branch-2.0
Commit: b031ea7dc29b3e55dfaf8e8466b6d8f33cb81a3e
Parents: c0bcecf
Author: wm...@hotmail.com 
Authored: Tue May 17 15:20:47 2016 +0200
Committer: Nick Pentreath 
Committed: Tue May 17 15:21:04 2016 +0200

--
 docs/ml-clustering.md   | 82 
 .../examples/ml/JavaGaussianMixtureExample.java | 64 +++
 .../main/python/ml/gaussian_mixture_example.py  | 48 
 .../examples/ml/GaussianMixtureExample.scala| 58 ++
 4 files changed, 252 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b031ea7d/docs/ml-clustering.md
--
diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md
index a0955a3..33e4b7b 100644
--- a/docs/ml-clustering.md
+++ b/docs/ml-clustering.md
@@ -148,3 +148,85 @@ Refer to the [Python API 
docs](api/python/pyspark.ml.html#pyspark.ml.clustering.
 {% include_example python/ml/bisecting_k_means_example.py %}
 
 
+
+## Gaussian Mixture Model (GMM)
+
+A [Gaussian Mixture 
Model](http://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model)
+represents a composite distribution whereby points are drawn from one of *k* 
Gaussian sub-distributions,
+each with its own probability. The `spark.ml` implementation uses the
+[expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
+algorithm to induce the maximum-likelihood model given a set of samples.
+
+`GaussianMixture` is implemented as an `Estimator` and generates a 
`GaussianMixtureModel` as the base
+model.
+
+### Input Columns
+
+
+  
+
+  Param name
+  Type(s)
+  Default
+  Description
+
+  
+  
+
+  featuresCol
+  Vector
+  "features"
+  Feature vector
+
+  
+
+
+### Output Columns
+
+
+  
+
+  Param name
+  Type(s)
+  Default
+  Description
+
+  
+  
+
+  predictionCol
+  Int
+  "prediction"
+  Predicted cluster center
+
+
+  probabilityCol
+  Vector
+  "probability"
+  Probability of each cluster
+
+  
+
+
+### Example
+
+
+
+
+Refer to the [Scala API 
docs](api/scala/index.html#org.apache.spark.ml.clustering.GaussianMixture) for 
more details.
+
+{% include_example 
scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala %}
+
+
+
+Refer to the [Java API 
docs](api/java/org/apache/spark/ml/clustering/GaussianMixture.html) for more 
details.
+
+{% include_example 
java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java %}
+
+
+
+Refer to the [Python API 
docs](api/python/pyspark.ml.html#pyspark.ml.clustering.GaussianMixture) for 
more details.
+
+{% include_example python/ml/gaussian_mixture_example.py %}
+
+

http://git-wip-us.apache.org/repos/asf/spark/blob/b031ea7d/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java
 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java
new file mode 100644
index 000..79b9909
--- /dev/null
+++ 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *

spark git commit: [SPARK-14434][ML] User guide doc and examples for GaussianMixture in spark.ml

2016-05-17 Thread mlnick

Repository: spark
Updated Branches:
  refs/heads/master c36ca651f -> 4134ff0c6


[SPARK-14434][ML] User guide doc and examples for GaussianMixture in spark.ml

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)

Add guide doc and examples for GaussianMixture in Spark.ml in Java, Scala and 
Python.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, 
manual tests)

Manual compile and test all examples

Author: wm...@hotmail.com 

Closes #12788 from wangmiao1981/example.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4134ff0c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4134ff0c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4134ff0c

Branch: refs/heads/master
Commit: 4134ff0c657efcbf0f61eff0423215afd6132837
Parents: c36ca65
Author: wm...@hotmail.com 
Authored: Tue May 17 15:20:47 2016 +0200
Committer: Nick Pentreath 
Committed: Tue May 17 15:20:47 2016 +0200

--
 docs/ml-clustering.md   | 82 
 .../examples/ml/JavaGaussianMixtureExample.java | 64 +++
 .../main/python/ml/gaussian_mixture_example.py  | 48 
 .../examples/ml/GaussianMixtureExample.scala| 58 ++
 4 files changed, 252 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4134ff0c/docs/ml-clustering.md
--
diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md
index a0955a3..33e4b7b 100644
--- a/docs/ml-clustering.md
+++ b/docs/ml-clustering.md
@@ -148,3 +148,85 @@ Refer to the [Python API 
docs](api/python/pyspark.ml.html#pyspark.ml.clustering.
 {% include_example python/ml/bisecting_k_means_example.py %}
 
 
+
+## Gaussian Mixture Model (GMM)
+
+A [Gaussian Mixture 
Model](http://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model)
+represents a composite distribution whereby points are drawn from one of *k* 
Gaussian sub-distributions,
+each with its own probability. The `spark.ml` implementation uses the
+[expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
+algorithm to induce the maximum-likelihood model given a set of samples.
+
+`GaussianMixture` is implemented as an `Estimator` and generates a 
`GaussianMixtureModel` as the base
+model.
+
+### Input Columns
+
+
+  
+
+  Param name
+  Type(s)
+  Default
+  Description
+
+  
+  
+
+  featuresCol
+  Vector
+  "features"
+  Feature vector
+
+  
+
+
+### Output Columns
+
+
+  
+
+  Param name
+  Type(s)
+  Default
+  Description
+
+  
+  
+
+  predictionCol
+  Int
+  "prediction"
+  Predicted cluster center
+
+
+  probabilityCol
+  Vector
+  "probability"
+  Probability of each cluster
+
+  
+
+
+### Example
+
+
+
+
+Refer to the [Scala API 
docs](api/scala/index.html#org.apache.spark.ml.clustering.GaussianMixture) for 
more details.
+
+{% include_example 
scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala %}
+
+
+
+Refer to the [Java API 
docs](api/java/org/apache/spark/ml/clustering/GaussianMixture.html) for more 
details.
+
+{% include_example 
java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java %}
+
+
+
+Refer to the [Python API 
docs](api/python/pyspark.ml.html#pyspark.ml.clustering.GaussianMixture) for 
more details.
+
+{% include_example python/ml/gaussian_mixture_example.py %}
+
+

http://git-wip-us.apache.org/repos/asf/spark/blob/4134ff0c/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java
 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java
new file mode 100644
index 000..79b9909
--- /dev/null
+++ 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is

spark git commit: [SPARK-15351][SQL] RowEncoder should support array as the external type for ArrayType

2016-05-17 Thread wenchen

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 1426235bf -> c0bcecf91


[SPARK-15351][SQL] RowEncoder should support array as the external type for 
ArrayType

## What changes were proposed in this pull request?

This PR improves `RowEncoder` and `MapObjects`, to support array as the 
external type for `ArrayType`. The idea is straightforward, we use `Object` as 
the external input type for `ArrayType`, and determine its type at runtime in 
`MapObjects`.

## How was this patch tested?

new test in `RowEncoderSuite`

Author: Wenchen Fan 

Closes #13138 from cloud-fan/map-object.

(cherry picked from commit c36ca651f9177f8e7a3f6a0098cba5a810ee9deb)
Signed-off-by: Wenchen Fan 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c0bcecf9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c0bcecf9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c0bcecf9

Branch: refs/heads/branch-2.0
Commit: c0bcecf914a0e0f6669a62a50e6198af38d4aac6
Parents: 1426235
Author: Wenchen Fan 
Authored: Tue May 17 17:02:52 2016 +0800
Committer: Wenchen Fan 
Committed: Tue May 17 17:03:15 2016 +0800

--
 .../main/scala/org/apache/spark/sql/Row.scala   |  4 +-
 .../sql/catalyst/encoders/RowEncoder.scala  | 22 +
 .../catalyst/expressions/objects/objects.scala  | 99 +---
 .../sql/catalyst/util/GenericArrayData.scala|  5 +
 .../sql/catalyst/encoders/RowEncoderSuite.scala | 17 
 5 files changed, 92 insertions(+), 55 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c0bcecf9/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
--
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index 726291b..a257b83 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -151,7 +151,7 @@ trait Row extends Serializable {
*   BinaryType -> byte array
*   ArrayType -> scala.collection.Seq (use getList for java.util.List)
*   MapType -> scala.collection.Map (use getJavaMap for java.util.Map)
-   *   StructType -> org.apache.spark.sql.Row (or Product)
+   *   StructType -> org.apache.spark.sql.Row
* }}}
*/
   def apply(i: Int): Any = get(i)
@@ -176,7 +176,7 @@ trait Row extends Serializable {
*   BinaryType -> byte array
*   ArrayType -> scala.collection.Seq (use getList for java.util.List)
*   MapType -> scala.collection.Map (use getJavaMap for java.util.Map)
-   *   StructType -> org.apache.spark.sql.Row (or Product)
+   *   StructType -> org.apache.spark.sql.Row
* }}}
*/
   def get(i: Int): Any

http://git-wip-us.apache.org/repos/asf/spark/blob/c0bcecf9/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
index ae842a9..a5f39aa 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
@@ -32,6 +32,26 @@ import org.apache.spark.unsafe.types.UTF8String
 /**
  * A factory for constructing encoders that convert external row to/from the 
Spark SQL
  * internal binary representation.
+ *
+ * The following is a mapping between Spark SQL types and its allowed external 
types:
+ * {{{
+ *   BooleanType -> java.lang.Boolean
+ *   ByteType -> java.lang.Byte
+ *   ShortType -> java.lang.Short
+ *   IntegerType -> java.lang.Integer
+ *   FloatType -> java.lang.Float
+ *   DoubleType -> java.lang.Double
+ *   StringType -> String
+ *   DecimalType -> java.math.BigDecimal or scala.math.BigDecimal or Decimal
+ *
+ *   DateType -> java.sql.Date
+ *   TimestampType -> java.sql.Timestamp
+ *
+ *   BinaryType -> byte array
+ *   ArrayType -> scala.collection.Seq or Array
+ *   MapType -> scala.collection.Map
+ *   StructType -> org.apache.spark.sql.Row or Product
+ * }}}
  */
 object RowEncoder {
   def apply(schema: StructType): ExpressionEncoder[Row] = {
@@ -166,6 +186,8 @@ object RowEncoder {
 // In order to support both Decimal and java/scala BigDecimal in external 
row, we make this
 // as java.lang.Object.
 case _: DecimalType => ObjectType(classOf[java.lang.Object])
+// In order to support both Array and Seq in external row, we make this as 
java.lang.Object.
+case _: ArrayType =>

spark git commit: [SPARK-15351][SQL] RowEncoder should support array as the external type for ArrayType

2016-05-17 Thread wenchen

Repository: spark
Updated Branches:
  refs/heads/master 122302cbf -> c36ca651f


[SPARK-15351][SQL] RowEncoder should support array as the external type for 
ArrayType

## What changes were proposed in this pull request?

This PR improves `RowEncoder` and `MapObjects`, to support array as the 
external type for `ArrayType`. The idea is straightforward, we use `Object` as 
the external input type for `ArrayType`, and determine its type at runtime in 
`MapObjects`.

## How was this patch tested?

new test in `RowEncoderSuite`

Author: Wenchen Fan 

Closes #13138 from cloud-fan/map-object.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c36ca651
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c36ca651
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c36ca651

Branch: refs/heads/master
Commit: c36ca651f9177f8e7a3f6a0098cba5a810ee9deb
Parents: 122302c
Author: Wenchen Fan 
Authored: Tue May 17 17:02:52 2016 +0800
Committer: Wenchen Fan 
Committed: Tue May 17 17:02:52 2016 +0800

--
 .../main/scala/org/apache/spark/sql/Row.scala   |  4 +-
 .../sql/catalyst/encoders/RowEncoder.scala  | 22 +
 .../catalyst/expressions/objects/objects.scala  | 99 +---
 .../sql/catalyst/util/GenericArrayData.scala|  5 +
 .../sql/catalyst/encoders/RowEncoderSuite.scala | 17 
 5 files changed, 92 insertions(+), 55 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c36ca651/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
--
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index 726291b..a257b83 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -151,7 +151,7 @@ trait Row extends Serializable {
*   BinaryType -> byte array
*   ArrayType -> scala.collection.Seq (use getList for java.util.List)
*   MapType -> scala.collection.Map (use getJavaMap for java.util.Map)
-   *   StructType -> org.apache.spark.sql.Row (or Product)
+   *   StructType -> org.apache.spark.sql.Row
* }}}
*/
   def apply(i: Int): Any = get(i)
@@ -176,7 +176,7 @@ trait Row extends Serializable {
*   BinaryType -> byte array
*   ArrayType -> scala.collection.Seq (use getList for java.util.List)
*   MapType -> scala.collection.Map (use getJavaMap for java.util.Map)
-   *   StructType -> org.apache.spark.sql.Row (or Product)
+   *   StructType -> org.apache.spark.sql.Row
* }}}
*/
   def get(i: Int): Any

http://git-wip-us.apache.org/repos/asf/spark/blob/c36ca651/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
index ae842a9..a5f39aa 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
@@ -32,6 +32,26 @@ import org.apache.spark.unsafe.types.UTF8String
 /**
  * A factory for constructing encoders that convert external row to/from the 
Spark SQL
  * internal binary representation.
+ *
+ * The following is a mapping between Spark SQL types and its allowed external 
types:
+ * {{{
+ *   BooleanType -> java.lang.Boolean
+ *   ByteType -> java.lang.Byte
+ *   ShortType -> java.lang.Short
+ *   IntegerType -> java.lang.Integer
+ *   FloatType -> java.lang.Float
+ *   DoubleType -> java.lang.Double
+ *   StringType -> String
+ *   DecimalType -> java.math.BigDecimal or scala.math.BigDecimal or Decimal
+ *
+ *   DateType -> java.sql.Date
+ *   TimestampType -> java.sql.Timestamp
+ *
+ *   BinaryType -> byte array
+ *   ArrayType -> scala.collection.Seq or Array
+ *   MapType -> scala.collection.Map
+ *   StructType -> org.apache.spark.sql.Row or Product
+ * }}}
  */
 object RowEncoder {
   def apply(schema: StructType): ExpressionEncoder[Row] = {
@@ -166,6 +186,8 @@ object RowEncoder {
 // In order to support both Decimal and java/scala BigDecimal in external 
row, we make this
 // as java.lang.Object.
 case _: DecimalType => ObjectType(classOf[java.lang.Object])
+// In order to support both Array and Seq in external row, we make this as 
java.lang.Object.
+case _: ArrayType => ObjectType(classOf[java.lang.Object])
 case _ => externalDataTypeFor(dt)
   }

spark git commit: [SPARK-15290][BUILD] Move annotations, like @Since / @DeveloperApi, into spark-tags

2016-05-17 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 6d10b2826 -> 1426235bf


[SPARK-15290][BUILD] Move annotations, like @Since / @DeveloperApi, into 
spark-tags

## What changes were proposed in this pull request?

(See https://github.com/apache/spark/pull/12416 where most of this was already 
reviewed and committed; this is just the module structure and move part. This 
change does not move the annotations into test scope, which was the apparently 
problem last time.)

Rename `spark-test-tags` -> `spark-tags`; move common annotations like `Since` 
to `spark-tags`

## How was this patch tested?

Jenkins tests.

Author: Sean Owen 

Closes #13074 from srowen/SPARK-15290.

(cherry picked from commit 122302cbf5cbf1133067a5acdffd6ab96765dafe)
Signed-off-by: Sean Owen 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1426235b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1426235b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1426235b

Branch: refs/heads/branch-2.0
Commit: 1426235bfb1ecfa55859930913ae45d085912bf7
Parents: 6d10b28
Author: Sean Owen 
Authored: Tue May 17 09:55:53 2016 +0100
Committer: Sean Owen 
Committed: Tue May 17 09:56:09 2016 +0100

--
 common/network-common/pom.xml   |  2 +-
 common/network-shuffle/pom.xml  |  2 +-
 common/network-yarn/pom.xml |  2 +-
 common/sketch/pom.xml   |  2 +-
 common/tags/pom.xml |  6 +--
 .../apache/spark/annotation/AlphaComponent.java | 33 
 .../apache/spark/annotation/DeveloperApi.java   | 35 +
 .../apache/spark/annotation/Experimental.java   | 36 +
 .../org/apache/spark/annotation/Private.java| 41 
 .../org/apache/spark/annotation/Since.scala | 30 ++
 .../apache/spark/annotation/package-info.java   | 23 +++
 .../org/apache/spark/annotation/package.scala   | 25 
 common/unsafe/pom.xml   |  8 ++--
 core/pom.xml|  2 +-
 .../apache/spark/annotation/AlphaComponent.java | 33 
 .../apache/spark/annotation/DeveloperApi.java   | 35 -
 .../apache/spark/annotation/Experimental.java   | 36 -
 .../org/apache/spark/annotation/Private.java| 41 
 .../org/apache/spark/annotation/Since.scala | 30 --
 .../apache/spark/annotation/package-info.java   | 23 ---
 .../org/apache/spark/annotation/package.scala   | 25 
 dev/sparktestsupport/modules.py | 19 ++---
 external/docker-integration-tests/pom.xml   |  2 +-
 external/flume-sink/pom.xml |  2 +-
 external/flume/pom.xml  |  2 +-
 external/java8-tests/pom.xml|  2 +-
 external/kafka-0-8/pom.xml  |  2 +-
 external/kinesis-asl/pom.xml|  2 +-
 graphx/pom.xml  |  2 +-
 launcher/pom.xml|  2 +-
 mllib-local/pom.xml |  4 ++
 mllib/pom.xml   |  2 +-
 pom.xml |  3 +-
 project/MimaExcludes.scala  |  8 
 project/SparkBuild.scala| 10 ++---
 repl/pom.xml|  2 +-
 sql/catalyst/pom.xml|  2 +-
 sql/core/pom.xml|  2 +-
 sql/hive-thriftserver/pom.xml   |  2 +-
 sql/hive/pom.xml|  2 +-
 streaming/pom.xml   |  2 +-
 yarn/pom.xml|  2 +-
 42 files changed, 282 insertions(+), 264 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1426235b/common/network-common/pom.xml
--
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index bd507c2..5444ae6 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -66,7 +66,7 @@
 
 
   org.apache.spark
-  spark-test-tags_${scala.binary.version}
+  spark-tags_${scala.binary.version}
 
 
   org.mockito

http://git-wip-us.apache.org/repos/asf/spark/blob/1426235b/common/network-shuffle/pom.xml
--
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 810ec10..e736436 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@

spark git commit: [SPARK-15290][BUILD] Move annotations, like @Since / @DeveloperApi, into spark-tags

2016-05-17 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/master 8ad9f08c9 -> 122302cbf


[SPARK-15290][BUILD] Move annotations, like @Since / @DeveloperApi, into 
spark-tags

## What changes were proposed in this pull request?

(See https://github.com/apache/spark/pull/12416 where most of this was already 
reviewed and committed; this is just the module structure and move part. This 
change does not move the annotations into test scope, which was the apparently 
problem last time.)

Rename `spark-test-tags` -> `spark-tags`; move common annotations like `Since` 
to `spark-tags`

## How was this patch tested?

Jenkins tests.

Author: Sean Owen 

Closes #13074 from srowen/SPARK-15290.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/122302cb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/122302cb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/122302cb

Branch: refs/heads/master
Commit: 122302cbf5cbf1133067a5acdffd6ab96765dafe
Parents: 8ad9f08
Author: Sean Owen 
Authored: Tue May 17 09:55:53 2016 +0100
Committer: Sean Owen 
Committed: Tue May 17 09:55:53 2016 +0100

--
 common/network-common/pom.xml   |  2 +-
 common/network-shuffle/pom.xml  |  2 +-
 common/network-yarn/pom.xml |  2 +-
 common/sketch/pom.xml   |  2 +-
 common/tags/pom.xml |  6 +--
 .../apache/spark/annotation/AlphaComponent.java | 33 
 .../apache/spark/annotation/DeveloperApi.java   | 35 +
 .../apache/spark/annotation/Experimental.java   | 36 +
 .../org/apache/spark/annotation/Private.java| 41 
 .../org/apache/spark/annotation/Since.scala | 30 ++
 .../apache/spark/annotation/package-info.java   | 23 +++
 .../org/apache/spark/annotation/package.scala   | 25 
 common/unsafe/pom.xml   |  8 ++--
 core/pom.xml|  2 +-
 .../apache/spark/annotation/AlphaComponent.java | 33 
 .../apache/spark/annotation/DeveloperApi.java   | 35 -
 .../apache/spark/annotation/Experimental.java   | 36 -
 .../org/apache/spark/annotation/Private.java| 41 
 .../org/apache/spark/annotation/Since.scala | 30 --
 .../apache/spark/annotation/package-info.java   | 23 ---
 .../org/apache/spark/annotation/package.scala   | 25 
 dev/sparktestsupport/modules.py | 19 ++---
 external/docker-integration-tests/pom.xml   |  2 +-
 external/flume-sink/pom.xml |  2 +-
 external/flume/pom.xml  |  2 +-
 external/java8-tests/pom.xml|  2 +-
 external/kafka-0-8/pom.xml  |  2 +-
 external/kinesis-asl/pom.xml|  2 +-
 graphx/pom.xml  |  2 +-
 launcher/pom.xml|  2 +-
 mllib-local/pom.xml |  4 ++
 mllib/pom.xml   |  2 +-
 pom.xml |  3 +-
 project/MimaExcludes.scala  |  8 
 project/SparkBuild.scala| 10 ++---
 repl/pom.xml|  2 +-
 sql/catalyst/pom.xml|  2 +-
 sql/core/pom.xml|  2 +-
 sql/hive-thriftserver/pom.xml   |  2 +-
 sql/hive/pom.xml|  2 +-
 streaming/pom.xml   |  2 +-
 yarn/pom.xml|  2 +-
 42 files changed, 282 insertions(+), 264 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/122302cb/common/network-common/pom.xml
--
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index bd507c2..5444ae6 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -66,7 +66,7 @@
 
 
   org.apache.spark
-  spark-test-tags_${scala.binary.version}
+  spark-tags_${scala.binary.version}
 
 
   org.mockito

http://git-wip-us.apache.org/repos/asf/spark/blob/122302cb/common/network-shuffle/pom.xml
--
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 810ec10..e736436 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -80,7 +80,7 @@
 
 
   org.apache.spark
-  spark-test-tags_${scala.binary.version}
+

spark git commit: [SPARK-12972][CORE][TEST-MAVEN][TEST-HADOOP2.2] Update org.apache.httpcomponents.httpclient, commons-io

2016-05-17 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 0d5e29655 -> 6d10b2826


[SPARK-12972][CORE][TEST-MAVEN][TEST-HADOOP2.2] Update 
org.apache.httpcomponents.httpclient, commons-io

## What changes were proposed in this pull request?

This is sort of a hot-fix for https://github.com/apache/spark/pull/13117, but, 
the problem is limited to Hadoop 2.2. The change is to manage `commons-io` to 
2.4 for all Hadoop builds, which is only a net change for Hadoop 2.2, which was 
using 2.1.

## How was this patch tested?

Jenkins tests -- normal PR builder, then the `[test-hadoop2.2] [test-maven]` if 
successful.

Author: Sean Owen 

Closes #13132 from srowen/SPARK-12972.3.

(cherry picked from commit fabc8e5b128849a08d820d8c0b3425e39258e02e)
Signed-off-by: Sean Owen 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6d10b282
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6d10b282
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6d10b282

Branch: refs/heads/branch-2.0
Commit: 6d10b28261e8f1c989d4cab9f59f5f082fd267de
Parents: 0d5e296
Author: Sean Owen 
Authored: Mon May 16 16:27:04 2016 +0100
Committer: Sean Owen 
Committed: Tue May 17 09:53:38 2016 +0100

--
 dev/deps/spark-deps-hadoop-2.2 | 2 +-
 pom.xml| 6 ++
 2 files changed, 7 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6d10b282/dev/deps/spark-deps-hadoop-2.2
--
diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index c3be6b2..91b333f 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -30,7 +30,7 @@ commons-configuration-1.6.jar
 commons-dbcp-1.4.jar
 commons-digester-1.8.jar
 commons-httpclient-3.1.jar
-commons-io-2.1.jar
+commons-io-2.4.jar
 commons-lang-2.6.jar
 commons-lang3-3.3.2.jar
 commons-logging-1.1.3.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/6d10b282/pom.xml
--
diff --git a/pom.xml b/pom.xml
index 40d9bf5..864824d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -166,6 +166,7 @@
 1.1.2
 1.2.0-incubating
 1.10
+2.4
 
 2.6
 
@@ -377,6 +378,11 @@
 ${commons-lang2.version}
   
   
+commons-io
+commons-io
+${commons-io.version}
+  
+  
 commons-codec
 commons-codec
 ${commons-codec.version}


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-12972][CORE] Update org.apache.httpcomponents.httpclient

2016-05-17 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 8e3ee683b -> 0d5e29655


[SPARK-12972][CORE] Update org.apache.httpcomponents.httpclient

## What changes were proposed in this pull request?

(Retry of https://github.com/apache/spark/pull/13049)

- update to httpclient 4.5 / httpcore 4.4
- remove some defunct exclusions
- manage httpmime version to match
- update selenium / httpunit to support 4.5 (possible now that Jetty 9 is used)

## How was this patch tested?

Jenkins tests. Also, locally running the same test command of one Jenkins 
profile that failed: `mvn -Phadoop-2.6 -Pyarn -Phive -Phive-thriftserver 
-Pkinesis-asl ...`

Author: Sean Owen 

Closes #13117 from srowen/SPARK-12972.2.

(cherry picked from commit f5576a052da0bb59343bc2a6b6ce06c6abaac75b)
Signed-off-by: Sean Owen 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0d5e2965
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0d5e2965
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0d5e2965

Branch: refs/heads/branch-2.0
Commit: 0d5e29655f9c3758393794367c0b5d3fd395d1f6
Parents: 8e3ee68
Author: Sean Owen 
Authored: Sun May 15 15:56:46 2016 +0100
Committer: Sean Owen 
Committed: Tue May 17 09:53:13 2016 +0100

--
 core/pom.xml  | 11 +++---
 dev/deps/spark-deps-hadoop-2.2|  4 +-
 dev/deps/spark-deps-hadoop-2.3|  4 +-
 dev/deps/spark-deps-hadoop-2.4|  4 +-
 dev/deps/spark-deps-hadoop-2.6|  4 +-
 dev/deps/spark-deps-hadoop-2.7|  4 +-
 external/docker-integration-tests/pom.xml |  2 -
 pom.xml   | 54 +++---
 sql/hive-thriftserver/pom.xml | 11 +++---
 streaming/pom.xml |  5 +++
 10 files changed, 40 insertions(+), 63 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0d5e2965/core/pom.xml
--
diff --git a/core/pom.xml b/core/pom.xml
index c985352..4b8fb4e 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -275,12 +275,11 @@
 
   org.seleniumhq.selenium
   selenium-java
-  
-
-  com.google.guava
-  guava
-
-  
+  test
+
+
+  org.seleniumhq.selenium
+  selenium-htmlunit-driver
   test
 
 

http://git-wip-us.apache.org/repos/asf/spark/blob/0d5e2965/dev/deps/spark-deps-hadoop-2.2
--
diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index 2477312..c3be6b2 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -69,8 +69,8 @@ hadoop-yarn-server-web-proxy-2.2.0.jar
 hk2-api-2.4.0-b34.jar
 hk2-locator-2.4.0-b34.jar
 hk2-utils-2.4.0-b34.jar
-httpclient-4.3.2.jar
-httpcore-4.3.2.jar
+httpclient-4.5.2.jar
+httpcore-4.4.4.jar
 ivy-2.4.0.jar
 jackson-annotations-2.5.3.jar
 jackson-core-2.5.3.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/0d5e2965/dev/deps/spark-deps-hadoop-2.3
--
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index 0181a47..61ed4c0 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -71,8 +71,8 @@ hadoop-yarn-server-web-proxy-2.3.0.jar
 hk2-api-2.4.0-b34.jar
 hk2-locator-2.4.0-b34.jar
 hk2-utils-2.4.0-b34.jar
-httpclient-4.3.2.jar
-httpcore-4.3.2.jar
+httpclient-4.5.2.jar
+httpcore-4.4.4.jar
 ivy-2.4.0.jar
 jackson-annotations-2.5.3.jar
 jackson-core-2.5.3.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/0d5e2965/dev/deps/spark-deps-hadoop-2.4
--
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index f7ff234..fb01492 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -71,8 +71,8 @@ hadoop-yarn-server-web-proxy-2.4.0.jar
 hk2-api-2.4.0-b34.jar
 hk2-locator-2.4.0-b34.jar
 hk2-utils-2.4.0-b34.jar
-httpclient-4.3.2.jar
-httpcore-4.3.2.jar
+httpclient-4.5.2.jar
+httpcore-4.4.4.jar
 ivy-2.4.0.jar
 jackson-annotations-2.5.3.jar
 jackson-core-2.5.3.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/0d5e2965/dev/deps/spark-deps-hadoop-2.6
--
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 92db55d..0baf4e8 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -77,8 +77,8 @@ hk2-api-2.4.0-b34.jar
 hk2-locator-2.4.0-b34.jar
 hk2-utils-2.4.0-b34.jar
 htrace-core-3.0.4.jar
-httpclient-4.3.2.jar

spark git commit: [SPARK-14906][ML] Copy linalg in PySpark to new ML package

2016-05-17 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 0dd1f8720 -> 8e3ee683b


[SPARK-14906][ML] Copy linalg in PySpark to new ML package

## What changes were proposed in this pull request?

Copy the linalg (Vector/Matrix and VectorUDT/MatrixUDT) in PySpark to new ML 
package.

## How was this patch tested?
Existing tests.

Author: Xiangrui Meng 
Author: Liang-Chi Hsieh 
Author: Liang-Chi Hsieh 

Closes #13099 from viirya/move-pyspark-vector-matrix-udt4.

(cherry picked from commit 8ad9f08c94e98317a9095dd53d737c1b8df6e29c)
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8e3ee683
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8e3ee683
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8e3ee683

Branch: refs/heads/branch-2.0
Commit: 8e3ee683bb7ecc857480bc347e7a814e5a63ff28
Parents: 0dd1f87
Author: Xiangrui Meng 
Authored: Tue May 17 00:08:02 2016 -0700
Committer: Xiangrui Meng 
Committed: Tue May 17 00:08:15 2016 -0700

--
 python/docs/pyspark.ml.rst   |8 +
 python/pyspark/ml/linalg/__init__.py | 1145 +
 python/pyspark/ml/tests.py   |  456 ++--
 3 files changed, 1564 insertions(+), 45 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8e3ee683/python/docs/pyspark.ml.rst
--
diff --git a/python/docs/pyspark.ml.rst b/python/docs/pyspark.ml.rst
index 86d4186..26f7415 100644
--- a/python/docs/pyspark.ml.rst
+++ b/python/docs/pyspark.ml.rst
@@ -41,6 +41,14 @@ pyspark.ml.clustering module
 :undoc-members:
 :inherited-members:
 
+pyspark.ml.linalg module
+
+
+.. automodule:: pyspark.ml.linalg
+:members:
+:undoc-members:
+:inherited-members:
+
 pyspark.ml.recommendation module
 
 

http://git-wip-us.apache.org/repos/asf/spark/blob/8e3ee683/python/pyspark/ml/linalg/__init__.py
--
diff --git a/python/pyspark/ml/linalg/__init__.py 
b/python/pyspark/ml/linalg/__init__.py
new file mode 100644
index 000..f42c589
--- /dev/null
+++ b/python/pyspark/ml/linalg/__init__.py
@@ -0,0 +1,1145 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+MLlib utilities for linear algebra. For dense vectors, MLlib
+uses the NumPy C{array} type, so you can simply pass NumPy arrays
+around. For sparse vectors, users can construct a L{SparseVector}
+object from MLlib or pass SciPy C{scipy.sparse} column vectors if
+SciPy is available in their environment.
+"""
+
+import sys
+import array
+import struct
+
+if sys.version >= '3':
+basestring = str
+xrange = range
+import copyreg as copy_reg
+long = int
+else:
+from itertools import izip as zip
+import copy_reg
+
+import numpy as np
+
+from pyspark import since
+from pyspark.sql.types import UserDefinedType, StructField, StructType, 
ArrayType, DoubleType, \
+IntegerType, ByteType, BooleanType
+
+
+__all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors',
+   'Matrix', 'DenseMatrix', 'SparseMatrix', 'Matrices']
+
+
+if sys.version_info[:2] == (2, 7):
+# speed up pickling array in Python 2.7
+def fast_pickle_array(ar):
+return array.array, (ar.typecode, ar.tostring())
+copy_reg.pickle(array.array, fast_pickle_array)
+
+
+# Check whether we have SciPy. MLlib works without it too, but if we have it, 
some methods,
+# such as _dot and _serialize_double_vector, start to support scipy.sparse 
matrices.
+
+try:
+import scipy.sparse
+_have_scipy = True
+except:
+# No SciPy in environment, but that's okay
+_have_scipy = False
+
+
+def _convert_to_vector(l):
+if isinstance(l, Vector):
+return l
+elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange):
+return DenseVector(l)
+elif

43 matches

Mail list logo