[2/2] spark git commit: Preparing development version 2.0.0-SNAPSHOT
Preparing development version 2.0.0-SNAPSHOT Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b5450091 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b5450091 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b5450091 Branch: refs/heads/branch-2.0 Commit: b54500913d49b91949716b2c41bd5f637b1795a7 Parents: 8f5a04b Author: Patrick WendellAuthored: Tue May 17 18:15:51 2016 -0700 Committer: Patrick Wendell Committed: Tue May 17 18:15:51 2016 -0700 -- assembly/pom.xml | 2 +- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml| 2 +- common/network-yarn/pom.xml | 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml | 2 +- common/unsafe/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/docker-integration-tests/pom.xml | 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml| 2 +- external/java8-tests/pom.xml | 2 +- external/kafka-0-8-assembly/pom.xml | 2 +- external/kafka-0-8/pom.xml| 2 +- external/kinesis-asl-assembly/pom.xml | 2 +- external/kinesis-asl/pom.xml | 2 +- external/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml| 2 +- launcher/pom.xml | 2 +- mllib-local/pom.xml | 2 +- mllib/pom.xml | 2 +- pom.xml | 2 +- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- sql/hivecontext-compatibility/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- yarn/pom.xml | 2 +- 33 files changed, 33 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b5450091/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 958cb45..75ac926 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 -2.0.0-preview +2.0.0-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/b5450091/common/network-common/pom.xml -- diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index 2cb86ea..5444ae6 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0-preview +2.0.0-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/b5450091/common/network-shuffle/pom.xml -- diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index 58d7879..e736436 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0-preview +2.0.0-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/b5450091/common/network-yarn/pom.xml -- diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index 3f8dfe4..1fd3af2 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0-preview +2.0.0-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/b5450091/common/sketch/pom.xml -- diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml index 03db5b8..0bd 100644 --- a/common/sketch/pom.xml +++ b/common/sketch/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0-preview +2.0.0-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/b5450091/common/tags/pom.xml -- diff --git a/common/tags/pom.xml b/common/tags/pom.xml index e100851..14e94ec 100644 --- a/common/tags/pom.xml +++ b/common/tags/pom.xml @@ -22,7 +22,7 @@ org.apache.spark
[spark] Git Push Summary
Repository: spark Updated Tags: refs/tags/2.0.0-preview [created] 8f5a04b62 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[1/2] spark git commit: Preparing Spark release 2.0.0-preview
Repository: spark Updated Branches: refs/heads/branch-2.0 c8be3da66 -> b54500913 Preparing Spark release 2.0.0-preview Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8f5a04b6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8f5a04b6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8f5a04b6 Branch: refs/heads/branch-2.0 Commit: 8f5a04b6299e3a47aca13cbb40e72344c0114860 Parents: c8be3da Author: Patrick WendellAuthored: Tue May 17 18:15:42 2016 -0700 Committer: Patrick Wendell Committed: Tue May 17 18:15:42 2016 -0700 -- assembly/pom.xml | 2 +- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml| 2 +- common/network-yarn/pom.xml | 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml | 2 +- common/unsafe/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/docker-integration-tests/pom.xml | 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml| 2 +- external/java8-tests/pom.xml | 2 +- external/kafka-0-8-assembly/pom.xml | 2 +- external/kafka-0-8/pom.xml| 2 +- external/kinesis-asl-assembly/pom.xml | 2 +- external/kinesis-asl/pom.xml | 2 +- external/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml| 2 +- launcher/pom.xml | 2 +- mllib-local/pom.xml | 2 +- mllib/pom.xml | 2 +- pom.xml | 2 +- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- sql/hivecontext-compatibility/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- yarn/pom.xml | 2 +- 33 files changed, 33 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8f5a04b6/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 75ac926..958cb45 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 -2.0.0-SNAPSHOT +2.0.0-preview ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/8f5a04b6/common/network-common/pom.xml -- diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index 5444ae6..2cb86ea 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0-SNAPSHOT +2.0.0-preview ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/8f5a04b6/common/network-shuffle/pom.xml -- diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index e736436..58d7879 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0-SNAPSHOT +2.0.0-preview ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/8f5a04b6/common/network-yarn/pom.xml -- diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index 1fd3af2..3f8dfe4 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0-SNAPSHOT +2.0.0-preview ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/8f5a04b6/common/sketch/pom.xml -- diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml index 0bd..03db5b8 100644 --- a/common/sketch/pom.xml +++ b/common/sketch/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0-SNAPSHOT +2.0.0-preview ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/8f5a04b6/common/tags/pom.xml -- diff --git a/common/tags/pom.xml b/common/tags/pom.xml index 14e94ec..e100851 100644 --- a/common/tags/pom.xml
spark git commit: Prepare branch for 2.0.0-preview.
Repository: spark Updated Branches: refs/heads/branch-2.0 5f5270ead -> c8be3da66 Prepare branch for 2.0.0-preview. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c8be3da6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c8be3da6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c8be3da6 Branch: refs/heads/branch-2.0 Commit: c8be3da66903899fcd743c425c25e32fc356d981 Parents: 5f5270e Author: Reynold XinAuthored: Tue May 17 18:07:59 2016 -0700 Committer: Reynold Xin Committed: Tue May 17 18:07:59 2016 -0700 -- core/src/main/scala/org/apache/spark/package.scala | 2 +- docs/_config.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c8be3da6/core/src/main/scala/org/apache/spark/package.scala -- diff --git a/core/src/main/scala/org/apache/spark/package.scala b/core/src/main/scala/org/apache/spark/package.scala index cc5e7ef..65a3dd2 100644 --- a/core/src/main/scala/org/apache/spark/package.scala +++ b/core/src/main/scala/org/apache/spark/package.scala @@ -43,5 +43,5 @@ package org.apache package object spark { // For package docs only - val SPARK_VERSION = "2.0.0-SNAPSHOT" + val SPARK_VERSION = "2.0.0-preview" } http://git-wip-us.apache.org/repos/asf/spark/blob/c8be3da6/docs/_config.yml -- diff --git a/docs/_config.yml b/docs/_config.yml index 8bdc68a..c0a3be7 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -14,7 +14,7 @@ include: # These allow the documentation to be updated with newer releases # of Spark, Scala, and Mesos. -SPARK_VERSION: 2.0.0-SNAPSHOT +SPARK_VERSION: 2.0.0-preview SPARK_VERSION_SHORT: 2.0.0 SCALA_BINARY_VERSION: "2.11" SCALA_VERSION: "2.11.7" - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[3/3] spark git commit: [SPARK-15171][SQL] Remove the references to deprecated method dataset.registerTempTable
[SPARK-15171][SQL] Remove the references to deprecated method dataset.registerTempTable ## What changes were proposed in this pull request? Update the unit test code, examples, and documents to remove calls to deprecated method `dataset.registerTempTable`. ## How was this patch tested? This PR only changes the unit test code, examples, and comments. It should be safe. This is a follow up of PR https://github.com/apache/spark/pull/12945 which was merged. Author: Sean ZhongCloses #13098 from clockfly/spark-15171-remove-deprecation. (cherry picked from commit 25b315e6cad7c27b62dcaa2c194293c1115fdfb3) Signed-off-by: Cheng Lian Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5f5270ea Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5f5270ea Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5f5270ea Branch: refs/heads/branch-2.0 Commit: 5f5270ead86d5294af6c871e36112e2a833e9d64 Parents: 1db3741 Author: Sean Zhong Authored: Wed May 18 09:01:59 2016 +0800 Committer: Cheng Lian Committed: Wed May 18 09:05:34 2016 +0800 -- docs/sql-programming-guide.md | 48 - docs/streaming-programming-guide.md | 12 +-- .../apache/spark/examples/sql/JavaSparkSQL.java | 8 +- .../streaming/JavaSqlNetworkWordCount.java | 2 +- examples/src/main/python/sql.py | 2 +- .../python/streaming/sql_network_wordcount.py | 2 +- .../apache/spark/examples/sql/RDDRelation.scala | 6 +- .../spark/examples/sql/hive/HiveFromSpark.scala | 4 +- .../streaming/SqlNetworkWordCount.scala | 2 +- .../org/apache/spark/ml/JavaPipelineSuite.java | 2 +- .../JavaLogisticRegressionSuite.java| 10 +- .../regression/JavaLinearRegressionSuite.java | 4 +- python/pyspark/sql/context.py | 4 +- python/pyspark/sql/readwriter.py| 2 +- python/pyspark/sql/session.py | 2 +- python/pyspark/sql/tests.py | 25 ++--- .../scala/org/apache/spark/sql/SQLContext.scala | 2 +- .../apache/spark/sql/JavaApplySchemaSuite.java | 8 +- .../spark/sql/sources/JavaSaveLoadSuite.java| 2 +- .../org/apache/spark/sql/CachedTableSuite.scala | 60 +-- .../spark/sql/ColumnExpressionSuite.scala | 2 +- .../spark/sql/DataFrameTimeWindowingSuite.scala | 2 +- .../apache/spark/sql/DataFrameWindowSuite.scala | 22 ++-- .../scala/org/apache/spark/sql/JoinSuite.scala | 4 +- .../org/apache/spark/sql/ListTablesSuite.scala | 4 +- .../org/apache/spark/sql/SQLContextSuite.scala | 2 +- .../org/apache/spark/sql/SQLQuerySuite.scala| 103 ++- .../sql/ScalaReflectionRelationSuite.scala | 10 +- .../org/apache/spark/sql/SubquerySuite.scala| 8 +- .../scala/org/apache/spark/sql/UDFSuite.scala | 12 +-- .../apache/spark/sql/UserDefinedTypeSuite.scala | 2 +- .../spark/sql/execution/PlannerSuite.scala | 10 +- .../benchmark/AggregateBenchmark.scala | 3 +- .../columnar/InMemoryColumnarQuerySuite.scala | 8 +- .../columnar/PartitionBatchPruningSuite.scala | 2 +- .../execution/datasources/json/JsonSuite.scala | 58 +-- .../ParquetPartitionDiscoverySuite.scala| 10 +- .../datasources/parquet/ParquetQuerySuite.scala | 4 +- .../parquet/ParquetReadBenchmark.scala | 20 ++-- .../datasources/parquet/TPCDSBenchmark.scala| 2 +- .../sql/execution/metric/SQLMetricsSuite.scala | 8 +- .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 2 +- .../sql/sources/CreateTableAsSelectSuite.scala | 2 +- .../apache/spark/sql/sources/InsertSuite.scala | 6 +- .../spark/sql/sources/SaveLoadSuite.scala | 4 +- .../spark/sql/streaming/StreamSuite.scala | 2 +- .../org/apache/spark/sql/test/SQLTestData.scala | 46 - .../spark/sql/hive/JavaDataFrameSuite.java | 2 +- .../sql/hive/JavaMetastoreDataSourcesSuite.java | 2 +- .../spark/sql/hive/ErrorPositionSuite.scala | 4 +- .../spark/sql/hive/HiveParquetSuite.scala | 4 +- .../spark/sql/hive/HiveSparkSubmitSuite.scala | 8 +- .../sql/hive/InsertIntoHiveTableSuite.scala | 12 +-- .../sql/hive/MetastoreDataSourcesSuite.scala| 8 +- .../hive/ParquetHiveCompatibilitySuite.scala| 2 +- .../spark/sql/hive/QueryPartitionSuite.scala| 2 +- .../apache/spark/sql/hive/StatisticsSuite.scala | 2 +- .../org/apache/spark/sql/hive/UDFSuite.scala| 2 +- .../hive/execution/AggregationQuerySuite.scala | 8 +- .../sql/hive/execution/HiveExplainSuite.scala | 2 +- .../execution/HiveOperatorQueryableSuite.scala | 4 +- .../spark/sql/hive/execution/HivePlanTest.scala | 2 +- .../sql/hive/execution/HiveQuerySuite.scala
[2/3] spark git commit: [SPARK-15171][SQL] Remove the references to deprecated method dataset.registerTempTable
http://git-wip-us.apache.org/repos/asf/spark/blob/5f5270ea/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala index d2e1ea1..2a5295d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala @@ -78,7 +78,7 @@ class PlannerSuite extends SharedSQLContext { val schema = StructType(fields) val row = Row.fromSeq(Seq.fill(fields.size)(null)) val rowRDD = sparkContext.parallelize(row :: Nil) -spark.createDataFrame(rowRDD, schema).registerTempTable("testLimit") +spark.createDataFrame(rowRDD, schema).createOrReplaceTempView("testLimit") val planned = sql( """ @@ -132,7 +132,7 @@ class PlannerSuite extends SharedSQLContext { test("InMemoryRelation statistics propagation") { withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "81920") { withTempTable("tiny") { -testData.limit(3).registerTempTable("tiny") +testData.limit(3).createOrReplaceTempView("tiny") sql("CACHE TABLE tiny") val a = testData.as("a") @@ -199,9 +199,9 @@ class PlannerSuite extends SharedSQLContext { test("PartitioningCollection") { withTempTable("normal", "small", "tiny") { - testData.registerTempTable("normal") - testData.limit(10).registerTempTable("small") - testData.limit(3).registerTempTable("tiny") + testData.createOrReplaceTempView("normal") + testData.limit(10).createOrReplaceTempView("small") + testData.limit(3).createOrReplaceTempView("tiny") // Disable broadcast join withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { http://git-wip-us.apache.org/repos/asf/spark/blob/5f5270ea/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala index b31338e..bf3a39c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala @@ -134,7 +134,8 @@ class AggregateBenchmark extends BenchmarkBase { val N = 20 << 22 val benchmark = new Benchmark("Aggregate w keys", N) -sparkSession.range(N).selectExpr("id", "floor(rand() * 1) as k").registerTempTable("test") +sparkSession.range(N).selectExpr("id", "floor(rand() * 1) as k") + .createOrReplaceTempView("test") def f(): Unit = sparkSession.sql("select k, k, sum(id) from test group by k, k").collect() http://git-wip-us.apache.org/repos/asf/spark/blob/5f5270ea/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala index 2099d4e..e2fb913 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala @@ -42,7 +42,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext { test("default size avoids broadcast") { // TODO: Improve this test when we have better statistics sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString)) - .toDF().registerTempTable("sizeTst") + .toDF().createOrReplaceTempView("sizeTst") spark.catalog.cacheTable("sizeTst") assert( spark.table("sizeTst").queryExecution.analyzed.statistics.sizeInBytes > @@ -92,7 +92,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext { test("SPARK-2729 regression: timestamp data type") { val timestamps = (0 to 3).map(i => Tuple1(new Timestamp(i))).toDF("time") -timestamps.registerTempTable("timestamps") +timestamps.createOrReplaceTempView("timestamps") checkAnswer( sql("SELECT time FROM timestamps"), @@ -133,7 +133,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext { assert(df.schema.head.dataType === DecimalType(15, 10)) -df.cache().registerTempTable("test_fixed_decimal") +df.cache().createOrReplaceTempView("test_fixed_decimal") checkAnswer( sql("SELECT * FROM test_fixed_decimal"),
spark git commit: [SPARK-14346] Fix scala-2.10 build
Repository: spark Updated Branches: refs/heads/branch-2.0 2dddec40d -> 1db37417c [SPARK-14346] Fix scala-2.10 build ## What changes were proposed in this pull request? Scala 2.10 build was broken by #13079. I am reverting the change of that line. Author: Yin HuaiCloses #13157 from yhuai/SPARK-14346-fix-scala2.10. (cherry picked from commit 2a5db9c140b9d60a5ec91018be19bec7b80850ee) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1db37417 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1db37417 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1db37417 Branch: refs/heads/branch-2.0 Commit: 1db37417c25429c0001c19d2f10f4a314fe4585c Parents: 2dddec4 Author: Yin Huai Authored: Tue May 17 18:02:31 2016 -0700 Committer: Yin Huai Committed: Tue May 17 18:02:45 2016 -0700 -- .../scala/org/apache/spark/sql/catalyst/catalog/interface.scala| 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1db37417/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index d4f5cbb..3fdd411 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -208,6 +208,6 @@ case class SimpleCatalogRelation( } require( -metadata.identifier.database.contains(databaseName), +metadata.identifier.database == Some(databaseName), "provided database does not match the one specified in the table definition") } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-14346] Fix scala-2.10 build
Repository: spark Updated Branches: refs/heads/master 25b315e6c -> 2a5db9c14 [SPARK-14346] Fix scala-2.10 build ## What changes were proposed in this pull request? Scala 2.10 build was broken by #13079. I am reverting the change of that line. Author: Yin HuaiCloses #13157 from yhuai/SPARK-14346-fix-scala2.10. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2a5db9c1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2a5db9c1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2a5db9c1 Branch: refs/heads/master Commit: 2a5db9c140b9d60a5ec91018be19bec7b80850ee Parents: 25b315e Author: Yin Huai Authored: Tue May 17 18:02:31 2016 -0700 Committer: Yin Huai Committed: Tue May 17 18:02:31 2016 -0700 -- .../scala/org/apache/spark/sql/catalyst/catalog/interface.scala| 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2a5db9c1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index d4f5cbb..3fdd411 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -208,6 +208,6 @@ case class SimpleCatalogRelation( } require( -metadata.identifier.database.contains(databaseName), +metadata.identifier.database == Some(databaseName), "provided database does not match the one specified in the table definition") } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[2/3] spark git commit: [SPARK-15171][SQL] Remove the references to deprecated method dataset.registerTempTable
http://git-wip-us.apache.org/repos/asf/spark/blob/25b315e6/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala index d2e1ea1..2a5295d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala @@ -78,7 +78,7 @@ class PlannerSuite extends SharedSQLContext { val schema = StructType(fields) val row = Row.fromSeq(Seq.fill(fields.size)(null)) val rowRDD = sparkContext.parallelize(row :: Nil) -spark.createDataFrame(rowRDD, schema).registerTempTable("testLimit") +spark.createDataFrame(rowRDD, schema).createOrReplaceTempView("testLimit") val planned = sql( """ @@ -132,7 +132,7 @@ class PlannerSuite extends SharedSQLContext { test("InMemoryRelation statistics propagation") { withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "81920") { withTempTable("tiny") { -testData.limit(3).registerTempTable("tiny") +testData.limit(3).createOrReplaceTempView("tiny") sql("CACHE TABLE tiny") val a = testData.as("a") @@ -199,9 +199,9 @@ class PlannerSuite extends SharedSQLContext { test("PartitioningCollection") { withTempTable("normal", "small", "tiny") { - testData.registerTempTable("normal") - testData.limit(10).registerTempTable("small") - testData.limit(3).registerTempTable("tiny") + testData.createOrReplaceTempView("normal") + testData.limit(10).createOrReplaceTempView("small") + testData.limit(3).createOrReplaceTempView("tiny") // Disable broadcast join withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { http://git-wip-us.apache.org/repos/asf/spark/blob/25b315e6/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala index b31338e..bf3a39c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala @@ -134,7 +134,8 @@ class AggregateBenchmark extends BenchmarkBase { val N = 20 << 22 val benchmark = new Benchmark("Aggregate w keys", N) -sparkSession.range(N).selectExpr("id", "floor(rand() * 1) as k").registerTempTable("test") +sparkSession.range(N).selectExpr("id", "floor(rand() * 1) as k") + .createOrReplaceTempView("test") def f(): Unit = sparkSession.sql("select k, k, sum(id) from test group by k, k").collect() http://git-wip-us.apache.org/repos/asf/spark/blob/25b315e6/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala index 2099d4e..e2fb913 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala @@ -42,7 +42,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext { test("default size avoids broadcast") { // TODO: Improve this test when we have better statistics sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString)) - .toDF().registerTempTable("sizeTst") + .toDF().createOrReplaceTempView("sizeTst") spark.catalog.cacheTable("sizeTst") assert( spark.table("sizeTst").queryExecution.analyzed.statistics.sizeInBytes > @@ -92,7 +92,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext { test("SPARK-2729 regression: timestamp data type") { val timestamps = (0 to 3).map(i => Tuple1(new Timestamp(i))).toDF("time") -timestamps.registerTempTable("timestamps") +timestamps.createOrReplaceTempView("timestamps") checkAnswer( sql("SELECT time FROM timestamps"), @@ -133,7 +133,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext { assert(df.schema.head.dataType === DecimalType(15, 10)) -df.cache().registerTempTable("test_fixed_decimal") +df.cache().createOrReplaceTempView("test_fixed_decimal") checkAnswer( sql("SELECT * FROM test_fixed_decimal"),
[3/3] spark git commit: [SPARK-15171][SQL] Remove the references to deprecated method dataset.registerTempTable
[SPARK-15171][SQL] Remove the references to deprecated method dataset.registerTempTable ## What changes were proposed in this pull request? Update the unit test code, examples, and documents to remove calls to deprecated method `dataset.registerTempTable`. ## How was this patch tested? This PR only changes the unit test code, examples, and comments. It should be safe. This is a follow up of PR https://github.com/apache/spark/pull/12945 which was merged. Author: Sean ZhongCloses #13098 from clockfly/spark-15171-remove-deprecation. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/25b315e6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/25b315e6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/25b315e6 Branch: refs/heads/master Commit: 25b315e6cad7c27b62dcaa2c194293c1115fdfb3 Parents: b674e67 Author: Sean Zhong Authored: Wed May 18 09:01:59 2016 +0800 Committer: Cheng Lian Committed: Wed May 18 09:01:59 2016 +0800 -- docs/sql-programming-guide.md | 48 - docs/streaming-programming-guide.md | 12 +-- .../apache/spark/examples/sql/JavaSparkSQL.java | 8 +- .../streaming/JavaSqlNetworkWordCount.java | 2 +- examples/src/main/python/sql.py | 2 +- .../python/streaming/sql_network_wordcount.py | 2 +- .../apache/spark/examples/sql/RDDRelation.scala | 6 +- .../spark/examples/sql/hive/HiveFromSpark.scala | 4 +- .../streaming/SqlNetworkWordCount.scala | 2 +- .../org/apache/spark/ml/JavaPipelineSuite.java | 2 +- .../JavaLogisticRegressionSuite.java| 10 +- .../regression/JavaLinearRegressionSuite.java | 4 +- python/pyspark/sql/context.py | 4 +- python/pyspark/sql/readwriter.py| 2 +- python/pyspark/sql/session.py | 2 +- python/pyspark/sql/tests.py | 25 ++--- .../scala/org/apache/spark/sql/SQLContext.scala | 2 +- .../apache/spark/sql/JavaApplySchemaSuite.java | 8 +- .../spark/sql/sources/JavaSaveLoadSuite.java| 2 +- .../org/apache/spark/sql/CachedTableSuite.scala | 60 +-- .../spark/sql/ColumnExpressionSuite.scala | 2 +- .../spark/sql/DataFrameTimeWindowingSuite.scala | 2 +- .../apache/spark/sql/DataFrameWindowSuite.scala | 22 ++-- .../scala/org/apache/spark/sql/JoinSuite.scala | 4 +- .../org/apache/spark/sql/ListTablesSuite.scala | 4 +- .../org/apache/spark/sql/SQLContextSuite.scala | 2 +- .../org/apache/spark/sql/SQLQuerySuite.scala| 103 ++- .../sql/ScalaReflectionRelationSuite.scala | 10 +- .../org/apache/spark/sql/SubquerySuite.scala| 8 +- .../scala/org/apache/spark/sql/UDFSuite.scala | 12 +-- .../apache/spark/sql/UserDefinedTypeSuite.scala | 2 +- .../spark/sql/execution/PlannerSuite.scala | 10 +- .../benchmark/AggregateBenchmark.scala | 3 +- .../columnar/InMemoryColumnarQuerySuite.scala | 8 +- .../columnar/PartitionBatchPruningSuite.scala | 2 +- .../execution/datasources/json/JsonSuite.scala | 58 +-- .../ParquetPartitionDiscoverySuite.scala| 10 +- .../datasources/parquet/ParquetQuerySuite.scala | 4 +- .../parquet/ParquetReadBenchmark.scala | 20 ++-- .../datasources/parquet/TPCDSBenchmark.scala| 2 +- .../sql/execution/metric/SQLMetricsSuite.scala | 8 +- .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 2 +- .../sql/sources/CreateTableAsSelectSuite.scala | 2 +- .../apache/spark/sql/sources/InsertSuite.scala | 6 +- .../spark/sql/sources/SaveLoadSuite.scala | 4 +- .../spark/sql/streaming/StreamSuite.scala | 2 +- .../org/apache/spark/sql/test/SQLTestData.scala | 46 - .../spark/sql/hive/JavaDataFrameSuite.java | 2 +- .../sql/hive/JavaMetastoreDataSourcesSuite.java | 2 +- .../spark/sql/hive/ErrorPositionSuite.scala | 4 +- .../spark/sql/hive/HiveParquetSuite.scala | 4 +- .../spark/sql/hive/HiveSparkSubmitSuite.scala | 8 +- .../sql/hive/InsertIntoHiveTableSuite.scala | 12 +-- .../sql/hive/MetastoreDataSourcesSuite.scala| 8 +- .../hive/ParquetHiveCompatibilitySuite.scala| 2 +- .../spark/sql/hive/QueryPartitionSuite.scala| 2 +- .../apache/spark/sql/hive/StatisticsSuite.scala | 2 +- .../org/apache/spark/sql/hive/UDFSuite.scala| 2 +- .../hive/execution/AggregationQuerySuite.scala | 8 +- .../sql/hive/execution/HiveExplainSuite.scala | 2 +- .../execution/HiveOperatorQueryableSuite.scala | 4 +- .../spark/sql/hive/execution/HivePlanTest.scala | 2 +- .../sql/hive/execution/HiveQuerySuite.scala | 18 ++-- .../hive/execution/HiveResolutionSuite.scala| 10 +- .../sql/hive/execution/HiveTableScanSuite.scala |
[1/3] spark git commit: [SPARK-15171][SQL] Remove the references to deprecated method dataset.registerTempTable
Repository: spark Updated Branches: refs/heads/master b674e67c2 -> 25b315e6c http://git-wip-us.apache.org/repos/asf/spark/blob/25b315e6/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index ac9a393..81f3ea8 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -102,14 +102,14 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("SPARK-6835: udtf in lateral view") { val df = Seq((1, 1)).toDF("c1", "c2") -df.registerTempTable("table1") +df.createOrReplaceTempView("table1") val query = sql("SELECT c1, v FROM table1 LATERAL VIEW stack(3, 1, c1 + 1, c1 + 2) d AS v") checkAnswer(query, Row(1, 1) :: Row(1, 2) :: Row(1, 3) :: Nil) } test("SPARK-13651: generator outputs shouldn't be resolved from its child's output") { withTempTable("src") { - Seq(("id1", "value1")).toDF("key", "value").registerTempTable("src") + Seq(("id1", "value1")).toDF("key", "value").createOrReplaceTempView("src") val query = sql("SELECT genoutput.* FROM src " + "LATERAL VIEW explode(map('key1', 100, 'key2', 200)) genoutput AS key, value") @@ -135,8 +135,8 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { Order(1, "Atlas", "MTB", 434, "2015-01-07", "John D", "Pacifica", "CA", 20151), Order(11, "Swift", "YFlikr", 137, "2015-01-23", "John D", "Hayward", "CA", 20151)) -orders.toDF.registerTempTable("orders1") -orderUpdates.toDF.registerTempTable("orderupdates1") +orders.toDF.createOrReplaceTempView("orders1") +orderUpdates.toDF.createOrReplaceTempView("orderupdates1") sql( """CREATE TABLE orders( @@ -305,7 +305,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("SPARK-5371: union with null and sum") { val df = Seq((1, 1)).toDF("c1", "c2") -df.registerTempTable("table1") +df.createOrReplaceTempView("table1") val query = sql( """ @@ -329,7 +329,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("CTAS with WITH clause") { val df = Seq((1, 1)).toDF("c1", "c2") -df.registerTempTable("table1") +df.createOrReplaceTempView("table1") sql( """ @@ -346,7 +346,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } test("explode nested Field") { -Seq(NestedArray1(NestedArray2(Seq(1, 2, 3.toDF.registerTempTable("nestedArray") +Seq(NestedArray1(NestedArray2(Seq(1, 2, 3.toDF.createOrReplaceTempView("nestedArray") checkAnswer( sql("SELECT ints FROM nestedArray LATERAL VIEW explode(a.b) a AS ints"), Row(1) :: Row(2) :: Row(3) :: Nil) @@ -543,7 +543,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } test("specifying the column list for CTAS") { -Seq((1, "11"), (2, "22")).toDF("key", "value").registerTempTable("mytable1") +Seq((1, "11"), (2, "22")).toDF("key", "value").createOrReplaceTempView("mytable1") sql("create table gen__tmp(a int, b string) as select key, value from mytable1") checkAnswer( @@ -598,7 +598,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("double nested data") { sparkContext.parallelize(Nested1(Nested2(Nested3(1))) :: Nil) - .toDF().registerTempTable("nested") + .toDF().createOrReplaceTempView("nested") checkAnswer( sql("SELECT f1.f2.f3 FROM nested"), Row(1)) @@ -682,7 +682,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("SPARK-4963 DataFrame sample on mutable row return wrong result") { sql("SELECT * FROM src WHERE key % 2 = 0") .sample(withReplacement = false, fraction = 0.3) - .registerTempTable("sampled") + .createOrReplaceTempView("sampled") (1 to 10).foreach { i => checkAnswer( sql("SELECT * FROM sampled WHERE key % 2 = 1"), @@ -707,7 +707,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { val rowRdd = sparkContext.parallelize(row :: Nil) -hiveContext.createDataFrame(rowRdd, schema).registerTempTable("testTable") +hiveContext.createDataFrame(rowRdd, schema).createOrReplaceTempView("testTable") sql( """CREATE TABLE nullValuesInInnerComplexTypes @@ -733,14 +733,14 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("SPARK-4296 Grouping field with Hive
spark git commit: [SPARK-14346][SQL] Native SHOW CREATE TABLE for Hive tables/views
Repository: spark Updated Branches: refs/heads/branch-2.0 7b62b7c11 -> 2dddec40d [SPARK-14346][SQL] Native SHOW CREATE TABLE for Hive tables/views ## What changes were proposed in this pull request? This is a follow-up of #12781. It adds native `SHOW CREATE TABLE` support for Hive tables and views. A new field `hasUnsupportedFeatures` is added to `CatalogTable` to indicate whether all table metadata retrieved from the concrete underlying external catalog (i.e. Hive metastore in this case) can be mapped to fields in `CatalogTable`. This flag is useful when the target Hive table contains structures that can't be handled by Spark SQL, e.g., skewed columns and storage handler, etc.. ## How was this patch tested? New test cases are added in `ShowCreateTableSuite` to do round-trip tests. Author: Cheng LianCloses #13079 from liancheng/spark-14346-show-create-table-for-hive-tables. (cherry picked from commit b674e67c22bf663334e537e35787c00533adbb04) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2dddec40 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2dddec40 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2dddec40 Branch: refs/heads/branch-2.0 Commit: 2dddec40d6562d1d16bb242bf7dc730431ee1e3e Parents: 7b62b7c Author: Cheng Lian Authored: Tue May 17 15:56:44 2016 -0700 Committer: Yin Huai Committed: Tue May 17 15:56:57 2016 -0700 -- .../spark/sql/catalyst/catalog/interface.scala | 12 +- .../spark/sql/execution/command/tables.scala| 184 +- .../spark/sql/hive/client/HiveClientImpl.scala | 10 +- .../spark/sql/hive/ShowCreateTableSuite.scala | 185 ++- 4 files changed, 333 insertions(+), 58 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2dddec40/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index d215655..d4f5cbb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -79,6 +79,12 @@ case class CatalogTablePartition( * * Note that Hive's metastore also tracks skewed columns. We should consider adding that in the * future once we have a better understanding of how we want to handle skewed columns. + * + * @param hasUnsupportedFeatures is used to indicate whether all table metadata entries retrieved + *from the concrete underlying external catalog (e.g. Hive metastore) are supported by + *Spark SQL. For example, if the underlying Hive table has skewed columns, this information + *can't be mapped to [[CatalogTable]] since Spark SQL doesn't handle skewed columns for now. + *In this case `hasUnsupportedFeatures` is set to true. By default, it is false. */ case class CatalogTable( identifier: TableIdentifier, @@ -95,7 +101,8 @@ case class CatalogTable( properties: Map[String, String] = Map.empty, viewOriginalText: Option[String] = None, viewText: Option[String] = None, -comment: Option[String] = None) { +comment: Option[String] = None, +hasUnsupportedFeatures: Boolean = false) { // Verify that the provided columns are part of the schema private val colNames = schema.map(_.name).toSet @@ -200,6 +207,7 @@ case class SimpleCatalogRelation( } } - require(metadata.identifier.database == Some(databaseName), + require( +metadata.identifier.database.contains(databaseName), "provided database does not match the one specified in the table definition") } http://git-wip-us.apache.org/repos/asf/spark/blob/2dddec40/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index bb4f1ff..1fc02d1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -626,40 +626,149 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman val stmt = if (DDLUtils.isDatasourceTable(tableMetadata)) { showCreateDataSourceTable(tableMetadata) } else { - throw new UnsupportedOperationException( -
spark git commit: [SPARK-14346][SQL] Native SHOW CREATE TABLE for Hive tables/views
Repository: spark Updated Branches: refs/heads/master 8e8bc9f95 -> b674e67c2 [SPARK-14346][SQL] Native SHOW CREATE TABLE for Hive tables/views ## What changes were proposed in this pull request? This is a follow-up of #12781. It adds native `SHOW CREATE TABLE` support for Hive tables and views. A new field `hasUnsupportedFeatures` is added to `CatalogTable` to indicate whether all table metadata retrieved from the concrete underlying external catalog (i.e. Hive metastore in this case) can be mapped to fields in `CatalogTable`. This flag is useful when the target Hive table contains structures that can't be handled by Spark SQL, e.g., skewed columns and storage handler, etc.. ## How was this patch tested? New test cases are added in `ShowCreateTableSuite` to do round-trip tests. Author: Cheng LianCloses #13079 from liancheng/spark-14346-show-create-table-for-hive-tables. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b674e67c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b674e67c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b674e67c Branch: refs/heads/master Commit: b674e67c22bf663334e537e35787c00533adbb04 Parents: 8e8bc9f Author: Cheng Lian Authored: Tue May 17 15:56:44 2016 -0700 Committer: Yin Huai Committed: Tue May 17 15:56:44 2016 -0700 -- .../spark/sql/catalyst/catalog/interface.scala | 12 +- .../spark/sql/execution/command/tables.scala| 184 +- .../spark/sql/hive/client/HiveClientImpl.scala | 10 +- .../spark/sql/hive/ShowCreateTableSuite.scala | 185 ++- 4 files changed, 333 insertions(+), 58 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b674e67c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index d215655..d4f5cbb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -79,6 +79,12 @@ case class CatalogTablePartition( * * Note that Hive's metastore also tracks skewed columns. We should consider adding that in the * future once we have a better understanding of how we want to handle skewed columns. + * + * @param hasUnsupportedFeatures is used to indicate whether all table metadata entries retrieved + *from the concrete underlying external catalog (e.g. Hive metastore) are supported by + *Spark SQL. For example, if the underlying Hive table has skewed columns, this information + *can't be mapped to [[CatalogTable]] since Spark SQL doesn't handle skewed columns for now. + *In this case `hasUnsupportedFeatures` is set to true. By default, it is false. */ case class CatalogTable( identifier: TableIdentifier, @@ -95,7 +101,8 @@ case class CatalogTable( properties: Map[String, String] = Map.empty, viewOriginalText: Option[String] = None, viewText: Option[String] = None, -comment: Option[String] = None) { +comment: Option[String] = None, +hasUnsupportedFeatures: Boolean = false) { // Verify that the provided columns are part of the schema private val colNames = schema.map(_.name).toSet @@ -200,6 +207,7 @@ case class SimpleCatalogRelation( } } - require(metadata.identifier.database == Some(databaseName), + require( +metadata.identifier.database.contains(databaseName), "provided database does not match the one specified in the table definition") } http://git-wip-us.apache.org/repos/asf/spark/blob/b674e67c/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index bb4f1ff..1fc02d1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -626,40 +626,149 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman val stmt = if (DDLUtils.isDatasourceTable(tableMetadata)) { showCreateDataSourceTable(tableMetadata) } else { - throw new UnsupportedOperationException( -"SHOW CREATE TABLE only supports Spark SQL data source tables.") + showCreateHiveTable(tableMetadata) }
spark git commit: [SPARK-11735][CORE][SQL] Add a check in the constructor of SQLContext/SparkSession to make sure its SparkContext is not stopped
Repository: spark Updated Branches: refs/heads/branch-2.0 c0bb77132 -> 7b62b7c11 [SPARK-11735][CORE][SQL] Add a check in the constructor of SQLContext/SparkSession to make sure its SparkContext is not stopped ## What changes were proposed in this pull request? Add a check in the constructor of SQLContext/SparkSession to make sure its SparkContext is not stopped. ## How was this patch tested? Jenkins unit tests. Author: Shixiong ZhuCloses #13154 from zsxwing/check-spark-context-stop. (cherry picked from commit 8e8bc9f957de6c0aefbc6ef4b18c421b486477a6) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7b62b7c1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7b62b7c1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7b62b7c1 Branch: refs/heads/branch-2.0 Commit: 7b62b7c1180dc3414b7d7e07561d0e6b89ff2b37 Parents: c0bb771 Author: Shixiong Zhu Authored: Tue May 17 14:57:21 2016 -0700 Committer: Yin Huai Committed: Tue May 17 14:57:31 2016 -0700 -- core/src/main/scala/org/apache/spark/SparkContext.scala | 2 +- sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala | 2 ++ sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7b62b7c1/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index e391599..e6cdd0d 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -94,7 +94,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli private[spark] val stopped: AtomicBoolean = new AtomicBoolean(false) - private def assertNotStopped(): Unit = { + private[spark] def assertNotStopped(): Unit = { if (stopped.get()) { val activeContext = SparkContext.activeContext.get() val activeCreationSite = http://git-wip-us.apache.org/repos/asf/spark/blob/7b62b7c1/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index c64e284..4451188 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -67,6 +67,8 @@ class SQLContext private[sql]( self => + sparkSession.sparkContext.assertNotStopped() + // Note: Since Spark 2.0 this class has become a wrapper of SparkSession, where the // real functionality resides. This class remains mainly for backward compatibility. http://git-wip-us.apache.org/repos/asf/spark/blob/7b62b7c1/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index 100b43f..aa974f2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -68,6 +68,7 @@ class SparkSession private( this(sc, None) } + sparkContext.assertNotStopped() /* --- * | Session-related state | - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11735][CORE][SQL] Add a check in the constructor of SQLContext/SparkSession to make sure its SparkContext is not stopped
Repository: spark Updated Branches: refs/heads/master 0f576a574 -> 8e8bc9f95 [SPARK-11735][CORE][SQL] Add a check in the constructor of SQLContext/SparkSession to make sure its SparkContext is not stopped ## What changes were proposed in this pull request? Add a check in the constructor of SQLContext/SparkSession to make sure its SparkContext is not stopped. ## How was this patch tested? Jenkins unit tests. Author: Shixiong ZhuCloses #13154 from zsxwing/check-spark-context-stop. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8e8bc9f9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8e8bc9f9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8e8bc9f9 Branch: refs/heads/master Commit: 8e8bc9f957de6c0aefbc6ef4b18c421b486477a6 Parents: 0f576a5 Author: Shixiong Zhu Authored: Tue May 17 14:57:21 2016 -0700 Committer: Yin Huai Committed: Tue May 17 14:57:21 2016 -0700 -- core/src/main/scala/org/apache/spark/SparkContext.scala | 2 +- sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala | 2 ++ sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8e8bc9f9/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index e391599..e6cdd0d 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -94,7 +94,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli private[spark] val stopped: AtomicBoolean = new AtomicBoolean(false) - private def assertNotStopped(): Unit = { + private[spark] def assertNotStopped(): Unit = { if (stopped.get()) { val activeContext = SparkContext.activeContext.get() val activeCreationSite = http://git-wip-us.apache.org/repos/asf/spark/blob/8e8bc9f9/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index c64e284..4451188 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -67,6 +67,8 @@ class SQLContext private[sql]( self => + sparkSession.sparkContext.assertNotStopped() + // Note: Since Spark 2.0 this class has become a wrapper of SparkSession, where the // real functionality resides. This class remains mainly for backward compatibility. http://git-wip-us.apache.org/repos/asf/spark/blob/8e8bc9f9/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index 3016437..da575c7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -68,6 +68,7 @@ class SparkSession private( this(sc, None) } + sparkContext.assertNotStopped() /* --- * | Session-related state | - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15244] [PYTHON] Type of column name created with createDataFrame is not consistent.
Repository: spark Updated Branches: refs/heads/master e2efe0529 -> 0f576a574 [SPARK-15244] [PYTHON] Type of column name created with createDataFrame is not consistent. ## What changes were proposed in this pull request? **createDataFrame** returns inconsistent types for column names. ```python >>> from pyspark.sql.types import StructType, StructField, StringType >>> schema = StructType([StructField(u"col", StringType())]) >>> df1 = spark.createDataFrame([("a",)], schema) >>> df1.columns # "col" is str ['col'] >>> df2 = spark.createDataFrame([("a",)], [u"col"]) >>> df2.columns # "col" is unicode [u'col'] ``` The reason is only **StructField** has the following code. ``` if not isinstance(name, str): name = name.encode('utf-8') ``` This PR adds the same logic into **createDataFrame** for consistency. ``` if isinstance(schema, list): schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema] ``` ## How was this patch tested? Pass the Jenkins test (with new python doctest) Author: Dongjoon HyunCloses #13097 from dongjoon-hyun/SPARK-15244. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0f576a57 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0f576a57 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0f576a57 Branch: refs/heads/master Commit: 0f576a5748244f7e874b925f8d841f1ca238f087 Parents: e2efe05 Author: Dongjoon Hyun Authored: Tue May 17 13:05:07 2016 -0700 Committer: Davies Liu Committed: Tue May 17 13:05:07 2016 -0700 -- python/pyspark/sql/session.py | 2 ++ python/pyspark/sql/tests.py | 7 +++ 2 files changed, 9 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0f576a57/python/pyspark/sql/session.py -- diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index ae31435..0781b44 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -465,6 +465,8 @@ class SparkSession(object): return (obj, ) schema = StructType().add("value", datatype) else: +if isinstance(schema, list): +schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema] prepare = lambda obj: obj if isinstance(data, RDD): http://git-wip-us.apache.org/repos/asf/spark/blob/0f576a57/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 0c73f58..0977c43 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -228,6 +228,13 @@ class SQLTests(ReusedPySparkTestCase): self.assertRaises(AnalysisException, lambda: df.select(df.c).first()) self.assertRaises(AnalysisException, lambda: df.select(df["c"]).first()) +def test_column_name_encoding(self): +"""Ensure that created columns has `str` type consistently.""" +columns = self.spark.createDataFrame([('Alice', 1)], ['name', u'age']).columns +self.assertEqual(columns, ['name', 'age']) +self.assertTrue(isinstance(columns[0], str)) +self.assertTrue(isinstance(columns[1], str)) + def test_explode(self): from pyspark.sql.functions import explode d = [Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"})] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15244] [PYTHON] Type of column name created with createDataFrame is not consistent.
Repository: spark Updated Branches: refs/heads/branch-2.0 ff1cfce18 -> c0bb77132 [SPARK-15244] [PYTHON] Type of column name created with createDataFrame is not consistent. ## What changes were proposed in this pull request? **createDataFrame** returns inconsistent types for column names. ```python >>> from pyspark.sql.types import StructType, StructField, StringType >>> schema = StructType([StructField(u"col", StringType())]) >>> df1 = spark.createDataFrame([("a",)], schema) >>> df1.columns # "col" is str ['col'] >>> df2 = spark.createDataFrame([("a",)], [u"col"]) >>> df2.columns # "col" is unicode [u'col'] ``` The reason is only **StructField** has the following code. ``` if not isinstance(name, str): name = name.encode('utf-8') ``` This PR adds the same logic into **createDataFrame** for consistency. ``` if isinstance(schema, list): schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema] ``` ## How was this patch tested? Pass the Jenkins test (with new python doctest) Author: Dongjoon HyunCloses #13097 from dongjoon-hyun/SPARK-15244. (cherry picked from commit 0f576a5748244f7e874b925f8d841f1ca238f087) Signed-off-by: Davies Liu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c0bb7713 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c0bb7713 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c0bb7713 Branch: refs/heads/branch-2.0 Commit: c0bb77132b9acac951074fd623892abafeb02512 Parents: ff1cfce Author: Dongjoon Hyun Authored: Tue May 17 13:05:07 2016 -0700 Committer: Davies Liu Committed: Tue May 17 13:05:17 2016 -0700 -- python/pyspark/sql/session.py | 2 ++ python/pyspark/sql/tests.py | 7 +++ 2 files changed, 9 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c0bb7713/python/pyspark/sql/session.py -- diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index ae31435..0781b44 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -465,6 +465,8 @@ class SparkSession(object): return (obj, ) schema = StructType().add("value", datatype) else: +if isinstance(schema, list): +schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema] prepare = lambda obj: obj if isinstance(data, RDD): http://git-wip-us.apache.org/repos/asf/spark/blob/c0bb7713/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 0c73f58..0977c43 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -228,6 +228,13 @@ class SQLTests(ReusedPySparkTestCase): self.assertRaises(AnalysisException, lambda: df.select(df.c).first()) self.assertRaises(AnalysisException, lambda: df.select(df["c"]).first()) +def test_column_name_encoding(self): +"""Ensure that created columns has `str` type consistently.""" +columns = self.spark.createDataFrame([('Alice', 1)], ['name', u'age']).columns +self.assertEqual(columns, ['name', 'age']) +self.assertTrue(isinstance(columns[0], str)) +self.assertTrue(isinstance(columns[1], str)) + def test_explode(self): from pyspark.sql.functions import explode d = [Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"})] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[3/4] spark git commit: [SPARK-14615][ML] Use the new ML Vector and Matrix in the ML pipeline based algorithms
http://git-wip-us.apache.org/repos/asf/spark/blob/ff1cfce1/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala index 626e97e..9d084b5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala @@ -21,11 +21,14 @@ import org.apache.hadoop.fs.Path import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml._ +import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature -import org.apache.spark.mllib.linalg.{Vector, VectorUDT} +import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors} +import org.apache.spark.mllib.linalg.VectorImplicits._ +import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{StructField, StructType} @@ -93,7 +96,9 @@ class StandardScaler(override val uid: String) extends Estimator[StandardScalerM @Since("2.0.0") override def fit(dataset: Dataset[_]): StandardScalerModel = { transformSchema(dataset.schema, logging = true) -val input = dataset.select($(inputCol)).rdd.map { case Row(v: Vector) => v } +val input: RDD[OldVector] = dataset.select($(inputCol)).rdd.map { + case Row(v: Vector) => OldVectors.fromML(v) +} val scaler = new feature.StandardScaler(withMean = $(withMean), withStd = $(withStd)) val scalerModel = scaler.fit(input) copyValues(new StandardScalerModel(uid, scalerModel.std, scalerModel.mean).setParent(this)) @@ -145,7 +150,11 @@ class StandardScalerModel private[ml] ( override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val scaler = new feature.StandardScalerModel(std, mean, $(withStd), $(withMean)) -val scale = udf { scaler.transform _ } + +// TODO: Make the transformer natively in ml framework to avoid extra conversion. +val transformer: Vector => Vector = v => scaler.transform(OldVectors.fromML(v)).asML + +val scale = udf(transformer) dataset.withColumn($(outputCol), scale(col($(inputCol } http://git-wip-us.apache.org/repos/asf/spark/blob/ff1cfce1/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala index 4d3e46e..1bc2420 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala @@ -23,10 +23,10 @@ import org.apache.spark.SparkException import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute, UnresolvedAttribute} +import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ -import org.apache.spark.mllib.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ http://git-wip-us.apache.org/repos/asf/spark/blob/ff1cfce1/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala index 68b699d..2bc9d22 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala @@ -27,10 +27,10 @@ import org.apache.hadoop.fs.Path import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.attribute._ +import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ -import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, VectorUDT} import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{StructField, StructType}
[4/4] spark git commit: [SPARK-14615][ML] Use the new ML Vector and Matrix in the ML pipeline based algorithms
[SPARK-14615][ML] Use the new ML Vector and Matrix in the ML pipeline based algorithms ## What changes were proposed in this pull request? Once SPARK-14487 and SPARK-14549 are merged, we will migrate to use the new vector and matrix type in the new ml pipeline based apis. ## How was this patch tested? Unit tests Author: DB TsaiAuthor: Liang-Chi Hsieh Author: Xiangrui Meng Closes #12627 from dbtsai/SPARK-14615-NewML. (cherry picked from commit e2efe0529acd748f26dbaa41331d1733ed256237) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ff1cfce1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ff1cfce1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ff1cfce1 Branch: refs/heads/branch-2.0 Commit: ff1cfce18829ccb176f27d4fcc242cbb341a2277 Parents: 1ad3bbd Author: DB Tsai Authored: Tue May 17 12:51:07 2016 -0700 Committer: Xiangrui Meng Committed: Tue May 17 12:51:41 2016 -0700 -- dev/sparktestsupport/modules.py | 1 + .../examples/ml/JavaBisectingKMeansExample.java | 2 +- .../examples/ml/JavaDeveloperApiExample.java| 8 +- .../ml/JavaElementwiseProductExample.java | 6 +- .../spark/examples/ml/JavaKMeansExample.java| 2 +- .../ml/AFTSurvivalRegressionExample.scala | 2 +- .../examples/ml/ChiSqSelectorExample.scala | 2 +- .../apache/spark/examples/ml/DCTExample.scala | 2 +- .../spark/examples/ml/DataFrameExample.scala| 3 +- .../spark/examples/ml/DecisionTreeExample.scala | 2 +- .../spark/examples/ml/DeveloperApiExample.scala | 4 +- .../examples/ml/ElementwiseProductExample.scala | 2 +- .../ml/EstimatorTransformerParamExample.scala | 2 +- ...odelSelectionViaCrossValidationExample.scala | 2 +- .../apache/spark/examples/ml/PCAExample.scala | 2 +- .../spark/examples/ml/PipelineExample.scala | 2 +- .../ml/PolynomialExpansionExample.scala | 2 +- .../spark/examples/ml/SimpleParamsExample.scala | 4 +- .../ml/SimpleTextClassificationPipeline.scala | 2 +- .../examples/ml/VectorAssemblerExample.scala| 2 +- .../spark/examples/ml/VectorSlicerExample.scala | 2 +- .../scala/org/apache/spark/ml/Predictor.scala | 4 +- .../scala/org/apache/spark/ml/ann/Layer.scala | 22 ++- .../spark/ml/attribute/AttributeGroup.scala | 2 +- .../spark/ml/classification/Classifier.scala| 4 +- .../classification/DecisionTreeClassifier.scala | 4 +- .../spark/ml/classification/GBTClassifier.scala | 4 +- .../ml/classification/LogisticRegression.scala | 5 +- .../MultilayerPerceptronClassifier.scala| 4 +- .../spark/ml/classification/NaiveBayes.scala| 9 +- .../spark/ml/classification/OneVsRest.scala | 2 +- .../ProbabilisticClassifier.scala | 2 +- .../classification/RandomForestClassifier.scala | 4 +- .../spark/ml/clustering/BisectingKMeans.scala | 16 +- .../spark/ml/clustering/GaussianMixture.scala | 9 +- .../org/apache/spark/ml/clustering/KMeans.scala | 18 +- .../org/apache/spark/ml/clustering/LDA.scala| 18 +- .../BinaryClassificationEvaluator.scala | 2 +- .../org/apache/spark/ml/feature/Binarizer.scala | 2 +- .../apache/spark/ml/feature/ChiSqSelector.scala | 21 +- .../spark/ml/feature/CountVectorizer.scala | 2 +- .../scala/org/apache/spark/ml/feature/DCT.scala | 2 +- .../spark/ml/feature/ElementwiseProduct.scala | 6 +- .../org/apache/spark/ml/feature/HashingTF.scala | 3 +- .../scala/org/apache/spark/ml/feature/IDF.scala | 15 +- .../org/apache/spark/ml/feature/Instance.scala | 2 +- .../apache/spark/ml/feature/Interaction.scala | 2 +- .../apache/spark/ml/feature/LabeledPoint.scala | 38 .../apache/spark/ml/feature/MaxAbsScaler.scala | 8 +- .../apache/spark/ml/feature/MinMaxScaler.scala | 9 +- .../apache/spark/ml/feature/Normalizer.scala| 5 +- .../apache/spark/ml/feature/OneHotEncoder.scala | 2 +- .../scala/org/apache/spark/ml/feature/PCA.scala | 21 +- .../spark/ml/feature/PolynomialExpansion.scala | 2 +- .../org/apache/spark/ml/feature/RFormula.scala | 2 +- .../spark/ml/feature/RFormulaParser.scala | 2 +- .../spark/ml/feature/StandardScaler.scala | 15 +- .../spark/ml/feature/VectorAssembler.scala | 2 +- .../apache/spark/ml/feature/VectorIndexer.scala | 2 +- .../apache/spark/ml/feature/VectorSlicer.scala | 2 +- .../org/apache/spark/ml/feature/Word2Vec.scala | 3 +- .../org/apache/spark/ml/linalg/VectorUDT.scala | 2 +- .../IterativelyReweightedLeastSquares.scala | 2 +- .../spark/ml/optim/WeightedLeastSquares.scala | 3 +- .../org/apache/spark/ml/param/params.scala | 7 +-
[2/4] spark git commit: [SPARK-14615][ML] Use the new ML Vector and Matrix in the ML pipeline based algorithms
http://git-wip-us.apache.org/repos/asf/spark/blob/ff1cfce1/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 69650eb..a1b4853 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -17,18 +17,19 @@ package org.apache.spark.ml.classification +import scala.collection.JavaConverters._ import scala.language.existentials import scala.util.Random +import scala.util.control.Breaks._ import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.feature.Instance +import org.apache.spark.ml.classification.LogisticRegressionSuite._ +import org.apache.spark.ml.feature.{Instance, LabeledPoint} +import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} -import org.apache.spark.mllib.classification.LogisticRegressionSuite._ -import org.apache.spark.mllib.linalg.{Vector, Vectors} -import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions.lit @@ -967,4 +968,122 @@ object LogisticRegressionSuite { "standardization" -> false, "threshold" -> 0.6 ) + + def generateLogisticInputAsList( +offset: Double, +scale: Double, +nPoints: Int, +seed: Int): java.util.List[LabeledPoint] = { +generateLogisticInput(offset, scale, nPoints, seed).asJava + } + + // Generate input of the form Y = logistic(offset + scale*X) + def generateLogisticInput( + offset: Double, + scale: Double, + nPoints: Int, + seed: Int): Seq[LabeledPoint] = { +val rnd = new Random(seed) +val x1 = Array.fill[Double](nPoints)(rnd.nextGaussian()) + +val y = (0 until nPoints).map { i => + val p = 1.0 / (1.0 + math.exp(-(offset + scale * x1(i + if (rnd.nextDouble() < p) 1.0 else 0.0 +} + +val testData = (0 until nPoints).map(i => LabeledPoint(y(i), Vectors.dense(Array(x1(i) +testData + } + + /** + * Generates `k` classes multinomial synthetic logistic input in `n` dimensional space given the + * model weights and mean/variance of the features. The synthetic data will be drawn from + * the probability distribution constructed by weights using the following formula. + * + * P(y = 0 | x) = 1 / norm + * P(y = 1 | x) = exp(x * w_1) / norm + * P(y = 2 | x) = exp(x * w_2) / norm + * ... + * P(y = k-1 | x) = exp(x * w_{k-1}) / norm + * where norm = 1 + exp(x * w_1) + exp(x * w_2) + ... + exp(x * w_{k-1}) + * + * @param weights matrix is flatten into a vector; as a result, the dimension of weights vector + *will be (k - 1) * (n + 1) if `addIntercept == true`, and + *if `addIntercept != true`, the dimension will be (k - 1) * n. + * @param xMean the mean of the generated features. Lots of time, if the features are not properly + * standardized, the algorithm with poor implementation will have difficulty + * to converge. + * @param xVariance the variance of the generated features. + * @param addIntercept whether to add intercept. + * @param nPoints the number of instance of generated data. + * @param seed the seed for random generator. For consistent testing result, it will be fixed. + */ + def generateMultinomialLogisticInput( + weights: Array[Double], + xMean: Array[Double], + xVariance: Array[Double], + addIntercept: Boolean, + nPoints: Int, + seed: Int): Seq[LabeledPoint] = { +val rnd = new Random(seed) + +val xDim = xMean.length +val xWithInterceptsDim = if (addIntercept) xDim + 1 else xDim +val nClasses = weights.length / xWithInterceptsDim + 1 + +val x = Array.fill[Vector](nPoints)(Vectors.dense(Array.fill[Double](xDim)(rnd.nextGaussian( + +x.foreach { vector => + // This doesn't work if `vector` is a sparse vector. + val vectorArray = vector.toArray + var i = 0 + val len = vectorArray.length + while (i < len) { +vectorArray(i) = vectorArray(i) * math.sqrt(xVariance(i)) + xMean(i) +i += 1 + } +} + +val y = (0 until nPoints).map { idx => + val xArray = x(idx).toArray + val margins = Array.ofDim[Double](nClasses) + val probs = Array.ofDim[Double](nClasses) + + for (i <- 0 until nClasses - 1) { +for (j <- 0 until xDim) margins(i +
[3/4] spark git commit: [SPARK-14615][ML] Use the new ML Vector and Matrix in the ML pipeline based algorithms
http://git-wip-us.apache.org/repos/asf/spark/blob/e2efe052/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala index 626e97e..9d084b5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala @@ -21,11 +21,14 @@ import org.apache.hadoop.fs.Path import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml._ +import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature -import org.apache.spark.mllib.linalg.{Vector, VectorUDT} +import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors} +import org.apache.spark.mllib.linalg.VectorImplicits._ +import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{StructField, StructType} @@ -93,7 +96,9 @@ class StandardScaler(override val uid: String) extends Estimator[StandardScalerM @Since("2.0.0") override def fit(dataset: Dataset[_]): StandardScalerModel = { transformSchema(dataset.schema, logging = true) -val input = dataset.select($(inputCol)).rdd.map { case Row(v: Vector) => v } +val input: RDD[OldVector] = dataset.select($(inputCol)).rdd.map { + case Row(v: Vector) => OldVectors.fromML(v) +} val scaler = new feature.StandardScaler(withMean = $(withMean), withStd = $(withStd)) val scalerModel = scaler.fit(input) copyValues(new StandardScalerModel(uid, scalerModel.std, scalerModel.mean).setParent(this)) @@ -145,7 +150,11 @@ class StandardScalerModel private[ml] ( override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val scaler = new feature.StandardScalerModel(std, mean, $(withStd), $(withMean)) -val scale = udf { scaler.transform _ } + +// TODO: Make the transformer natively in ml framework to avoid extra conversion. +val transformer: Vector => Vector = v => scaler.transform(OldVectors.fromML(v)).asML + +val scale = udf(transformer) dataset.withColumn($(outputCol), scale(col($(inputCol } http://git-wip-us.apache.org/repos/asf/spark/blob/e2efe052/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala index 4d3e46e..1bc2420 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala @@ -23,10 +23,10 @@ import org.apache.spark.SparkException import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute, UnresolvedAttribute} +import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ -import org.apache.spark.mllib.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ http://git-wip-us.apache.org/repos/asf/spark/blob/e2efe052/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala index 68b699d..2bc9d22 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala @@ -27,10 +27,10 @@ import org.apache.hadoop.fs.Path import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.attribute._ +import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ -import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, VectorUDT} import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{StructField, StructType}
[1/4] spark git commit: [SPARK-14615][ML] Use the new ML Vector and Matrix in the ML pipeline based algorithms
Repository: spark Updated Branches: refs/heads/master 9f176dd39 -> e2efe0529 http://git-wip-us.apache.org/repos/asf/spark/blob/e2efe052/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala index 49cb7e1..441d0f7 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala @@ -73,7 +73,7 @@ class DecisionTreeSuite extends SparkFunSuite with MLlibTestSparkContext { maxBins = 100, categoricalFeaturesInfo = Map(0 -> 3, 1 -> 3)) -val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy) +val metadata = DecisionTreeMetadata.buildMetadata(rdd.map(_.asML), strategy) assert(!metadata.isUnordered(featureIndex = 0)) assert(!metadata.isUnordered(featureIndex = 1)) @@ -100,7 +100,7 @@ class DecisionTreeSuite extends SparkFunSuite with MLlibTestSparkContext { maxDepth = 2, maxBins = 100, categoricalFeaturesInfo = Map(0 -> 2, 1 -> 2)) -val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy) +val metadata = DecisionTreeMetadata.buildMetadata(rdd.map(_.asML), strategy) assert(!metadata.isUnordered(featureIndex = 0)) assert(!metadata.isUnordered(featureIndex = 1)) @@ -116,7 +116,7 @@ class DecisionTreeSuite extends SparkFunSuite with MLlibTestSparkContext { val rdd = sc.parallelize(arr) val strategy = new Strategy(Classification, Gini, maxDepth = 3, numClasses = 2, maxBins = 100) -val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy) +val metadata = DecisionTreeMetadata.buildMetadata(rdd.map(_.asML), strategy) assert(!metadata.isUnordered(featureIndex = 0)) assert(!metadata.isUnordered(featureIndex = 1)) @@ -133,7 +133,7 @@ class DecisionTreeSuite extends SparkFunSuite with MLlibTestSparkContext { val rdd = sc.parallelize(arr) val strategy = new Strategy(Classification, Gini, maxDepth = 3, numClasses = 2, maxBins = 100) -val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy) +val metadata = DecisionTreeMetadata.buildMetadata(rdd.map(_.asML), strategy) assert(!metadata.isUnordered(featureIndex = 0)) assert(!metadata.isUnordered(featureIndex = 1)) @@ -150,7 +150,7 @@ class DecisionTreeSuite extends SparkFunSuite with MLlibTestSparkContext { val rdd = sc.parallelize(arr) val strategy = new Strategy(Classification, Entropy, maxDepth = 3, numClasses = 2, maxBins = 100) -val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy) +val metadata = DecisionTreeMetadata.buildMetadata(rdd.map(_.asML), strategy) assert(!metadata.isUnordered(featureIndex = 0)) assert(!metadata.isUnordered(featureIndex = 1)) @@ -167,7 +167,7 @@ class DecisionTreeSuite extends SparkFunSuite with MLlibTestSparkContext { val rdd = sc.parallelize(arr) val strategy = new Strategy(Classification, Entropy, maxDepth = 3, numClasses = 2, maxBins = 100) -val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy) +val metadata = DecisionTreeMetadata.buildMetadata(rdd.map(_.asML), strategy) assert(!metadata.isUnordered(featureIndex = 0)) assert(!metadata.isUnordered(featureIndex = 1)) @@ -183,7 +183,7 @@ class DecisionTreeSuite extends SparkFunSuite with MLlibTestSparkContext { val rdd = sc.parallelize(arr) val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 4, numClasses = 3, categoricalFeaturesInfo = Map(0 -> 3, 1 -> 3)) -val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy) +val metadata = DecisionTreeMetadata.buildMetadata(rdd.map(_.asML), strategy) assert(strategy.isMulticlassClassification) assert(metadata.isUnordered(featureIndex = 0)) assert(metadata.isUnordered(featureIndex = 1)) @@ -240,7 +240,7 @@ class DecisionTreeSuite extends SparkFunSuite with MLlibTestSparkContext { numClasses = 3, maxBins = maxBins, categoricalFeaturesInfo = Map(0 -> 3, 1 -> 3)) assert(strategy.isMulticlassClassification) -val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy) +val metadata = DecisionTreeMetadata.buildMetadata(rdd.map(_.asML), strategy) assert(metadata.isUnordered(featureIndex = 0)) assert(metadata.isUnordered(featureIndex = 1)) @@ -288,7 +288,7 @@ class DecisionTreeSuite extends SparkFunSuite with MLlibTestSparkContext { val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 4, numClasses = 3, maxBins = 100, categoricalFeaturesInfo = Map(0 -> 3)) assert(strategy.isMulticlassClassification) -val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy)
[4/4] spark git commit: [SPARK-14615][ML] Use the new ML Vector and Matrix in the ML pipeline based algorithms
[SPARK-14615][ML] Use the new ML Vector and Matrix in the ML pipeline based algorithms ## What changes were proposed in this pull request? Once SPARK-14487 and SPARK-14549 are merged, we will migrate to use the new vector and matrix type in the new ml pipeline based apis. ## How was this patch tested? Unit tests Author: DB TsaiAuthor: Liang-Chi Hsieh Author: Xiangrui Meng Closes #12627 from dbtsai/SPARK-14615-NewML. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e2efe052 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e2efe052 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e2efe052 Branch: refs/heads/master Commit: e2efe0529acd748f26dbaa41331d1733ed256237 Parents: 9f176dd Author: DB Tsai Authored: Tue May 17 12:51:07 2016 -0700 Committer: Xiangrui Meng Committed: Tue May 17 12:51:07 2016 -0700 -- dev/sparktestsupport/modules.py | 1 + .../examples/ml/JavaBisectingKMeansExample.java | 2 +- .../examples/ml/JavaDeveloperApiExample.java| 8 +- .../ml/JavaElementwiseProductExample.java | 6 +- .../spark/examples/ml/JavaKMeansExample.java| 2 +- .../ml/AFTSurvivalRegressionExample.scala | 2 +- .../examples/ml/ChiSqSelectorExample.scala | 2 +- .../apache/spark/examples/ml/DCTExample.scala | 2 +- .../spark/examples/ml/DataFrameExample.scala| 3 +- .../spark/examples/ml/DecisionTreeExample.scala | 2 +- .../spark/examples/ml/DeveloperApiExample.scala | 4 +- .../examples/ml/ElementwiseProductExample.scala | 2 +- .../ml/EstimatorTransformerParamExample.scala | 2 +- ...odelSelectionViaCrossValidationExample.scala | 2 +- .../apache/spark/examples/ml/PCAExample.scala | 2 +- .../spark/examples/ml/PipelineExample.scala | 2 +- .../ml/PolynomialExpansionExample.scala | 2 +- .../spark/examples/ml/SimpleParamsExample.scala | 4 +- .../ml/SimpleTextClassificationPipeline.scala | 2 +- .../examples/ml/VectorAssemblerExample.scala| 2 +- .../spark/examples/ml/VectorSlicerExample.scala | 2 +- .../scala/org/apache/spark/ml/Predictor.scala | 4 +- .../scala/org/apache/spark/ml/ann/Layer.scala | 22 ++- .../spark/ml/attribute/AttributeGroup.scala | 2 +- .../spark/ml/classification/Classifier.scala| 4 +- .../classification/DecisionTreeClassifier.scala | 4 +- .../spark/ml/classification/GBTClassifier.scala | 4 +- .../ml/classification/LogisticRegression.scala | 5 +- .../MultilayerPerceptronClassifier.scala| 4 +- .../spark/ml/classification/NaiveBayes.scala| 9 +- .../spark/ml/classification/OneVsRest.scala | 2 +- .../ProbabilisticClassifier.scala | 2 +- .../classification/RandomForestClassifier.scala | 4 +- .../spark/ml/clustering/BisectingKMeans.scala | 16 +- .../spark/ml/clustering/GaussianMixture.scala | 9 +- .../org/apache/spark/ml/clustering/KMeans.scala | 18 +- .../org/apache/spark/ml/clustering/LDA.scala| 18 +- .../BinaryClassificationEvaluator.scala | 2 +- .../org/apache/spark/ml/feature/Binarizer.scala | 2 +- .../apache/spark/ml/feature/ChiSqSelector.scala | 21 +- .../spark/ml/feature/CountVectorizer.scala | 2 +- .../scala/org/apache/spark/ml/feature/DCT.scala | 2 +- .../spark/ml/feature/ElementwiseProduct.scala | 6 +- .../org/apache/spark/ml/feature/HashingTF.scala | 3 +- .../scala/org/apache/spark/ml/feature/IDF.scala | 15 +- .../org/apache/spark/ml/feature/Instance.scala | 2 +- .../apache/spark/ml/feature/Interaction.scala | 2 +- .../apache/spark/ml/feature/LabeledPoint.scala | 38 .../apache/spark/ml/feature/MaxAbsScaler.scala | 8 +- .../apache/spark/ml/feature/MinMaxScaler.scala | 9 +- .../apache/spark/ml/feature/Normalizer.scala| 5 +- .../apache/spark/ml/feature/OneHotEncoder.scala | 2 +- .../scala/org/apache/spark/ml/feature/PCA.scala | 21 +- .../spark/ml/feature/PolynomialExpansion.scala | 2 +- .../org/apache/spark/ml/feature/RFormula.scala | 2 +- .../spark/ml/feature/RFormulaParser.scala | 2 +- .../spark/ml/feature/StandardScaler.scala | 15 +- .../spark/ml/feature/VectorAssembler.scala | 2 +- .../apache/spark/ml/feature/VectorIndexer.scala | 2 +- .../apache/spark/ml/feature/VectorSlicer.scala | 2 +- .../org/apache/spark/ml/feature/Word2Vec.scala | 3 +- .../org/apache/spark/ml/linalg/VectorUDT.scala | 2 +- .../IterativelyReweightedLeastSquares.scala | 2 +- .../spark/ml/optim/WeightedLeastSquares.scala | 3 +- .../org/apache/spark/ml/param/params.scala | 7 +- .../ml/regression/AFTSurvivalRegression.scala | 3 +- .../ml/regression/DecisionTreeRegressor.scala | 4 +-
[2/4] spark git commit: [SPARK-14615][ML] Use the new ML Vector and Matrix in the ML pipeline based algorithms
http://git-wip-us.apache.org/repos/asf/spark/blob/e2efe052/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 69650eb..a1b4853 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -17,18 +17,19 @@ package org.apache.spark.ml.classification +import scala.collection.JavaConverters._ import scala.language.existentials import scala.util.Random +import scala.util.control.Breaks._ import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.feature.Instance +import org.apache.spark.ml.classification.LogisticRegressionSuite._ +import org.apache.spark.ml.feature.{Instance, LabeledPoint} +import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} -import org.apache.spark.mllib.classification.LogisticRegressionSuite._ -import org.apache.spark.mllib.linalg.{Vector, Vectors} -import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions.lit @@ -967,4 +968,122 @@ object LogisticRegressionSuite { "standardization" -> false, "threshold" -> 0.6 ) + + def generateLogisticInputAsList( +offset: Double, +scale: Double, +nPoints: Int, +seed: Int): java.util.List[LabeledPoint] = { +generateLogisticInput(offset, scale, nPoints, seed).asJava + } + + // Generate input of the form Y = logistic(offset + scale*X) + def generateLogisticInput( + offset: Double, + scale: Double, + nPoints: Int, + seed: Int): Seq[LabeledPoint] = { +val rnd = new Random(seed) +val x1 = Array.fill[Double](nPoints)(rnd.nextGaussian()) + +val y = (0 until nPoints).map { i => + val p = 1.0 / (1.0 + math.exp(-(offset + scale * x1(i + if (rnd.nextDouble() < p) 1.0 else 0.0 +} + +val testData = (0 until nPoints).map(i => LabeledPoint(y(i), Vectors.dense(Array(x1(i) +testData + } + + /** + * Generates `k` classes multinomial synthetic logistic input in `n` dimensional space given the + * model weights and mean/variance of the features. The synthetic data will be drawn from + * the probability distribution constructed by weights using the following formula. + * + * P(y = 0 | x) = 1 / norm + * P(y = 1 | x) = exp(x * w_1) / norm + * P(y = 2 | x) = exp(x * w_2) / norm + * ... + * P(y = k-1 | x) = exp(x * w_{k-1}) / norm + * where norm = 1 + exp(x * w_1) + exp(x * w_2) + ... + exp(x * w_{k-1}) + * + * @param weights matrix is flatten into a vector; as a result, the dimension of weights vector + *will be (k - 1) * (n + 1) if `addIntercept == true`, and + *if `addIntercept != true`, the dimension will be (k - 1) * n. + * @param xMean the mean of the generated features. Lots of time, if the features are not properly + * standardized, the algorithm with poor implementation will have difficulty + * to converge. + * @param xVariance the variance of the generated features. + * @param addIntercept whether to add intercept. + * @param nPoints the number of instance of generated data. + * @param seed the seed for random generator. For consistent testing result, it will be fixed. + */ + def generateMultinomialLogisticInput( + weights: Array[Double], + xMean: Array[Double], + xVariance: Array[Double], + addIntercept: Boolean, + nPoints: Int, + seed: Int): Seq[LabeledPoint] = { +val rnd = new Random(seed) + +val xDim = xMean.length +val xWithInterceptsDim = if (addIntercept) xDim + 1 else xDim +val nClasses = weights.length / xWithInterceptsDim + 1 + +val x = Array.fill[Vector](nPoints)(Vectors.dense(Array.fill[Double](xDim)(rnd.nextGaussian( + +x.foreach { vector => + // This doesn't work if `vector` is a sparse vector. + val vectorArray = vector.toArray + var i = 0 + val len = vectorArray.length + while (i < len) { +vectorArray(i) = vectorArray(i) * math.sqrt(xVariance(i)) + xMean(i) +i += 1 + } +} + +val y = (0 until nPoints).map { idx => + val xArray = x(idx).toArray + val margins = Array.ofDim[Double](nClasses) + val probs = Array.ofDim[Double](nClasses) + + for (i <- 0 until nClasses - 1) { +for (j <- 0 until xDim) margins(i +
spark git commit: [MINOR][DOCS] Replace remaining 'sqlContext' in ScalaDoc/JavaDoc.
Repository: spark Updated Branches: refs/heads/branch-2.0 025b3e9f1 -> 1ad3bbd0a [MINOR][DOCS] Replace remaining 'sqlContext' in ScalaDoc/JavaDoc. ## What changes were proposed in this pull request? According to the recent change, this PR replaces all the remaining `sqlContext` usage with `spark` in ScalaDoc/JavaDoc (.scala/.java files) except `SQLContext.scala`, `SparkPlan.scala', and `DatasetHolder.scala`. ## How was this patch tested? Manual. Author: Dongjoon HyunCloses #13125 from dongjoon-hyun/minor_doc_sparksession. (cherry picked from commit 9f176dd3918129a72282a6b7a12e2899cbb6dac9) Signed-off-by: Nick Pentreath Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1ad3bbd0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1ad3bbd0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1ad3bbd0 Branch: refs/heads/branch-2.0 Commit: 1ad3bbd0a4c7d4ab4aad0806f345d3904d7cd441 Parents: 025b3e9 Author: Dongjoon Hyun Authored: Tue May 17 20:50:22 2016 +0200 Committer: Nick Pentreath Committed: Tue May 17 20:50:47 2016 +0200 -- .../main/scala/org/apache/spark/ml/feature/package.scala | 2 +- .../main/scala/org/apache/spark/sql/DataFrameReader.scala | 4 ++-- .../org/apache/spark/sql/DataFrameStatFunctions.scala | 10 +- .../scala/org/apache/spark/sql/ExperimentalMethods.scala | 2 +- .../datasources/PartitioningAwareFileCatalog.scala| 8 .../src/main/scala/org/apache/spark/sql/functions.scala | 4 ++-- 6 files changed, 15 insertions(+), 15 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1ad3bbd0/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala index 4571ab2..b94187a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala @@ -44,7 +44,7 @@ import org.apache.spark.sql.DataFrame * import org.apache.spark.ml.Pipeline * * // a DataFrame with three columns: id (integer), text (string), and rating (double). - * val df = sqlContext.createDataFrame(Seq( + * val df = spark.createDataFrame(Seq( * (0, "Hi I heard about Spark", 3.0), * (1, "I wish Java could use case classes", 4.0), * (2, "Logistic regression models are neat", 4.0) http://git-wip-us.apache.org/repos/asf/spark/blob/1ad3bbd0/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index e1a64df..011aff4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -446,10 +446,10 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * Each line in the text file is a new row in the resulting Dataset. For example: * {{{ * // Scala: - * sqlContext.read.text("/path/to/spark/README.md") + * spark.read.text("/path/to/spark/README.md") * * // Java: - * sqlContext.read().text("/path/to/spark/README.md") + * spark.read().text("/path/to/spark/README.md") * }}} * * @param paths input path http://git-wip-us.apache.org/repos/asf/spark/blob/1ad3bbd0/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 3eb1f0f..1855eab 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -160,8 +160,8 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @return A DataFrame containing for the contingency table. * * {{{ - *val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2), - * (3, 3))).toDF("key", "value") + *val df = spark.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2), (3, 3))) + * .toDF("key", "value") *val ct = df.stat.crosstab("key", "value") *ct.show() *+-+---+---+---+ @@ -197,7 +197,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { *
spark git commit: [MINOR][DOCS] Replace remaining 'sqlContext' in ScalaDoc/JavaDoc.
Repository: spark Updated Branches: refs/heads/master 3308a862b -> 9f176dd39 [MINOR][DOCS] Replace remaining 'sqlContext' in ScalaDoc/JavaDoc. ## What changes were proposed in this pull request? According to the recent change, this PR replaces all the remaining `sqlContext` usage with `spark` in ScalaDoc/JavaDoc (.scala/.java files) except `SQLContext.scala`, `SparkPlan.scala', and `DatasetHolder.scala`. ## How was this patch tested? Manual. Author: Dongjoon HyunCloses #13125 from dongjoon-hyun/minor_doc_sparksession. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9f176dd3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9f176dd3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9f176dd3 Branch: refs/heads/master Commit: 9f176dd3918129a72282a6b7a12e2899cbb6dac9 Parents: 3308a86 Author: Dongjoon Hyun Authored: Tue May 17 20:50:22 2016 +0200 Committer: Nick Pentreath Committed: Tue May 17 20:50:22 2016 +0200 -- .../main/scala/org/apache/spark/ml/feature/package.scala | 2 +- .../main/scala/org/apache/spark/sql/DataFrameReader.scala | 4 ++-- .../org/apache/spark/sql/DataFrameStatFunctions.scala | 10 +- .../scala/org/apache/spark/sql/ExperimentalMethods.scala | 2 +- .../datasources/PartitioningAwareFileCatalog.scala| 8 .../src/main/scala/org/apache/spark/sql/functions.scala | 4 ++-- 6 files changed, 15 insertions(+), 15 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9f176dd3/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala index 4571ab2..b94187a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala @@ -44,7 +44,7 @@ import org.apache.spark.sql.DataFrame * import org.apache.spark.ml.Pipeline * * // a DataFrame with three columns: id (integer), text (string), and rating (double). - * val df = sqlContext.createDataFrame(Seq( + * val df = spark.createDataFrame(Seq( * (0, "Hi I heard about Spark", 3.0), * (1, "I wish Java could use case classes", 4.0), * (2, "Logistic regression models are neat", 4.0) http://git-wip-us.apache.org/repos/asf/spark/blob/9f176dd3/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index e1a64df..011aff4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -446,10 +446,10 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * Each line in the text file is a new row in the resulting Dataset. For example: * {{{ * // Scala: - * sqlContext.read.text("/path/to/spark/README.md") + * spark.read.text("/path/to/spark/README.md") * * // Java: - * sqlContext.read().text("/path/to/spark/README.md") + * spark.read().text("/path/to/spark/README.md") * }}} * * @param paths input path http://git-wip-us.apache.org/repos/asf/spark/blob/9f176dd3/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 3eb1f0f..1855eab 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -160,8 +160,8 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @return A DataFrame containing for the contingency table. * * {{{ - *val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2), - * (3, 3))).toDF("key", "value") + *val df = spark.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2), (3, 3))) + * .toDF("key", "value") *val ct = df.stat.crosstab("key", "value") *ct.show() *+-+---+---+---+ @@ -197,7 +197,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { *val rows = Seq.tabulate(100) { i => * if (i % 2 == 0) (1, -1.0) else (i, i * -1.0) *} - *val df =
spark git commit: [SPARK-10216][SQL] Avoid creating empty files during overwriting with group by query
Repository: spark Updated Branches: refs/heads/branch-2.0 adc1c2685 -> af37bdd3a [SPARK-10216][SQL] Avoid creating empty files during overwriting with group by query ## What changes were proposed in this pull request? Currently, `INSERT INTO` with `GROUP BY` query tries to make at least 200 files (default value of `spark.sql.shuffle.partition`), which results in lots of empty files. This PR makes it avoid creating empty files during overwriting into Hive table and in internal data sources with group by query. This checks whether the given partition has data in it or not and creates/writes file only when it actually has data. ## How was this patch tested? Unittests in `InsertIntoHiveTableSuite` and `HadoopFsRelationTest`. Closes #8411 Author: hyukjinkwonAuthor: Keuntae Park Closes #12855 from HyukjinKwon/pr/8411. (cherry picked from commit 8d05a7a98bdbd3ce7c81d273e05a375877ebe68f) Signed-off-by: Michael Armbrust Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/af37bdd3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/af37bdd3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/af37bdd3 Branch: refs/heads/branch-2.0 Commit: af37bdd3a7cee5206f98b3a2ba9113e71b53a2f4 Parents: adc1c26 Author: hyukjinkwon Authored: Tue May 17 11:18:51 2016 -0700 Committer: Michael Armbrust Committed: Tue May 17 11:21:06 2016 -0700 -- .../execution/datasources/WriterContainer.scala | 221 ++- .../spark/sql/hive/hiveWriterContainers.scala | 24 +- .../sql/hive/InsertIntoHiveTableSuite.scala | 41 +++- .../sql/sources/HadoopFsRelationTest.scala | 22 +- 4 files changed, 182 insertions(+), 126 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/af37bdd3/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala index 3b064a5..7e12bbb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala @@ -239,48 +239,50 @@ private[sql] class DefaultWriterContainer( extends BaseWriterContainer(relation, job, isAppend) { def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): Unit = { -executorSideSetup(taskContext) -val configuration = taskAttemptContext.getConfiguration -configuration.set("spark.sql.sources.output.path", outputPath) -var writer = newOutputWriter(getWorkPath) -writer.initConverter(dataSchema) - -// If anything below fails, we should abort the task. -try { - Utils.tryWithSafeFinallyAndFailureCallbacks { -while (iterator.hasNext) { - val internalRow = iterator.next() - writer.writeInternal(internalRow) -} -commitTask() - }(catchBlock = abortTask()) -} catch { - case t: Throwable => -throw new SparkException("Task failed while writing rows", t) -} +if (iterator.hasNext) { + executorSideSetup(taskContext) + val configuration = taskAttemptContext.getConfiguration + configuration.set("spark.sql.sources.output.path", outputPath) + var writer = newOutputWriter(getWorkPath) + writer.initConverter(dataSchema) -def commitTask(): Unit = { + // If anything below fails, we should abort the task. try { -if (writer != null) { - writer.close() - writer = null -} -super.commitTask() +Utils.tryWithSafeFinallyAndFailureCallbacks { + while (iterator.hasNext) { +val internalRow = iterator.next() +writer.writeInternal(internalRow) + } + commitTask() +}(catchBlock = abortTask()) } catch { -case cause: Throwable => - // This exception will be handled in `InsertIntoHadoopFsRelation.insert$writeRows`, and - // will cause `abortTask()` to be invoked. - throw new RuntimeException("Failed to commit task", cause) +case t: Throwable => + throw new SparkException("Task failed while writing rows", t) } -} -def abortTask(): Unit = { - try { -if (writer != null) { - writer.close() + def commitTask(): Unit = { +try { + if (writer != null) { +writer.close() +writer = null + } + super.commitTask() +
spark git commit: [SPARK-10216][SQL] Avoid creating empty files during overwriting with group by query
Repository: spark Updated Branches: refs/heads/master 20a89478e -> 8d05a7a98 [SPARK-10216][SQL] Avoid creating empty files during overwriting with group by query ## What changes were proposed in this pull request? Currently, `INSERT INTO` with `GROUP BY` query tries to make at least 200 files (default value of `spark.sql.shuffle.partition`), which results in lots of empty files. This PR makes it avoid creating empty files during overwriting into Hive table and in internal data sources with group by query. This checks whether the given partition has data in it or not and creates/writes file only when it actually has data. ## How was this patch tested? Unittests in `InsertIntoHiveTableSuite` and `HadoopFsRelationTest`. Closes #8411 Author: hyukjinkwonAuthor: Keuntae Park Closes #12855 from HyukjinKwon/pr/8411. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8d05a7a9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8d05a7a9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8d05a7a9 Branch: refs/heads/master Commit: 8d05a7a98bdbd3ce7c81d273e05a375877ebe68f Parents: 20a8947 Author: hyukjinkwon Authored: Tue May 17 11:18:51 2016 -0700 Committer: Michael Armbrust Committed: Tue May 17 11:18:51 2016 -0700 -- .../execution/datasources/WriterContainer.scala | 221 ++- .../spark/sql/hive/hiveWriterContainers.scala | 24 +- .../sql/hive/InsertIntoHiveTableSuite.scala | 41 +++- .../sql/sources/HadoopFsRelationTest.scala | 22 +- 4 files changed, 182 insertions(+), 126 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8d05a7a9/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala index 3b064a5..7e12bbb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala @@ -239,48 +239,50 @@ private[sql] class DefaultWriterContainer( extends BaseWriterContainer(relation, job, isAppend) { def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): Unit = { -executorSideSetup(taskContext) -val configuration = taskAttemptContext.getConfiguration -configuration.set("spark.sql.sources.output.path", outputPath) -var writer = newOutputWriter(getWorkPath) -writer.initConverter(dataSchema) - -// If anything below fails, we should abort the task. -try { - Utils.tryWithSafeFinallyAndFailureCallbacks { -while (iterator.hasNext) { - val internalRow = iterator.next() - writer.writeInternal(internalRow) -} -commitTask() - }(catchBlock = abortTask()) -} catch { - case t: Throwable => -throw new SparkException("Task failed while writing rows", t) -} +if (iterator.hasNext) { + executorSideSetup(taskContext) + val configuration = taskAttemptContext.getConfiguration + configuration.set("spark.sql.sources.output.path", outputPath) + var writer = newOutputWriter(getWorkPath) + writer.initConverter(dataSchema) -def commitTask(): Unit = { + // If anything below fails, we should abort the task. try { -if (writer != null) { - writer.close() - writer = null -} -super.commitTask() +Utils.tryWithSafeFinallyAndFailureCallbacks { + while (iterator.hasNext) { +val internalRow = iterator.next() +writer.writeInternal(internalRow) + } + commitTask() +}(catchBlock = abortTask()) } catch { -case cause: Throwable => - // This exception will be handled in `InsertIntoHadoopFsRelation.insert$writeRows`, and - // will cause `abortTask()` to be invoked. - throw new RuntimeException("Failed to commit task", cause) +case t: Throwable => + throw new SparkException("Task failed while writing rows", t) } -} -def abortTask(): Unit = { - try { -if (writer != null) { - writer.close() + def commitTask(): Unit = { +try { + if (writer != null) { +writer.close() +writer = null + } + super.commitTask() +} catch { + case cause: Throwable => +// This exception will be handled in
spark git commit: [SPARK-14346][SQL][FOLLOW-UP] add tests for CREAT TABLE USING with partition and bucket
Repository: spark Updated Branches: refs/heads/branch-2.0 110876b9a -> adc1c2685 [SPARK-14346][SQL][FOLLOW-UP] add tests for CREAT TABLE USING with partition and bucket ## What changes were proposed in this pull request? https://github.com/apache/spark/pull/12781 introduced PARTITIONED BY, CLUSTERED BY, and SORTED BY keywords to CREATE TABLE USING. This PR adds tests to make sure those keywords are handled correctly. This PR also fixes a mistake that we should create non-hive-compatible table if partition or bucket info exists. ## How was this patch tested? N/A Author: Wenchen FanCloses #13144 from cloud-fan/add-test. (cherry picked from commit 20a89478e168cb6901ef89f4cb6aa79193ed244a) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/adc1c268 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/adc1c268 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/adc1c268 Branch: refs/heads/branch-2.0 Commit: adc1c2685ea0cfbf23716a4199b85c65021d15c6 Parents: 110876b Author: Wenchen Fan Authored: Tue May 17 10:12:51 2016 -0700 Committer: Yin Huai Committed: Tue May 17 10:14:00 2016 -0700 -- .../command/createDataSourceTables.scala| 11 +++- .../sql/execution/command/DDLCommandSuite.scala | 53 .../spark/sql/execution/command/DDLSuite.scala | 44 3 files changed, 106 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/adc1c268/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala index 7d3c5257..70e5108 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala @@ -399,8 +399,8 @@ object CreateDataSourceTableUtils extends Logging { "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive." (None, message) - case (Some(serde), relation: HadoopFsRelation) -if relation.location.paths.length == 1 && relation.partitionSchema.isEmpty => + case (Some(serde), relation: HadoopFsRelation) if relation.location.paths.length == 1 && +relation.partitionSchema.isEmpty && relation.bucketSpec.isEmpty => val hiveTable = newHiveCompatibleMetastoreTable(relation, serde) val message = s"Persisting data source relation $qualifiedTableName with a single input path " + @@ -415,6 +415,13 @@ object CreateDataSourceTableUtils extends Logging { "Input path(s): " + relation.location.paths.mkString("\n", "\n", "") (None, message) + case (Some(serde), relation: HadoopFsRelation) if relation.bucketSpec.nonEmpty => +val message = + s"Persisting bucketed data source relation $qualifiedTableName into " + +"Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " + +"Input path(s): " + relation.location.paths.mkString("\n", "\n", "") +(None, message) + case (Some(serde), relation: HadoopFsRelation) => val message = s"Persisting data source relation $qualifiedTableName with multiple input paths into " + http://git-wip-us.apache.org/repos/asf/spark/blob/adc1c268/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala index 13df449..897170e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala @@ -24,7 +24,9 @@ import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.Project import org.apache.spark.sql.execution.SparkSqlParser +import org.apache.spark.sql.execution.datasources.{BucketSpec, CreateTableUsing} import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{IntegerType, StringType, StructType} // TODO: merge this with DDLSuite (SPARK-14441) class DDLCommandSuite extends PlanTest { @@
spark git commit: [SPARK-15165] [SQL] Codegen can break because toCommentSafeString is not actually safe
Repository: spark Updated Branches: refs/heads/branch-2.0 670f48222 -> 110876b9a [SPARK-15165] [SQL] Codegen can break because toCommentSafeString is not actually safe ## What changes were proposed in this pull request? toCommentSafeString method replaces "\u" with "u" to avoid codegen breaking. But if the even number of "\" is put before "u", like "u", in the string literal in the query, codegen can break. Following code causes compilation error. ``` val df = Seq(...).toDF df.select("'u002A/'").show ``` The reason of the compilation error is because "u002A/" is translated into "*/" (the end of comment). Due to this unsafety, arbitrary code can be injected like as follows. ``` val df = Seq(...).toDF // Inject "System.exit(1)" df.select("'u002A/{System.exit(1);}/*'").show ``` ## How was this patch tested? Added new test cases. Author: Kousuke SarutaAuthor: sarutak Closes #12939 from sarutak/SPARK-15165. (cherry picked from commit c0c3ec35476c756e569a1f34c4b258eb0490585c) Signed-off-by: Davies Liu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/110876b9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/110876b9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/110876b9 Branch: refs/heads/branch-2.0 Commit: 110876b9afe5e4205062fd8e8979e096e585737d Parents: 670f482 Author: Kousuke Saruta Authored: Tue May 17 10:07:01 2016 -0700 Committer: Davies Liu Committed: Tue May 17 10:08:52 2016 -0700 -- .../spark/sql/catalyst/util/package.scala | 13 +- .../expressions/CodeGenerationSuite.scala | 44 .../org/apache/spark/sql/SQLQuerySuite.scala| 264 +++ 3 files changed, 320 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/110876b9/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala index 3d2a624..f1d6cab 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala @@ -162,7 +162,18 @@ package object util { def toCommentSafeString(str: String): String = { val len = math.min(str.length, 128) val suffix = if (str.length > len) "..." else "" -str.substring(0, len).replace("*/", "\\*\\/").replace("\\u", "u") + suffix + +// Unicode literals, like \u0022, should be escaped before +// they are put in code comment to avoid codegen breaking. +// To escape them, single "\" should be prepended to a series of "\" just before "u" +// only when the number of "\" is odd. +// For example, \u0022 should become to \\u0022 +// but \\u0022 should not become to \\\u0022 because the first backslash escapes the second one, +// and \u0022 will remain, means not escaped. +// Otherwise, the runtime Java compiler will fail to compile or code injection can be allowed. +// For details, see SPARK-15165. +str.substring(0, len).replace("*/", "*\\/") + .replaceAll("(^|[^])(()*u)", "$1$2") + suffix } /* FIX ME http://git-wip-us.apache.org/repos/asf/spark/blob/110876b9/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala index 2082cea..db34d12 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala @@ -194,4 +194,48 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper { true, InternalRow(UTF8String.fromString("\\u"))) } + + test("check compilation error doesn't occur caused by specific literal") { +// The end of comment (*/) should be escaped. +GenerateUnsafeProjection.generate( + Literal.create("*/Compilation error occurs/*", StringType) :: Nil) + +// `\u002A` is `*` and `\u002F` is `/` +// so if the end of comment consists of those characters in queries, we need to escape them. +GenerateUnsafeProjection.generate( + Literal.create("\\u002A/Compilation error
spark git commit: [SPARK-15318][ML][EXAMPLE] spark.ml Collaborative Filtering example does not work in spark-shell
Repository: spark Updated Branches: refs/heads/branch-2.0 273f3d052 -> 670f48222 [SPARK-15318][ML][EXAMPLE] spark.ml Collaborative Filtering example does not work in spark-shell ## What changes were proposed in this pull request? (Please fill in changes proposed in this fix) copy & paste example in ml-collaborative-filtering.html into spark-shell, we see the following errors. scala> case class Rating(userId: Int, movieId: Int, rating: Float, timestamp: Long) defined class Rating scala> object Rating { def parseRating(str: String): Rating = { | val fields = str.split("::") | assert(fields.size == 4) | Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong) | } } :29: error: Rating.type does not take parameters Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong) ^ In standard scala repl, it has the same error. Scala/spark-shell repl has some quirks (e.g. packages are also not well supported). The reason of errors is that scala/spark-shell repl discards previous definitions when we define the Object with the same class name. Solution: We can rename the Object Rating. ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) Manually test it: 1). ./bin/run-example ALSExample 2). copy & paste example in the generated document. It works fine. Author: wm...@hotmail.comCloses #13110 from wangmiao1981/repl. (cherry picked from commit bebe5f9811f968db92c2d33e2b30c35cfb808a4a) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/670f4822 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/670f4822 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/670f4822 Branch: refs/heads/branch-2.0 Commit: 670f482225e20d512c2c1c1fccee5b9a7d3745b0 Parents: 273f3d0 Author: wm...@hotmail.com Authored: Tue May 17 16:51:01 2016 +0100 Committer: Sean Owen Committed: Tue May 17 16:51:07 2016 +0100 -- .../apache/spark/examples/ml/ALSExample.scala| 19 --- 1 file changed, 12 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/670f4822/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala index 6b151a6..da19ea9 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala @@ -24,16 +24,21 @@ import org.apache.spark.ml.recommendation.ALS // $example off$ import org.apache.spark.sql.SparkSession +/** + * An example demonstrating ALS. + * Run with + * {{{ + * bin/run-example ml.ALSExample + * }}} + */ object ALSExample { // $example on$ case class Rating(userId: Int, movieId: Int, rating: Float, timestamp: Long) - object Rating { -def parseRating(str: String): Rating = { - val fields = str.split("::") - assert(fields.size == 4) - Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong) -} + def parseRating(str: String): Rating = { +val fields = str.split("::") +assert(fields.size == 4) +Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong) } // $example off$ @@ -46,7 +51,7 @@ object ALSExample { // $example on$ val ratings = spark.read.text("data/mllib/als/sample_movielens_ratings.txt") - .map(Rating.parseRating) + .map(parseRating) .toDF() val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15318][ML][EXAMPLE] spark.ml Collaborative Filtering example does not work in spark-shell
Repository: spark Updated Branches: refs/heads/master 932d80029 -> bebe5f981 [SPARK-15318][ML][EXAMPLE] spark.ml Collaborative Filtering example does not work in spark-shell ## What changes were proposed in this pull request? (Please fill in changes proposed in this fix) copy & paste example in ml-collaborative-filtering.html into spark-shell, we see the following errors. scala> case class Rating(userId: Int, movieId: Int, rating: Float, timestamp: Long) defined class Rating scala> object Rating { def parseRating(str: String): Rating = { | val fields = str.split("::") | assert(fields.size == 4) | Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong) | } } :29: error: Rating.type does not take parameters Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong) ^ In standard scala repl, it has the same error. Scala/spark-shell repl has some quirks (e.g. packages are also not well supported). The reason of errors is that scala/spark-shell repl discards previous definitions when we define the Object with the same class name. Solution: We can rename the Object Rating. ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) Manually test it: 1). ./bin/run-example ALSExample 2). copy & paste example in the generated document. It works fine. Author: wm...@hotmail.comCloses #13110 from wangmiao1981/repl. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bebe5f98 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bebe5f98 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bebe5f98 Branch: refs/heads/master Commit: bebe5f9811f968db92c2d33e2b30c35cfb808a4a Parents: 932d800 Author: wm...@hotmail.com Authored: Tue May 17 16:51:01 2016 +0100 Committer: Sean Owen Committed: Tue May 17 16:51:01 2016 +0100 -- .../apache/spark/examples/ml/ALSExample.scala| 19 --- 1 file changed, 12 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bebe5f98/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala index 6b151a6..da19ea9 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala @@ -24,16 +24,21 @@ import org.apache.spark.ml.recommendation.ALS // $example off$ import org.apache.spark.sql.SparkSession +/** + * An example demonstrating ALS. + * Run with + * {{{ + * bin/run-example ml.ALSExample + * }}} + */ object ALSExample { // $example on$ case class Rating(userId: Int, movieId: Int, rating: Float, timestamp: Long) - object Rating { -def parseRating(str: String): Rating = { - val fields = str.split("::") - assert(fields.size == 4) - Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong) -} + def parseRating(str: String): Rating = { +val fields = str.split("::") +assert(fields.size == 4) +Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong) } // $example off$ @@ -46,7 +51,7 @@ object ALSExample { // $example on$ val ratings = spark.read.text("data/mllib/als/sample_movielens_ratings.txt") - .map(Rating.parseRating) + .map(parseRating) .toDF() val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15333][DOCS] Reorganize building-spark.md; rationalize vs wiki
Repository: spark Updated Branches: refs/heads/branch-2.0 b031ea7dc -> 273f3d052 [SPARK-15333][DOCS] Reorganize building-spark.md; rationalize vs wiki ## What changes were proposed in this pull request? See JIRA for the motivation. The changes are almost entirely movement of text and edits to sections. Minor changes to text include: - Copying in / merging text from the "Useful Developer Tools" wiki, in areas of - Docker - R - Running one test - standardizing on ./build/mvn not mvn, and likewise for ./build/sbt - correcting some typos - standardizing code block formatting No text has been removed from this doc; text has been imported from the https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools wiki ## How was this patch tested? Jekyll doc build and inspection of resulting HTML in browser. Author: Sean OwenCloses #13124 from srowen/SPARK-15333. (cherry picked from commit 932d8002931d352dd2ec87184e6c84ec5fa859cd) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/273f3d05 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/273f3d05 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/273f3d05 Branch: refs/heads/branch-2.0 Commit: 273f3d05294f8fcd8f3f4e116afcd96bd4b50920 Parents: b031ea7 Author: Sean Owen Authored: Tue May 17 16:40:38 2016 +0100 Committer: Sean Owen Committed: Tue May 17 16:40:48 2016 +0100 -- docs/building-spark.md | 295 +++- 1 file changed, 156 insertions(+), 139 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/273f3d05/docs/building-spark.md -- diff --git a/docs/building-spark.md b/docs/building-spark.md index 63532c7..2c987cf 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -7,48 +7,18 @@ redirect_from: "building-with-maven.html" * This will become a table of contents (this text will be scraped). {:toc} -Building Spark using Maven requires Maven 3.3.9 or newer and Java 7+. -The Spark build can supply a suitable Maven binary; see below. - -# Building with `build/mvn` - -Spark now comes packaged with a self-contained Maven installation to ease building and deployment of Spark from source located under the `build/` directory. This script will automatically download and setup all necessary build requirements ([Maven](https://maven.apache.org/), [Scala](http://www.scala-lang.org/), and [Zinc](https://github.com/typesafehub/zinc)) locally within the `build/` directory itself. It honors any `mvn` binary if present already, however, will pull down its own copy of Scala and Zinc regardless to ensure proper version requirements are met. `build/mvn` execution acts as a pass through to the `mvn` call allowing easy transition from previous build methods. As an example, one can build a version of Spark as follows: - -{% highlight bash %} -build/mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -DskipTests clean package -{% endhighlight %} - -Other build examples can be found below. - -**Note:** When building on an encrypted filesystem (if your home directory is encrypted, for example), then the Spark build might fail with a "Filename too long" error. As a workaround, add the following in the configuration args of the `scala-maven-plugin` in the project `pom.xml`: - --Xmax-classfile-name -128 - -and in `project/SparkBuild.scala` add: - -scalacOptions in Compile ++= Seq("-Xmax-classfile-name", "128"), - -to the `sharedSettings` val. See also [this PR](https://github.com/apache/spark/pull/2883/files) if you are unsure of where to add these lines. - -# Building a Runnable Distribution +# Building Apache Spark -To create a Spark distribution like those distributed by the -[Spark Downloads](http://spark.apache.org/downloads.html) page, and that is laid out so as -to be runnable, use `./dev/make-distribution.sh` in the project root directory. It can be configured -with Maven profile settings and so on like the direct Maven build. Example: +## Apache Maven -./dev/make-distribution.sh --name custom-spark --tgz -Psparkr -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn - -For more information on usage, run `./dev/make-distribution.sh --help` +The Maven-based build is the build of reference for Apache Spark. +Building Spark using Maven requires Maven 3.3.9 or newer and Java 7+. -# Setting up Maven's Memory Usage +### Setting up Maven's Memory Usage You'll need to configure Maven to use more memory than usual by setting `MAVEN_OPTS`. We recommend the following settings: -{% highlight bash %} -export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M
spark git commit: [SPARK-15333][DOCS] Reorganize building-spark.md; rationalize vs wiki
Repository: spark Updated Branches: refs/heads/master 4134ff0c6 -> 932d80029 [SPARK-15333][DOCS] Reorganize building-spark.md; rationalize vs wiki ## What changes were proposed in this pull request? See JIRA for the motivation. The changes are almost entirely movement of text and edits to sections. Minor changes to text include: - Copying in / merging text from the "Useful Developer Tools" wiki, in areas of - Docker - R - Running one test - standardizing on ./build/mvn not mvn, and likewise for ./build/sbt - correcting some typos - standardizing code block formatting No text has been removed from this doc; text has been imported from the https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools wiki ## How was this patch tested? Jekyll doc build and inspection of resulting HTML in browser. Author: Sean OwenCloses #13124 from srowen/SPARK-15333. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/932d8002 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/932d8002 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/932d8002 Branch: refs/heads/master Commit: 932d8002931d352dd2ec87184e6c84ec5fa859cd Parents: 4134ff0 Author: Sean Owen Authored: Tue May 17 16:40:38 2016 +0100 Committer: Sean Owen Committed: Tue May 17 16:40:38 2016 +0100 -- docs/building-spark.md | 295 +++- 1 file changed, 156 insertions(+), 139 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/932d8002/docs/building-spark.md -- diff --git a/docs/building-spark.md b/docs/building-spark.md index 63532c7..2c987cf 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -7,48 +7,18 @@ redirect_from: "building-with-maven.html" * This will become a table of contents (this text will be scraped). {:toc} -Building Spark using Maven requires Maven 3.3.9 or newer and Java 7+. -The Spark build can supply a suitable Maven binary; see below. - -# Building with `build/mvn` - -Spark now comes packaged with a self-contained Maven installation to ease building and deployment of Spark from source located under the `build/` directory. This script will automatically download and setup all necessary build requirements ([Maven](https://maven.apache.org/), [Scala](http://www.scala-lang.org/), and [Zinc](https://github.com/typesafehub/zinc)) locally within the `build/` directory itself. It honors any `mvn` binary if present already, however, will pull down its own copy of Scala and Zinc regardless to ensure proper version requirements are met. `build/mvn` execution acts as a pass through to the `mvn` call allowing easy transition from previous build methods. As an example, one can build a version of Spark as follows: - -{% highlight bash %} -build/mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -DskipTests clean package -{% endhighlight %} - -Other build examples can be found below. - -**Note:** When building on an encrypted filesystem (if your home directory is encrypted, for example), then the Spark build might fail with a "Filename too long" error. As a workaround, add the following in the configuration args of the `scala-maven-plugin` in the project `pom.xml`: - --Xmax-classfile-name -128 - -and in `project/SparkBuild.scala` add: - -scalacOptions in Compile ++= Seq("-Xmax-classfile-name", "128"), - -to the `sharedSettings` val. See also [this PR](https://github.com/apache/spark/pull/2883/files) if you are unsure of where to add these lines. - -# Building a Runnable Distribution +# Building Apache Spark -To create a Spark distribution like those distributed by the -[Spark Downloads](http://spark.apache.org/downloads.html) page, and that is laid out so as -to be runnable, use `./dev/make-distribution.sh` in the project root directory. It can be configured -with Maven profile settings and so on like the direct Maven build. Example: +## Apache Maven -./dev/make-distribution.sh --name custom-spark --tgz -Psparkr -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn - -For more information on usage, run `./dev/make-distribution.sh --help` +The Maven-based build is the build of reference for Apache Spark. +Building Spark using Maven requires Maven 3.3.9 or newer and Java 7+. -# Setting up Maven's Memory Usage +### Setting up Maven's Memory Usage You'll need to configure Maven to use more memory than usual by setting `MAVEN_OPTS`. We recommend the following settings: -{% highlight bash %} -export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m" -{% endhighlight %} +export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m" If you
spark git commit: [SPARK-14434][ML] User guide doc and examples for GaussianMixture in spark.ml
Repository: spark Updated Branches: refs/heads/branch-2.0 c0bcecf91 -> b031ea7dc [SPARK-14434][ML] User guide doc and examples for GaussianMixture in spark.ml ## What changes were proposed in this pull request? (Please fill in changes proposed in this fix) Add guide doc and examples for GaussianMixture in Spark.ml in Java, Scala and Python. ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) Manual compile and test all examples Author: wm...@hotmail.comCloses #12788 from wangmiao1981/example. (cherry picked from commit 4134ff0c657efcbf0f61eff0423215afd6132837) Signed-off-by: Nick Pentreath Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b031ea7d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b031ea7d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b031ea7d Branch: refs/heads/branch-2.0 Commit: b031ea7dc29b3e55dfaf8e8466b6d8f33cb81a3e Parents: c0bcecf Author: wm...@hotmail.com Authored: Tue May 17 15:20:47 2016 +0200 Committer: Nick Pentreath Committed: Tue May 17 15:21:04 2016 +0200 -- docs/ml-clustering.md | 82 .../examples/ml/JavaGaussianMixtureExample.java | 64 +++ .../main/python/ml/gaussian_mixture_example.py | 48 .../examples/ml/GaussianMixtureExample.scala| 58 ++ 4 files changed, 252 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b031ea7d/docs/ml-clustering.md -- diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md index a0955a3..33e4b7b 100644 --- a/docs/ml-clustering.md +++ b/docs/ml-clustering.md @@ -148,3 +148,85 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering. {% include_example python/ml/bisecting_k_means_example.py %} + +## Gaussian Mixture Model (GMM) + +A [Gaussian Mixture Model](http://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model) +represents a composite distribution whereby points are drawn from one of *k* Gaussian sub-distributions, +each with its own probability. The `spark.ml` implementation uses the +[expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) +algorithm to induce the maximum-likelihood model given a set of samples. + +`GaussianMixture` is implemented as an `Estimator` and generates a `GaussianMixtureModel` as the base +model. + +### Input Columns + + + + + Param name + Type(s) + Default + Description + + + + + featuresCol + Vector + "features" + Feature vector + + + + +### Output Columns + + + + + Param name + Type(s) + Default + Description + + + + + predictionCol + Int + "prediction" + Predicted cluster center + + + probabilityCol + Vector + "probability" + Probability of each cluster + + + + +### Example + + + + +Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.clustering.GaussianMixture) for more details. + +{% include_example scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala %} + + + +Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/GaussianMixture.html) for more details. + +{% include_example java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java %} + + + +Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering.GaussianMixture) for more details. + +{% include_example python/ml/gaussian_mixture_example.py %} + + http://git-wip-us.apache.org/repos/asf/spark/blob/b031ea7d/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java new file mode 100644 index 000..79b9909 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *
spark git commit: [SPARK-14434][ML] User guide doc and examples for GaussianMixture in spark.ml
Repository: spark Updated Branches: refs/heads/master c36ca651f -> 4134ff0c6 [SPARK-14434][ML] User guide doc and examples for GaussianMixture in spark.ml ## What changes were proposed in this pull request? (Please fill in changes proposed in this fix) Add guide doc and examples for GaussianMixture in Spark.ml in Java, Scala and Python. ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) Manual compile and test all examples Author: wm...@hotmail.comCloses #12788 from wangmiao1981/example. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4134ff0c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4134ff0c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4134ff0c Branch: refs/heads/master Commit: 4134ff0c657efcbf0f61eff0423215afd6132837 Parents: c36ca65 Author: wm...@hotmail.com Authored: Tue May 17 15:20:47 2016 +0200 Committer: Nick Pentreath Committed: Tue May 17 15:20:47 2016 +0200 -- docs/ml-clustering.md | 82 .../examples/ml/JavaGaussianMixtureExample.java | 64 +++ .../main/python/ml/gaussian_mixture_example.py | 48 .../examples/ml/GaussianMixtureExample.scala| 58 ++ 4 files changed, 252 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4134ff0c/docs/ml-clustering.md -- diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md index a0955a3..33e4b7b 100644 --- a/docs/ml-clustering.md +++ b/docs/ml-clustering.md @@ -148,3 +148,85 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering. {% include_example python/ml/bisecting_k_means_example.py %} + +## Gaussian Mixture Model (GMM) + +A [Gaussian Mixture Model](http://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model) +represents a composite distribution whereby points are drawn from one of *k* Gaussian sub-distributions, +each with its own probability. The `spark.ml` implementation uses the +[expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) +algorithm to induce the maximum-likelihood model given a set of samples. + +`GaussianMixture` is implemented as an `Estimator` and generates a `GaussianMixtureModel` as the base +model. + +### Input Columns + + + + + Param name + Type(s) + Default + Description + + + + + featuresCol + Vector + "features" + Feature vector + + + + +### Output Columns + + + + + Param name + Type(s) + Default + Description + + + + + predictionCol + Int + "prediction" + Predicted cluster center + + + probabilityCol + Vector + "probability" + Probability of each cluster + + + + +### Example + + + + +Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.clustering.GaussianMixture) for more details. + +{% include_example scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala %} + + + +Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/GaussianMixture.html) for more details. + +{% include_example java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java %} + + + +Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering.GaussianMixture) for more details. + +{% include_example python/ml/gaussian_mixture_example.py %} + + http://git-wip-us.apache.org/repos/asf/spark/blob/4134ff0c/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java new file mode 100644 index 000..79b9909 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is
spark git commit: [SPARK-15351][SQL] RowEncoder should support array as the external type for ArrayType
Repository: spark Updated Branches: refs/heads/branch-2.0 1426235bf -> c0bcecf91 [SPARK-15351][SQL] RowEncoder should support array as the external type for ArrayType ## What changes were proposed in this pull request? This PR improves `RowEncoder` and `MapObjects`, to support array as the external type for `ArrayType`. The idea is straightforward, we use `Object` as the external input type for `ArrayType`, and determine its type at runtime in `MapObjects`. ## How was this patch tested? new test in `RowEncoderSuite` Author: Wenchen FanCloses #13138 from cloud-fan/map-object. (cherry picked from commit c36ca651f9177f8e7a3f6a0098cba5a810ee9deb) Signed-off-by: Wenchen Fan Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c0bcecf9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c0bcecf9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c0bcecf9 Branch: refs/heads/branch-2.0 Commit: c0bcecf914a0e0f6669a62a50e6198af38d4aac6 Parents: 1426235 Author: Wenchen Fan Authored: Tue May 17 17:02:52 2016 +0800 Committer: Wenchen Fan Committed: Tue May 17 17:03:15 2016 +0800 -- .../main/scala/org/apache/spark/sql/Row.scala | 4 +- .../sql/catalyst/encoders/RowEncoder.scala | 22 + .../catalyst/expressions/objects/objects.scala | 99 +--- .../sql/catalyst/util/GenericArrayData.scala| 5 + .../sql/catalyst/encoders/RowEncoderSuite.scala | 17 5 files changed, 92 insertions(+), 55 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c0bcecf9/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala index 726291b..a257b83 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala @@ -151,7 +151,7 @@ trait Row extends Serializable { * BinaryType -> byte array * ArrayType -> scala.collection.Seq (use getList for java.util.List) * MapType -> scala.collection.Map (use getJavaMap for java.util.Map) - * StructType -> org.apache.spark.sql.Row (or Product) + * StructType -> org.apache.spark.sql.Row * }}} */ def apply(i: Int): Any = get(i) @@ -176,7 +176,7 @@ trait Row extends Serializable { * BinaryType -> byte array * ArrayType -> scala.collection.Seq (use getList for java.util.List) * MapType -> scala.collection.Map (use getJavaMap for java.util.Map) - * StructType -> org.apache.spark.sql.Row (or Product) + * StructType -> org.apache.spark.sql.Row * }}} */ def get(i: Int): Any http://git-wip-us.apache.org/repos/asf/spark/blob/c0bcecf9/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala index ae842a9..a5f39aa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala @@ -32,6 +32,26 @@ import org.apache.spark.unsafe.types.UTF8String /** * A factory for constructing encoders that convert external row to/from the Spark SQL * internal binary representation. + * + * The following is a mapping between Spark SQL types and its allowed external types: + * {{{ + * BooleanType -> java.lang.Boolean + * ByteType -> java.lang.Byte + * ShortType -> java.lang.Short + * IntegerType -> java.lang.Integer + * FloatType -> java.lang.Float + * DoubleType -> java.lang.Double + * StringType -> String + * DecimalType -> java.math.BigDecimal or scala.math.BigDecimal or Decimal + * + * DateType -> java.sql.Date + * TimestampType -> java.sql.Timestamp + * + * BinaryType -> byte array + * ArrayType -> scala.collection.Seq or Array + * MapType -> scala.collection.Map + * StructType -> org.apache.spark.sql.Row or Product + * }}} */ object RowEncoder { def apply(schema: StructType): ExpressionEncoder[Row] = { @@ -166,6 +186,8 @@ object RowEncoder { // In order to support both Decimal and java/scala BigDecimal in external row, we make this // as java.lang.Object. case _: DecimalType => ObjectType(classOf[java.lang.Object]) +// In order to support both Array and Seq in external row, we make this as java.lang.Object. +case _: ArrayType =>
spark git commit: [SPARK-15351][SQL] RowEncoder should support array as the external type for ArrayType
Repository: spark Updated Branches: refs/heads/master 122302cbf -> c36ca651f [SPARK-15351][SQL] RowEncoder should support array as the external type for ArrayType ## What changes were proposed in this pull request? This PR improves `RowEncoder` and `MapObjects`, to support array as the external type for `ArrayType`. The idea is straightforward, we use `Object` as the external input type for `ArrayType`, and determine its type at runtime in `MapObjects`. ## How was this patch tested? new test in `RowEncoderSuite` Author: Wenchen FanCloses #13138 from cloud-fan/map-object. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c36ca651 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c36ca651 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c36ca651 Branch: refs/heads/master Commit: c36ca651f9177f8e7a3f6a0098cba5a810ee9deb Parents: 122302c Author: Wenchen Fan Authored: Tue May 17 17:02:52 2016 +0800 Committer: Wenchen Fan Committed: Tue May 17 17:02:52 2016 +0800 -- .../main/scala/org/apache/spark/sql/Row.scala | 4 +- .../sql/catalyst/encoders/RowEncoder.scala | 22 + .../catalyst/expressions/objects/objects.scala | 99 +--- .../sql/catalyst/util/GenericArrayData.scala| 5 + .../sql/catalyst/encoders/RowEncoderSuite.scala | 17 5 files changed, 92 insertions(+), 55 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c36ca651/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala index 726291b..a257b83 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala @@ -151,7 +151,7 @@ trait Row extends Serializable { * BinaryType -> byte array * ArrayType -> scala.collection.Seq (use getList for java.util.List) * MapType -> scala.collection.Map (use getJavaMap for java.util.Map) - * StructType -> org.apache.spark.sql.Row (or Product) + * StructType -> org.apache.spark.sql.Row * }}} */ def apply(i: Int): Any = get(i) @@ -176,7 +176,7 @@ trait Row extends Serializable { * BinaryType -> byte array * ArrayType -> scala.collection.Seq (use getList for java.util.List) * MapType -> scala.collection.Map (use getJavaMap for java.util.Map) - * StructType -> org.apache.spark.sql.Row (or Product) + * StructType -> org.apache.spark.sql.Row * }}} */ def get(i: Int): Any http://git-wip-us.apache.org/repos/asf/spark/blob/c36ca651/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala index ae842a9..a5f39aa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala @@ -32,6 +32,26 @@ import org.apache.spark.unsafe.types.UTF8String /** * A factory for constructing encoders that convert external row to/from the Spark SQL * internal binary representation. + * + * The following is a mapping between Spark SQL types and its allowed external types: + * {{{ + * BooleanType -> java.lang.Boolean + * ByteType -> java.lang.Byte + * ShortType -> java.lang.Short + * IntegerType -> java.lang.Integer + * FloatType -> java.lang.Float + * DoubleType -> java.lang.Double + * StringType -> String + * DecimalType -> java.math.BigDecimal or scala.math.BigDecimal or Decimal + * + * DateType -> java.sql.Date + * TimestampType -> java.sql.Timestamp + * + * BinaryType -> byte array + * ArrayType -> scala.collection.Seq or Array + * MapType -> scala.collection.Map + * StructType -> org.apache.spark.sql.Row or Product + * }}} */ object RowEncoder { def apply(schema: StructType): ExpressionEncoder[Row] = { @@ -166,6 +186,8 @@ object RowEncoder { // In order to support both Decimal and java/scala BigDecimal in external row, we make this // as java.lang.Object. case _: DecimalType => ObjectType(classOf[java.lang.Object]) +// In order to support both Array and Seq in external row, we make this as java.lang.Object. +case _: ArrayType => ObjectType(classOf[java.lang.Object]) case _ => externalDataTypeFor(dt) }
spark git commit: [SPARK-15290][BUILD] Move annotations, like @Since / @DeveloperApi, into spark-tags
Repository: spark Updated Branches: refs/heads/branch-2.0 6d10b2826 -> 1426235bf [SPARK-15290][BUILD] Move annotations, like @Since / @DeveloperApi, into spark-tags ## What changes were proposed in this pull request? (See https://github.com/apache/spark/pull/12416 where most of this was already reviewed and committed; this is just the module structure and move part. This change does not move the annotations into test scope, which was the apparently problem last time.) Rename `spark-test-tags` -> `spark-tags`; move common annotations like `Since` to `spark-tags` ## How was this patch tested? Jenkins tests. Author: Sean OwenCloses #13074 from srowen/SPARK-15290. (cherry picked from commit 122302cbf5cbf1133067a5acdffd6ab96765dafe) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1426235b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1426235b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1426235b Branch: refs/heads/branch-2.0 Commit: 1426235bfb1ecfa55859930913ae45d085912bf7 Parents: 6d10b28 Author: Sean Owen Authored: Tue May 17 09:55:53 2016 +0100 Committer: Sean Owen Committed: Tue May 17 09:56:09 2016 +0100 -- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml | 2 +- common/network-yarn/pom.xml | 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml | 6 +-- .../apache/spark/annotation/AlphaComponent.java | 33 .../apache/spark/annotation/DeveloperApi.java | 35 + .../apache/spark/annotation/Experimental.java | 36 + .../org/apache/spark/annotation/Private.java| 41 .../org/apache/spark/annotation/Since.scala | 30 ++ .../apache/spark/annotation/package-info.java | 23 +++ .../org/apache/spark/annotation/package.scala | 25 common/unsafe/pom.xml | 8 ++-- core/pom.xml| 2 +- .../apache/spark/annotation/AlphaComponent.java | 33 .../apache/spark/annotation/DeveloperApi.java | 35 - .../apache/spark/annotation/Experimental.java | 36 - .../org/apache/spark/annotation/Private.java| 41 .../org/apache/spark/annotation/Since.scala | 30 -- .../apache/spark/annotation/package-info.java | 23 --- .../org/apache/spark/annotation/package.scala | 25 dev/sparktestsupport/modules.py | 19 ++--- external/docker-integration-tests/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/java8-tests/pom.xml| 2 +- external/kafka-0-8/pom.xml | 2 +- external/kinesis-asl/pom.xml| 2 +- graphx/pom.xml | 2 +- launcher/pom.xml| 2 +- mllib-local/pom.xml | 4 ++ mllib/pom.xml | 2 +- pom.xml | 3 +- project/MimaExcludes.scala | 8 project/SparkBuild.scala| 10 ++--- repl/pom.xml| 2 +- sql/catalyst/pom.xml| 2 +- sql/core/pom.xml| 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml| 2 +- streaming/pom.xml | 2 +- yarn/pom.xml| 2 +- 42 files changed, 282 insertions(+), 264 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1426235b/common/network-common/pom.xml -- diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index bd507c2..5444ae6 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -66,7 +66,7 @@ org.apache.spark - spark-test-tags_${scala.binary.version} + spark-tags_${scala.binary.version} org.mockito http://git-wip-us.apache.org/repos/asf/spark/blob/1426235b/common/network-shuffle/pom.xml -- diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index 810ec10..e736436 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@
spark git commit: [SPARK-15290][BUILD] Move annotations, like @Since / @DeveloperApi, into spark-tags
Repository: spark Updated Branches: refs/heads/master 8ad9f08c9 -> 122302cbf [SPARK-15290][BUILD] Move annotations, like @Since / @DeveloperApi, into spark-tags ## What changes were proposed in this pull request? (See https://github.com/apache/spark/pull/12416 where most of this was already reviewed and committed; this is just the module structure and move part. This change does not move the annotations into test scope, which was the apparently problem last time.) Rename `spark-test-tags` -> `spark-tags`; move common annotations like `Since` to `spark-tags` ## How was this patch tested? Jenkins tests. Author: Sean OwenCloses #13074 from srowen/SPARK-15290. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/122302cb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/122302cb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/122302cb Branch: refs/heads/master Commit: 122302cbf5cbf1133067a5acdffd6ab96765dafe Parents: 8ad9f08 Author: Sean Owen Authored: Tue May 17 09:55:53 2016 +0100 Committer: Sean Owen Committed: Tue May 17 09:55:53 2016 +0100 -- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml | 2 +- common/network-yarn/pom.xml | 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml | 6 +-- .../apache/spark/annotation/AlphaComponent.java | 33 .../apache/spark/annotation/DeveloperApi.java | 35 + .../apache/spark/annotation/Experimental.java | 36 + .../org/apache/spark/annotation/Private.java| 41 .../org/apache/spark/annotation/Since.scala | 30 ++ .../apache/spark/annotation/package-info.java | 23 +++ .../org/apache/spark/annotation/package.scala | 25 common/unsafe/pom.xml | 8 ++-- core/pom.xml| 2 +- .../apache/spark/annotation/AlphaComponent.java | 33 .../apache/spark/annotation/DeveloperApi.java | 35 - .../apache/spark/annotation/Experimental.java | 36 - .../org/apache/spark/annotation/Private.java| 41 .../org/apache/spark/annotation/Since.scala | 30 -- .../apache/spark/annotation/package-info.java | 23 --- .../org/apache/spark/annotation/package.scala | 25 dev/sparktestsupport/modules.py | 19 ++--- external/docker-integration-tests/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/java8-tests/pom.xml| 2 +- external/kafka-0-8/pom.xml | 2 +- external/kinesis-asl/pom.xml| 2 +- graphx/pom.xml | 2 +- launcher/pom.xml| 2 +- mllib-local/pom.xml | 4 ++ mllib/pom.xml | 2 +- pom.xml | 3 +- project/MimaExcludes.scala | 8 project/SparkBuild.scala| 10 ++--- repl/pom.xml| 2 +- sql/catalyst/pom.xml| 2 +- sql/core/pom.xml| 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml| 2 +- streaming/pom.xml | 2 +- yarn/pom.xml| 2 +- 42 files changed, 282 insertions(+), 264 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/122302cb/common/network-common/pom.xml -- diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index bd507c2..5444ae6 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -66,7 +66,7 @@ org.apache.spark - spark-test-tags_${scala.binary.version} + spark-tags_${scala.binary.version} org.mockito http://git-wip-us.apache.org/repos/asf/spark/blob/122302cb/common/network-shuffle/pom.xml -- diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index 810ec10..e736436 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -80,7 +80,7 @@ org.apache.spark - spark-test-tags_${scala.binary.version} +
spark git commit: [SPARK-12972][CORE][TEST-MAVEN][TEST-HADOOP2.2] Update org.apache.httpcomponents.httpclient, commons-io
Repository: spark Updated Branches: refs/heads/branch-2.0 0d5e29655 -> 6d10b2826 [SPARK-12972][CORE][TEST-MAVEN][TEST-HADOOP2.2] Update org.apache.httpcomponents.httpclient, commons-io ## What changes were proposed in this pull request? This is sort of a hot-fix for https://github.com/apache/spark/pull/13117, but, the problem is limited to Hadoop 2.2. The change is to manage `commons-io` to 2.4 for all Hadoop builds, which is only a net change for Hadoop 2.2, which was using 2.1. ## How was this patch tested? Jenkins tests -- normal PR builder, then the `[test-hadoop2.2] [test-maven]` if successful. Author: Sean OwenCloses #13132 from srowen/SPARK-12972.3. (cherry picked from commit fabc8e5b128849a08d820d8c0b3425e39258e02e) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6d10b282 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6d10b282 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6d10b282 Branch: refs/heads/branch-2.0 Commit: 6d10b28261e8f1c989d4cab9f59f5f082fd267de Parents: 0d5e296 Author: Sean Owen Authored: Mon May 16 16:27:04 2016 +0100 Committer: Sean Owen Committed: Tue May 17 09:53:38 2016 +0100 -- dev/deps/spark-deps-hadoop-2.2 | 2 +- pom.xml| 6 ++ 2 files changed, 7 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6d10b282/dev/deps/spark-deps-hadoop-2.2 -- diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index c3be6b2..91b333f 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -30,7 +30,7 @@ commons-configuration-1.6.jar commons-dbcp-1.4.jar commons-digester-1.8.jar commons-httpclient-3.1.jar -commons-io-2.1.jar +commons-io-2.4.jar commons-lang-2.6.jar commons-lang3-3.3.2.jar commons-logging-1.1.3.jar http://git-wip-us.apache.org/repos/asf/spark/blob/6d10b282/pom.xml -- diff --git a/pom.xml b/pom.xml index 40d9bf5..864824d 100644 --- a/pom.xml +++ b/pom.xml @@ -166,6 +166,7 @@ 1.1.2 1.2.0-incubating 1.10 +2.4 2.6 @@ -377,6 +378,11 @@ ${commons-lang2.version} +commons-io +commons-io +${commons-io.version} + + commons-codec commons-codec ${commons-codec.version} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12972][CORE] Update org.apache.httpcomponents.httpclient
Repository: spark Updated Branches: refs/heads/branch-2.0 8e3ee683b -> 0d5e29655 [SPARK-12972][CORE] Update org.apache.httpcomponents.httpclient ## What changes were proposed in this pull request? (Retry of https://github.com/apache/spark/pull/13049) - update to httpclient 4.5 / httpcore 4.4 - remove some defunct exclusions - manage httpmime version to match - update selenium / httpunit to support 4.5 (possible now that Jetty 9 is used) ## How was this patch tested? Jenkins tests. Also, locally running the same test command of one Jenkins profile that failed: `mvn -Phadoop-2.6 -Pyarn -Phive -Phive-thriftserver -Pkinesis-asl ...` Author: Sean OwenCloses #13117 from srowen/SPARK-12972.2. (cherry picked from commit f5576a052da0bb59343bc2a6b6ce06c6abaac75b) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0d5e2965 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0d5e2965 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0d5e2965 Branch: refs/heads/branch-2.0 Commit: 0d5e29655f9c3758393794367c0b5d3fd395d1f6 Parents: 8e3ee68 Author: Sean Owen Authored: Sun May 15 15:56:46 2016 +0100 Committer: Sean Owen Committed: Tue May 17 09:53:13 2016 +0100 -- core/pom.xml | 11 +++--- dev/deps/spark-deps-hadoop-2.2| 4 +- dev/deps/spark-deps-hadoop-2.3| 4 +- dev/deps/spark-deps-hadoop-2.4| 4 +- dev/deps/spark-deps-hadoop-2.6| 4 +- dev/deps/spark-deps-hadoop-2.7| 4 +- external/docker-integration-tests/pom.xml | 2 - pom.xml | 54 +++--- sql/hive-thriftserver/pom.xml | 11 +++--- streaming/pom.xml | 5 +++ 10 files changed, 40 insertions(+), 63 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0d5e2965/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index c985352..4b8fb4e 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -275,12 +275,11 @@ org.seleniumhq.selenium selenium-java - - - com.google.guava - guava - - + test + + + org.seleniumhq.selenium + selenium-htmlunit-driver test http://git-wip-us.apache.org/repos/asf/spark/blob/0d5e2965/dev/deps/spark-deps-hadoop-2.2 -- diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index 2477312..c3be6b2 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -69,8 +69,8 @@ hadoop-yarn-server-web-proxy-2.2.0.jar hk2-api-2.4.0-b34.jar hk2-locator-2.4.0-b34.jar hk2-utils-2.4.0-b34.jar -httpclient-4.3.2.jar -httpcore-4.3.2.jar +httpclient-4.5.2.jar +httpcore-4.4.4.jar ivy-2.4.0.jar jackson-annotations-2.5.3.jar jackson-core-2.5.3.jar http://git-wip-us.apache.org/repos/asf/spark/blob/0d5e2965/dev/deps/spark-deps-hadoop-2.3 -- diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index 0181a47..61ed4c0 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -71,8 +71,8 @@ hadoop-yarn-server-web-proxy-2.3.0.jar hk2-api-2.4.0-b34.jar hk2-locator-2.4.0-b34.jar hk2-utils-2.4.0-b34.jar -httpclient-4.3.2.jar -httpcore-4.3.2.jar +httpclient-4.5.2.jar +httpcore-4.4.4.jar ivy-2.4.0.jar jackson-annotations-2.5.3.jar jackson-core-2.5.3.jar http://git-wip-us.apache.org/repos/asf/spark/blob/0d5e2965/dev/deps/spark-deps-hadoop-2.4 -- diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index f7ff234..fb01492 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -71,8 +71,8 @@ hadoop-yarn-server-web-proxy-2.4.0.jar hk2-api-2.4.0-b34.jar hk2-locator-2.4.0-b34.jar hk2-utils-2.4.0-b34.jar -httpclient-4.3.2.jar -httpcore-4.3.2.jar +httpclient-4.5.2.jar +httpcore-4.4.4.jar ivy-2.4.0.jar jackson-annotations-2.5.3.jar jackson-core-2.5.3.jar http://git-wip-us.apache.org/repos/asf/spark/blob/0d5e2965/dev/deps/spark-deps-hadoop-2.6 -- diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 92db55d..0baf4e8 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -77,8 +77,8 @@ hk2-api-2.4.0-b34.jar hk2-locator-2.4.0-b34.jar hk2-utils-2.4.0-b34.jar htrace-core-3.0.4.jar -httpclient-4.3.2.jar
spark git commit: [SPARK-14906][ML] Copy linalg in PySpark to new ML package
Repository: spark Updated Branches: refs/heads/branch-2.0 0dd1f8720 -> 8e3ee683b [SPARK-14906][ML] Copy linalg in PySpark to new ML package ## What changes were proposed in this pull request? Copy the linalg (Vector/Matrix and VectorUDT/MatrixUDT) in PySpark to new ML package. ## How was this patch tested? Existing tests. Author: Xiangrui MengAuthor: Liang-Chi Hsieh Author: Liang-Chi Hsieh Closes #13099 from viirya/move-pyspark-vector-matrix-udt4. (cherry picked from commit 8ad9f08c94e98317a9095dd53d737c1b8df6e29c) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8e3ee683 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8e3ee683 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8e3ee683 Branch: refs/heads/branch-2.0 Commit: 8e3ee683bb7ecc857480bc347e7a814e5a63ff28 Parents: 0dd1f87 Author: Xiangrui Meng Authored: Tue May 17 00:08:02 2016 -0700 Committer: Xiangrui Meng Committed: Tue May 17 00:08:15 2016 -0700 -- python/docs/pyspark.ml.rst |8 + python/pyspark/ml/linalg/__init__.py | 1145 + python/pyspark/ml/tests.py | 456 ++-- 3 files changed, 1564 insertions(+), 45 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8e3ee683/python/docs/pyspark.ml.rst -- diff --git a/python/docs/pyspark.ml.rst b/python/docs/pyspark.ml.rst index 86d4186..26f7415 100644 --- a/python/docs/pyspark.ml.rst +++ b/python/docs/pyspark.ml.rst @@ -41,6 +41,14 @@ pyspark.ml.clustering module :undoc-members: :inherited-members: +pyspark.ml.linalg module + + +.. automodule:: pyspark.ml.linalg +:members: +:undoc-members: +:inherited-members: + pyspark.ml.recommendation module http://git-wip-us.apache.org/repos/asf/spark/blob/8e3ee683/python/pyspark/ml/linalg/__init__.py -- diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py new file mode 100644 index 000..f42c589 --- /dev/null +++ b/python/pyspark/ml/linalg/__init__.py @@ -0,0 +1,1145 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +MLlib utilities for linear algebra. For dense vectors, MLlib +uses the NumPy C{array} type, so you can simply pass NumPy arrays +around. For sparse vectors, users can construct a L{SparseVector} +object from MLlib or pass SciPy C{scipy.sparse} column vectors if +SciPy is available in their environment. +""" + +import sys +import array +import struct + +if sys.version >= '3': +basestring = str +xrange = range +import copyreg as copy_reg +long = int +else: +from itertools import izip as zip +import copy_reg + +import numpy as np + +from pyspark import since +from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \ +IntegerType, ByteType, BooleanType + + +__all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors', + 'Matrix', 'DenseMatrix', 'SparseMatrix', 'Matrices'] + + +if sys.version_info[:2] == (2, 7): +# speed up pickling array in Python 2.7 +def fast_pickle_array(ar): +return array.array, (ar.typecode, ar.tostring()) +copy_reg.pickle(array.array, fast_pickle_array) + + +# Check whether we have SciPy. MLlib works without it too, but if we have it, some methods, +# such as _dot and _serialize_double_vector, start to support scipy.sparse matrices. + +try: +import scipy.sparse +_have_scipy = True +except: +# No SciPy in environment, but that's okay +_have_scipy = False + + +def _convert_to_vector(l): +if isinstance(l, Vector): +return l +elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange): +return DenseVector(l) +elif