spark git commit: [SPARK-19925][SPARKR] Fix SparkR spark.getSparkFiles fails when it was called on executors.
Repository: spark Updated Branches: refs/heads/branch-2.1 c4d2b8338 -> 277ed375b [SPARK-19925][SPARKR] Fix SparkR spark.getSparkFiles fails when it was called on executors. ## What changes were proposed in this pull request? SparkR ```spark.getSparkFiles``` fails when it was called on executors, see details at [SPARK-19925](https://issues.apache.org/jira/browse/SPARK-19925). ## How was this patch tested? Add unit tests, and verify this fix at standalone and yarn cluster. Author: Yanbo LiangCloses #17274 from yanboliang/spark-19925. (cherry picked from commit 478fbc866fbfdb4439788583281863ecea14e8af) Signed-off-by: Yanbo Liang Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/277ed375 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/277ed375 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/277ed375 Branch: refs/heads/branch-2.1 Commit: 277ed375b0af3e8fe2a8b9dee62997dcf16d5872 Parents: c4d2b83 Author: Yanbo Liang Authored: Tue Mar 21 21:50:54 2017 -0700 Committer: Yanbo Liang Committed: Tue Mar 21 22:12:55 2017 -0700 -- R/pkg/R/context.R | 16 ++-- R/pkg/inst/tests/testthat/test_context.R| 7 +++ .../main/scala/org/apache/spark/api/r/RRunner.scala | 2 ++ 3 files changed, 23 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/277ed375/R/pkg/R/context.R -- diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index 1a0dd65..634bdcb 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -330,7 +330,13 @@ spark.addFile <- function(path, recursive = FALSE) { #'} #' @note spark.getSparkFilesRootDirectory since 2.1.0 spark.getSparkFilesRootDirectory <- function() { - callJStatic("org.apache.spark.SparkFiles", "getRootDirectory") + if (Sys.getenv("SPARKR_IS_RUNNING_ON_WORKER") == "") { +# Running on driver. +callJStatic("org.apache.spark.SparkFiles", "getRootDirectory") + } else { +# Running on worker. +Sys.getenv("SPARKR_SPARKFILES_ROOT_DIR") + } } #' Get the absolute path of a file added through spark.addFile. @@ -345,7 +351,13 @@ spark.getSparkFilesRootDirectory <- function() { #'} #' @note spark.getSparkFiles since 2.1.0 spark.getSparkFiles <- function(fileName) { - callJStatic("org.apache.spark.SparkFiles", "get", as.character(fileName)) + if (Sys.getenv("SPARKR_IS_RUNNING_ON_WORKER") == "") { +# Running on driver. +callJStatic("org.apache.spark.SparkFiles", "get", as.character(fileName)) + } else { +# Running on worker. +file.path(spark.getSparkFilesRootDirectory(), as.character(fileName)) + } } #' Run a function over a list of elements, distributing the computations with Spark http://git-wip-us.apache.org/repos/asf/spark/blob/277ed375/R/pkg/inst/tests/testthat/test_context.R -- diff --git a/R/pkg/inst/tests/testthat/test_context.R b/R/pkg/inst/tests/testthat/test_context.R index caca069..c847113 100644 --- a/R/pkg/inst/tests/testthat/test_context.R +++ b/R/pkg/inst/tests/testthat/test_context.R @@ -177,6 +177,13 @@ test_that("add and get file to be downloaded with Spark job on every node", { spark.addFile(path) download_path <- spark.getSparkFiles(filename) expect_equal(readLines(download_path), words) + + # Test spark.getSparkFiles works well on executors. + seq <- seq(from = 1, to = 10, length.out = 5) + f <- function(seq) { spark.getSparkFiles(filename) } + results <- spark.lapply(seq, f) + for (i in 1:5) { expect_equal(basename(results[[i]]), filename) } + unlink(path) # Test add directory recursively. http://git-wip-us.apache.org/repos/asf/spark/blob/277ed375/core/src/main/scala/org/apache/spark/api/r/RRunner.scala -- diff --git a/core/src/main/scala/org/apache/spark/api/r/RRunner.scala b/core/src/main/scala/org/apache/spark/api/r/RRunner.scala index 29e21b3..8811839 100644 --- a/core/src/main/scala/org/apache/spark/api/r/RRunner.scala +++ b/core/src/main/scala/org/apache/spark/api/r/RRunner.scala @@ -347,6 +347,8 @@ private[r] object RRunner { pb.environment().put("SPARKR_RLIBDIR", rLibDir.mkString(",")) pb.environment().put("SPARKR_WORKER_PORT", port.toString) pb.environment().put("SPARKR_BACKEND_CONNECTION_TIMEOUT", rConnectionTimeout.toString) +pb.environment().put("SPARKR_SPARKFILES_ROOT_DIR", SparkFiles.getRootDirectory()) +pb.environment().put("SPARKR_IS_RUNNING_ON_WORKER", "TRUE") pb.redirectErrorStream(true) // redirect stderr into stdout val proc = pb.start() val
spark git commit: [SPARK-19925][SPARKR] Fix SparkR spark.getSparkFiles fails when it was called on executors.
Repository: spark Updated Branches: refs/heads/master c1e87e384 -> 478fbc866 [SPARK-19925][SPARKR] Fix SparkR spark.getSparkFiles fails when it was called on executors. ## What changes were proposed in this pull request? SparkR ```spark.getSparkFiles``` fails when it was called on executors, see details at [SPARK-19925](https://issues.apache.org/jira/browse/SPARK-19925). ## How was this patch tested? Add unit tests, and verify this fix at standalone and yarn cluster. Author: Yanbo LiangCloses #17274 from yanboliang/spark-19925. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/478fbc86 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/478fbc86 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/478fbc86 Branch: refs/heads/master Commit: 478fbc866fbfdb4439788583281863ecea14e8af Parents: c1e87e3 Author: Yanbo Liang Authored: Tue Mar 21 21:50:54 2017 -0700 Committer: Yanbo Liang Committed: Tue Mar 21 21:50:54 2017 -0700 -- R/pkg/R/context.R | 16 ++-- R/pkg/inst/tests/testthat/test_context.R| 7 +++ .../main/scala/org/apache/spark/api/r/RRunner.scala | 2 ++ 3 files changed, 23 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/478fbc86/R/pkg/R/context.R -- diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index 1ca573e..50856e3 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -330,7 +330,13 @@ spark.addFile <- function(path, recursive = FALSE) { #'} #' @note spark.getSparkFilesRootDirectory since 2.1.0 spark.getSparkFilesRootDirectory <- function() { - callJStatic("org.apache.spark.SparkFiles", "getRootDirectory") + if (Sys.getenv("SPARKR_IS_RUNNING_ON_WORKER") == "") { +# Running on driver. +callJStatic("org.apache.spark.SparkFiles", "getRootDirectory") + } else { +# Running on worker. +Sys.getenv("SPARKR_SPARKFILES_ROOT_DIR") + } } #' Get the absolute path of a file added through spark.addFile. @@ -345,7 +351,13 @@ spark.getSparkFilesRootDirectory <- function() { #'} #' @note spark.getSparkFiles since 2.1.0 spark.getSparkFiles <- function(fileName) { - callJStatic("org.apache.spark.SparkFiles", "get", as.character(fileName)) + if (Sys.getenv("SPARKR_IS_RUNNING_ON_WORKER") == "") { +# Running on driver. +callJStatic("org.apache.spark.SparkFiles", "get", as.character(fileName)) + } else { +# Running on worker. +file.path(spark.getSparkFilesRootDirectory(), as.character(fileName)) + } } #' Run a function over a list of elements, distributing the computations with Spark http://git-wip-us.apache.org/repos/asf/spark/blob/478fbc86/R/pkg/inst/tests/testthat/test_context.R -- diff --git a/R/pkg/inst/tests/testthat/test_context.R b/R/pkg/inst/tests/testthat/test_context.R index caca069..c847113 100644 --- a/R/pkg/inst/tests/testthat/test_context.R +++ b/R/pkg/inst/tests/testthat/test_context.R @@ -177,6 +177,13 @@ test_that("add and get file to be downloaded with Spark job on every node", { spark.addFile(path) download_path <- spark.getSparkFiles(filename) expect_equal(readLines(download_path), words) + + # Test spark.getSparkFiles works well on executors. + seq <- seq(from = 1, to = 10, length.out = 5) + f <- function(seq) { spark.getSparkFiles(filename) } + results <- spark.lapply(seq, f) + for (i in 1:5) { expect_equal(basename(results[[i]]), filename) } + unlink(path) # Test add directory recursively. http://git-wip-us.apache.org/repos/asf/spark/blob/478fbc86/core/src/main/scala/org/apache/spark/api/r/RRunner.scala -- diff --git a/core/src/main/scala/org/apache/spark/api/r/RRunner.scala b/core/src/main/scala/org/apache/spark/api/r/RRunner.scala index 29e21b3..8811839 100644 --- a/core/src/main/scala/org/apache/spark/api/r/RRunner.scala +++ b/core/src/main/scala/org/apache/spark/api/r/RRunner.scala @@ -347,6 +347,8 @@ private[r] object RRunner { pb.environment().put("SPARKR_RLIBDIR", rLibDir.mkString(",")) pb.environment().put("SPARKR_WORKER_PORT", port.toString) pb.environment().put("SPARKR_BACKEND_CONNECTION_TIMEOUT", rConnectionTimeout.toString) +pb.environment().put("SPARKR_SPARKFILES_ROOT_DIR", SparkFiles.getRootDirectory()) +pb.environment().put("SPARKR_IS_RUNNING_ON_WORKER", "TRUE") pb.redirectErrorStream(true) // redirect stderr into stdout val proc = pb.start() val errThread = startStdoutThread(proc) - To unsubscribe,
spark git commit: [SPARK-20030][SS] Event-time-based timeout for MapGroupsWithState
Repository: spark Updated Branches: refs/heads/master 2d73fcced -> c1e87e384 [SPARK-20030][SS] Event-time-based timeout for MapGroupsWithState ## What changes were proposed in this pull request? Adding event time based timeout. The user sets the timeout timestamp directly using `KeyedState.setTimeoutTimestamp`. The keys times out when the watermark crosses the timeout timestamp. ## How was this patch tested? Unit tests Author: Tathagata DasCloses #17361 from tdas/SPARK-20030. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c1e87e38 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c1e87e38 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c1e87e38 Branch: refs/heads/master Commit: c1e87e384d1878308b42da80bb3d65be512aab55 Parents: 2d73fcc Author: Tathagata Das Authored: Tue Mar 21 21:27:08 2017 -0700 Committer: Tathagata Das Committed: Tue Mar 21 21:27:08 2017 -0700 -- .../spark/sql/streaming/KeyedStateTimeout.java | 22 +- .../analysis/UnsupportedOperationChecker.scala | 96 +++-- .../sql/catalyst/plans/logical/object.scala | 3 +- .../analysis/UnsupportedOperationsSuite.scala | 16 + .../spark/sql/execution/SparkStrategies.scala | 3 +- .../streaming/FlatMapGroupsWithStateExec.scala | 87 ++-- .../streaming/IncrementalExecution.scala| 5 +- .../execution/streaming/KeyedStateImpl.scala| 139 +-- .../execution/streaming/statefulOperators.scala | 14 +- .../apache/spark/sql/streaming/KeyedState.scala | 97 - .../streaming/FlatMapGroupsWithStateSuite.scala | 402 --- 11 files changed, 616 insertions(+), 268 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c1e87e38/sql/catalyst/src/main/java/org/apache/spark/sql/streaming/KeyedStateTimeout.java -- diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/streaming/KeyedStateTimeout.java b/sql/catalyst/src/main/java/org/apache/spark/sql/streaming/KeyedStateTimeout.java index cf112f2..e2e7ab1 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/streaming/KeyedStateTimeout.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/streaming/KeyedStateTimeout.java @@ -19,9 +19,7 @@ package org.apache.spark.sql.streaming; import org.apache.spark.annotation.Experimental; import org.apache.spark.annotation.InterfaceStability; -import org.apache.spark.sql.catalyst.plans.logical.NoTimeout$; -import org.apache.spark.sql.catalyst.plans.logical.ProcessingTimeTimeout; -import org.apache.spark.sql.catalyst.plans.logical.ProcessingTimeTimeout$; +import org.apache.spark.sql.catalyst.plans.logical.*; /** * Represents the type of timeouts possible for the Dataset operations @@ -34,9 +32,23 @@ import org.apache.spark.sql.catalyst.plans.logical.ProcessingTimeTimeout$; @InterfaceStability.Evolving public class KeyedStateTimeout { - /** Timeout based on processing time. */ + /** + * Timeout based on processing time. The duration of timeout can be set for each group in + * `map/flatMapGroupsWithState` by calling `KeyedState.setTimeoutDuration()`. See documentation + * on `KeyedState` for more details. + */ public static KeyedStateTimeout ProcessingTimeTimeout() { return ProcessingTimeTimeout$.MODULE$; } - /** No timeout */ + /** + * Timeout based on event-time. The event-time timestamp for timeout can be set for each + * group in `map/flatMapGroupsWithState` by calling `KeyedState.setTimeoutTimestamp()`. + * In addition, you have to define the watermark in the query using `Dataset.withWatermark`. + * When the watermark advances beyond the set timestamp of a group and the group has not + * received any data, then the group times out. See documentation on + * `KeyedState` for more details. + */ + public static KeyedStateTimeout EventTimeTimeout() { return EventTimeTimeout$.MODULE$; } + + /** No timeout. */ public static KeyedStateTimeout NoTimeout() { return NoTimeout$.MODULE$; } } http://git-wip-us.apache.org/repos/asf/spark/blob/c1e87e38/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala index a9ff61e..7da7f55 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala +++
spark git commit: [SPARK-20051][SS] Fix StreamSuite flaky test - recover from v2.1 checkpoint
Repository: spark Updated Branches: refs/heads/master 9281a3d50 -> 2d73fcced [SPARK-20051][SS] Fix StreamSuite flaky test - recover from v2.1 checkpoint ## What changes were proposed in this pull request? There is a race condition between calling stop on a streaming query and deleting directories in `withTempDir` that causes test to fail, fixing to do lazy deletion using delete on shutdown JVM hook. ## How was this patch tested? - Unit test - repeated 300 runs with no failure Author: Kunal KhamarCloses #17382 from kunalkhamar/partition-bugfix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2d73fcce Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2d73fcce Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2d73fcce Branch: refs/heads/master Commit: 2d73fcced0492c606feab8fe84f62e8318ebcaa1 Parents: 9281a3d Author: Kunal Khamar Authored: Tue Mar 21 18:56:14 2017 -0700 Committer: Tathagata Das Committed: Tue Mar 21 18:56:14 2017 -0700 -- .../spark/sql/streaming/StreamSuite.scala | 77 ++-- 1 file changed, 37 insertions(+), 40 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2d73fcce/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala index e867fc4..f01211e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala @@ -33,6 +33,7 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.StreamSourceProvider import org.apache.spark.sql.types.{IntegerType, StructField, StructType} +import org.apache.spark.util.Utils class StreamSuite extends StreamTest { @@ -438,52 +439,48 @@ class StreamSuite extends StreamTest { // 1 - Test if recovery from the checkpoint is successful. prepareMemoryStream() -withTempDir { dir => - // Copy the checkpoint to a temp dir to prevent changes to the original. - // Not doing this will lead to the test passing on the first run, but fail subsequent runs. - FileUtils.copyDirectory(checkpointDir, dir) - - // Checkpoint data was generated by a query with 10 shuffle partitions. - // In order to test reading from the checkpoint, the checkpoint must have two or more batches, - // since the last batch may be rerun. - withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "10") { -var streamingQuery: StreamingQuery = null -try { - streamingQuery = -query.queryName("counts").option("checkpointLocation", dir.getCanonicalPath).start() - streamingQuery.processAllAvailable() - inputData.addData(9) - streamingQuery.processAllAvailable() - - QueryTest.checkAnswer(spark.table("counts").toDF(), -Row("1", 1) :: Row("2", 1) :: Row("3", 2) :: Row("4", 2) :: -Row("5", 2) :: Row("6", 2) :: Row("7", 1) :: Row("8", 1) :: Row("9", 1) :: Nil) -} finally { - if (streamingQuery ne null) { -streamingQuery.stop() - } +val dir1 = Utils.createTempDir().getCanonicalFile // not using withTempDir {}, makes test flaky +// Copy the checkpoint to a temp dir to prevent changes to the original. +// Not doing this will lead to the test passing on the first run, but fail subsequent runs. +FileUtils.copyDirectory(checkpointDir, dir1) +// Checkpoint data was generated by a query with 10 shuffle partitions. +// In order to test reading from the checkpoint, the checkpoint must have two or more batches, +// since the last batch may be rerun. +withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "10") { + var streamingQuery: StreamingQuery = null + try { +streamingQuery = + query.queryName("counts").option("checkpointLocation", dir1.getCanonicalPath).start() +streamingQuery.processAllAvailable() +inputData.addData(9) +streamingQuery.processAllAvailable() + +QueryTest.checkAnswer(spark.table("counts").toDF(), + Row("1", 1) :: Row("2", 1) :: Row("3", 2) :: Row("4", 2) :: + Row("5", 2) :: Row("6", 2) :: Row("7", 1) :: Row("8", 1) :: Row("9", 1) :: Nil) + } finally { +if (streamingQuery ne null) { + streamingQuery.stop() } } } // 2 - Check recovery with wrong num shuffle partitions prepareMemoryStream() -withTempDir { dir
[2/2] spark git commit: Preparing development version 2.1.2-SNAPSHOT
Preparing development version 2.1.2-SNAPSHOT Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c4d2b833 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c4d2b833 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c4d2b833 Branch: refs/heads/branch-2.1 Commit: c4d2b83389ad57c803860d73f00c27efe30c00b6 Parents: 30abb95 Author: Patrick WendellAuthored: Tue Mar 21 18:30:07 2017 -0700 Committer: Patrick Wendell Committed: Tue Mar 21 18:30:07 2017 -0700 -- R/pkg/DESCRIPTION | 2 +- assembly/pom.xml | 2 +- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml| 2 +- common/network-yarn/pom.xml | 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml | 2 +- common/unsafe/pom.xml | 2 +- core/pom.xml | 2 +- docs/_config.yml | 4 ++-- examples/pom.xml | 2 +- external/docker-integration-tests/pom.xml | 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml| 2 +- external/java8-tests/pom.xml | 2 +- external/kafka-0-10-assembly/pom.xml | 2 +- external/kafka-0-10-sql/pom.xml | 2 +- external/kafka-0-10/pom.xml | 2 +- external/kafka-0-8-assembly/pom.xml | 2 +- external/kafka-0-8/pom.xml| 2 +- external/kinesis-asl-assembly/pom.xml | 2 +- external/kinesis-asl/pom.xml | 2 +- external/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml| 2 +- launcher/pom.xml | 2 +- mesos/pom.xml | 2 +- mllib-local/pom.xml | 2 +- mllib/pom.xml | 2 +- pom.xml | 2 +- python/pyspark/version.py | 2 +- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- yarn/pom.xml | 2 +- 39 files changed, 40 insertions(+), 40 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c4d2b833/R/pkg/DESCRIPTION -- diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 1ceda7b..2d461ca 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -1,6 +1,6 @@ Package: SparkR Type: Package -Version: 2.1.1 +Version: 2.1.2 Title: R Frontend for Apache Spark Description: The SparkR package provides an R Frontend for Apache Spark. Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"), http://git-wip-us.apache.org/repos/asf/spark/blob/c4d2b833/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index cc290c0..6e092ef 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 -2.1.1 +2.1.2-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/c4d2b833/common/network-common/pom.xml -- diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index ccf4b27..77a4b64 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.1.1 +2.1.2-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/c4d2b833/common/network-shuffle/pom.xml -- diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index 98a2324..1a2d85a 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.1.1 +2.1.2-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/c4d2b833/common/network-yarn/pom.xml -- diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index dc1ad14..7a57e89 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.1.1 +
[spark] Git Push Summary
Repository: spark Updated Tags: refs/tags/v2.1.1-rc1 [created] 30abb95c9 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[1/2] spark git commit: Preparing Spark release v2.1.1-rc1
Repository: spark Updated Branches: refs/heads/branch-2.1 a04428fe2 -> c4d2b8338 Preparing Spark release v2.1.1-rc1 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/30abb95c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/30abb95c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/30abb95c Branch: refs/heads/branch-2.1 Commit: 30abb95c9ca1632d98ec9773b19b7b374c3688ff Parents: a04428f Author: Patrick WendellAuthored: Tue Mar 21 18:30:02 2017 -0700 Committer: Patrick Wendell Committed: Tue Mar 21 18:30:02 2017 -0700 -- assembly/pom.xml | 2 +- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml| 2 +- common/network-yarn/pom.xml | 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml | 2 +- common/unsafe/pom.xml | 2 +- core/pom.xml | 2 +- docs/_config.yml | 2 +- examples/pom.xml | 2 +- external/docker-integration-tests/pom.xml | 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml| 2 +- external/java8-tests/pom.xml | 2 +- external/kafka-0-10-assembly/pom.xml | 2 +- external/kafka-0-10-sql/pom.xml | 2 +- external/kafka-0-10/pom.xml | 2 +- external/kafka-0-8-assembly/pom.xml | 2 +- external/kafka-0-8/pom.xml| 2 +- external/kinesis-asl-assembly/pom.xml | 2 +- external/kinesis-asl/pom.xml | 2 +- external/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml| 2 +- launcher/pom.xml | 2 +- mesos/pom.xml | 2 +- mllib-local/pom.xml | 2 +- mllib/pom.xml | 2 +- pom.xml | 2 +- python/pyspark/version.py | 2 +- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- yarn/pom.xml | 2 +- 38 files changed, 38 insertions(+), 38 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/30abb95c/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 29522fd..cc290c0 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 -2.1.1-SNAPSHOT +2.1.1 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/30abb95c/common/network-common/pom.xml -- diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index 793f6c7..ccf4b27 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.1.1-SNAPSHOT +2.1.1 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/30abb95c/common/network-shuffle/pom.xml -- diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index d8ab265..98a2324 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.1.1-SNAPSHOT +2.1.1 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/30abb95c/common/network-yarn/pom.xml -- diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index ec23a33..dc1ad14 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.1.1-SNAPSHOT +2.1.1 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/30abb95c/common/sketch/pom.xml -- diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml index 1cefe88..250b696 100644 --- a/common/sketch/pom.xml +++ b/common/sketch/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.1.1-SNAPSHOT +2.1.1 ../../pom.xml
subscribe to spark commits
subscribe to spark commits
spark git commit: [SPARK-19919][SQL] Defer throwing the exception for empty paths in CSV datasource into `DataSource`
Repository: spark Updated Branches: refs/heads/master a04dcde8c -> 9281a3d50 [SPARK-19919][SQL] Defer throwing the exception for empty paths in CSV datasource into `DataSource` ## What changes were proposed in this pull request? This PR proposes to defer throwing the exception within `DataSource`. Currently, if other datasources fail to infer the schema, it returns `None` and then this is being validated in `DataSource` as below: ``` scala> spark.read.json("emptydir") org.apache.spark.sql.AnalysisException: Unable to infer schema for JSON. It must be specified manually.; ``` ``` scala> spark.read.orc("emptydir") org.apache.spark.sql.AnalysisException: Unable to infer schema for ORC. It must be specified manually.; ``` ``` scala> spark.read.parquet("emptydir") org.apache.spark.sql.AnalysisException: Unable to infer schema for Parquet. It must be specified manually.; ``` However, CSV it checks it within the datasource implementation and throws another exception message as below: ``` scala> spark.read.csv("emptydir") java.lang.IllegalArgumentException: requirement failed: Cannot infer schema from an empty set of files ``` We could remove this duplicated check and validate this in one place in the same way with the same message. ## How was this patch tested? Unit test in `CSVSuite` and manual test. Author: hyukjinkwonCloses #17256 from HyukjinKwon/SPARK-19919. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9281a3d5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9281a3d5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9281a3d5 Branch: refs/heads/master Commit: 9281a3d504d526440c1d445075e38a6d9142ac93 Parents: a04dcde Author: hyukjinkwon Authored: Wed Mar 22 08:41:46 2017 +0800 Committer: Wenchen Fan Committed: Wed Mar 22 08:41:46 2017 +0800 -- .../datasources/csv/CSVDataSource.scala | 25 ++-- .../datasources/csv/CSVFileFormat.scala | 4 +--- .../sql/test/DataFrameReaderWriterSuite.scala | 6 +++-- 3 files changed, 23 insertions(+), 12 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9281a3d5/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 63af18e..83bdf6f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -54,10 +54,21 @@ abstract class CSVDataSource extends Serializable { /** * Infers the schema from `inputPaths` files. */ - def infer( + final def inferSchema( sparkSession: SparkSession, inputPaths: Seq[FileStatus], - parsedOptions: CSVOptions): Option[StructType] + parsedOptions: CSVOptions): Option[StructType] = { +if (inputPaths.nonEmpty) { + Some(infer(sparkSession, inputPaths, parsedOptions)) +} else { + None +} + } + + protected def infer( + sparkSession: SparkSession, + inputPaths: Seq[FileStatus], + parsedOptions: CSVOptions): StructType /** * Generates a header from the given row which is null-safe and duplicate-safe. @@ -131,10 +142,10 @@ object TextInputCSVDataSource extends CSVDataSource { override def infer( sparkSession: SparkSession, inputPaths: Seq[FileStatus], - parsedOptions: CSVOptions): Option[StructType] = { + parsedOptions: CSVOptions): StructType = { val csv = createBaseDataset(sparkSession, inputPaths, parsedOptions) val maybeFirstLine = CSVUtils.filterCommentAndEmpty(csv, parsedOptions).take(1).headOption -Some(inferFromDataset(sparkSession, csv, maybeFirstLine, parsedOptions)) +inferFromDataset(sparkSession, csv, maybeFirstLine, parsedOptions) } /** @@ -203,7 +214,7 @@ object WholeFileCSVDataSource extends CSVDataSource { override def infer( sparkSession: SparkSession, inputPaths: Seq[FileStatus], - parsedOptions: CSVOptions): Option[StructType] = { + parsedOptions: CSVOptions): StructType = { val csv = createBaseRdd(sparkSession, inputPaths, parsedOptions) csv.flatMap { lines => UnivocityParser.tokenizeStream( @@ -222,10 +233,10 @@ object WholeFileCSVDataSource extends CSVDataSource { parsedOptions.headerFlag, new CsvParser(parsedOptions.asParserSettings)) } -Some(CSVInferSchema.infer(tokenRDD, header, parsedOptions)) +
spark git commit: [SPARK-19980][SQL][BACKPORT-2.1] Add NULL checks in Bean serializer
Repository: spark Updated Branches: refs/heads/branch-2.1 9dfdd2adf -> a04428fe2 [SPARK-19980][SQL][BACKPORT-2.1] Add NULL checks in Bean serializer ## What changes were proposed in this pull request? A Bean serializer in `ExpressionEncoder` could change values when Beans having NULL. A concrete example is as follows; ``` scala> :paste class Outer extends Serializable { private var cls: Inner = _ def setCls(c: Inner): Unit = cls = c def getCls(): Inner = cls } class Inner extends Serializable { private var str: String = _ def setStr(s: String): Unit = str = str def getStr(): String = str } scala> Seq("""{"cls":null}""", """{"cls": {"str":null}}""").toDF().write.text("data") scala> val encoder = Encoders.bean(classOf[Outer]) scala> val schema = encoder.schema scala> val df = spark.read.schema(schema).json("data").as[Outer](encoder) scala> df.show +--+ | cls| +--+ |[null]| | null| +--+ scala> df.map(x => x)(encoder).show() +--+ | cls| +--+ |[null]| |[null]| // <-- Value changed +--+ ``` This is because the Bean serializer does not have the NULL-check expressions that the serializer of Scala's product types has. Actually, this value change does not happen in Scala's product types; ``` scala> :paste case class Outer(cls: Inner) case class Inner(str: String) scala> val encoder = Encoders.product[Outer] scala> val schema = encoder.schema scala> val df = spark.read.schema(schema).json("data").as[Outer](encoder) scala> df.show +--+ | cls| +--+ |[null]| | null| +--+ scala> df.map(x => x)(encoder).show() +--+ | cls| +--+ |[null]| | null| +--+ ``` This pr added the NULL-check expressions in Bean serializer along with the serializer of Scala's product types. ## How was this patch tested? Added tests in `JavaDatasetSuite`. Author: Takeshi YamamuroCloses #17372 from maropu/SPARK-19980-BACKPORT2.1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a04428fe Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a04428fe Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a04428fe Branch: refs/heads/branch-2.1 Commit: a04428fe26b5b3ad998a88c81c829050fe4a0256 Parents: 9dfdd2a Author: Takeshi Yamamuro Authored: Wed Mar 22 08:37:54 2017 +0800 Committer: Wenchen Fan Committed: Wed Mar 22 08:37:54 2017 +0800 -- .../spark/sql/catalyst/JavaTypeInference.scala | 11 +++-- .../org/apache/spark/sql/JavaDatasetSuite.java | 24 2 files changed, 33 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a04428fe/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala index 61c153c..2de066f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala @@ -334,7 +334,11 @@ object JavaTypeInference { */ def serializerFor(beanClass: Class[_]): CreateNamedStruct = { val inputObject = BoundReference(0, ObjectType(beanClass), nullable = true) -serializerFor(inputObject, TypeToken.of(beanClass)).asInstanceOf[CreateNamedStruct] +val nullSafeInput = AssertNotNull(inputObject, Seq("top level input bean")) +serializerFor(nullSafeInput, TypeToken.of(beanClass)) match { + case expressions.If(_, _, s: CreateNamedStruct) => s + case other => CreateNamedStruct(expressions.Literal("value") :: other :: Nil) +} } private def serializerFor(inputObject: Expression, typeToken: TypeToken[_]): Expression = { @@ -417,7 +421,7 @@ object JavaTypeInference { case other => val properties = getJavaBeanProperties(other) if (properties.length > 0) { -CreateNamedStruct(properties.flatMap { p => +val nonNullOutput = CreateNamedStruct(properties.flatMap { p => val fieldName = p.getName val fieldType = typeToken.method(p.getReadMethod).getReturnType val fieldValue = Invoke( @@ -426,6 +430,9 @@ object JavaTypeInference { inferExternalType(fieldType.getRawType)) expressions.Literal(fieldName) :: serializerFor(fieldValue, fieldType) :: Nil }) + +val nullOutput = expressions.Literal.create(null, nonNullOutput.dataType) +expressions.If(IsNull(inputObject), nullOutput, nonNullOutput) } else { throw new
spark git commit: clarify array_contains function description
Repository: spark Updated Branches: refs/heads/master a8877bdbb -> a04dcde8c clarify array_contains function description ## What changes were proposed in this pull request? The description in the comment for array_contains is vague/incomplete (i.e., doesn't mention that it returns `null` if the array is `null`); this PR fixes that. ## How was this patch tested? No testing, since it merely changes a comment. Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Will ManningCloses #17380 from lwwmanning/patch-1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a04dcde8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a04dcde8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a04dcde8 Branch: refs/heads/master Commit: a04dcde8cb191e591a5f5d7a67a5371e31e7343c Parents: a8877bd Author: Will Manning Authored: Wed Mar 22 00:40:48 2017 +0100 Committer: Reynold Xin Committed: Wed Mar 22 00:40:48 2017 +0100 -- sql/core/src/main/scala/org/apache/spark/sql/functions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a04dcde8/sql/core/src/main/scala/org/apache/spark/sql/functions.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index a9f089c..66bb881 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -2896,7 +2896,7 @@ object functions { // /** - * Returns true if the array contains `value` + * Returns null if the array is null, true if the array contains `value`, and false otherwise. * @group collection_funcs * @since 1.5.0 */ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: clarify array_contains function description
Repository: spark Updated Branches: refs/heads/branch-2.1 5c18b6c31 -> 9dfdd2adf clarify array_contains function description ## What changes were proposed in this pull request? The description in the comment for array_contains is vague/incomplete (i.e., doesn't mention that it returns `null` if the array is `null`); this PR fixes that. ## How was this patch tested? No testing, since it merely changes a comment. Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Will ManningCloses #17380 from lwwmanning/patch-1. (cherry picked from commit a04dcde8cb191e591a5f5d7a67a5371e31e7343c) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9dfdd2ad Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9dfdd2ad Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9dfdd2ad Branch: refs/heads/branch-2.1 Commit: 9dfdd2adff508d256ae392ebe1b29f721931cf5e Parents: 5c18b6c Author: Will Manning Authored: Wed Mar 22 00:40:48 2017 +0100 Committer: Reynold Xin Committed: Wed Mar 22 00:41:09 2017 +0100 -- sql/core/src/main/scala/org/apache/spark/sql/functions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9dfdd2ad/sql/core/src/main/scala/org/apache/spark/sql/functions.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 9a080fd..fab9202 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -2854,7 +2854,7 @@ object functions { // /** - * Returns true if the array contains `value` + * Returns null if the array is null, true if the array contains `value`, and false otherwise. * @group collection_funcs * @since 1.5.0 */ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19237][SPARKR][CORE] On Windows spark-submit should handle when java is not installed
Repository: spark Updated Branches: refs/heads/branch-2.1 a88c88aac -> 5c18b6c31 [SPARK-19237][SPARKR][CORE] On Windows spark-submit should handle when java is not installed ## What changes were proposed in this pull request? When SparkR is installed as a R package there might not be any java runtime. If it is not there SparkR's `sparkR.session()` will block waiting for the connection timeout, hanging the R IDE/shell, without any notification or message. ## How was this patch tested? manually - [x] need to test on Windows Author: Felix CheungCloses #16596 from felixcheung/rcheckjava. (cherry picked from commit a8877bdbba6df105740f909bc87a13cdd4440757) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5c18b6c3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5c18b6c3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5c18b6c3 Branch: refs/heads/branch-2.1 Commit: 5c18b6c316509430823f4edfabe834d8143481e3 Parents: a88c88a Author: Felix Cheung Authored: Tue Mar 21 14:24:41 2017 -0700 Committer: Shivaram Venkataraman Committed: Tue Mar 21 14:25:07 2017 -0700 -- R/pkg/inst/tests/testthat/test_Windows.R | 1 + bin/spark-class2.cmd | 11 ++- 2 files changed, 11 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5c18b6c3/R/pkg/inst/tests/testthat/test_Windows.R -- diff --git a/R/pkg/inst/tests/testthat/test_Windows.R b/R/pkg/inst/tests/testthat/test_Windows.R index e8d9834..1d777dd 100644 --- a/R/pkg/inst/tests/testthat/test_Windows.R +++ b/R/pkg/inst/tests/testthat/test_Windows.R @@ -20,6 +20,7 @@ test_that("sparkJars tag in SparkContext", { if (.Platform$OS.type != "windows") { skip("This test is only for Windows, skipped") } + testOutput <- launchScript("ECHO", "a/b/c", wait = TRUE) abcPath <- testOutput[1] expect_equal(abcPath, "a\\b\\c") http://git-wip-us.apache.org/repos/asf/spark/blob/5c18b6c3/bin/spark-class2.cmd -- diff --git a/bin/spark-class2.cmd b/bin/spark-class2.cmd index 869c0b2..9faa7d6 100644 --- a/bin/spark-class2.cmd +++ b/bin/spark-class2.cmd @@ -50,7 +50,16 @@ if not "x%SPARK_PREPEND_CLASSES%"=="x" ( rem Figure out where java is. set RUNNER=java -if not "x%JAVA_HOME%"=="x" set RUNNER=%JAVA_HOME%\bin\java +if not "x%JAVA_HOME%"=="x" ( + set RUNNER="%JAVA_HOME%\bin\java" +) else ( + where /q "%RUNNER%" + if ERRORLEVEL 1 ( +echo Java not found and JAVA_HOME environment variable is not set. +echo Install Java and set JAVA_HOME to point to the Java installation directory. +exit /b 1 + ) +) rem The launcher library prints the command to be executed in a single line suitable for being rem executed by the batch interpreter. So read all the output of the launcher into a variable. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19237][SPARKR][CORE] On Windows spark-submit should handle when java is not installed
Repository: spark Updated Branches: refs/heads/master 7dbc162f1 -> a8877bdbb [SPARK-19237][SPARKR][CORE] On Windows spark-submit should handle when java is not installed ## What changes were proposed in this pull request? When SparkR is installed as a R package there might not be any java runtime. If it is not there SparkR's `sparkR.session()` will block waiting for the connection timeout, hanging the R IDE/shell, without any notification or message. ## How was this patch tested? manually - [x] need to test on Windows Author: Felix CheungCloses #16596 from felixcheung/rcheckjava. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a8877bdb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a8877bdb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a8877bdb Branch: refs/heads/master Commit: a8877bdbba6df105740f909bc87a13cdd4440757 Parents: 7dbc162 Author: Felix Cheung Authored: Tue Mar 21 14:24:41 2017 -0700 Committer: Shivaram Venkataraman Committed: Tue Mar 21 14:24:41 2017 -0700 -- R/pkg/inst/tests/testthat/test_Windows.R | 1 + bin/spark-class2.cmd | 11 ++- 2 files changed, 11 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a8877bdb/R/pkg/inst/tests/testthat/test_Windows.R -- diff --git a/R/pkg/inst/tests/testthat/test_Windows.R b/R/pkg/inst/tests/testthat/test_Windows.R index e8d9834..1d777dd 100644 --- a/R/pkg/inst/tests/testthat/test_Windows.R +++ b/R/pkg/inst/tests/testthat/test_Windows.R @@ -20,6 +20,7 @@ test_that("sparkJars tag in SparkContext", { if (.Platform$OS.type != "windows") { skip("This test is only for Windows, skipped") } + testOutput <- launchScript("ECHO", "a/b/c", wait = TRUE) abcPath <- testOutput[1] expect_equal(abcPath, "a\\b\\c") http://git-wip-us.apache.org/repos/asf/spark/blob/a8877bdb/bin/spark-class2.cmd -- diff --git a/bin/spark-class2.cmd b/bin/spark-class2.cmd index 869c0b2..9faa7d6 100644 --- a/bin/spark-class2.cmd +++ b/bin/spark-class2.cmd @@ -50,7 +50,16 @@ if not "x%SPARK_PREPEND_CLASSES%"=="x" ( rem Figure out where java is. set RUNNER=java -if not "x%JAVA_HOME%"=="x" set RUNNER=%JAVA_HOME%\bin\java +if not "x%JAVA_HOME%"=="x" ( + set RUNNER="%JAVA_HOME%\bin\java" +) else ( + where /q "%RUNNER%" + if ERRORLEVEL 1 ( +echo Java not found and JAVA_HOME environment variable is not set. +echo Install Java and set JAVA_HOME to point to the Java installation directory. +exit /b 1 + ) +) rem The launcher library prints the command to be executed in a single line suitable for being rem executed by the batch interpreter. So read all the output of the launcher into a variable. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20017][SQL] change the nullability of function 'StringToMap' from 'false' to 'true'
Repository: spark Updated Branches: refs/heads/master ae4b91d1f -> 7dbc162f1 [SPARK-20017][SQL] change the nullability of function 'StringToMap' from 'false' to 'true' ## What changes were proposed in this pull request? Change the nullability of function `StringToMap` from `false` to `true`. Author: zhaorongsheng <334362...@qq.com> Closes #17350 from zhaorongsheng/bug-fix_strToMap_NPE. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7dbc162f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7dbc162f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7dbc162f Branch: refs/heads/master Commit: 7dbc162f12cc1a447c85a1a2c20d32ebb5cbeacf Parents: ae4b91d Author: zhaorongsheng <334362...@qq.com> Authored: Tue Mar 21 11:30:55 2017 -0700 Committer: Xiao LiCommitted: Tue Mar 21 11:30:55 2017 -0700 -- .../spark/sql/catalyst/expressions/complexTypeCreator.scala | 4 +++- .../spark/sql/catalyst/expressions/ComplexTypeSuite.scala | 7 +++ 2 files changed, 10 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7dbc162f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index 22277ad..b6675a8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -390,6 +390,8 @@ case class CreateNamedStructUnsafe(children: Seq[Expression]) extends CreateName Examples: > SELECT _FUNC_('a:1,b:2,c:3', ',', ':'); map("a":"1","b":"2","c":"3") + > SELECT _FUNC_('a'); + map("a":null) """) // scalastyle:on line.size.limit case class StringToMap(text: Expression, pairDelim: Expression, keyValueDelim: Expression) @@ -407,7 +409,7 @@ case class StringToMap(text: Expression, pairDelim: Expression, keyValueDelim: E override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, StringType) - override def dataType: DataType = MapType(StringType, StringType, valueContainsNull = false) + override def dataType: DataType = MapType(StringType, StringType) override def checkInputDataTypes(): TypeCheckResult = { if (Seq(pairDelim, keyValueDelim).exists(! _.foldable)) { http://git-wip-us.apache.org/repos/asf/spark/blob/7dbc162f/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala index abe1d2b..5f8a8f4 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala @@ -251,6 +251,9 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper { } test("StringToMap") { +val expectedDataType = MapType(StringType, StringType, valueContainsNull = true) +assert(new StringToMap("").dataType === expectedDataType) + val s0 = Literal("a:1,b:2,c:3") val m0 = Map("a" -> "1", "b" -> "2", "c" -> "3") checkEvaluation(new StringToMap(s0), m0) @@ -271,6 +274,10 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper { val m4 = Map("a" -> "1", "b" -> "2", "c" -> "3") checkEvaluation(new StringToMap(s4, Literal("_")), m4) +val s5 = Literal("a") +val m5 = Map("a" -> null) +checkEvaluation(new StringToMap(s5), m5) + // arguments checking assert(new StringToMap(Literal("a:1,b:2,c:3")).checkInputDataTypes().isSuccess) assert(new StringToMap(Literal(null)).checkInputDataTypes().isFailure) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20017][SQL] change the nullability of function 'StringToMap' from 'false' to 'true'
Repository: spark Updated Branches: refs/heads/branch-2.1 c4c7b1857 -> a88c88aac [SPARK-20017][SQL] change the nullability of function 'StringToMap' from 'false' to 'true' ## What changes were proposed in this pull request? Change the nullability of function `StringToMap` from `false` to `true`. Author: zhaorongsheng <334362...@qq.com> Closes #17350 from zhaorongsheng/bug-fix_strToMap_NPE. (cherry picked from commit 7dbc162f12cc1a447c85a1a2c20d32ebb5cbeacf) Signed-off-by: Xiao LiProject: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a88c88aa Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a88c88aa Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a88c88aa Branch: refs/heads/branch-2.1 Commit: a88c88aacc6f659fc4086caf74c03cd500068b94 Parents: c4c7b18 Author: zhaorongsheng <334362...@qq.com> Authored: Tue Mar 21 11:30:55 2017 -0700 Committer: Xiao Li Committed: Tue Mar 21 11:31:05 2017 -0700 -- .../spark/sql/catalyst/expressions/complexTypeCreator.scala | 4 +++- .../spark/sql/catalyst/expressions/ComplexTypeSuite.scala | 7 +++ 2 files changed, 10 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a88c88aa/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index 599fb63..3df2ed8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -346,6 +346,8 @@ case class CreateNamedStructUnsafe(children: Seq[Expression]) extends CreateName Examples: > SELECT _FUNC_('a:1,b:2,c:3', ',', ':'); map("a":"1","b":"2","c":"3") + > SELECT _FUNC_('a'); + map("a":null) """) // scalastyle:on line.size.limit case class StringToMap(text: Expression, pairDelim: Expression, keyValueDelim: Expression) @@ -363,7 +365,7 @@ case class StringToMap(text: Expression, pairDelim: Expression, keyValueDelim: E override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, StringType) - override def dataType: DataType = MapType(StringType, StringType, valueContainsNull = false) + override def dataType: DataType = MapType(StringType, StringType) override def checkInputDataTypes(): TypeCheckResult = { if (Seq(pairDelim, keyValueDelim).exists(! _.foldable)) { http://git-wip-us.apache.org/repos/asf/spark/blob/a88c88aa/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala index c21c6de..5f12472 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala @@ -247,6 +247,9 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper { } test("StringToMap") { +val expectedDataType = MapType(StringType, StringType, valueContainsNull = true) +assert(new StringToMap("").dataType === expectedDataType) + val s0 = Literal("a:1,b:2,c:3") val m0 = Map("a" -> "1", "b" -> "2", "c" -> "3") checkEvaluation(new StringToMap(s0), m0) @@ -267,6 +270,10 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper { val m4 = Map("a" -> "1", "b" -> "2", "c" -> "3") checkEvaluation(new StringToMap(s4, Literal("_")), m4) +val s5 = Literal("a") +val m5 = Map("a" -> null) +checkEvaluation(new StringToMap(s5), m5) + // arguments checking assert(new StringToMap(Literal("a:1,b:2,c:3")).checkInputDataTypes().isSuccess) assert(new StringToMap(Literal(null)).checkInputDataTypes().isFailure) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20039][ML] rename ChiSquare to ChiSquareTest
Repository: spark Updated Branches: refs/heads/master 4c0ff5f58 -> ae4b91d1f [SPARK-20039][ML] rename ChiSquare to ChiSquareTest ## What changes were proposed in this pull request? I realized that since ChiSquare is in the package stat, it's pretty unclear if it's the hypothesis test, distribution, or what. This PR renames it to ChiSquareTest to clarify this. ## How was this patch tested? Existing unit tests Author: Joseph K. BradleyCloses #17368 from jkbradley/SPARK-20039. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ae4b91d1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ae4b91d1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ae4b91d1 Branch: refs/heads/master Commit: ae4b91d1f5734b9d66f3b851b71b3c179f3cdd76 Parents: 4c0ff5f Author: Joseph K. Bradley Authored: Tue Mar 21 11:01:25 2017 -0700 Committer: Joseph K. Bradley Committed: Tue Mar 21 11:01:25 2017 -0700 -- .../org/apache/spark/ml/stat/ChiSquare.scala| 81 .../apache/spark/ml/stat/ChiSquareTest.scala| 81 .../apache/spark/ml/stat/ChiSquareSuite.scala | 98 .../spark/ml/stat/ChiSquareTestSuite.scala | 98 4 files changed, 179 insertions(+), 179 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ae4b91d1/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquare.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquare.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquare.scala deleted file mode 100644 index c3865ce..000 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquare.scala +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - *http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.stat - -import org.apache.spark.annotation.{Experimental, Since} -import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} -import org.apache.spark.ml.util.SchemaUtils -import org.apache.spark.mllib.linalg.{Vectors => OldVectors} -import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} -import org.apache.spark.mllib.stat.{Statistics => OldStatistics} -import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.functions.col - - -/** - * :: Experimental :: - * - * Chi-square hypothesis testing for categorical data. - * - * See http://en.wikipedia.org/wiki/Chi-squared_test;>Wikipedia for more information - * on the Chi-squared test. - */ -@Experimental -@Since("2.2.0") -object ChiSquare { - - /** Used to construct output schema of tests */ - private case class ChiSquareResult( - pValues: Vector, - degreesOfFreedom: Array[Int], - statistics: Vector) - - /** - * Conduct Pearson's independence test for every feature against the label across the input RDD. - * For each feature, the (feature, label) pairs are converted into a contingency matrix for which - * the Chi-squared statistic is computed. All label and feature values must be categorical. - * - * The null hypothesis is that the occurrence of the outcomes is statistically independent. - * - * @param dataset DataFrame of categorical labels and categorical features. - * Real-valued features will be treated as categorical for each distinct value. - * @param featuresCol Name of features column in dataset, of type `Vector` (`VectorUDT`) - * @param labelCol Name of label column in dataset, of any numerical type - * @return DataFrame containing the test result for every feature against the label. - * This DataFrame will contain a single Row with the following fields: - * - `pValues: Vector` - * - `degreesOfFreedom: Array[Int]` - * - `statistics: Vector` - * Each of these fields has one value per feature. - */ - @Since("2.2.0") - def test(dataset: DataFrame, featuresCol: String, labelCol:
spark git commit: [SPARK-19261][SQL] Alter add columns for Hive serde and some datasource tables
Repository: spark Updated Branches: refs/heads/master 63f077fbe -> 4c0ff5f58 [SPARK-19261][SQL] Alter add columns for Hive serde and some datasource tables ## What changes were proposed in this pull request? Support` ALTER TABLE ADD COLUMNS (...) `syntax for Hive serde and some datasource tables. In this PR, we consider a few aspects: 1. View is not supported for `ALTER ADD COLUMNS` 2. Since tables created in SparkSQL with Hive DDL syntax will populate table properties with schema information, we need make sure the consistency of the schema before and after ALTER operation in order for future use. 3. For embedded-schema type of format, such as `parquet`, we need to make sure that the predicate on the newly-added columns can be evaluated properly, or pushed down properly. In case of the data file does not have the columns for the newly-added columns, such predicates should return as if the column values are NULLs. 4. For datasource table, this feature does not support the following: 4.1 TEXT format, since there is only one default column `value` is inferred for text format data. 4.2 ORC format, since SparkSQL native ORC reader does not support the difference between user-specified-schema and inferred schema from ORC files. 4.3 Third party datasource types that implements RelationProvider, including the built-in JDBC format, since different implementations by the vendors may have different ways to dealing with schema. 4.4 Other datasource types, such as `parquet`, `json`, `csv`, `hive` are supported. 5. Column names being added can not be duplicate of any existing data column or partition column names. Case sensitivity is taken into consideration according to the sql configuration. 6. This feature also supports In-Memory catalog, while Hive support is turned off. ## How was this patch tested? Add new test cases Author: Xin WuCloses #16626 from xwu0226/alter_add_columns. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4c0ff5f5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4c0ff5f5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4c0ff5f5 Branch: refs/heads/master Commit: 4c0ff5f58565f811b65f1a11b6121da007bcbd5f Parents: 63f077f Author: Xin Wu Authored: Tue Mar 21 08:49:54 2017 -0700 Committer: Xiao Li Committed: Tue Mar 21 08:49:54 2017 -0700 -- .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 3 +- .../sql/catalyst/catalog/SessionCatalog.scala | 56 + .../catalyst/catalog/SessionCatalogSuite.scala | 29 + .../spark/sql/execution/SparkSqlParser.scala| 16 +++ .../spark/sql/execution/command/tables.scala| 76 +++- .../sql/execution/command/DDLCommandSuite.scala | 8 +- .../spark/sql/execution/command/DDLSuite.scala | 122 +++ .../spark/sql/hive/execution/HiveDDLSuite.scala | 100 ++- 8 files changed, 400 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4c0ff5f5/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index cc3b8fd..c4a590e 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -85,6 +85,8 @@ statement LIKE source=tableIdentifier locationSpec? #createTableLike | ANALYZE TABLE tableIdentifier partitionSpec? COMPUTE STATISTICS (identifier | FOR COLUMNS identifierSeq)? #analyze +| ALTER TABLE tableIdentifier +ADD COLUMNS '(' columns=colTypeList ')' #addTableColumns | ALTER (TABLE | VIEW) from=tableIdentifier RENAME TO to=tableIdentifier #renameTable | ALTER (TABLE | VIEW) tableIdentifier @@ -198,7 +200,6 @@ unsupportedHiveNativeCommands | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=COMPACT | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=CONCATENATE | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=SET kw4=FILEFORMAT -| kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=ADD kw4=COLUMNS | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=REPLACE kw4=COLUMNS | kw1=START kw2=TRANSACTION | kw1=COMMIT http://git-wip-us.apache.org/repos/asf/spark/blob/4c0ff5f5/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
spark git commit: [SPARK-20041][DOC] Update docs for NaN handling in approxQuantile
Repository: spark Updated Branches: refs/heads/master 14865d7ff -> 63f077fbe [SPARK-20041][DOC] Update docs for NaN handling in approxQuantile ## What changes were proposed in this pull request? Update docs for NaN handling in approxQuantile. ## How was this patch tested? existing tests. Author: Zheng RuiFengCloses #17369 from zhengruifeng/doc_quantiles_nan. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/63f077fb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/63f077fb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/63f077fb Branch: refs/heads/master Commit: 63f077fbe50b4094340e9915db41d7dbdba52975 Parents: 14865d7 Author: Zheng RuiFeng Authored: Tue Mar 21 08:45:59 2017 -0700 Committer: Xiao Li Committed: Tue Mar 21 08:45:59 2017 -0700 -- R/pkg/R/stats.R | 3 ++- .../scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala | 4 ++-- python/pyspark/sql/dataframe.py | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/63f077fb/R/pkg/R/stats.R -- diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index 8d1d165..d78a108 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -149,7 +149,8 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"), #' This method implements a variation of the Greenwald-Khanna algorithm (with some speed #' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670 #' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna. -#' Note that rows containing any NA values will be removed before calculation. +#' Note that NA values will be ignored in numerical columns before calculation. For +#' columns only containing NA values, an empty list is returned. #' #' @param x A SparkDataFrame. #' @param cols A single column name, or a list of names for multiple columns. http://git-wip-us.apache.org/repos/asf/spark/blob/63f077fb/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala index 80c7f55..feceeba 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala @@ -93,8 +93,8 @@ private[feature] trait QuantileDiscretizerBase extends Params * are too few distinct values of the input to create enough distinct quantiles. * * NaN handling: - * NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will - * produce a `Bucketizer` model for making predictions. During the transformation, + * null and NaN values will be ignored from the column during `QuantileDiscretizer` fitting. This + * will produce a `Bucketizer` model for making predictions. During the transformation, * `Bucketizer` will raise an error when it finds NaN values in the dataset, but the user can * also choose to either keep or remove NaN values within the dataset by setting `handleInvalid`. * If the user chooses to keep NaN values, they will be handled specially and placed into their own http://git-wip-us.apache.org/repos/asf/spark/blob/63f077fb/python/pyspark/sql/dataframe.py -- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index bb6df22..a24512f 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1384,7 +1384,8 @@ class DataFrame(object): Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna. -Note that rows containing any null values will be removed before calculation. +Note that null values will be ignored in numerical columns before calculation. +For columns only containing null values, an empty list is returned. :param col: str, list. Can be a single column name, or a list of names for multiple columns. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17080][SQL][FOLLOWUP] Improve documentation, change buildJoin method structure and add a debug log
Repository: spark Updated Branches: refs/heads/master 650d03cfc -> 14865d7ff [SPARK-17080][SQL][FOLLOWUP] Improve documentation, change buildJoin method structure and add a debug log ## What changes were proposed in this pull request? 1. Improve documentation for class `Cost` and `JoinReorderDP` and method `buildJoin()`. 2. Change code structure of `buildJoin()` to make the logic clearer. 3. Add a debug-level log to record information for join reordering, including time cost, the number of items and the number of plans in memo. ## How was this patch tested? Not related. Author: wangzhenhuaCloses #17353 from wzhfy/reorderFollow. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/14865d7f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/14865d7f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/14865d7f Branch: refs/heads/master Commit: 14865d7ff78db5cf9a3e8626204c8e7ed059c353 Parents: 650d03c Author: wangzhenhua Authored: Tue Mar 21 08:44:09 2017 -0700 Committer: Xiao Li Committed: Tue Mar 21 08:44:09 2017 -0700 -- .../optimizer/CostBasedJoinReorder.scala| 109 --- .../org/apache/spark/sql/internal/SQLConf.scala | 1 + 2 files changed, 68 insertions(+), 42 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/14865d7f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala index 521c468..fc37720 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.optimizer import scala.collection.mutable +import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeSet, Expression, PredicateHelper} import org.apache.spark.sql.catalyst.plans.{Inner, InnerLike} import org.apache.spark.sql.catalyst.plans.logical.{BinaryNode, Join, LogicalPlan, Project} @@ -51,7 +52,7 @@ case class CostBasedJoinReorder(conf: SQLConf) extends Rule[LogicalPlan] with Pr } } - def reorder(plan: LogicalPlan, output: AttributeSet): LogicalPlan = { + private def reorder(plan: LogicalPlan, output: AttributeSet): LogicalPlan = { val (items, conditions) = extractInnerJoins(plan) // TODO: Compute the set of star-joins and use them in the join enumeration // algorithm to prune un-optimal plan choices. @@ -69,7 +70,7 @@ case class CostBasedJoinReorder(conf: SQLConf) extends Rule[LogicalPlan] with Pr } /** - * Extract consecutive inner joinable items and join conditions. + * Extracts items of consecutive inner joins and join conditions. * This method works for bushy trees and left/right deep trees. */ private def extractInnerJoins(plan: LogicalPlan): (Seq[LogicalPlan], Set[Expression]) = { @@ -119,18 +120,21 @@ case class CostBasedJoinReorder(conf: SQLConf) extends Rule[LogicalPlan] with Pr * When building m-way joins, we only keep the best plan (with the lowest cost) for the same set * of m items. E.g., for 3-way joins, we keep only the best plan for items {A, B, C} among * plans (A J B) J C, (A J C) J B and (B J C) J A. - * - * Thus the plans maintained for each level when reordering four items A, B, C, D are as follows: + * We also prune cartesian product candidates when building a new plan if there exists no join + * condition involving references from both left and right. This pruning strategy significantly + * reduces the search space. + * E.g., given A J B J C J D with join conditions A.k1 = B.k1 and B.k2 = C.k2 and C.k3 = D.k3, + * plans maintained for each level are as follows: * level 0: p({A}), p({B}), p({C}), p({D}) - * level 1: p({A, B}), p({A, C}), p({A, D}), p({B, C}), p({B, D}), p({C, D}) - * level 2: p({A, B, C}), p({A, B, D}), p({A, C, D}), p({B, C, D}) + * level 1: p({A, B}), p({B, C}), p({C, D}) + * level 2: p({A, B, C}), p({B, C, D}) * level 3: p({A, B, C, D}) * where p({A, B, C, D}) is the final output plan. * * For cost evaluation, since physical costs for operators are not available currently, we use * cardinalities and sizes to compute costs. */ -object JoinReorderDP extends PredicateHelper { +object JoinReorderDP extends PredicateHelper with Logging { def search( conf: SQLConf, @@ -138,6 +142,7 @@ object
spark git commit: [SPARK-19998][BLOCK MANAGER] Change the exception log to add RDD id of the related the block
Repository: spark Updated Branches: refs/heads/master 7620aed82 -> 650d03cfc [SPARK-19998][BLOCK MANAGER] Change the exception log to add RDD id of the related the block ## What changes were proposed in this pull request? "java.lang.Exception: Could not compute split, block $blockId not found" doesn't have the rdd id info, the "BlockManager: Removing RDD $id" has only the RDD id, so it couldn't find that the Exception's reason is the Removing; so it's better block not found Exception add RDD id info ## How was this patch tested? Existing tests Author: jianran.tfhAuthor: jianran Closes #17334 from jianran/SPARK-19998. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/650d03cf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/650d03cf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/650d03cf Branch: refs/heads/master Commit: 650d03cfc9a609a2c603f9ced452d03ec8429b0d Parents: 7620aed Author: jianran.tfh Authored: Tue Mar 21 15:15:19 2017 + Committer: Sean Owen Committed: Tue Mar 21 15:15:19 2017 + -- core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/650d03cf/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala b/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala index d47b755..4e036c2 100644 --- a/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala @@ -47,7 +47,7 @@ class BlockRDD[T: ClassTag](sc: SparkContext, @transient val blockIds: Array[Blo blockManager.get[T](blockId) match { case Some(block) => block.data.asInstanceOf[Iterator[T]] case None => -throw new Exception("Could not compute split, block " + blockId + " not found") +throw new Exception(s"Could not compute split, block $blockId of RDD $id not found") } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19994][HOTFIX][BRANCH-2.0] Change InnerLike to Inner
Repository: spark Updated Branches: refs/heads/branch-2.0 3983b3dcd -> 72a0ee3ab [SPARK-19994][HOTFIX][BRANCH-2.0] Change InnerLike to Inner ## What changes were proposed in this pull request? InnerLike => Inner ## How was this patch tested? Existing tests. Author: wangzhenhuaCloses #17376 from wzhfy/hotFixWrongOrdering. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/72a0ee3a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/72a0ee3a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/72a0ee3a Branch: refs/heads/branch-2.0 Commit: 72a0ee3ab420a81903d63cea0a9584a534849dba Parents: 3983b3d Author: wangzhenhua Authored: Tue Mar 21 14:38:28 2017 +0100 Committer: Herman van Hovell Committed: Tue Mar 21 14:38:28 2017 +0100 -- .../org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/72a0ee3a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala index 50fd8f3..7a7340d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala @@ -84,7 +84,7 @@ case class SortMergeJoinExec( case RightOuter => requiredOrders(rightKeys) // There are null rows in both streams, so there is no order. case FullOuter => Nil -case _: InnerLike | LeftExistence(_) => requiredOrders(leftKeys) +case Inner | LeftExistence(_) => requiredOrders(leftKeys) case x => throw new IllegalArgumentException( s"${getClass.getSimpleName} should not take $x as the JoinType") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20011][ML][DOCS] Clarify documentation for ALS 'rank' parameter
Repository: spark Updated Branches: refs/heads/master d2dcd6792 -> 7620aed82 [SPARK-20011][ML][DOCS] Clarify documentation for ALS 'rank' parameter ## What changes were proposed in this pull request? API documentation and collaborative filtering documentation page changes to clarify inconsistent description of ALS rank parameter. - [DOCS] was previously: "rank is the number of latent factors in the model." - [API] was previously: "rank - number of features to use" This change describes rank in both places consistently as: - "Number of features to use (also referred to as the number of latent factors)" Author: Chris Snow Author: christopher snowCloses #17345 from snowch/SPARK-20011. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7620aed8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7620aed8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7620aed8 Branch: refs/heads/master Commit: 7620aed828d8baefc425b54684a83c81f1507b02 Parents: d2dcd67 Author: christopher snow Authored: Tue Mar 21 13:23:59 2017 + Committer: Sean Owen Committed: Tue Mar 21 13:23:59 2017 + -- docs/mllib-collaborative-filtering.md | 2 +- .../org/apache/spark/mllib/recommendation/ALS.scala | 16 python/pyspark/mllib/recommendation.py | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7620aed8/docs/mllib-collaborative-filtering.md -- diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md index 0f891a0..d1bb6d6 100644 --- a/docs/mllib-collaborative-filtering.md +++ b/docs/mllib-collaborative-filtering.md @@ -20,7 +20,7 @@ algorithm to learn these latent factors. The implementation in `spark.mllib` has following parameters: * *numBlocks* is the number of blocks used to parallelize computation (set to -1 to auto-configure). -* *rank* is the number of latent factors in the model. +* *rank* is the number of features to use (also referred to as the number of latent factors). * *iterations* is the number of iterations of ALS to run. ALS typically converges to a reasonable solution in 20 iterations or less. * *lambda* specifies the regularization parameter in ALS. http://git-wip-us.apache.org/repos/asf/spark/blob/7620aed8/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala index 76b1bc1..1428822 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala @@ -301,7 +301,7 @@ object ALS { * level of parallelism. * * @param ratingsRDD of [[Rating]] objects with userID, productID, and rating - * @param rank number of features to use + * @param rank number of features to use (also referred to as the number of latent factors) * @param iterations number of iterations of ALS * @param lambda regularization parameter * @param blocks level of parallelism to split computation into @@ -326,7 +326,7 @@ object ALS { * level of parallelism. * * @param ratingsRDD of [[Rating]] objects with userID, productID, and rating - * @param rank number of features to use + * @param rank number of features to use (also referred to as the number of latent factors) * @param iterations number of iterations of ALS * @param lambda regularization parameter * @param blocks level of parallelism to split computation into @@ -349,7 +349,7 @@ object ALS { * parallelism automatically based on the number of partitions in `ratings`. * * @param ratingsRDD of [[Rating]] objects with userID, productID, and rating - * @param rank number of features to use + * @param rank number of features to use (also referred to as the number of latent factors) * @param iterations number of iterations of ALS * @param lambda regularization parameter */ @@ -366,7 +366,7 @@ object ALS { * parallelism automatically based on the number of partitions in `ratings`. * * @param ratingsRDD of [[Rating]] objects with userID, productID, and rating - * @param rank number of features to use + * @param rank number of features to use (also referred to as the number of latent factors) * @param iterations number of iterations of ALS */ @Since("0.8.0")