spark git commit: [SPARK-18444][SPARKR] SparkR running in yarn-cluster mode should not download Spark package.
Repository: spark Updated Branches: refs/heads/branch-2.0 9dad3a7b0 -> a37238b06 [SPARK-18444][SPARKR] SparkR running in yarn-cluster mode should not download Spark package. ## What changes were proposed in this pull request? When running SparkR job in yarn-cluster mode, it will download Spark package from apache website which is not necessary. ``` ./bin/spark-submit --master yarn-cluster ./examples/src/main/r/dataframe.R ``` The following is output: ``` Attaching package: âSparkRâ The following objects are masked from âpackage:statsâ: cov, filter, lag, na.omit, predict, sd, var, window The following objects are masked from âpackage:baseâ: as.data.frame, colnames, colnames<-, drop, endsWith, intersect, rank, rbind, sample, startsWith, subset, summary, transform, union Spark not found in SPARK_HOME: Spark not found in the cache directory. Installation will start. MirrorUrl not provided. Looking for preferred site from apache website... .. ``` There's no ```SPARK_HOME``` in yarn-cluster mode since the R process is in a remote host of the yarn cluster rather than in the client host. The JVM comes up first and the R process then connects to it. So in such cases we should never have to download Spark as Spark is already running. ## How was this patch tested? Offline test. Author: Yanbo LiangCloses #15888 from yanboliang/spark-18444. (cherry picked from commit acb97157796231fef74aba985825b05b607b9279) Signed-off-by: Yanbo Liang Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a37238b0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a37238b0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a37238b0 Branch: refs/heads/branch-2.0 Commit: a37238b06f525a1e870750650cf1a4f2885ea265 Parents: 9dad3a7 Author: Yanbo Liang Authored: Tue Nov 22 00:05:30 2016 -0800 Committer: Yanbo Liang Committed: Tue Nov 22 00:08:51 2016 -0800 -- R/pkg/R/sparkR.R| 20 R/pkg/R/utils.R | 4 +++ R/pkg/inst/tests/testthat/test_sparkR.R | 46 3 files changed, 64 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a37238b0/R/pkg/R/sparkR.R -- diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index cc6d591..6476693 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -369,8 +369,13 @@ sparkR.session <- function( overrideEnvs(sparkConfigMap, paramMap) } + deployMode <- "" + if (exists("spark.submit.deployMode", envir = sparkConfigMap)) { +deployMode <- sparkConfigMap[["spark.submit.deployMode"]] + } + if (!exists(".sparkRjsc", envir = .sparkREnv)) { -retHome <- sparkCheckInstall(sparkHome, master) +retHome <- sparkCheckInstall(sparkHome, master, deployMode) if (!is.null(retHome)) sparkHome <- retHome sparkExecutorEnvMap <- new.env() sparkR.sparkContext(master, appName, sparkHome, sparkConfigMap, sparkExecutorEnvMap, @@ -546,24 +551,27 @@ processSparkPackages <- function(packages) { # # @param sparkHome directory to find Spark package. # @param master the Spark master URL, used to check local or remote mode. +# @param deployMode whether to deploy your driver on the worker nodes (cluster) +#or locally as an external client (client). # @return NULL if no need to update sparkHome, and new sparkHome otherwise. -sparkCheckInstall <- function(sparkHome, master) { +sparkCheckInstall <- function(sparkHome, master, deployMode) { if (!isSparkRShell()) { if (!is.na(file.info(sparkHome)$isdir)) { msg <- paste0("Spark package found in SPARK_HOME: ", sparkHome) message(msg) NULL } else { - if (!nzchar(master) || isMasterLocal(master)) { -msg <- paste0("Spark not found in SPARK_HOME: ", - sparkHome) + if (isMasterLocal(master)) { +msg <- paste0("Spark not found in SPARK_HOME: ", sparkHome) message(msg) packageLocalDir <- install.spark() packageLocalDir - } else { + } else if (isClientMode(master) || deployMode == "client") { msg <- paste0("Spark not found in SPARK_HOME: ", sparkHome, "\n", installInstruction("remote")) stop(msg) + } else { +NULL } } } else { http://git-wip-us.apache.org/repos/asf/spark/blob/a37238b0/R/pkg/R/utils.R -- diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index 248c575..581a9a4 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -694,6 +694,10 @@ isMasterLocal <- function(master) {
spark git commit: [SPARK-18444][SPARKR] SparkR running in yarn-cluster mode should not download Spark package.
Repository: spark Updated Branches: refs/heads/branch-2.1 aaa2a173a -> c70214075 [SPARK-18444][SPARKR] SparkR running in yarn-cluster mode should not download Spark package. ## What changes were proposed in this pull request? When running SparkR job in yarn-cluster mode, it will download Spark package from apache website which is not necessary. ``` ./bin/spark-submit --master yarn-cluster ./examples/src/main/r/dataframe.R ``` The following is output: ``` Attaching package: âSparkRâ The following objects are masked from âpackage:statsâ: cov, filter, lag, na.omit, predict, sd, var, window The following objects are masked from âpackage:baseâ: as.data.frame, colnames, colnames<-, drop, endsWith, intersect, rank, rbind, sample, startsWith, subset, summary, transform, union Spark not found in SPARK_HOME: Spark not found in the cache directory. Installation will start. MirrorUrl not provided. Looking for preferred site from apache website... .. ``` There's no ```SPARK_HOME``` in yarn-cluster mode since the R process is in a remote host of the yarn cluster rather than in the client host. The JVM comes up first and the R process then connects to it. So in such cases we should never have to download Spark as Spark is already running. ## How was this patch tested? Offline test. Author: Yanbo LiangCloses #15888 from yanboliang/spark-18444. (cherry picked from commit acb97157796231fef74aba985825b05b607b9279) Signed-off-by: Yanbo Liang Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c7021407 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c7021407 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c7021407 Branch: refs/heads/branch-2.1 Commit: c7021407597480bddf226ffa6d1d3f682408dfeb Parents: aaa2a17 Author: Yanbo Liang Authored: Tue Nov 22 00:05:30 2016 -0800 Committer: Yanbo Liang Committed: Tue Nov 22 00:05:54 2016 -0800 -- R/pkg/R/sparkR.R| 20 R/pkg/R/utils.R | 4 +++ R/pkg/inst/tests/testthat/test_sparkR.R | 46 3 files changed, 64 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c7021407/R/pkg/R/sparkR.R -- diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 6b4a2f2..a7152b4 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -373,8 +373,13 @@ sparkR.session <- function( overrideEnvs(sparkConfigMap, paramMap) } + deployMode <- "" + if (exists("spark.submit.deployMode", envir = sparkConfigMap)) { +deployMode <- sparkConfigMap[["spark.submit.deployMode"]] + } + if (!exists(".sparkRjsc", envir = .sparkREnv)) { -retHome <- sparkCheckInstall(sparkHome, master) +retHome <- sparkCheckInstall(sparkHome, master, deployMode) if (!is.null(retHome)) sparkHome <- retHome sparkExecutorEnvMap <- new.env() sparkR.sparkContext(master, appName, sparkHome, sparkConfigMap, sparkExecutorEnvMap, @@ -550,24 +555,27 @@ processSparkPackages <- function(packages) { # # @param sparkHome directory to find Spark package. # @param master the Spark master URL, used to check local or remote mode. +# @param deployMode whether to deploy your driver on the worker nodes (cluster) +#or locally as an external client (client). # @return NULL if no need to update sparkHome, and new sparkHome otherwise. -sparkCheckInstall <- function(sparkHome, master) { +sparkCheckInstall <- function(sparkHome, master, deployMode) { if (!isSparkRShell()) { if (!is.na(file.info(sparkHome)$isdir)) { msg <- paste0("Spark package found in SPARK_HOME: ", sparkHome) message(msg) NULL } else { - if (!nzchar(master) || isMasterLocal(master)) { -msg <- paste0("Spark not found in SPARK_HOME: ", - sparkHome) + if (isMasterLocal(master)) { +msg <- paste0("Spark not found in SPARK_HOME: ", sparkHome) message(msg) packageLocalDir <- install.spark() packageLocalDir - } else { + } else if (isClientMode(master) || deployMode == "client") { msg <- paste0("Spark not found in SPARK_HOME: ", sparkHome, "\n", installInstruction("remote")) stop(msg) + } else { +NULL } } } else { http://git-wip-us.apache.org/repos/asf/spark/blob/c7021407/R/pkg/R/utils.R -- diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index 2000454..098c0e3 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -777,6 +777,10 @@ isMasterLocal <- function(master) {
spark git commit: [SPARK-18444][SPARKR] SparkR running in yarn-cluster mode should not download Spark package.
Repository: spark Updated Branches: refs/heads/master ebeb0830a -> acb971577 [SPARK-18444][SPARKR] SparkR running in yarn-cluster mode should not download Spark package. ## What changes were proposed in this pull request? When running SparkR job in yarn-cluster mode, it will download Spark package from apache website which is not necessary. ``` ./bin/spark-submit --master yarn-cluster ./examples/src/main/r/dataframe.R ``` The following is output: ``` Attaching package: âSparkRâ The following objects are masked from âpackage:statsâ: cov, filter, lag, na.omit, predict, sd, var, window The following objects are masked from âpackage:baseâ: as.data.frame, colnames, colnames<-, drop, endsWith, intersect, rank, rbind, sample, startsWith, subset, summary, transform, union Spark not found in SPARK_HOME: Spark not found in the cache directory. Installation will start. MirrorUrl not provided. Looking for preferred site from apache website... .. ``` There's no ```SPARK_HOME``` in yarn-cluster mode since the R process is in a remote host of the yarn cluster rather than in the client host. The JVM comes up first and the R process then connects to it. So in such cases we should never have to download Spark as Spark is already running. ## How was this patch tested? Offline test. Author: Yanbo LiangCloses #15888 from yanboliang/spark-18444. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/acb97157 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/acb97157 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/acb97157 Branch: refs/heads/master Commit: acb97157796231fef74aba985825b05b607b9279 Parents: ebeb083 Author: Yanbo Liang Authored: Tue Nov 22 00:05:30 2016 -0800 Committer: Yanbo Liang Committed: Tue Nov 22 00:05:30 2016 -0800 -- R/pkg/R/sparkR.R| 20 R/pkg/R/utils.R | 4 +++ R/pkg/inst/tests/testthat/test_sparkR.R | 46 3 files changed, 64 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/acb97157/R/pkg/R/sparkR.R -- diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 6b4a2f2..a7152b4 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -373,8 +373,13 @@ sparkR.session <- function( overrideEnvs(sparkConfigMap, paramMap) } + deployMode <- "" + if (exists("spark.submit.deployMode", envir = sparkConfigMap)) { +deployMode <- sparkConfigMap[["spark.submit.deployMode"]] + } + if (!exists(".sparkRjsc", envir = .sparkREnv)) { -retHome <- sparkCheckInstall(sparkHome, master) +retHome <- sparkCheckInstall(sparkHome, master, deployMode) if (!is.null(retHome)) sparkHome <- retHome sparkExecutorEnvMap <- new.env() sparkR.sparkContext(master, appName, sparkHome, sparkConfigMap, sparkExecutorEnvMap, @@ -550,24 +555,27 @@ processSparkPackages <- function(packages) { # # @param sparkHome directory to find Spark package. # @param master the Spark master URL, used to check local or remote mode. +# @param deployMode whether to deploy your driver on the worker nodes (cluster) +#or locally as an external client (client). # @return NULL if no need to update sparkHome, and new sparkHome otherwise. -sparkCheckInstall <- function(sparkHome, master) { +sparkCheckInstall <- function(sparkHome, master, deployMode) { if (!isSparkRShell()) { if (!is.na(file.info(sparkHome)$isdir)) { msg <- paste0("Spark package found in SPARK_HOME: ", sparkHome) message(msg) NULL } else { - if (!nzchar(master) || isMasterLocal(master)) { -msg <- paste0("Spark not found in SPARK_HOME: ", - sparkHome) + if (isMasterLocal(master)) { +msg <- paste0("Spark not found in SPARK_HOME: ", sparkHome) message(msg) packageLocalDir <- install.spark() packageLocalDir - } else { + } else if (isClientMode(master) || deployMode == "client") { msg <- paste0("Spark not found in SPARK_HOME: ", sparkHome, "\n", installInstruction("remote")) stop(msg) + } else { +NULL } } } else { http://git-wip-us.apache.org/repos/asf/spark/blob/acb97157/R/pkg/R/utils.R -- diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index 2000454..098c0e3 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -777,6 +777,10 @@ isMasterLocal <- function(master) { grepl("^local(\\[([0-9]+|\\*)\\])?$", master, perl = TRUE) } +isClientMode <- function(master) { + grepl("([a-z]+)-client$",