spark git commit: [SPARK-17210][SPARKR] sparkr.zip is not distributed to executors when running sparkr in RStudio
Repository: spark Updated Branches: refs/heads/branch-2.0 d3f90e71a -> 1a8ea000e [SPARK-17210][SPARKR] sparkr.zip is not distributed to executors when running sparkr in RStudio ## What changes were proposed in this pull request? Spark will add sparkr.zip to archive only when it is yarn mode (SparkSubmit.scala). ``` if (args.isR && clusterManager == YARN) { val sparkRPackagePath = RUtils.localSparkRPackagePath if (sparkRPackagePath.isEmpty) { printErrorAndExit("SPARK_HOME does not exist for R application in YARN mode.") } val sparkRPackageFile = new File(sparkRPackagePath.get, SPARKR_PACKAGE_ARCHIVE) if (!sparkRPackageFile.exists()) { printErrorAndExit(s"$SPARKR_PACKAGE_ARCHIVE does not exist for R application in YARN mode.") } val sparkRPackageURI = Utils.resolveURI(sparkRPackageFile.getAbsolutePath).toString // Distribute the SparkR package. // Assigns a symbol link name "sparkr" to the shipped package. args.archives = mergeFileLists(args.archives, sparkRPackageURI + "#sparkr") // Distribute the R package archive containing all the built R packages. if (!RUtils.rPackages.isEmpty) { val rPackageFile = RPackageUtils.zipRLibraries(new File(RUtils.rPackages.get), R_PACKAGE_ARCHIVE) if (!rPackageFile.exists()) { printErrorAndExit("Failed to zip all the built R packages.") } val rPackageURI = Utils.resolveURI(rPackageFile.getAbsolutePath).toString // Assigns a symbol link name "rpkg" to the shipped package. args.archives = mergeFileLists(args.archives, rPackageURI + "#rpkg") } } ``` So it is necessary to pass spark.master from R process to JVM. Otherwise sparkr.zip won't be distributed to executor. Besides that I also pass spark.yarn.keytab/spark.yarn.principal to spark side, because JVM process need them to access secured cluster. ## How was this patch tested? Verify it manually in R Studio using the following code. ``` Sys.setenv(SPARK_HOME="/Users/jzhang/github/spark") .libPaths(c(file.path(Sys.getenv(), "R", "lib"), .libPaths())) library(SparkR) sparkR.session(master="yarn-client", sparkConfig = list(spark.executor.instances="1")) df <- as.DataFrame(mtcars) head(df) ``` ⦠Author: Jeff ZhangCloses #14784 from zjffdu/SPARK-17210. (cherry picked from commit f62ddc5983a08d4d54c0a9a8210dd6cbec555671) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1a8ea000 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1a8ea000 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1a8ea000 Branch: refs/heads/branch-2.0 Commit: 1a8ea000e7e16bdee54c47ab0f5e197c15f200a6 Parents: d3f90e7 Author: Jeff Zhang Authored: Fri Sep 23 11:37:43 2016 -0700 Committer: Felix Cheung Committed: Fri Sep 23 11:38:21 2016 -0700 -- R/pkg/R/sparkR.R | 4 docs/sparkr.md | 15 +++ 2 files changed, 19 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1a8ea000/R/pkg/R/sparkR.R -- diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 0601536..cc6d591 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -491,6 +491,10 @@ sparkConfToSubmitOps[["spark.driver.memory"]] <- "--driver-memory" sparkConfToSubmitOps[["spark.driver.extraClassPath"]] <- "--driver-class-path" sparkConfToSubmitOps[["spark.driver.extraJavaOptions"]] <- "--driver-java-options" sparkConfToSubmitOps[["spark.driver.extraLibraryPath"]] <- "--driver-library-path" +sparkConfToSubmitOps[["spark.master"]] <- "--master" +sparkConfToSubmitOps[["spark.yarn.keytab"]] <- "--keytab" +sparkConfToSubmitOps[["spark.yarn.principal"]] <- "--principal" + # Utility function that returns Spark Submit arguments as a string # http://git-wip-us.apache.org/repos/asf/spark/blob/1a8ea000/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index b881119..340e7f7 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -63,6 +63,21 @@ The following Spark driver properties can be set in `sparkConfig` with `sparkR.s Property NameProperty groupspark-submit equivalent +spark.master +Application Properties +--master + + +spark.yarn.keytab +Application Properties +--keytab + + +spark.yarn.principal +Application Properties +--principal + + spark.driver.memory Application Properties --driver-memory - To unsubscribe, e-mail:
spark git commit: [SPARK-17210][SPARKR] sparkr.zip is not distributed to executors when running sparkr in RStudio
Repository: spark Updated Branches: refs/heads/master f89808b0f -> f62ddc598 [SPARK-17210][SPARKR] sparkr.zip is not distributed to executors when running sparkr in RStudio ## What changes were proposed in this pull request? Spark will add sparkr.zip to archive only when it is yarn mode (SparkSubmit.scala). ``` if (args.isR && clusterManager == YARN) { val sparkRPackagePath = RUtils.localSparkRPackagePath if (sparkRPackagePath.isEmpty) { printErrorAndExit("SPARK_HOME does not exist for R application in YARN mode.") } val sparkRPackageFile = new File(sparkRPackagePath.get, SPARKR_PACKAGE_ARCHIVE) if (!sparkRPackageFile.exists()) { printErrorAndExit(s"$SPARKR_PACKAGE_ARCHIVE does not exist for R application in YARN mode.") } val sparkRPackageURI = Utils.resolveURI(sparkRPackageFile.getAbsolutePath).toString // Distribute the SparkR package. // Assigns a symbol link name "sparkr" to the shipped package. args.archives = mergeFileLists(args.archives, sparkRPackageURI + "#sparkr") // Distribute the R package archive containing all the built R packages. if (!RUtils.rPackages.isEmpty) { val rPackageFile = RPackageUtils.zipRLibraries(new File(RUtils.rPackages.get), R_PACKAGE_ARCHIVE) if (!rPackageFile.exists()) { printErrorAndExit("Failed to zip all the built R packages.") } val rPackageURI = Utils.resolveURI(rPackageFile.getAbsolutePath).toString // Assigns a symbol link name "rpkg" to the shipped package. args.archives = mergeFileLists(args.archives, rPackageURI + "#rpkg") } } ``` So it is necessary to pass spark.master from R process to JVM. Otherwise sparkr.zip won't be distributed to executor. Besides that I also pass spark.yarn.keytab/spark.yarn.principal to spark side, because JVM process need them to access secured cluster. ## How was this patch tested? Verify it manually in R Studio using the following code. ``` Sys.setenv(SPARK_HOME="/Users/jzhang/github/spark") .libPaths(c(file.path(Sys.getenv(), "R", "lib"), .libPaths())) library(SparkR) sparkR.session(master="yarn-client", sparkConfig = list(spark.executor.instances="1")) df <- as.DataFrame(mtcars) head(df) ``` ⦠Author: Jeff ZhangCloses #14784 from zjffdu/SPARK-17210. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f62ddc59 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f62ddc59 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f62ddc59 Branch: refs/heads/master Commit: f62ddc5983a08d4d54c0a9a8210dd6cbec555671 Parents: f89808b Author: Jeff Zhang Authored: Fri Sep 23 11:37:43 2016 -0700 Committer: Felix Cheung Committed: Fri Sep 23 11:37:43 2016 -0700 -- R/pkg/R/sparkR.R | 4 docs/sparkr.md | 15 +++ 2 files changed, 19 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f62ddc59/R/pkg/R/sparkR.R -- diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 0601536..cc6d591 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -491,6 +491,10 @@ sparkConfToSubmitOps[["spark.driver.memory"]] <- "--driver-memory" sparkConfToSubmitOps[["spark.driver.extraClassPath"]] <- "--driver-class-path" sparkConfToSubmitOps[["spark.driver.extraJavaOptions"]] <- "--driver-java-options" sparkConfToSubmitOps[["spark.driver.extraLibraryPath"]] <- "--driver-library-path" +sparkConfToSubmitOps[["spark.master"]] <- "--master" +sparkConfToSubmitOps[["spark.yarn.keytab"]] <- "--keytab" +sparkConfToSubmitOps[["spark.yarn.principal"]] <- "--principal" + # Utility function that returns Spark Submit arguments as a string # http://git-wip-us.apache.org/repos/asf/spark/blob/f62ddc59/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index b881119..340e7f7 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -63,6 +63,21 @@ The following Spark driver properties can be set in `sparkConfig` with `sparkR.s Property NameProperty groupspark-submit equivalent +spark.master +Application Properties +--master + + +spark.yarn.keytab +Application Properties +--keytab + + +spark.yarn.principal +Application Properties +--principal + + spark.driver.memory Application Properties --driver-memory - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org