spark git commit: [SPARK-18444][SPARKR] SparkR running in yarn-cluster mode should not download Spark package.

2016-11-22 Thread yliang
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 9dad3a7b0 -> a37238b06


[SPARK-18444][SPARKR] SparkR running in yarn-cluster mode should not download 
Spark package.

## What changes were proposed in this pull request?
When running SparkR job in yarn-cluster mode, it will download Spark package 
from apache website which is not necessary.
```
./bin/spark-submit --master yarn-cluster ./examples/src/main/r/dataframe.R
```
The following is output:
```
Attaching package: ‘SparkR’

The following objects are masked from ‘package:stats’:

cov, filter, lag, na.omit, predict, sd, var, window

The following objects are masked from ‘package:base’:

as.data.frame, colnames, colnames<-, drop, endsWith, intersect,
rank, rbind, sample, startsWith, subset, summary, transform, union

Spark not found in SPARK_HOME:
Spark not found in the cache directory. Installation will start.
MirrorUrl not provided.
Looking for preferred site from apache website...
..
```
There's no ```SPARK_HOME``` in yarn-cluster mode since the R process is in a 
remote host of the yarn cluster rather than in the client host. The JVM comes 
up first and the R process then connects to it. So in such cases we should 
never have to download Spark as Spark is already running.

## How was this patch tested?
Offline test.

Author: Yanbo Liang 

Closes #15888 from yanboliang/spark-18444.

(cherry picked from commit acb97157796231fef74aba985825b05b607b9279)
Signed-off-by: Yanbo Liang 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a37238b0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a37238b0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a37238b0

Branch: refs/heads/branch-2.0
Commit: a37238b06f525a1e870750650cf1a4f2885ea265
Parents: 9dad3a7
Author: Yanbo Liang 
Authored: Tue Nov 22 00:05:30 2016 -0800
Committer: Yanbo Liang 
Committed: Tue Nov 22 00:08:51 2016 -0800

--
 R/pkg/R/sparkR.R| 20 
 R/pkg/R/utils.R |  4 +++
 R/pkg/inst/tests/testthat/test_sparkR.R | 46 
 3 files changed, 64 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a37238b0/R/pkg/R/sparkR.R
--
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index cc6d591..6476693 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -369,8 +369,13 @@ sparkR.session <- function(
 overrideEnvs(sparkConfigMap, paramMap)
   }
 
+  deployMode <- ""
+  if (exists("spark.submit.deployMode", envir = sparkConfigMap)) {
+deployMode <- sparkConfigMap[["spark.submit.deployMode"]]
+  }
+
   if (!exists(".sparkRjsc", envir = .sparkREnv)) {
-retHome <- sparkCheckInstall(sparkHome, master)
+retHome <- sparkCheckInstall(sparkHome, master, deployMode)
 if (!is.null(retHome)) sparkHome <- retHome
 sparkExecutorEnvMap <- new.env()
 sparkR.sparkContext(master, appName, sparkHome, sparkConfigMap, 
sparkExecutorEnvMap,
@@ -546,24 +551,27 @@ processSparkPackages <- function(packages) {
 #
 # @param sparkHome directory to find Spark package.
 # @param master the Spark master URL, used to check local or remote mode.
+# @param deployMode whether to deploy your driver on the worker nodes (cluster)
+#or locally as an external client (client).
 # @return NULL if no need to update sparkHome, and new sparkHome otherwise.
-sparkCheckInstall <- function(sparkHome, master) {
+sparkCheckInstall <- function(sparkHome, master, deployMode) {
   if (!isSparkRShell()) {
 if (!is.na(file.info(sparkHome)$isdir)) {
   msg <- paste0("Spark package found in SPARK_HOME: ", sparkHome)
   message(msg)
   NULL
 } else {
-  if (!nzchar(master) || isMasterLocal(master)) {
-msg <- paste0("Spark not found in SPARK_HOME: ",
-  sparkHome)
+  if (isMasterLocal(master)) {
+msg <- paste0("Spark not found in SPARK_HOME: ", sparkHome)
 message(msg)
 packageLocalDir <- install.spark()
 packageLocalDir
-  } else {
+  } else if (isClientMode(master) || deployMode == "client") {
 msg <- paste0("Spark not found in SPARK_HOME: ",
   sparkHome, "\n", installInstruction("remote"))
 stop(msg)
+  } else {
+NULL
   }
 }
   } else {

http://git-wip-us.apache.org/repos/asf/spark/blob/a37238b0/R/pkg/R/utils.R
--
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 248c575..581a9a4 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -694,6 +694,10 @@ isMasterLocal <- function(master) {
   

spark git commit: [SPARK-18444][SPARKR] SparkR running in yarn-cluster mode should not download Spark package.

2016-11-22 Thread yliang
Repository: spark
Updated Branches:
  refs/heads/branch-2.1 aaa2a173a -> c70214075


[SPARK-18444][SPARKR] SparkR running in yarn-cluster mode should not download 
Spark package.

## What changes were proposed in this pull request?
When running SparkR job in yarn-cluster mode, it will download Spark package 
from apache website which is not necessary.
```
./bin/spark-submit --master yarn-cluster ./examples/src/main/r/dataframe.R
```
The following is output:
```
Attaching package: ‘SparkR’

The following objects are masked from ‘package:stats’:

cov, filter, lag, na.omit, predict, sd, var, window

The following objects are masked from ‘package:base’:

as.data.frame, colnames, colnames<-, drop, endsWith, intersect,
rank, rbind, sample, startsWith, subset, summary, transform, union

Spark not found in SPARK_HOME:
Spark not found in the cache directory. Installation will start.
MirrorUrl not provided.
Looking for preferred site from apache website...
..
```
There's no ```SPARK_HOME``` in yarn-cluster mode since the R process is in a 
remote host of the yarn cluster rather than in the client host. The JVM comes 
up first and the R process then connects to it. So in such cases we should 
never have to download Spark as Spark is already running.

## How was this patch tested?
Offline test.

Author: Yanbo Liang 

Closes #15888 from yanboliang/spark-18444.

(cherry picked from commit acb97157796231fef74aba985825b05b607b9279)
Signed-off-by: Yanbo Liang 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c7021407
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c7021407
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c7021407

Branch: refs/heads/branch-2.1
Commit: c7021407597480bddf226ffa6d1d3f682408dfeb
Parents: aaa2a17
Author: Yanbo Liang 
Authored: Tue Nov 22 00:05:30 2016 -0800
Committer: Yanbo Liang 
Committed: Tue Nov 22 00:05:54 2016 -0800

--
 R/pkg/R/sparkR.R| 20 
 R/pkg/R/utils.R |  4 +++
 R/pkg/inst/tests/testthat/test_sparkR.R | 46 
 3 files changed, 64 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c7021407/R/pkg/R/sparkR.R
--
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 6b4a2f2..a7152b4 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -373,8 +373,13 @@ sparkR.session <- function(
 overrideEnvs(sparkConfigMap, paramMap)
   }
 
+  deployMode <- ""
+  if (exists("spark.submit.deployMode", envir = sparkConfigMap)) {
+deployMode <- sparkConfigMap[["spark.submit.deployMode"]]
+  }
+
   if (!exists(".sparkRjsc", envir = .sparkREnv)) {
-retHome <- sparkCheckInstall(sparkHome, master)
+retHome <- sparkCheckInstall(sparkHome, master, deployMode)
 if (!is.null(retHome)) sparkHome <- retHome
 sparkExecutorEnvMap <- new.env()
 sparkR.sparkContext(master, appName, sparkHome, sparkConfigMap, 
sparkExecutorEnvMap,
@@ -550,24 +555,27 @@ processSparkPackages <- function(packages) {
 #
 # @param sparkHome directory to find Spark package.
 # @param master the Spark master URL, used to check local or remote mode.
+# @param deployMode whether to deploy your driver on the worker nodes (cluster)
+#or locally as an external client (client).
 # @return NULL if no need to update sparkHome, and new sparkHome otherwise.
-sparkCheckInstall <- function(sparkHome, master) {
+sparkCheckInstall <- function(sparkHome, master, deployMode) {
   if (!isSparkRShell()) {
 if (!is.na(file.info(sparkHome)$isdir)) {
   msg <- paste0("Spark package found in SPARK_HOME: ", sparkHome)
   message(msg)
   NULL
 } else {
-  if (!nzchar(master) || isMasterLocal(master)) {
-msg <- paste0("Spark not found in SPARK_HOME: ",
-  sparkHome)
+  if (isMasterLocal(master)) {
+msg <- paste0("Spark not found in SPARK_HOME: ", sparkHome)
 message(msg)
 packageLocalDir <- install.spark()
 packageLocalDir
-  } else {
+  } else if (isClientMode(master) || deployMode == "client") {
 msg <- paste0("Spark not found in SPARK_HOME: ",
   sparkHome, "\n", installInstruction("remote"))
 stop(msg)
+  } else {
+NULL
   }
 }
   } else {

http://git-wip-us.apache.org/repos/asf/spark/blob/c7021407/R/pkg/R/utils.R
--
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 2000454..098c0e3 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -777,6 +777,10 @@ isMasterLocal <- function(master) {
   

spark git commit: [SPARK-18444][SPARKR] SparkR running in yarn-cluster mode should not download Spark package.

2016-11-22 Thread yliang
Repository: spark
Updated Branches:
  refs/heads/master ebeb0830a -> acb971577


[SPARK-18444][SPARKR] SparkR running in yarn-cluster mode should not download 
Spark package.

## What changes were proposed in this pull request?
When running SparkR job in yarn-cluster mode, it will download Spark package 
from apache website which is not necessary.
```
./bin/spark-submit --master yarn-cluster ./examples/src/main/r/dataframe.R
```
The following is output:
```
Attaching package: ‘SparkR’

The following objects are masked from ‘package:stats’:

cov, filter, lag, na.omit, predict, sd, var, window

The following objects are masked from ‘package:base’:

as.data.frame, colnames, colnames<-, drop, endsWith, intersect,
rank, rbind, sample, startsWith, subset, summary, transform, union

Spark not found in SPARK_HOME:
Spark not found in the cache directory. Installation will start.
MirrorUrl not provided.
Looking for preferred site from apache website...
..
```
There's no ```SPARK_HOME``` in yarn-cluster mode since the R process is in a 
remote host of the yarn cluster rather than in the client host. The JVM comes 
up first and the R process then connects to it. So in such cases we should 
never have to download Spark as Spark is already running.

## How was this patch tested?
Offline test.

Author: Yanbo Liang 

Closes #15888 from yanboliang/spark-18444.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/acb97157
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/acb97157
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/acb97157

Branch: refs/heads/master
Commit: acb97157796231fef74aba985825b05b607b9279
Parents: ebeb083
Author: Yanbo Liang 
Authored: Tue Nov 22 00:05:30 2016 -0800
Committer: Yanbo Liang 
Committed: Tue Nov 22 00:05:30 2016 -0800

--
 R/pkg/R/sparkR.R| 20 
 R/pkg/R/utils.R |  4 +++
 R/pkg/inst/tests/testthat/test_sparkR.R | 46 
 3 files changed, 64 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/acb97157/R/pkg/R/sparkR.R
--
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 6b4a2f2..a7152b4 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -373,8 +373,13 @@ sparkR.session <- function(
 overrideEnvs(sparkConfigMap, paramMap)
   }
 
+  deployMode <- ""
+  if (exists("spark.submit.deployMode", envir = sparkConfigMap)) {
+deployMode <- sparkConfigMap[["spark.submit.deployMode"]]
+  }
+
   if (!exists(".sparkRjsc", envir = .sparkREnv)) {
-retHome <- sparkCheckInstall(sparkHome, master)
+retHome <- sparkCheckInstall(sparkHome, master, deployMode)
 if (!is.null(retHome)) sparkHome <- retHome
 sparkExecutorEnvMap <- new.env()
 sparkR.sparkContext(master, appName, sparkHome, sparkConfigMap, 
sparkExecutorEnvMap,
@@ -550,24 +555,27 @@ processSparkPackages <- function(packages) {
 #
 # @param sparkHome directory to find Spark package.
 # @param master the Spark master URL, used to check local or remote mode.
+# @param deployMode whether to deploy your driver on the worker nodes (cluster)
+#or locally as an external client (client).
 # @return NULL if no need to update sparkHome, and new sparkHome otherwise.
-sparkCheckInstall <- function(sparkHome, master) {
+sparkCheckInstall <- function(sparkHome, master, deployMode) {
   if (!isSparkRShell()) {
 if (!is.na(file.info(sparkHome)$isdir)) {
   msg <- paste0("Spark package found in SPARK_HOME: ", sparkHome)
   message(msg)
   NULL
 } else {
-  if (!nzchar(master) || isMasterLocal(master)) {
-msg <- paste0("Spark not found in SPARK_HOME: ",
-  sparkHome)
+  if (isMasterLocal(master)) {
+msg <- paste0("Spark not found in SPARK_HOME: ", sparkHome)
 message(msg)
 packageLocalDir <- install.spark()
 packageLocalDir
-  } else {
+  } else if (isClientMode(master) || deployMode == "client") {
 msg <- paste0("Spark not found in SPARK_HOME: ",
   sparkHome, "\n", installInstruction("remote"))
 stop(msg)
+  } else {
+NULL
   }
 }
   } else {

http://git-wip-us.apache.org/repos/asf/spark/blob/acb97157/R/pkg/R/utils.R
--
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 2000454..098c0e3 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -777,6 +777,10 @@ isMasterLocal <- function(master) {
   grepl("^local(\\[([0-9]+|\\*)\\])?$", master, perl = TRUE)
 }
 
+isClientMode <- function(master) {
+  grepl("([a-z]+)-client$",