[1/9] spark git commit: [SPARKR][BACKPORT-2.1] backporting package and test changes

felixcheung Sun, 10 Sep 2017 10:25:04 -0700

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 6a8a726f3 -> ae4e8ae41



http://git-wip-us.apache.org/repos/asf/spark/blob/ae4e8ae4/R/pkg/tests/fulltests/test_take.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_take.R 
b/R/pkg/tests/fulltests/test_take.R
new file mode 100644
index 0000000..aaa5328
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_take.R
@@ -0,0 +1,69 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("tests RDD function take()")
+
+# Mock data
+numVector <- c(-10:97)
+numList <- list(sqrt(1), sqrt(2), sqrt(3), 4 ** 10)
+strVector <- c("Dexter Morgan: I suppose I should be upset, even feel",
+               "violated, but I'm not. No, in fact, I think this is a 
friendly",
+               "message, like \"Hey, wanna play?\" and yes, I want to play. ",
+               "I really, really do.")
+strList <- list("Dexter Morgan: Blood. Sometimes it sets my teeth on edge, ",
+                "other times it helps me control the chaos.",
+                "Dexter Morgan: Harry and Dorris Morgan did a wonderful job ",
+                "raising me. But they're both dead now. I didn't kill them. 
Honest.")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", 
"getJavaSparkContext", sparkSession)
+
+test_that("take() gives back the original elements in correct count and 
order", {
+  numVectorRDD <- parallelize(sc, numVector, 10)
+  # case: number of elements to take is less than the size of the first 
partition
+  expect_equal(takeRDD(numVectorRDD, 1), as.list(head(numVector, n = 1)))
+  # case: number of elements to take is the same as the size of the first 
partition
+  expect_equal(takeRDD(numVectorRDD, 11), as.list(head(numVector, n = 11)))
+  # case: number of elements to take is greater than all elements
+  expect_equal(takeRDD(numVectorRDD, length(numVector)), as.list(numVector))
+  expect_equal(takeRDD(numVectorRDD, length(numVector) + 1), 
as.list(numVector))
+
+  numListRDD <- parallelize(sc, numList, 1)
+  numListRDD2 <- parallelize(sc, numList, 4)
+  expect_equal(takeRDD(numListRDD, 3), takeRDD(numListRDD2, 3))
+  expect_equal(takeRDD(numListRDD, 5), takeRDD(numListRDD2, 5))
+  expect_equal(takeRDD(numListRDD, 1), as.list(head(numList, n = 1)))
+  expect_equal(takeRDD(numListRDD2, 999), numList)
+
+  strVectorRDD <- parallelize(sc, strVector, 2)
+  strVectorRDD2 <- parallelize(sc, strVector, 3)
+  expect_equal(takeRDD(strVectorRDD, 4), as.list(strVector))
+  expect_equal(takeRDD(strVectorRDD2, 2), as.list(head(strVector, n = 2)))
+
+  strListRDD <- parallelize(sc, strList, 4)
+  strListRDD2 <- parallelize(sc, strList, 1)
+  expect_equal(takeRDD(strListRDD, 3), as.list(head(strList, n = 3)))
+  expect_equal(takeRDD(strListRDD2, 1), as.list(head(strList, n = 1)))
+
+  expect_equal(length(takeRDD(strListRDD, 0)), 0)
+  expect_equal(length(takeRDD(strVectorRDD, 0)), 0)
+  expect_equal(length(takeRDD(numListRDD, 0)), 0)
+  expect_equal(length(takeRDD(numVectorRDD, 0)), 0)
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/ae4e8ae4/R/pkg/tests/fulltests/test_textFile.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_textFile.R 
b/R/pkg/tests/fulltests/test_textFile.R
new file mode 100644
index 0000000..3b46606
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_textFile.R
@@ -0,0 +1,164 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("the textFile() function")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", 
"getJavaSparkContext", sparkSession)
+
+mockFile <- c("Spark is pretty.", "Spark is awesome.")
+
+test_that("textFile() on a local file returns an RDD", {
+  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName)
+
+  rdd <- textFile(sc, fileName)
+  expect_is(rdd, "RDD")
+  expect_true(countRDD(rdd) > 0)
+  expect_equal(countRDD(rdd), 2)
+
+  unlink(fileName)
+})
+
+test_that("textFile() followed by a collect() returns the same content", {
+  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName)
+
+  rdd <- textFile(sc, fileName)
+  expect_equal(collectRDD(rdd), as.list(mockFile))
+
+  unlink(fileName)
+})
+
+test_that("textFile() word count works as expected", {
+  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName)
+
+  rdd <- textFile(sc, fileName)
+
+  words <- flatMap(rdd, function(line) { strsplit(line, " ")[[1]] })
+  wordCount <- lapply(words, function(word) { list(word, 1L) })
+
+  counts <- reduceByKey(wordCount, "+", 2L)
+  output <- collectRDD(counts)
+  expected <- list(list("pretty.", 1), list("is", 2), list("awesome.", 1),
+                   list("Spark", 2))
+  expect_equal(sortKeyValueList(output), sortKeyValueList(expected))
+
+  unlink(fileName)
+})
+
+test_that("several transformations on RDD created by textFile()", {
+  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName)
+
+  rdd <- textFile(sc, fileName) # RDD
+  for (i in 1:10) {
+    # PipelinedRDD initially created from RDD
+    rdd <- lapply(rdd, function(x) paste(x, x))
+  }
+  collectRDD(rdd)
+
+  unlink(fileName)
+})
+
+test_that("textFile() followed by a saveAsTextFile() returns the same 
content", {
+  fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName1)
+
+  rdd <- textFile(sc, fileName1, 1L)
+  saveAsTextFile(rdd, fileName2)
+  rdd <- textFile(sc, fileName2)
+  expect_equal(collectRDD(rdd), as.list(mockFile))
+
+  unlink(fileName1)
+  unlink(fileName2)
+})
+
+test_that("saveAsTextFile() on a parallelized list works as expected", {
+  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  l <- list(1, 2, 3)
+  rdd <- parallelize(sc, l, 1L)
+  saveAsTextFile(rdd, fileName)
+  rdd <- textFile(sc, fileName)
+  expect_equal(collectRDD(rdd), lapply(l, function(x) {toString(x)}))
+
+  unlink(fileName)
+})
+
+test_that("textFile() and saveAsTextFile() word count works as expected", {
+  fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName1)
+
+  rdd <- textFile(sc, fileName1)
+
+  words <- flatMap(rdd, function(line) { strsplit(line, " ")[[1]] })
+  wordCount <- lapply(words, function(word) { list(word, 1L) })
+
+  counts <- reduceByKey(wordCount, "+", 2L)
+
+  saveAsTextFile(counts, fileName2)
+  rdd <- textFile(sc, fileName2)
+
+  output <- collectRDD(rdd)
+  expected <- list(list("awesome.", 1), list("Spark", 2),
+                   list("pretty.", 1), list("is", 2))
+  expectedStr <- lapply(expected, function(x) { toString(x) })
+  expect_equal(sortKeyValueList(output), sortKeyValueList(expectedStr))
+
+  unlink(fileName1)
+  unlink(fileName2)
+})
+
+test_that("textFile() on multiple paths", {
+  fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines("Spark is pretty.", fileName1)
+  writeLines("Spark is awesome.", fileName2)
+
+  rdd <- textFile(sc, c(fileName1, fileName2))
+  expect_equal(countRDD(rdd), 2)
+
+  unlink(fileName1)
+  unlink(fileName2)
+})
+
+test_that("Pipelined operations on RDDs created using textFile", {
+  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName)
+
+  rdd <- textFile(sc, fileName)
+
+  lengths <- lapply(rdd, function(x) { length(x) })
+  expect_equal(collectRDD(lengths), list(1, 1))
+
+  lengthsPipelined <- lapply(lengths, function(x) { x + 10 })
+  expect_equal(collectRDD(lengthsPipelined), list(11, 11))
+
+  lengths30 <- lapply(lengthsPipelined, function(x) { x + 20 })
+  expect_equal(collectRDD(lengths30), list(31, 31))
+
+  lengths20 <- lapply(lengths, function(x) { x + 20 })
+  expect_equal(collectRDD(lengths20), list(21, 21))
+
+  unlink(fileName)
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/ae4e8ae4/R/pkg/tests/fulltests/test_utils.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_utils.R 
b/R/pkg/tests/fulltests/test_utils.R
new file mode 100644
index 0000000..c875248
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_utils.R
@@ -0,0 +1,242 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("functions in utils.R")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", 
"getJavaSparkContext", sparkSession)
+
+test_that("convertJListToRList() gives back (deserializes) the original JLists
+          of strings and integers", {
+  # It's hard to manually create a Java List using rJava, since it does not
+  # support generics well. Instead, we rely on collectRDD() returning a
+  # JList.
+  nums <- as.list(1:10)
+  rdd <- parallelize(sc, nums, 1L)
+  jList <- callJMethod(rdd@jrdd, "collect")
+  rList <- convertJListToRList(jList, flatten = TRUE)
+  expect_equal(rList, nums)
+
+  strs <- as.list("hello", "spark")
+  rdd <- parallelize(sc, strs, 2L)
+  jList <- callJMethod(rdd@jrdd, "collect")
+  rList <- convertJListToRList(jList, flatten = TRUE)
+  expect_equal(rList, strs)
+})
+
+test_that("serializeToBytes on RDD", {
+  # File content
+  mockFile <- c("Spark is pretty.", "Spark is awesome.")
+  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName)
+
+  text.rdd <- textFile(sc, fileName)
+  expect_equal(getSerializedMode(text.rdd), "string")
+  ser.rdd <- serializeToBytes(text.rdd)
+  expect_equal(collectRDD(ser.rdd), as.list(mockFile))
+  expect_equal(getSerializedMode(ser.rdd), "byte")
+
+  unlink(fileName)
+})
+
+test_that("cleanClosure on R functions", {
+  y <- c(1, 2, 3)
+  g <- function(x) { x + 1 }
+  f <- function(x) { g(x) + y }
+  newF <- cleanClosure(f)
+  env <- environment(newF)
+  expect_equal(length(ls(env)), 2)  # y, g
+  actual <- get("y", envir = env, inherits = FALSE)
+  expect_equal(actual, y)
+  actual <- get("g", envir = env, inherits = FALSE)
+  expect_equal(actual, g)
+
+  # Test for nested enclosures and package variables.
+  env2 <- new.env()
+  funcEnv <- new.env(parent = env2)
+  f <- function(x) { log(g(x) + y) }
+  environment(f) <- funcEnv  # enclosing relationship: f -> funcEnv -> env2 -> 
.GlobalEnv
+  newF <- cleanClosure(f)
+  env <- environment(newF)
+  expect_equal(length(ls(env)), 2)  # "min" should not be included
+  actual <- get("y", envir = env, inherits = FALSE)
+  expect_equal(actual, y)
+  actual <- get("g", envir = env, inherits = FALSE)
+  expect_equal(actual, g)
+
+  base <- c(1, 2, 3)
+  l <- list(field = matrix(1))
+  field <- matrix(2)
+  defUse <- 3
+  g <- function(x) { x + y }
+  f <- function(x) {
+    defUse <- base::as.integer(x) + 1  # Test for access operators `::`.
+    lapply(x, g) + 1  # Test for capturing function call "g"'s closure as a 
argument of lapply.
+    l$field[1, 1] <- 3  # Test for access operators `$`.
+    res <- defUse + l$field[1, ]  # Test for def-use chain of "defUse", and "" 
symbol.
+    f(res)  # Test for recursive calls.
+  }
+  newF <- cleanClosure(f)
+  env <- environment(newF)
+  # TODO(shivaram): length(ls(env)) is 4 here for some reason and `lapply` is 
included in `env`.
+  # Disabling this test till we debug this.
+  #
+  # nolint start
+  # expect_equal(length(ls(env)), 3)  # Only "g", "l" and "f". No "base", 
"field" or "defUse".
+  # nolint end
+  expect_true("g" %in% ls(env))
+  expect_true("l" %in% ls(env))
+  expect_true("f" %in% ls(env))
+  expect_equal(get("l", envir = env, inherits = FALSE), l)
+  # "y" should be in the environemnt of g.
+  newG <- get("g", envir = env, inherits = FALSE)
+  env <- environment(newG)
+  expect_equal(length(ls(env)), 1)
+  actual <- get("y", envir = env, inherits = FALSE)
+  expect_equal(actual, y)
+
+  # Test for function (and variable) definitions.
+  f <- function(x) {
+    g <- function(y) { y * 2 }
+    g(x)
+  }
+  newF <- cleanClosure(f)
+  env <- environment(newF)
+  expect_equal(length(ls(env)), 0)  # "y" and "g" should not be included.
+
+  # Test for overriding variables in base namespace (Issue: SparkR-196).
+  nums <- as.list(1:10)
+  rdd <- parallelize(sc, nums, 2L)
+  t <- 4  # Override base::t in .GlobalEnv.
+  f <- function(x) { x > t }
+  newF <- cleanClosure(f)
+  env <- environment(newF)
+  expect_equal(ls(env), "t")
+  expect_equal(get("t", envir = env, inherits = FALSE), t)
+  actual <- collectRDD(lapply(rdd, f))
+  expected <- as.list(c(rep(FALSE, 4), rep(TRUE, 6)))
+  expect_equal(actual, expected)
+
+  # Test for broadcast variables.
+  a <- matrix(nrow = 10, ncol = 10, data = rnorm(100))
+  aBroadcast <- broadcast(sc, a)
+  normMultiply <- function(x) { norm(aBroadcast$value) * x }
+  newnormMultiply <- SparkR:::cleanClosure(normMultiply)
+  env <- environment(newnormMultiply)
+  expect_equal(ls(env), "aBroadcast")
+  expect_equal(get("aBroadcast", envir = env, inherits = FALSE), aBroadcast)
+})
+
+test_that("varargsToJProperties", {
+  jprops <- newJObject("java.util.Properties")
+  expect_true(class(jprops) == "jobj")
+
+  jprops <- varargsToJProperties(abc = "123")
+  expect_true(class(jprops) == "jobj")
+  expect_equal(callJMethod(jprops, "getProperty", "abc"), "123")
+
+  jprops <- varargsToJProperties(abc = "abc", b = 1)
+  expect_equal(callJMethod(jprops, "getProperty", "abc"), "abc")
+  expect_equal(callJMethod(jprops, "getProperty", "b"), "1")
+
+  jprops <- varargsToJProperties()
+  expect_equal(callJMethod(jprops, "size"), 0L)
+})
+
+test_that("convertToJSaveMode", {
+  s <- convertToJSaveMode("error")
+  expect_true(class(s) == "jobj")
+  expect_match(capture.output(print.jobj(s)), "Java ref type 
org.apache.spark.sql.SaveMode id ")
+  expect_error(convertToJSaveMode("foo"),
+    'mode should be one of "append", "overwrite", "error", "ignore"') #nolint
+})
+
+test_that("captureJVMException", {
+  method <- "getSQLDataType"
+  expect_error(tryCatch(callJStatic("org.apache.spark.sql.api.r.SQLUtils", 
method,
+                                    "unknown"),
+                        error = function(e) {
+                          captureJVMException(e, method)
+                        }),
+               "Error in getSQLDataType : illegal argument - Invalid type 
unknown")
+})
+
+test_that("hashCode", {
+  expect_error(hashCode("bc53d3605e8a5b7de1e8e271c2317645"), NA)
+})
+
+test_that("overrideEnvs", {
+  config <- new.env()
+  config[["spark.master"]] <- "foo"
+  config[["config_only"]] <- "ok"
+  param <- new.env()
+  param[["spark.master"]] <- "local"
+  param[["param_only"]] <- "blah"
+  overrideEnvs(config, param)
+  expect_equal(config[["spark.master"]], "local")
+  expect_equal(config[["param_only"]], "blah")
+  expect_equal(config[["config_only"]], "ok")
+})
+
+test_that("rbindRaws", {
+
+  # Mixed Column types
+  r <- serialize(1:5, connection = NULL)
+  r1 <- serialize(1, connection = NULL)
+  r2 <- serialize(letters, connection = NULL)
+  r3 <- serialize(1:10, connection = NULL)
+  inputData <- list(list(1L, r1, "a", r), list(2L, r2, "b", r),
+                    list(3L, r3, "c", r))
+  expected <- data.frame(V1 = 1:3)
+  expected$V2 <- list(r1, r2, r3)
+  expected$V3 <- c("a", "b", "c")
+  expected$V4 <- list(r, r, r)
+  result <- rbindRaws(inputData)
+  expect_equal(expected, result)
+
+  # Single binary column
+  input <- list(list(r1), list(r2), list(r3))
+  expected <- subset(expected, select = "V2")
+  result <- setNames(rbindRaws(input), "V2")
+  expect_equal(expected, result)
+
+})
+
+test_that("varargsToStrEnv", {
+  strenv <- varargsToStrEnv(a = 1, b = 1.1, c = TRUE, d = "abcd")
+  env <- varargsToEnv(a = "1", b = "1.1", c = "true", d = "abcd")
+  expect_equal(strenv, env)
+  expect_error(varargsToStrEnv(a = list(1, "a")),
+               paste0("Unsupported type for a : list. Supported types are 
logical, ",
+                      "numeric, character and NULL."))
+  expect_warning(varargsToStrEnv(a = 1, 2, 3, 4), "Unnamed arguments ignored: 
2, 3, 4.")
+  expect_warning(varargsToStrEnv(1, 2, 3, 4), "Unnamed arguments ignored: 1, 
2, 3, 4.")
+})
+
+test_that("basenameSansExtFromUrl", {
+  x <- 
paste0("http://people.apache.org/~pwendell/spark-nightly/spark-branch-2.1-bin/spark-2.1.1-";,
+              
"SNAPSHOT-2016_12_09_11_08-eb2d9bf-bin/spark-2.1.1-SNAPSHOT-bin-hadoop2.7.tgz")
+  y <- 
paste0("http://people.apache.org/~pwendell/spark-releases/spark-2.1.0-rc2-bin/spark-2.1.0-";,
+              "bin-hadoop2.4-without-hive.tgz")
+  expect_equal(basenameSansExtFromUrl(x), "spark-2.1.1-SNAPSHOT-bin-hadoop2.7")
+  expect_equal(basenameSansExtFromUrl(y), 
"spark-2.1.0-bin-hadoop2.4-without-hive")
+  z <- 
"http://people.apache.org/~pwendell/spark-releases/spark-2.1.0--hive.tar.gz";
+  expect_equal(basenameSansExtFromUrl(z), "spark-2.1.0--hive")
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/ae4e8ae4/R/pkg/tests/run-all.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R
index 9b75d95..0aefd80 100644
--- a/R/pkg/tests/run-all.R
+++ b/R/pkg/tests/run-all.R
@@ -21,13 +21,31 @@ library(SparkR)
 # Turn all warnings into errors
 options("warn" = 2)
 
-install.spark()
+if (.Platform$OS.type == "windows") {
+  Sys.setenv(TZ = "GMT")
+}
 
 # Setup global test environment
+# Install Spark first to set SPARK_HOME
+install.spark()
+
 sparkRDir <- file.path(Sys.getenv("SPARK_HOME"), "R")
-sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE)
 sparkRWhitelistSQLDirs <- c("spark-warehouse", "metastore_db")
 invisible(lapply(sparkRWhitelistSQLDirs,
                  function(x) { unlink(file.path(sparkRDir, x), recursive = 
TRUE, force = TRUE)}))
+sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE)
+
+sparkRTestMaster <- "local[1]"
+if (identical(Sys.getenv("NOT_CRAN"), "true")) {
+  sparkRTestMaster <- ""
+}
 
 test_package("SparkR")
+
+if (identical(Sys.getenv("NOT_CRAN"), "true")) {
+  # for testthat 1.0.2 later, change reporter from "summary" to 
default_reporter()
+  testthat:::run_tests("SparkR",
+                       file.path(sparkRDir, "pkg", "tests", "fulltests"),
+                       NULL,
+                       "summary")
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/ae4e8ae4/R/pkg/vignettes/sparkr-vignettes.Rmd
----------------------------------------------------------------------
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd 
b/R/pkg/vignettes/sparkr-vignettes.Rmd
index d16526d..0e344dd 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -27,6 +27,17 @@ vignette: >
  limitations under the License.
 -->
 
+```{r setup, include=FALSE}
+library(knitr)
+opts_hooks$set(eval = function(options) {
+  # override eval to FALSE only on windows
+  if (.Platform$OS.type == "windows") {
+    options$eval = FALSE
+  }
+  options
+})
+```
+
 ## Overview
 
 SparkR is an R package that provides a light-weight frontend to use Apache 
Spark from R. With Spark `r packageVersion("SparkR")`, SparkR provides a 
distributed data frame implementation that supports data processing operations 
like selection, filtering, aggregation etc. and distributed machine learning 
using [MLlib](http://spark.apache.org/mllib/).
@@ -65,7 +76,7 @@ We can view the first few rows of the `SparkDataFrame` by 
`head` or `showDF` fun
 head(carsDF)
 ```
 
-Common data processing operations such as `filter`, `select` are supported on 
the `SparkDataFrame`.
+Common data processing operations such as `filter` and `select` are supported 
on the `SparkDataFrame`.
 ```{r}
 carsSubDF <- select(carsDF, "model", "mpg", "hp")
 carsSubDF <- filter(carsSubDF, carsSubDF$hp >= 200)
@@ -364,7 +375,7 @@ out <- dapply(carsSubDF, function(x) { x <- cbind(x, x$mpg 
* 1.61) }, schema)
 head(collect(out))
 ```
 
-Like `dapply`, apply a function to each partition of a `SparkDataFrame` and 
collect the result back. The output of function should be a `data.frame`, but 
no schema is required in this case. Note that `dapplyCollect` can fail if the 
output of UDF run on all the partition cannot be pulled to the driver and fit 
in driver memory.
+Like `dapply`, `dapplyCollect` can apply a function to each partition of a 
`SparkDataFrame` and collect the result back. The output of the function should 
be a `data.frame`, but no schema is required in this case. Note that 
`dapplyCollect` can fail if the output of the UDF on all partitions cannot be 
pulled into the driver's memory.
 
 ```{r}
 out <- dapplyCollect(
@@ -390,7 +401,7 @@ result <- gapply(
 head(arrange(result, "max_mpg", decreasing = TRUE))
 ```
 
-Like gapply, `gapplyCollect` applies a function to each partition of a 
`SparkDataFrame` and collect the result back to R `data.frame`. The output of 
the function should be a `data.frame` but no schema is required in this case. 
Note that `gapplyCollect` can fail if the output of UDF run on all the 
partition cannot be pulled to the driver and fit in driver memory.
+Like `gapply`, `gapplyCollect` can apply a function to each partition of a 
`SparkDataFrame` and collect the result back to R `data.frame`. The output of 
the function should be a `data.frame` but no schema is required in this case. 
Note that `gapplyCollect` can fail if the output of the UDF on all partitions 
cannot be pulled into the driver's memory.
 
 ```{r}
 result <- gapplyCollect(
@@ -443,20 +454,20 @@ options(ops)
 
 
 ### SQL Queries
-A `SparkDataFrame` can also be registered as a temporary view in Spark SQL and 
that allows you to run SQL queries over its data. The sql function enables 
applications to run SQL queries programmatically and returns the result as a 
`SparkDataFrame`.
+A `SparkDataFrame` can also be registered as a temporary view in Spark SQL so 
that one can run SQL queries over its data. The sql function enables 
applications to run SQL queries programmatically and returns the result as a 
`SparkDataFrame`.
 
 ```{r}
 people <- read.df(paste0(sparkR.conf("spark.home"),
                          "/examples/src/main/resources/people.json"), "json")
 ```
 
-Register this SparkDataFrame as a temporary view.
+Register this `SparkDataFrame` as a temporary view.
 
 ```{r}
 createOrReplaceTempView(people, "people")
 ```
 
-SQL statements can be run by using the sql method.
+SQL statements can be run using the sql method.
 ```{r}
 teenagers <- sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
 head(teenagers)
@@ -718,7 +729,7 @@ head(predict(isoregModel, newDF))
 `spark.gbt` fits a [gradient-boosted 
tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or 
regression model on a `SparkDataFrame`.
 Users can call `summary` to get a summary of the fitted model, `predict` to 
make predictions, and `write.ml`/`read.ml` to save/load fitted models.
 
-Similar to the random forest example above, we use the `longley` dataset to 
train a gradient-boosted tree and make predictions:
+We use the `longley` dataset to train a gradient-boosted tree and make 
predictions:
 
 ```{r, warning=FALSE}
 df <- createDataFrame(longley)
@@ -745,7 +756,7 @@ predictions <- predict(rfModel, df)
 
 `spark.gaussianMixture` fits multivariate [Gaussian Mixture 
Model](https://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model)
 (GMM) against a `SparkDataFrame`. 
[Expectation-Maximization](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
 (EM) is used to approximate the maximum likelihood estimator (MLE) of the 
model.
 
-We use a simulated example to demostrate the usage.
+We use a simulated example to demonstrate the usage.
 ```{r}
 X1 <- data.frame(V1 = rnorm(4), V2 = rnorm(4))
 X2 <- data.frame(V1 = rnorm(6, 3), V2 = rnorm(6, 4))
@@ -776,9 +787,9 @@ head(select(kmeansPredictions, "model", "mpg", "hp", "wt", 
"prediction"), n = 20
 
 * Topics and documents both exist in a feature space, where feature vectors 
are vectors of word counts (bag of words).
 
-* Rather than estimating a clustering using a traditional distance, LDA uses a 
function based on a statistical model of how text documents are generated.
+* Rather than clustering using a traditional distance, LDA uses a function 
based on a statistical model of how text documents are generated.
 
-To use LDA, we need to specify a `features` column in `data` where each entry 
represents a document. There are two type options for the column:
+To use LDA, we need to specify a `features` column in `data` where each entry 
represents a document. There are two options for the column:
 
 * character string: This can be a string of the whole document. It will be 
parsed automatically. Additional stop words can be added in 
`customizedStopWords`.
 
@@ -826,7 +837,7 @@ perplexity
 
 `spark.als` learns latent factors in [collaborative 
filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering)
 via [alternating least squares](http://dl.acm.org/citation.cfm?id=1608614).
 
-There are multiple options that can be configured in `spark.als`, including 
`rank`, `reg`, `nonnegative`. For a complete list, refer to the help file.
+There are multiple options that can be configured in `spark.als`, including 
`rank`, `reg`, and `nonnegative`. For a complete list, refer to the help file.
 
 ```{r}
 ratings <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 
4.0),
@@ -875,7 +886,7 @@ testSummary
 
 
 ### Model Persistence
-The following example shows how to save/load an ML model by SparkR.
+The following example shows how to save/load an ML model in SparkR.
 ```{r, warning=FALSE}
 irisDF <- createDataFrame(iris)
 gaussianGLM <- spark.glm(irisDF, Sepal_Length ~ Sepal_Width + Species, family 
= "gaussian")
@@ -906,19 +917,19 @@ There are three main object classes in SparkR you may be 
working with.
     + `sdf` stores a reference to the corresponding Spark Dataset in the Spark 
JVM backend.
     + `env` saves the meta-information of the object such as `isCached`.
 
-It can be created by data import methods or by transforming an existing 
`SparkDataFrame`. We can manipulate `SparkDataFrame` by numerous data 
processing functions and feed that into machine learning algorithms.
+    It can be created by data import methods or by transforming an existing 
`SparkDataFrame`. We can manipulate `SparkDataFrame` by numerous data 
processing functions and feed that into machine learning algorithms.
 
-* `Column`: an S4 class representing column of `SparkDataFrame`. The slot `jc` 
saves a reference to the corresponding Column object in the Spark JVM backend.
+* `Column`: an S4 class representing a column of `SparkDataFrame`. The slot 
`jc` saves a reference to the corresponding `Column` object in the Spark JVM 
backend.
 
-It can be obtained from a `SparkDataFrame` by `$` operator, `df$col`. More 
often, it is used together with other functions, for example, with `select` to 
select particular columns, with `filter` and constructed conditions to select 
rows, with aggregation functions to compute aggregate statistics for each group.
+    It can be obtained from a `SparkDataFrame` by `$` operator, e.g., 
`df$col`. More often, it is used together with other functions, for example, 
with `select` to select particular columns, with `filter` and constructed 
conditions to select rows, with aggregation functions to compute aggregate 
statistics for each group.
 
-* `GroupedData`: an S4 class representing grouped data created by `groupBy` or 
by transforming other `GroupedData`. Its `sgd` slot saves a reference to a 
RelationalGroupedDataset object in the backend.
+* `GroupedData`: an S4 class representing grouped data created by `groupBy` or 
by transforming other `GroupedData`. Its `sgd` slot saves a reference to a 
`RelationalGroupedDataset` object in the backend.
 
-This is often an intermediate object with group information and followed up by 
aggregation operations.
+    This is often an intermediate object with group information and followed 
up by aggregation operations.
 
 ### Architecture
 
-A complete description of architecture can be seen in reference, in particular 
the paper *SparkR: Scaling R Programs with Spark*.
+A complete description of architecture can be seen in the references, in 
particular the paper *SparkR: Scaling R Programs with Spark*.
 
 Under the hood of SparkR is Spark SQL engine. This avoids the overheads of 
running interpreted R code, and the optimized SQL execution engine in Spark 
uses structural information about data and computation flow to perform a bunch 
of optimizations to speed up the computation.
 

http://git-wip-us.apache.org/repos/asf/spark/blob/ae4e8ae4/R/run-tests.sh
----------------------------------------------------------------------
diff --git a/R/run-tests.sh b/R/run-tests.sh
index 5e4dafa..dab6d2d 100755
--- a/R/run-tests.sh
+++ b/R/run-tests.sh
@@ -23,7 +23,7 @@ FAILED=0
 LOGFILE=$FWDIR/unit-tests.out
 rm -f $LOGFILE
 
-SPARK_TESTING=1 $FWDIR/../bin/spark-submit --driver-java-options 
"-Dlog4j.configuration=file:$FWDIR/log4j.properties" --conf 
spark.hadoop.fs.default.name="file:///" $FWDIR/pkg/tests/run-all.R 2>&1 | tee 
-a $LOGFILE
+SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --driver-java-options 
"-Dlog4j.configuration=file:$FWDIR/log4j.properties" --conf 
spark.hadoop.fs.default.name="file:///" $FWDIR/pkg/tests/run-all.R 2>&1 | tee 
-a $LOGFILE
 FAILED=$((PIPESTATUS[0]||$FAILED))
 
 NUM_TEST_WARNING="$(grep -c -e 'Warnings ----------------' $LOGFILE)"

http://git-wip-us.apache.org/repos/asf/spark/blob/ae4e8ae4/appveyor.yml
----------------------------------------------------------------------
diff --git a/appveyor.yml b/appveyor.yml
index 5e75683..cec6e29 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -45,6 +45,9 @@ install:
 build_script:
   - cmd: mvn -DskipTests -Phadoop-2.6 -Psparkr -Phive -Phive-thriftserver 
package
 
+environment:
+  NOT_CRAN: true
+
 test_script:
   - cmd: .\bin\spark-submit2.cmd --conf 
spark.hadoop.fs.default.name="file:///" R\pkg\tests\run-all.R
 

http://git-wip-us.apache.org/repos/asf/spark/blob/ae4e8ae4/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
----------------------------------------------------------------------
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala 
b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index d9e176a..499c38a 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -428,8 +428,8 @@ class SparkSubmitSuite
     assume(RUtils.isSparkRInstalled, "SparkR is not installed in this build.")
     val main = MavenCoordinate("my.great.lib", "mylib", "0.1")
     val sparkHome = sys.props.getOrElse("spark.test.home", 
fail("spark.test.home is not set!"))
-    val rScriptDir =
-      Seq(sparkHome, "R", "pkg", "inst", "tests", 
"packageInAJarTest.R").mkString(File.separator)
+    val rScriptDir = Seq(
+      sparkHome, "R", "pkg", "tests", "fulltests", 
"packageInAJarTest.R").mkString(File.separator)
     assert(new File(rScriptDir).exists)
     IvyTestUtils.withRepository(main, None, None, withR = true) { repo =>
       val args = Seq(
@@ -450,7 +450,7 @@ class SparkSubmitSuite
     // Check if the SparkR package is installed
     assume(RUtils.isSparkRInstalled, "SparkR is not installed in this build.")
     val rScriptDir =
-      Seq(sparkHome, "R", "pkg", "inst", "tests", "testthat", 
"jarTest.R").mkString(File.separator)
+      Seq(sparkHome, "R", "pkg", "tests", "fulltests", 
"jarTest.R").mkString(File.separator)
     assert(new File(rScriptDir).exists)
 
     // compile a small jar containing a class that will be called from R code.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[1/9] spark git commit: [SPARKR][BACKPORT-2.1] backporting package and test changes

Reply via email to