spark git commit: [SPARK-20550][SPARKR] R wrapper for Dataset.alias

2017-05-07 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 500436b43 -> 1f73d3589


[SPARK-20550][SPARKR] R wrapper for Dataset.alias

## What changes were proposed in this pull request?

- Add SparkR wrapper for `Dataset.alias`.
- Adjust roxygen annotations for `functions.alias` (including example usage).

## How was this patch tested?

Unit tests, `check_cran.sh`.

Author: zero323 

Closes #17825 from zero323/SPARK-20550.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1f73d358
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1f73d358
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1f73d358

Branch: refs/heads/master
Commit: 1f73d3589a84b78473598c17ac328a9805896778
Parents: 500436b
Author: zero323 
Authored: Sun May 7 16:24:42 2017 -0700
Committer: Felix Cheung 
Committed: Sun May 7 16:24:42 2017 -0700

--
 R/pkg/R/DataFrame.R   | 24 
 R/pkg/R/column.R  | 16 
 R/pkg/R/generics.R| 11 +++
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 10 ++
 4 files changed, 53 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1f73d358/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 1c88692..b56dddc 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -3745,3 +3745,27 @@ setMethod("hint",
 jdf <- callJMethod(x@sdf, "hint", name, parameters)
 dataFrame(jdf)
   })
+
+#' alias
+#'
+#' @aliases alias,SparkDataFrame-method
+#' @family SparkDataFrame functions
+#' @rdname alias
+#' @name alias
+#' @export
+#' @examples
+#' \dontrun{
+#' df <- alias(createDataFrame(mtcars), "mtcars")
+#' avg_mpg <- alias(agg(groupBy(df, df$cyl), avg(df$mpg)), "avg_mpg")
+#'
+#' head(select(df, column("mtcars.mpg")))
+#' head(join(df, avg_mpg, column("mtcars.cyl") == column("avg_mpg.cyl")))
+#' }
+#' @note alias(SparkDataFrame) since 2.3.0
+setMethod("alias",
+  signature(object = "SparkDataFrame"),
+  function(object, data) {
+stopifnot(is.character(data))
+sdf <- callJMethod(object@sdf, "alias", data)
+dataFrame(sdf)
+  })

http://git-wip-us.apache.org/repos/asf/spark/blob/1f73d358/R/pkg/R/column.R
--
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index 147ee4b..5740780 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -130,19 +130,19 @@ createMethods <- function() {
 
 createMethods()
 
-#' alias
-#'
-#' Set a new name for a column
-#'
-#' @param object Column to rename
-#' @param data new name to use
-#'
 #' @rdname alias
 #' @name alias
 #' @aliases alias,Column-method
 #' @family colum_func
 #' @export
-#' @note alias since 1.4.0
+#' @examples \dontrun{
+#' df <- createDataFrame(iris)
+#'
+#' head(select(
+#'   df, alias(df$Sepal_Length, "slength"), alias(df$Petal_Length, "plength")
+#' ))
+#' }
+#' @note alias(Column) since 1.4.0
 setMethod("alias",
   signature(object = "Column"),
   function(object, data) {

http://git-wip-us.apache.org/repos/asf/spark/blob/1f73d358/R/pkg/R/generics.R
--
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index e835ef3..3c84bf8 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -387,6 +387,17 @@ setGeneric("value", function(bcast) { 
standardGeneric("value") })
 #' @export
 setGeneric("agg", function (x, ...) { standardGeneric("agg") })
 
+#' alias
+#'
+#' Returns a new SparkDataFrame or a Column with an alias set. Equivalent to 
SQL "AS" keyword.
+#'
+#' @name alias
+#' @rdname alias
+#' @param object x a SparkDataFrame or a Column
+#' @param data new name to use
+#' @return a SparkDataFrame or a Column
+NULL
+
 #' @rdname arrange
 #' @export
 setGeneric("arrange", function(x, col, ...) { standardGeneric("arrange") })

http://git-wip-us.apache.org/repos/asf/spark/blob/1f73d358/R/pkg/inst/tests/testthat/test_sparkSQL.R
--
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 232246d..0856bab 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1223,6 +1223,16 @@ test_that("select with column", {
   expect_equal(columns(df4), c("name", "age"))
   expect_equal(count(df4), 3)
 
+  # Test select with alias
+  df5 <- alias(df, "table")
+
+  expect_equal(columns(select(df5, column("table.name"))), "name")
+  expect_equal(columns(select(df5, "table.name")), "name")
+
+  # Test that stats::alias is not m

spark git commit: [SPARK-20626][SPARKR] address date test warning with timezone on windows

2017-05-07 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 22691556e -> c24bdaab5


[SPARK-20626][SPARKR] address date test warning with timezone on windows

## What changes were proposed in this pull request?

set timezone on windows

## How was this patch tested?

unit test, AppVeyor

Author: Felix Cheung 

Closes #17892 from felixcheung/rtimestamptest.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c24bdaab
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c24bdaab
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c24bdaab

Branch: refs/heads/master
Commit: c24bdaab5a234d18b273544cefc44cc4005bf8fc
Parents: 2269155
Author: Felix Cheung 
Authored: Sun May 7 23:10:18 2017 -0700
Committer: Felix Cheung 
Committed: Sun May 7 23:10:18 2017 -0700

--
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 4 
 1 file changed, 4 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c24bdaab/R/pkg/inst/tests/testthat/test_sparkSQL.R
--
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 0856bab..f517ce6 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -96,6 +96,10 @@ mockLinesMapType <- 
c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}
 mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
 writeLines(mockLinesMapType, mapTypeJsonPath)
 
+if (.Platform$OS.type == "windows") {
+  Sys.setenv(TZ = "GMT")
+}
+
 test_that("calling sparkRSQL.init returns existing SQL context", {
   skip_on_cran()
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-20626][SPARKR] address date test warning with timezone on windows

2017-05-07 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.2 048e9890c -> 6c5b7e106


[SPARK-20626][SPARKR] address date test warning with timezone on windows

## What changes were proposed in this pull request?

set timezone on windows

## How was this patch tested?

unit test, AppVeyor

Author: Felix Cheung 

Closes #17892 from felixcheung/rtimestamptest.

(cherry picked from commit c24bdaab5a234d18b273544cefc44cc4005bf8fc)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6c5b7e10
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6c5b7e10
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6c5b7e10

Branch: refs/heads/branch-2.2
Commit: 6c5b7e106895302a87cf6522d3c64c3badac699f
Parents: 048e989
Author: Felix Cheung 
Authored: Sun May 7 23:10:18 2017 -0700
Committer: Felix Cheung 
Committed: Sun May 7 23:10:42 2017 -0700

--
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 4 
 1 file changed, 4 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6c5b7e10/R/pkg/inst/tests/testthat/test_sparkSQL.R
--
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 3c985f2..3f445e2 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -96,6 +96,10 @@ mockLinesMapType <- 
c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}
 mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
 writeLines(mockLinesMapType, mapTypeJsonPath)
 
+if (.Platform$OS.type == "windows") {
+  Sys.setenv(TZ = "GMT")
+}
+
 test_that("calling sparkRSQL.init returns existing SQL context", {
   skip_on_cran()
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARKR][DOC] fix typo in vignettes

2017-05-07 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.2 6c5b7e106 -> d8a5a0d34


[SPARKR][DOC] fix typo in vignettes

## What changes were proposed in this pull request?
Fix typo in vignettes

Author: Wayne Zhang 

Closes #17884 from actuaryzhang/typo.

(cherry picked from commit 2fdaeb52bbe2ed1a9127ac72917286e505303c85)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d8a5a0d3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d8a5a0d3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d8a5a0d3

Branch: refs/heads/branch-2.2
Commit: d8a5a0d3420abbb911d8a80dc7165762eb08d779
Parents: 6c5b7e1
Author: Wayne Zhang 
Authored: Sun May 7 23:16:30 2017 -0700
Committer: Felix Cheung 
Committed: Sun May 7 23:16:44 2017 -0700

--
 R/pkg/vignettes/sparkr-vignettes.Rmd | 36 +++
 1 file changed, 18 insertions(+), 18 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d8a5a0d3/R/pkg/vignettes/sparkr-vignettes.Rmd
--
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd 
b/R/pkg/vignettes/sparkr-vignettes.Rmd
index b933c59..0f6d5c2 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -65,7 +65,7 @@ We can view the first few rows of the `SparkDataFrame` by 
`head` or `showDF` fun
 head(carsDF)
 ```
 
-Common data processing operations such as `filter`, `select` are supported on 
the `SparkDataFrame`.
+Common data processing operations such as `filter` and `select` are supported 
on the `SparkDataFrame`.
 ```{r}
 carsSubDF <- select(carsDF, "model", "mpg", "hp")
 carsSubDF <- filter(carsSubDF, carsSubDF$hp >= 200)
@@ -364,7 +364,7 @@ out <- dapply(carsSubDF, function(x) { x <- cbind(x, x$mpg 
* 1.61) }, schema)
 head(collect(out))
 ```
 
-Like `dapply`, apply a function to each partition of a `SparkDataFrame` and 
collect the result back. The output of function should be a `data.frame`, but 
no schema is required in this case. Note that `dapplyCollect` can fail if the 
output of UDF run on all the partition cannot be pulled to the driver and fit 
in driver memory.
+Like `dapply`, `dapplyCollect` can apply a function to each partition of a 
`SparkDataFrame` and collect the result back. The output of the function should 
be a `data.frame`, but no schema is required in this case. Note that 
`dapplyCollect` can fail if the output of the UDF on all partitions cannot be 
pulled into the driver's memory.
 
 ```{r}
 out <- dapplyCollect(
@@ -390,7 +390,7 @@ result <- gapply(
 head(arrange(result, "max_mpg", decreasing = TRUE))
 ```
 
-Like gapply, `gapplyCollect` applies a function to each partition of a 
`SparkDataFrame` and collect the result back to R `data.frame`. The output of 
the function should be a `data.frame` but no schema is required in this case. 
Note that `gapplyCollect` can fail if the output of UDF run on all the 
partition cannot be pulled to the driver and fit in driver memory.
+Like `gapply`, `gapplyCollect` can apply a function to each partition of a 
`SparkDataFrame` and collect the result back to R `data.frame`. The output of 
the function should be a `data.frame` but no schema is required in this case. 
Note that `gapplyCollect` can fail if the output of the UDF on all partitions 
cannot be pulled into the driver's memory.
 
 ```{r}
 result <- gapplyCollect(
@@ -443,20 +443,20 @@ options(ops)
 
 
 ### SQL Queries
-A `SparkDataFrame` can also be registered as a temporary view in Spark SQL and 
that allows you to run SQL queries over its data. The sql function enables 
applications to run SQL queries programmatically and returns the result as a 
`SparkDataFrame`.
+A `SparkDataFrame` can also be registered as a temporary view in Spark SQL so 
that one can run SQL queries over its data. The sql function enables 
applications to run SQL queries programmatically and returns the result as a 
`SparkDataFrame`.
 
 ```{r}
 people <- read.df(paste0(sparkR.conf("spark.home"),
  "/examples/src/main/resources/people.json"), "json")
 ```
 
-Register this SparkDataFrame as a temporary view.
+Register this `SparkDataFrame` as a temporary view.
 
 ```{r}
 createOrReplaceTempView(people, "people")
 ```
 
-SQL statements can be run by using the sql method.
+SQL statements can be run using the sql method.
 ```{r}
 teenagers <- sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
 head(teenagers)
@@ -765,7 +765,7 @@ head(predict(isoregModel, newDF))
 `spark.gbt` fits a [gradient-boosted 
tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or 
regression model on a `SparkDataFrame`.
 Users can call `summary` to get a summary of the fitted model, `predict` to 
make predictions, and `write

spark git commit: [SPARKR][DOC] fix typo in vignettes

2017-05-07 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 42cc6d13e -> 2fdaeb52b


[SPARKR][DOC] fix typo in vignettes

## What changes were proposed in this pull request?
Fix typo in vignettes

Author: Wayne Zhang 

Closes #17884 from actuaryzhang/typo.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2fdaeb52
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2fdaeb52
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2fdaeb52

Branch: refs/heads/master
Commit: 2fdaeb52bbe2ed1a9127ac72917286e505303c85
Parents: 42cc6d1
Author: Wayne Zhang 
Authored: Sun May 7 23:16:30 2017 -0700
Committer: Felix Cheung 
Committed: Sun May 7 23:16:30 2017 -0700

--
 R/pkg/vignettes/sparkr-vignettes.Rmd | 36 +++
 1 file changed, 18 insertions(+), 18 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2fdaeb52/R/pkg/vignettes/sparkr-vignettes.Rmd
--
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd 
b/R/pkg/vignettes/sparkr-vignettes.Rmd
index d38ec4f..49f4ab8 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -65,7 +65,7 @@ We can view the first few rows of the `SparkDataFrame` by 
`head` or `showDF` fun
 head(carsDF)
 ```
 
-Common data processing operations such as `filter`, `select` are supported on 
the `SparkDataFrame`.
+Common data processing operations such as `filter` and `select` are supported 
on the `SparkDataFrame`.
 ```{r}
 carsSubDF <- select(carsDF, "model", "mpg", "hp")
 carsSubDF <- filter(carsSubDF, carsSubDF$hp >= 200)
@@ -379,7 +379,7 @@ out <- dapply(carsSubDF, function(x) { x <- cbind(x, x$mpg 
* 1.61) }, schema)
 head(collect(out))
 ```
 
-Like `dapply`, apply a function to each partition of a `SparkDataFrame` and 
collect the result back. The output of function should be a `data.frame`, but 
no schema is required in this case. Note that `dapplyCollect` can fail if the 
output of UDF run on all the partition cannot be pulled to the driver and fit 
in driver memory.
+Like `dapply`, `dapplyCollect` can apply a function to each partition of a 
`SparkDataFrame` and collect the result back. The output of the function should 
be a `data.frame`, but no schema is required in this case. Note that 
`dapplyCollect` can fail if the output of the UDF on all partitions cannot be 
pulled into the driver's memory.
 
 ```{r}
 out <- dapplyCollect(
@@ -405,7 +405,7 @@ result <- gapply(
 head(arrange(result, "max_mpg", decreasing = TRUE))
 ```
 
-Like gapply, `gapplyCollect` applies a function to each partition of a 
`SparkDataFrame` and collect the result back to R `data.frame`. The output of 
the function should be a `data.frame` but no schema is required in this case. 
Note that `gapplyCollect` can fail if the output of UDF run on all the 
partition cannot be pulled to the driver and fit in driver memory.
+Like `gapply`, `gapplyCollect` can apply a function to each partition of a 
`SparkDataFrame` and collect the result back to R `data.frame`. The output of 
the function should be a `data.frame` but no schema is required in this case. 
Note that `gapplyCollect` can fail if the output of the UDF on all partitions 
cannot be pulled into the driver's memory.
 
 ```{r}
 result <- gapplyCollect(
@@ -458,20 +458,20 @@ options(ops)
 
 
 ### SQL Queries
-A `SparkDataFrame` can also be registered as a temporary view in Spark SQL and 
that allows you to run SQL queries over its data. The sql function enables 
applications to run SQL queries programmatically and returns the result as a 
`SparkDataFrame`.
+A `SparkDataFrame` can also be registered as a temporary view in Spark SQL so 
that one can run SQL queries over its data. The sql function enables 
applications to run SQL queries programmatically and returns the result as a 
`SparkDataFrame`.
 
 ```{r}
 people <- read.df(paste0(sparkR.conf("spark.home"),
  "/examples/src/main/resources/people.json"), "json")
 ```
 
-Register this SparkDataFrame as a temporary view.
+Register this `SparkDataFrame` as a temporary view.
 
 ```{r}
 createOrReplaceTempView(people, "people")
 ```
 
-SQL statements can be run by using the sql method.
+SQL statements can be run using the sql method.
 ```{r}
 teenagers <- sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
 head(teenagers)
@@ -780,7 +780,7 @@ head(predict(isoregModel, newDF))
 `spark.gbt` fits a [gradient-boosted 
tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or 
regression model on a `SparkDataFrame`.
 Users can call `summary` to get a summary of the fitted model, `predict` to 
make predictions, and `write.ml`/`read.ml` to save/load fitted models.
 
-Similar to the random forest example above, we use the `longl

spark git commit: [SPARK-20661][SPARKR][TEST][FOLLOWUP] SparkR tableNames() test fails

2017-05-08 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 2abfee18b -> b952b44af


[SPARK-20661][SPARKR][TEST][FOLLOWUP] SparkR tableNames() test fails

## What changes were proposed in this pull request?

Change it to check for relative count like in this test 
https://github.com/apache/spark/blame/master/R/pkg/inst/tests/testthat/test_sparkSQL.R#L3355
 for catalog APIs

## How was this patch tested?

unit tests, this needs to combine with another commit with SQL change to check

Author: Felix Cheung 

Closes #17905 from felixcheung/rtabletests.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b952b44a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b952b44a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b952b44a

Branch: refs/heads/master
Commit: b952b44af4d243f1e3ad88bccf4af7d04df3fc81
Parents: 2abfee1
Author: Felix Cheung 
Authored: Mon May 8 22:49:40 2017 -0700
Committer: Felix Cheung 
Committed: Mon May 8 22:49:40 2017 -0700

--
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 15 ---
 1 file changed, 8 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b952b44a/R/pkg/inst/tests/testthat/test_sparkSQL.R
--
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index ab6888e..19aa61e 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -677,26 +677,27 @@ test_that("jsonRDD() on a RDD with json string", {
 })
 
 test_that("test tableNames and tables", {
-  # Making sure there are no registered temp tables from previous tests
-  suppressWarnings(sapply(tableNames(), function(tname) { dropTempTable(tname) 
}))
+  count <- count(listTables())
+
   df <- read.json(jsonPath)
   createOrReplaceTempView(df, "table1")
-  expect_equal(length(tableNames()), 1)
-  expect_equal(length(tableNames("default")), 1)
+  expect_equal(length(tableNames()), count + 1)
+  expect_equal(length(tableNames("default")), count + 1)
+
   tables <- listTables()
-  expect_equal(count(tables), 1)
+  expect_equal(count(tables), count + 1)
   expect_equal(count(tables()), count(tables))
   expect_true("tableName" %in% colnames(tables()))
   expect_true(all(c("tableName", "database", "isTemporary") %in% 
colnames(tables(
 
   suppressWarnings(registerTempTable(df, "table2"))
   tables <- listTables()
-  expect_equal(count(tables), 2)
+  expect_equal(count(tables), count + 2)
   suppressWarnings(dropTempTable("table1"))
   expect_true(dropTempView("table2"))
 
   tables <- listTables()
-  expect_equal(count(tables), 0)
+  expect_equal(count(tables), count + 0)
 })
 
 test_that(


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-20661][SPARKR][TEST][FOLLOWUP] SparkR tableNames() test fails

2017-05-08 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.2 4179ffc03 -> 54e074349


[SPARK-20661][SPARKR][TEST][FOLLOWUP] SparkR tableNames() test fails

## What changes were proposed in this pull request?

Change it to check for relative count like in this test 
https://github.com/apache/spark/blame/master/R/pkg/inst/tests/testthat/test_sparkSQL.R#L3355
 for catalog APIs

## How was this patch tested?

unit tests, this needs to combine with another commit with SQL change to check

Author: Felix Cheung 

Closes #17905 from felixcheung/rtabletests.

(cherry picked from commit b952b44af4d243f1e3ad88bccf4af7d04df3fc81)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/54e07434
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/54e07434
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/54e07434

Branch: refs/heads/branch-2.2
Commit: 54e07434968624dbb0fb80773356e614b954e52f
Parents: 4179ffc
Author: Felix Cheung 
Authored: Mon May 8 22:49:40 2017 -0700
Committer: Felix Cheung 
Committed: Mon May 8 22:49:53 2017 -0700

--
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 15 ---
 1 file changed, 8 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/54e07434/R/pkg/inst/tests/testthat/test_sparkSQL.R
--
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 58cd259..ae2969f 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -668,26 +668,27 @@ test_that("jsonRDD() on a RDD with json string", {
 })
 
 test_that("test tableNames and tables", {
-  # Making sure there are no registered temp tables from previous tests
-  suppressWarnings(sapply(tableNames(), function(tname) { dropTempTable(tname) 
}))
+  count <- count(listTables())
+
   df <- read.json(jsonPath)
   createOrReplaceTempView(df, "table1")
-  expect_equal(length(tableNames()), 1)
-  expect_equal(length(tableNames("default")), 1)
+  expect_equal(length(tableNames()), count + 1)
+  expect_equal(length(tableNames("default")), count + 1)
+
   tables <- listTables()
-  expect_equal(count(tables), 1)
+  expect_equal(count(tables), count + 1)
   expect_equal(count(tables()), count(tables))
   expect_true("tableName" %in% colnames(tables()))
   expect_true(all(c("tableName", "database", "isTemporary") %in% 
colnames(tables(
 
   suppressWarnings(registerTempTable(df, "table2"))
   tables <- listTables()
-  expect_equal(count(tables), 2)
+  expect_equal(count(tables), count + 2)
   suppressWarnings(dropTempTable("table1"))
   expect_true(dropTempView("table2"))
 
   tables <- listTables()
-  expect_equal(count(tables), 0)
+  expect_equal(count(tables), count + 0)
 })
 
 test_that(


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-20670][ML] Simplify FPGrowth transform

2017-05-09 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master a90c5cd82 -> a819dab66


[SPARK-20670][ML] Simplify FPGrowth transform

## What changes were proposed in this pull request?

jira: https://issues.apache.org/jira/browse/SPARK-20670
As suggested by Sean Owen in https://github.com/apache/spark/pull/17130, the 
transform code in FPGrowthModel can be simplified.

As I tested on some public dataset http://fimi.ua.ac.be/data/, the performance 
of the new transform code is even or better than the old implementation.

## How was this patch tested?

Existing unit test.

Author: Yuhao Yang 

Closes #17912 from hhbyyh/fpgrowthTransform.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a819dab6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a819dab6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a819dab6

Branch: refs/heads/master
Commit: a819dab6681f3a16615039227865af188b3c3f2a
Parents: a90c5cd
Author: Yuhao Yang 
Authored: Tue May 9 23:39:26 2017 -0700
Committer: Felix Cheung 
Committed: Tue May 9 23:39:26 2017 -0700

--
 mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a819dab6/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala 
b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
index 8f00daa..12804d0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
@@ -269,12 +269,8 @@ class FPGrowthModel private[ml] (
 val predictUDF = udf((items: Seq[_]) => {
   if (items != null) {
 val itemset = items.toSet
-brRules.value.flatMap(rule =>
-  if (items != null && rule._1.forall(item => itemset.contains(item))) 
{
-rule._2.filter(item => !itemset.contains(item))
-  } else {
-Seq.empty
-  }).distinct
+brRules.value.filter(_._1.forall(itemset.contains))
+  .flatMap(_._2.filter(!itemset.contains(_))).distinct
   } else {
 Seq.empty
   }}, dt)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-20704][SPARKR] change CRAN test to run single thread

2017-05-11 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master c8da53560 -> 888b84abe


[SPARK-20704][SPARKR] change CRAN test to run single thread

## What changes were proposed in this pull request?

- [x] need to test by running R CMD check --as-cran
- [x] sanity check vignettes

## How was this patch tested?

Jenkins

Author: Felix Cheung 

Closes #17945 from felixcheung/rchangesforpackage.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/888b84ab
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/888b84ab
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/888b84ab

Branch: refs/heads/master
Commit: 888b84abe8d3fd36c5c2226aeb9e202029936f94
Parents: c8da535
Author: Felix Cheung 
Authored: Thu May 11 23:10:04 2017 -0700
Committer: Felix Cheung 
Committed: Thu May 11 23:10:04 2017 -0700

--
 R/pkg/inst/tests/testthat/jarTest.R |  2 +-
 R/pkg/inst/tests/testthat/packageInAJarTest.R   |  2 +-
 R/pkg/inst/tests/testthat/test_Serde.R  |  2 +-
 R/pkg/inst/tests/testthat/test_binaryFile.R |  2 +-
 R/pkg/inst/tests/testthat/test_binary_function.R|  2 +-
 R/pkg/inst/tests/testthat/test_broadcast.R  |  2 +-
 R/pkg/inst/tests/testthat/test_context.R| 16 
 R/pkg/inst/tests/testthat/test_includePackage.R |  2 +-
 R/pkg/inst/tests/testthat/test_jvm_api.R|  2 +-
 .../inst/tests/testthat/test_mllib_classification.R |  2 +-
 R/pkg/inst/tests/testthat/test_mllib_clustering.R   |  2 +-
 R/pkg/inst/tests/testthat/test_mllib_fpm.R  |  2 +-
 .../inst/tests/testthat/test_mllib_recommendation.R |  2 +-
 R/pkg/inst/tests/testthat/test_mllib_regression.R   |  2 +-
 R/pkg/inst/tests/testthat/test_mllib_stat.R |  2 +-
 R/pkg/inst/tests/testthat/test_mllib_tree.R |  2 +-
 .../inst/tests/testthat/test_parallelize_collect.R  |  2 +-
 R/pkg/inst/tests/testthat/test_rdd.R|  2 +-
 R/pkg/inst/tests/testthat/test_shuffle.R|  2 +-
 R/pkg/inst/tests/testthat/test_sparkSQL.R   |  2 +-
 R/pkg/inst/tests/testthat/test_streaming.R  |  2 +-
 R/pkg/inst/tests/testthat/test_take.R   |  2 +-
 R/pkg/inst/tests/testthat/test_textFile.R   |  2 +-
 R/pkg/inst/tests/testthat/test_utils.R  |  2 +-
 R/pkg/tests/run-all.R   |  5 +
 R/pkg/vignettes/sparkr-vignettes.Rmd|  3 ++-
 26 files changed, 38 insertions(+), 32 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/888b84ab/R/pkg/inst/tests/testthat/jarTest.R
--
diff --git a/R/pkg/inst/tests/testthat/jarTest.R 
b/R/pkg/inst/tests/testthat/jarTest.R
index c9615c8..e2241e0 100644
--- a/R/pkg/inst/tests/testthat/jarTest.R
+++ b/R/pkg/inst/tests/testthat/jarTest.R
@@ -16,7 +16,7 @@
 #
 library(SparkR)
 
-sc <- sparkR.session()
+sc <- sparkR.session(master = "local[1]")
 
 helloTest <- SparkR:::callJStatic("sparkrtest.DummyClass",
   "helloWorld",

http://git-wip-us.apache.org/repos/asf/spark/blob/888b84ab/R/pkg/inst/tests/testthat/packageInAJarTest.R
--
diff --git a/R/pkg/inst/tests/testthat/packageInAJarTest.R 
b/R/pkg/inst/tests/testthat/packageInAJarTest.R
index 4bc935c..ac70626 100644
--- a/R/pkg/inst/tests/testthat/packageInAJarTest.R
+++ b/R/pkg/inst/tests/testthat/packageInAJarTest.R
@@ -17,7 +17,7 @@
 library(SparkR)
 library(sparkPackageTest)
 
-sparkR.session()
+sparkR.session(master = "local[1]")
 
 run1 <- myfunc(5L)
 

http://git-wip-us.apache.org/repos/asf/spark/blob/888b84ab/R/pkg/inst/tests/testthat/test_Serde.R
--
diff --git a/R/pkg/inst/tests/testthat/test_Serde.R 
b/R/pkg/inst/tests/testthat/test_Serde.R
index 518fb7b..6e160fa 100644
--- a/R/pkg/inst/tests/testthat/test_Serde.R
+++ b/R/pkg/inst/tests/testthat/test_Serde.R
@@ -17,7 +17,7 @@
 
 context("SerDe functionality")
 
-sparkSession <- sparkR.session(enableHiveSupport = FALSE)
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = 
FALSE)
 
 test_that("SerDe of primitive types", {
   skip_on_cran()

http://git-wip-us.apache.org/repos/asf/spark/blob/888b84ab/R/pkg/inst/tests/testthat/test_binaryFile.R
--
diff --git a/R/pkg/inst/tests/testthat/test_binaryFile.R 
b/R/pkg/inst/tests/testthat/test_binaryFile.R
index 63f54e1..00954fa 100644
--- a/R/pkg/inst/tests/testthat/test_binaryFile.R
+++ b/R/pkg/inst/tests/testthat/test_binaryFile.R
@@ -18,7 +18,7 @@
 context("fu

spark git commit: [SPARK-20704][SPARKR] change CRAN test to run single thread

2017-05-11 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.2 a8d981dc5 -> c1e5ac267


[SPARK-20704][SPARKR] change CRAN test to run single thread

## What changes were proposed in this pull request?

- [x] need to test by running R CMD check --as-cran
- [x] sanity check vignettes

## How was this patch tested?

Jenkins

Author: Felix Cheung 

Closes #17945 from felixcheung/rchangesforpackage.

(cherry picked from commit 888b84abe8d3fd36c5c2226aeb9e202029936f94)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c1e5ac26
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c1e5ac26
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c1e5ac26

Branch: refs/heads/branch-2.2
Commit: c1e5ac267fcf73b96c28bb08797de98624df15dc
Parents: a8d981d
Author: Felix Cheung 
Authored: Thu May 11 23:10:04 2017 -0700
Committer: Felix Cheung 
Committed: Thu May 11 23:10:19 2017 -0700

--
 R/pkg/inst/tests/testthat/jarTest.R |  2 +-
 R/pkg/inst/tests/testthat/packageInAJarTest.R   |  2 +-
 R/pkg/inst/tests/testthat/test_Serde.R  |  2 +-
 R/pkg/inst/tests/testthat/test_binaryFile.R |  2 +-
 R/pkg/inst/tests/testthat/test_binary_function.R|  2 +-
 R/pkg/inst/tests/testthat/test_broadcast.R  |  2 +-
 R/pkg/inst/tests/testthat/test_context.R| 16 
 R/pkg/inst/tests/testthat/test_includePackage.R |  2 +-
 R/pkg/inst/tests/testthat/test_jvm_api.R|  2 +-
 .../inst/tests/testthat/test_mllib_classification.R |  2 +-
 R/pkg/inst/tests/testthat/test_mllib_clustering.R   |  2 +-
 R/pkg/inst/tests/testthat/test_mllib_fpm.R  |  2 +-
 .../inst/tests/testthat/test_mllib_recommendation.R |  2 +-
 R/pkg/inst/tests/testthat/test_mllib_regression.R   |  2 +-
 R/pkg/inst/tests/testthat/test_mllib_stat.R |  2 +-
 R/pkg/inst/tests/testthat/test_mllib_tree.R |  2 +-
 .../inst/tests/testthat/test_parallelize_collect.R  |  2 +-
 R/pkg/inst/tests/testthat/test_rdd.R|  2 +-
 R/pkg/inst/tests/testthat/test_shuffle.R|  2 +-
 R/pkg/inst/tests/testthat/test_sparkSQL.R   |  2 +-
 R/pkg/inst/tests/testthat/test_streaming.R  |  2 +-
 R/pkg/inst/tests/testthat/test_take.R   |  2 +-
 R/pkg/inst/tests/testthat/test_textFile.R   |  2 +-
 R/pkg/inst/tests/testthat/test_utils.R  |  2 +-
 R/pkg/tests/run-all.R   |  5 +
 R/pkg/vignettes/sparkr-vignettes.Rmd|  3 ++-
 26 files changed, 38 insertions(+), 32 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c1e5ac26/R/pkg/inst/tests/testthat/jarTest.R
--
diff --git a/R/pkg/inst/tests/testthat/jarTest.R 
b/R/pkg/inst/tests/testthat/jarTest.R
index c9615c8..e2241e0 100644
--- a/R/pkg/inst/tests/testthat/jarTest.R
+++ b/R/pkg/inst/tests/testthat/jarTest.R
@@ -16,7 +16,7 @@
 #
 library(SparkR)
 
-sc <- sparkR.session()
+sc <- sparkR.session(master = "local[1]")
 
 helloTest <- SparkR:::callJStatic("sparkrtest.DummyClass",
   "helloWorld",

http://git-wip-us.apache.org/repos/asf/spark/blob/c1e5ac26/R/pkg/inst/tests/testthat/packageInAJarTest.R
--
diff --git a/R/pkg/inst/tests/testthat/packageInAJarTest.R 
b/R/pkg/inst/tests/testthat/packageInAJarTest.R
index 4bc935c..ac70626 100644
--- a/R/pkg/inst/tests/testthat/packageInAJarTest.R
+++ b/R/pkg/inst/tests/testthat/packageInAJarTest.R
@@ -17,7 +17,7 @@
 library(SparkR)
 library(sparkPackageTest)
 
-sparkR.session()
+sparkR.session(master = "local[1]")
 
 run1 <- myfunc(5L)
 

http://git-wip-us.apache.org/repos/asf/spark/blob/c1e5ac26/R/pkg/inst/tests/testthat/test_Serde.R
--
diff --git a/R/pkg/inst/tests/testthat/test_Serde.R 
b/R/pkg/inst/tests/testthat/test_Serde.R
index 518fb7b..6e160fa 100644
--- a/R/pkg/inst/tests/testthat/test_Serde.R
+++ b/R/pkg/inst/tests/testthat/test_Serde.R
@@ -17,7 +17,7 @@
 
 context("SerDe functionality")
 
-sparkSession <- sparkR.session(enableHiveSupport = FALSE)
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = 
FALSE)
 
 test_that("SerDe of primitive types", {
   skip_on_cran()

http://git-wip-us.apache.org/repos/asf/spark/blob/c1e5ac26/R/pkg/inst/tests/testthat/test_binaryFile.R
--
diff --git a/R/pkg/inst/tests/testthat/test_binaryFile.R 
b/R/pkg/inst/tests/testthat/test_binaryFile.R
index 63f54e1..00954fa 100644
--- a/R/pkg/inst/tests/test

spark git commit: [SPARK-20619][ML] StringIndexer supports multiple ways to order label

2017-05-12 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 888b84abe -> af40bb115


[SPARK-20619][ML] StringIndexer supports multiple ways to order label

## What changes were proposed in this pull request?

StringIndexer maps labels to numbers according to the descending order of label 
frequency. Other types of ordering (e.g., alphabetical) may be needed in 
feature ETL.  For example, the ordering will affect the result in one-hot 
encoding and RFormula.

This PR proposes to support other ordering methods and we add a parameter 
`stringOrderType` that supports the following four options:
- 'frequencyDesc': descending order by label frequency (most frequent label 
assigned 0)
- 'frequencyAsc': ascending order by label frequency (least frequent label 
assigned 0)
- 'alphabetDesc': descending alphabetical order
- 'alphabetAsc': ascending alphabetical order

The default is still descending order of label frequency, so there should be no 
impact to existing programs.

## How was this patch tested?
new test

Author: Wayne Zhang 

Closes #17879 from actuaryzhang/stringIndexer.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/af40bb11
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/af40bb11
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/af40bb11

Branch: refs/heads/master
Commit: af40bb1159b1f443bf44594c716d2f2dd3c98640
Parents: 888b84a
Author: Wayne Zhang 
Authored: Fri May 12 00:12:47 2017 -0700
Committer: Felix Cheung 
Committed: Fri May 12 00:12:47 2017 -0700

--
 .../apache/spark/ml/feature/StringIndexer.scala | 55 +---
 .../spark/ml/feature/StringIndexerSuite.scala   | 23 
 2 files changed, 71 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/af40bb11/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index 99321bc..b2dc4fc 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -59,6 +59,29 @@ private[feature] trait StringIndexerBase extends Params with 
HasInputCol with Ha
   @Since("1.6.0")
   def getHandleInvalid: String = $(handleInvalid)
 
+  /**
+   * Param for how to order labels of string column. The first label after 
ordering is assigned
+   * an index of 0.
+   * Options are:
+   *   - 'frequencyDesc': descending order by label frequency (most frequent 
label assigned 0)
+   *   - 'frequencyAsc': ascending order by label frequency (least frequent 
label assigned 0)
+   *   - 'alphabetDesc': descending alphabetical order
+   *   - 'alphabetAsc': ascending alphabetical order
+   * Default is 'frequencyDesc'.
+   *
+   * @group param
+   */
+  @Since("2.3.0")
+  final val stringOrderType: Param[String] = new Param(this, "stringOrderType",
+"how to order labels of string column. " +
+"The first label after ordering is assigned an index of 0. " +
+s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", 
")}.",
+ParamValidators.inArray(StringIndexer.supportedStringOrderType))
+
+  /** @group getParam */
+  @Since("2.3.0")
+  def getStringOrderType: String = $(stringOrderType)
+
   /** Validates and transforms the input schema. */
   protected def validateAndTransformSchema(schema: StructType): StructType = {
 val inputColName = $(inputCol)
@@ -79,8 +102,9 @@ private[feature] trait StringIndexerBase extends Params with 
HasInputCol with Ha
 /**
  * A label indexer that maps a string column of labels to an ML column of 
label indices.
  * If the input column is numeric, we cast it to string and index the string 
values.
- * The indices are in [0, numLabels), ordered by label frequencies.
- * So the most frequent label gets index 0.
+ * The indices are in [0, numLabels). By default, this is ordered by label 
frequencies
+ * so the most frequent label gets index 0. The ordering behavior is 
controlled by
+ * setting `stringOrderType`.
  *
  * @see `IndexToString` for the inverse transformation
  */
@@ -97,6 +121,11 @@ class StringIndexer @Since("1.4.0") (
   def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
 
   /** @group setParam */
+  @Since("2.3.0")
+  def setStringOrderType(value: String): this.type = set(stringOrderType, 
value)
+  setDefault(stringOrderType, StringIndexer.frequencyDesc)
+
+  /** @group setParam */
   @Since("1.4.0")
   def setInputCol(value: String): this.type = set(inputCol, value)
 
@@ -107,11 +136,17 @@ class StringIndexer @Since("1.4.0") (
   @Since("2.0.0")
   override def fit(dataset: Dataset[_])

spark git commit: [DOCS][SPARKR] Use verbose names for family annotations in functions.R

2017-05-14 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 1283c3d11 -> aa3df1590


[DOCS][SPARKR] Use verbose names for family annotations in functions.R

## What changes were proposed in this pull request?

- Change current short annotations (same as Scala `group`) to verbose names 
(same as Scala `groupname`).

Before:


![image](https://cloud.githubusercontent.com/assets/1554276/26033909/9a98b596-38b4-11e7-961e-15fd9ea7440d.png)

After:

![image](https://cloud.githubusercontent.com/assets/1554276/26033903/727a9944-38b4-11e7-8873-b09c553f4ec3.png)

- Add missing `family` annotations.

## How was this patch tested?

`check-cran.R` (skipping tests), manual inspection.

Author: zero323 

Closes #17976 from zero323/SPARKR-FUNCTIONS-DOCSTRINGS.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/aa3df159
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/aa3df159
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/aa3df159

Branch: refs/heads/master
Commit: aa3df15904f95bc05c513d6f7c186a45db5ffa88
Parents: 1283c3d
Author: zero323 
Authored: Sun May 14 11:43:28 2017 -0700
Committer: Felix Cheung 
Committed: Sun May 14 11:43:28 2017 -0700

--
 R/pkg/R/functions.R | 318 +++
 1 file changed, 159 insertions(+), 159 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/aa3df159/R/pkg/R/functions.R
--
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 2fd2d36..a6c2dea 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -24,7 +24,7 @@ NULL
 #' If the parameter is a \linkS4class{Column}, it is returned unchanged.
 #'
 #' @param x a literal value or a Column.
-#' @family normal_funcs
+#' @family non-aggregate functions
 #' @rdname lit
 #' @name lit
 #' @export
@@ -52,7 +52,7 @@ setMethod("lit", signature("ANY"),
 #'
 #' @rdname abs
 #' @name abs
-#' @family normal_funcs
+#' @family non-aggregate functions
 #' @export
 #' @examples \dontrun{abs(df$c)}
 #' @aliases abs,Column-method
@@ -73,7 +73,7 @@ setMethod("abs",
 #'
 #' @rdname acos
 #' @name acos
-#' @family math_funcs
+#' @family math functions
 #' @export
 #' @examples \dontrun{acos(df$c)}
 #' @aliases acos,Column-method
@@ -113,7 +113,7 @@ setMethod("approxCountDistinct",
 #'
 #' @rdname ascii
 #' @name ascii
-#' @family string_funcs
+#' @family string functions
 #' @export
 #' @aliases ascii,Column-method
 #' @examples \dontrun{\dontrun{ascii(df$c)}}
@@ -134,7 +134,7 @@ setMethod("ascii",
 #'
 #' @rdname asin
 #' @name asin
-#' @family math_funcs
+#' @family math functions
 #' @export
 #' @aliases asin,Column-method
 #' @examples \dontrun{asin(df$c)}
@@ -154,7 +154,7 @@ setMethod("asin",
 #'
 #' @rdname atan
 #' @name atan
-#' @family math_funcs
+#' @family math functions
 #' @export
 #' @aliases atan,Column-method
 #' @examples \dontrun{atan(df$c)}
@@ -172,7 +172,7 @@ setMethod("atan",
 #'
 #' @rdname avg
 #' @name avg
-#' @family agg_funcs
+#' @family aggregate functions
 #' @export
 #' @aliases avg,Column-method
 #' @examples \dontrun{avg(df$c)}
@@ -193,7 +193,7 @@ setMethod("avg",
 #'
 #' @rdname base64
 #' @name base64
-#' @family string_funcs
+#' @family string functions
 #' @export
 #' @aliases base64,Column-method
 #' @examples \dontrun{base64(df$c)}
@@ -214,7 +214,7 @@ setMethod("base64",
 #'
 #' @rdname bin
 #' @name bin
-#' @family math_funcs
+#' @family math functions
 #' @export
 #' @aliases bin,Column-method
 #' @examples \dontrun{bin(df$c)}
@@ -234,7 +234,7 @@ setMethod("bin",
 #'
 #' @rdname bitwiseNOT
 #' @name bitwiseNOT
-#' @family normal_funcs
+#' @family non-aggregate functions
 #' @export
 #' @aliases bitwiseNOT,Column-method
 #' @examples \dontrun{bitwiseNOT(df$c)}
@@ -254,7 +254,7 @@ setMethod("bitwiseNOT",
 #'
 #' @rdname cbrt
 #' @name cbrt
-#' @family math_funcs
+#' @family math functions
 #' @export
 #' @aliases cbrt,Column-method
 #' @examples \dontrun{cbrt(df$c)}
@@ -274,7 +274,7 @@ setMethod("cbrt",
 #'
 #' @rdname ceil
 #' @name ceil
-#' @family math_funcs
+#' @family math functions
 #' @export
 #' @aliases ceil,Column-method
 #' @examples \dontrun{ceil(df$c)}
@@ -292,7 +292,7 @@ setMethod("ceil",
 #'
 #' @rdname coalesce
 #' @name coalesce
-#' @family normal_funcs
+#' @family non-aggregate functions
 #' @export
 #' @aliases coalesce,Column-method
 #' @examples \dontrun{coalesce(df$c, df$d, df$e)}
@@ -324,7 +324,7 @@ col <- function(x) {
 #'
 #' @rdname column
 #' @name column
-#' @family normal_funcs
+#' @family non-aggregate functions
 #' @export
 #' @aliases column,character-method
 #' @examples \dontrun{column("name")}
@@ -342,7 +342,7 @@ setMethod("column",
 #'
 #' @rdname corr
 #' @name corr
-#' @family math_funcs
+#' @family math functions
 #' @export
 #' 

spark git commit: [SPARK-20726][SPARKR] wrapper for SQL broadcast

2017-05-14 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master aa3df1590 -> 5a799fd8c


[SPARK-20726][SPARKR] wrapper for SQL broadcast

## What changes were proposed in this pull request?

- Adds R wrapper for `o.a.s.sql.functions.broadcast`.
- Renames `broadcast` to `broadcast_`.

## How was this patch tested?

Unit tests, check `check-cran.sh`.

Author: zero323 

Closes #17965 from zero323/SPARK-20726.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5a799fd8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5a799fd8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5a799fd8

Branch: refs/heads/master
Commit: 5a799fd8c3664da1fa9821ead6c0e25f561c6a8d
Parents: aa3df15
Author: zero323 
Authored: Sun May 14 13:22:19 2017 -0700
Committer: Felix Cheung 
Committed: Sun May 14 13:22:19 2017 -0700

--
 R/pkg/NAMESPACE|  1 +
 R/pkg/R/DataFrame.R| 29 +
 R/pkg/R/context.R  |  4 ++--
 R/pkg/R/generics.R |  4 
 R/pkg/inst/tests/testthat/test_broadcast.R |  2 +-
 R/pkg/inst/tests/testthat/test_sparkSQL.R  |  5 +
 R/pkg/inst/tests/testthat/test_utils.R |  2 +-
 7 files changed, 43 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5a799fd8/R/pkg/NAMESPACE
--
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index ba0fe77..5c074d3 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -84,6 +84,7 @@ exportClasses("SparkDataFrame")
 exportMethods("arrange",
   "as.data.frame",
   "attach",
+  "broadcast",
   "cache",
   "checkpoint",
   "coalesce",

http://git-wip-us.apache.org/repos/asf/spark/blob/5a799fd8/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index b56dddc..aab2fc1 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -3769,3 +3769,32 @@ setMethod("alias",
 sdf <- callJMethod(object@sdf, "alias", data)
 dataFrame(sdf)
   })
+
+#' broadcast
+#'
+#' Return a new SparkDataFrame marked as small enough for use in broadcast 
joins.
+#'
+#' Equivalent to \code{hint(x, "broadcast")}.
+#'
+#' @param x a SparkDataFrame.
+#' @return a SparkDataFrame.
+#'
+#' @aliases broadcast,SparkDataFrame-method
+#' @family SparkDataFrame functions
+#' @rdname broadcast
+#' @name broadcast
+#' @export
+#' @examples
+#' \dontrun{
+#' df <- createDataFrame(mtcars)
+#' avg_mpg <- mean(groupBy(createDataFrame(mtcars), "cyl"), "mpg")
+#'
+#' head(join(df, broadcast(avg_mpg), df$cyl == avg_mpg$cyl))
+#' }
+#' @note broadcast since 2.3.0
+setMethod("broadcast",
+  signature(x = "SparkDataFrame"),
+  function(x) {
+sdf <- callJStatic("org.apache.spark.sql.functions", "broadcast", 
x@sdf)
+dataFrame(sdf)
+  })

http://git-wip-us.apache.org/repos/asf/spark/blob/5a799fd8/R/pkg/R/context.R
--
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
index 50856e3..8349b57 100644
--- a/R/pkg/R/context.R
+++ b/R/pkg/R/context.R
@@ -258,7 +258,7 @@ includePackage <- function(sc, pkg) {
 #'
 #' # Large Matrix object that we want to broadcast
 #' randomMat <- matrix(nrow=100, ncol=10, data=rnorm(1000))
-#' randomMatBr <- broadcast(sc, randomMat)
+#' randomMatBr <- broadcastRDD(sc, randomMat)
 #'
 #' # Use the broadcast variable inside the function
 #' useBroadcast <- function(x) {
@@ -266,7 +266,7 @@ includePackage <- function(sc, pkg) {
 #' }
 #' sumRDD <- lapply(rdd, useBroadcast)
 #'}
-broadcast <- function(sc, object) {
+broadcastRDD <- function(sc, object) {
   objName <- as.character(substitute(object))
   serializedObj <- serialize(object, connection = NULL)
 

http://git-wip-us.apache.org/repos/asf/spark/blob/5a799fd8/R/pkg/R/generics.R
--
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 3c84bf8..514ca99 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -799,6 +799,10 @@ setGeneric("write.df", function(df, path = NULL, ...) { 
standardGeneric("write.d
 #' @export
 setGeneric("randomSplit", function(x, weights, seed) { 
standardGeneric("randomSplit") })
 
+#' @rdname broadcast
+#' @export
+setGeneric("broadcast", function(x) { standardGeneric("broadcast") })
+
 ## Column Methods ##
 
 #' @rdname columnfunctions

http://git-wip-us.apache.org/repos/asf/spark/blob/5a799fd8/R/pkg/inst/tests/testthat/test_broadcast.R
---

spark git commit: [SPARKR][DOCS][MINOR] Use consistent names in rollup and cube examples

2017-05-19 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master ea3b1e352 -> 2d90c04f2


[SPARKR][DOCS][MINOR] Use consistent names in rollup and cube examples

## What changes were proposed in this pull request?

Rename `carsDF` to `df` in SparkR `rollup` and `cube` examples.

## How was this patch tested?

Manual tests.

Author: zero323 

Closes #17988 from zero323/cube-docs.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2d90c04f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2d90c04f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2d90c04f

Branch: refs/heads/master
Commit: 2d90c04f2343b0ce6cb4d76320bf583934cb9993
Parents: ea3b1e3
Author: zero323 
Authored: Fri May 19 11:04:38 2017 -0700
Committer: Felix Cheung 
Committed: Fri May 19 11:04:38 2017 -0700

--
 R/pkg/R/DataFrame.R | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2d90c04f/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index aab2fc1..2b5888f 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -3667,8 +3667,8 @@ setMethod("checkpoint",
 #' mean(cube(df, "cyl", "gear", "am"), "mpg")
 #'
 #' # Following calls are equivalent
-#' agg(cube(carsDF), mean(carsDF$mpg))
-#' agg(carsDF, mean(carsDF$mpg))
+#' agg(cube(df), mean(df$mpg))
+#' agg(df, mean(df$mpg))
 #' }
 #' @note cube since 2.3.0
 #' @seealso \link{agg}, \link{groupBy}, \link{rollup}
@@ -3702,8 +3702,8 @@ setMethod("cube",
 #' mean(rollup(df, "cyl", "gear", "am"), "mpg")
 #'
 #' # Following calls are equivalent
-#' agg(rollup(carsDF), mean(carsDF$mpg))
-#' agg(carsDF, mean(carsDF$mpg))
+#' agg(rollup(df), mean(df$mpg))
+#' agg(df, mean(df$mpg))
 #' }
 #' @note rollup since 2.3.0
 #' @seealso \link{agg}, \link{cube}, \link{groupBy}


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARKR] Fix bad examples in DataFrame methods and style issues

2017-05-19 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 2d90c04f2 -> 7f203a248


[SPARKR] Fix bad examples in DataFrame methods and style issues

## What changes were proposed in this pull request?
Some examples in the DataFrame methods are syntactically wrong, even though 
they are pseudo code. Fix these and some style issues.

Author: Wayne Zhang 

Closes #18003 from actuaryzhang/sparkRDoc3.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7f203a24
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7f203a24
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7f203a24

Branch: refs/heads/master
Commit: 7f203a248f94df6183a4bc4642a3d873171fef29
Parents: 2d90c04
Author: Wayne Zhang 
Authored: Fri May 19 11:18:20 2017 -0700
Committer: Felix Cheung 
Committed: Fri May 19 11:18:20 2017 -0700

--
 R/pkg/R/DataFrame.R  | 14 +++--
 R/pkg/R/WindowSpec.R |  3 ++-
 R/pkg/R/column.R |  6 --
 R/pkg/R/functions.R  | 51 +++
 4 files changed, 48 insertions(+), 26 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7f203a24/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 2b5888f..166b398 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -549,7 +549,7 @@ setMethod("registerTempTable",
 #' sparkR.session()
 #' df <- read.df(path, "parquet")
 #' df2 <- read.df(path2, "parquet")
-#' createOrReplaceTempView(df, "table1")
+#' saveAsTable(df, "table1")
 #' insertInto(df2, "table1", overwrite = TRUE)
 #'}
 #' @note insertInto since 1.4.0
@@ -1125,7 +1125,8 @@ setMethod("dim",
 #' path <- "path/to/file.json"
 #' df <- read.json(path)
 #' collected <- collect(df)
-#' firstName <- collected[[1]]$name
+#' class(collected)
+#' firstName <- names(collected)[1]
 #' }
 #' @note collect since 1.4.0
 setMethod("collect",
@@ -2814,7 +2815,7 @@ setMethod("except",
 #' path <- "path/to/file.json"
 #' df <- read.json(path)
 #' write.df(df, "myfile", "parquet", "overwrite")
-#' saveDF(df, parquetPath2, "parquet", mode = saveMode, mergeSchema = 
mergeSchema)
+#' saveDF(df, parquetPath2, "parquet", mode = "append", mergeSchema = TRUE)
 #' }
 #' @note write.df since 1.4.0
 setMethod("write.df",
@@ -3097,8 +3098,8 @@ setMethod("fillna",
 #' @family SparkDataFrame functions
 #' @aliases as.data.frame,SparkDataFrame-method
 #' @rdname as.data.frame
-#' @examples \dontrun{
-#'
+#' @examples
+#' \dontrun{
 #' irisDF <- createDataFrame(iris)
 #' df <- as.data.frame(irisDF[irisDF$Species == "setosa", ])
 #' }
@@ -3175,7 +3176,8 @@ setMethod("with",
 #' @aliases str,SparkDataFrame-method
 #' @family SparkDataFrame functions
 #' @param object a SparkDataFrame
-#' @examples \dontrun{
+#' @examples
+#' \dontrun{
 #' # Create a SparkDataFrame from the Iris dataset
 #' irisDF <- createDataFrame(iris)
 #'

http://git-wip-us.apache.org/repos/asf/spark/blob/7f203a24/R/pkg/R/WindowSpec.R
--
diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R
index 4ac83c2..81beac9 100644
--- a/R/pkg/R/WindowSpec.R
+++ b/R/pkg/R/WindowSpec.R
@@ -203,7 +203,8 @@ setMethod("rangeBetween",
 #' @aliases over,Column,WindowSpec-method
 #' @family colum_func
 #' @export
-#' @examples \dontrun{
+#' @examples
+#' \dontrun{
 #'   df <- createDataFrame(mtcars)
 #'
 #'   # Partition by am (transmission) and order by hp (horsepower)

http://git-wip-us.apache.org/repos/asf/spark/blob/7f203a24/R/pkg/R/column.R
--
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index 5740780..a5c2ea8 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -135,7 +135,8 @@ createMethods()
 #' @aliases alias,Column-method
 #' @family colum_func
 #' @export
-#' @examples \dontrun{
+#' @examples
+#' \dontrun{
 #' df <- createDataFrame(iris)
 #'
 #' head(select(
@@ -244,7 +245,8 @@ setMethod("between", signature(x = "Column"),
 #' @family colum_func
 #' @aliases cast,Column-method
 #'
-#' @examples \dontrun{
+#' @examples
+#' \dontrun{
 #'   cast(df$age, "string")
 #' }
 #' @note cast since 1.4.0

http://git-wip-us.apache.org/repos/asf/spark/blob/7f203a24/R/pkg/R/functions.R
--
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index a6c2dea..06a9019 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -3257,7 +3257,8 @@ setMethod("when", signature(condition = "Column", value = 
"ANY"),
 #' @aliases ifelse,Column-method
 #' @seealso \link{when}
 #' @export
-#' @examples \dontrun{
+#' @examples
+#' \dontrun{
 #' ifelse(df$a > 1 & df$b > 2, 0, 1)
 #' ifelse(df$a > 1, df$a, 1)
 #' }
@@ -3292,7 +3293,8 @

spark git commit: [SPARK-20736][PYTHON] PySpark StringIndexer supports StringOrderType

2017-05-21 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 9d6661c82 -> 0f2f56c37


[SPARK-20736][PYTHON] PySpark StringIndexer supports StringOrderType

## What changes were proposed in this pull request?
PySpark StringIndexer supports StringOrderType added in #17879.

Author: Wayne Zhang 

Closes #17978 from actuaryzhang/PythonStringIndexer.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0f2f56c3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0f2f56c3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0f2f56c3

Branch: refs/heads/master
Commit: 0f2f56c37b8d09eec2722a5ffba3015d7f3b626f
Parents: 9d6661c
Author: Wayne Zhang 
Authored: Sun May 21 16:51:55 2017 -0700
Committer: Felix Cheung 
Committed: Sun May 21 16:51:55 2017 -0700

--
 python/pyspark/ml/feature.py | 51 +--
 1 file changed, 43 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0f2f56c3/python/pyspark/ml/feature.py
--
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 8d25f5b..955bc97 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2082,10 +2082,12 @@ class StringIndexer(JavaEstimator, HasInputCol, 
HasOutputCol, HasHandleInvalid,
 """
 A label indexer that maps a string column of labels to an ML column of 
label indices.
 If the input column is numeric, we cast it to string and index the string 
values.
-The indices are in [0, numLabels), ordered by label frequencies.
-So the most frequent label gets index 0.
+The indices are in [0, numLabels). By default, this is ordered by label 
frequencies
+so the most frequent label gets index 0. The ordering behavior is 
controlled by
+setting :py:attr:`stringOrderType`. Its default value is 'frequencyDesc'.
 
->>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", 
handleInvalid='error')
+>>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", 
handleInvalid="error",
+... stringOrderType="frequencyDesc")
 >>> model = stringIndexer.fit(stringIndDf)
 >>> td = model.transform(stringIndDf)
 >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, 
td.indexed).collect()]),
@@ -2111,26 +2113,45 @@ class StringIndexer(JavaEstimator, HasInputCol, 
HasOutputCol, HasHandleInvalid,
 >>> loadedInverter = IndexToString.load(indexToStringPath)
 >>> loadedInverter.getLabels() == inverter.getLabels()
 True
+>>> stringIndexer.getStringOrderType()
+'frequencyDesc'
+>>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", 
handleInvalid="error",
+... stringOrderType="alphabetDesc")
+>>> model = stringIndexer.fit(stringIndDf)
+>>> td = model.transform(stringIndDf)
+>>> sorted(set([(i[0], i[1]) for i in td.select(td.id, 
td.indexed).collect()]),
+... key=lambda x: x[0])
+[(0, 2.0), (1, 1.0), (2, 0.0), (3, 2.0), (4, 2.0), (5, 0.0)]
 
 .. versionadded:: 1.4.0
 """
 
+stringOrderType = Param(Params._dummy(), "stringOrderType",
+"How to order labels of string column. The first 
label after " +
+"ordering is assigned an index of 0. Supported 
options: " +
+"frequencyDesc, frequencyAsc, alphabetDesc, 
alphabetAsc.",
+typeConverter=TypeConverters.toString)
+
 @keyword_only
-def __init__(self, inputCol=None, outputCol=None, handleInvalid="error"):
+def __init__(self, inputCol=None, outputCol=None, handleInvalid="error",
+ stringOrderType="frequencyDesc"):
 """
-__init__(self, inputCol=None, outputCol=None, handleInvalid="error")
+__init__(self, inputCol=None, outputCol=None, handleInvalid="error", \
+ stringOrderType="frequencyDesc")
 """
 super(StringIndexer, self).__init__()
 self._java_obj = 
self._new_java_obj("org.apache.spark.ml.feature.StringIndexer", self.uid)
-self._setDefault(handleInvalid="error")
+self._setDefault(handleInvalid="error", 
stringOrderType="frequencyDesc")
 kwargs = self._input_kwargs
 self.setParams(**kwargs)
 
 @keyword_only
 @since("1.4.0")
-def setParams(self, inputCol=None, outputCol=None, handleInvalid="error"):
+def setParams(self, inputCol=None, outputCol=None, handleInvalid="error",
+  stringOrderType="frequencyDesc"):
 """
-setParams(self, inputCol=None, outputCol=None, handleInvalid="error")
+setParams(self, inputCol=None, outputCol=None, handleInvalid="error", \
+  stringOrderType="frequencyDesc")
   

spark git commit: [SPARK-15767][ML][SPARKR] Decision Tree wrapper in SparkR

2017-05-22 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 363091100 -> 4be337583


[SPARK-15767][ML][SPARKR] Decision Tree wrapper in SparkR

## What changes were proposed in this pull request?
support decision tree in R

## How was this patch tested?
added tests

Author: Zheng RuiFeng 

Closes #17981 from zhengruifeng/dt_r.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4be33758
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4be33758
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4be33758

Branch: refs/heads/master
Commit: 4be33758354e1f95fd1d82a5482f3f00218e8c91
Parents: 3630911
Author: Zheng RuiFeng 
Authored: Mon May 22 10:40:49 2017 -0700
Committer: Felix Cheung 
Committed: Mon May 22 10:40:49 2017 -0700

--
 R/pkg/NAMESPACE |   5 +
 R/pkg/R/generics.R  |   5 +
 R/pkg/R/mllib_tree.R| 240 +++
 R/pkg/R/mllib_utils.R   |  14 +-
 R/pkg/inst/tests/testthat/test_mllib_tree.R |  86 +++
 .../r/DecisionTreeClassificationWrapper.scala   | 152 
 .../ml/r/DecisionTreeRegressionWrapper.scala| 137 +++
 .../scala/org/apache/spark/ml/r/RWrappers.scala |   4 +
 8 files changed, 639 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4be33758/R/pkg/NAMESPACE
--
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 5c074d3..4e3fe00 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -63,6 +63,7 @@ exportMethods("glm",
   "spark.als",
   "spark.kstest",
   "spark.logit",
+  "spark.decisionTree",
   "spark.randomForest",
   "spark.gbt",
   "spark.bisectingKmeans",
@@ -414,6 +415,8 @@ export("as.DataFrame",
"print.summary.GeneralizedLinearRegressionModel",
"read.ml",
"print.summary.KSTest",
+   "print.summary.DecisionTreeRegressionModel",
+   "print.summary.DecisionTreeClassificationModel",
"print.summary.RandomForestRegressionModel",
"print.summary.RandomForestClassificationModel",
"print.summary.GBTRegressionModel",
@@ -452,6 +455,8 @@ S3method(print, structField)
 S3method(print, structType)
 S3method(print, summary.GeneralizedLinearRegressionModel)
 S3method(print, summary.KSTest)
+S3method(print, summary.DecisionTreeRegressionModel)
+S3method(print, summary.DecisionTreeClassificationModel)
 S3method(print, summary.RandomForestRegressionModel)
 S3method(print, summary.RandomForestClassificationModel)
 S3method(print, summary.GBTRegressionModel)

http://git-wip-us.apache.org/repos/asf/spark/blob/4be33758/R/pkg/R/generics.R
--
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 514ca99..5630d0c 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1506,6 +1506,11 @@ setGeneric("spark.mlp", function(data, formula, ...) { 
standardGeneric("spark.ml
 #' @export
 setGeneric("spark.naiveBayes", function(data, formula, ...) { 
standardGeneric("spark.naiveBayes") })
 
+#' @rdname spark.decisionTree
+#' @export
+setGeneric("spark.decisionTree",
+   function(data, formula, ...) { 
standardGeneric("spark.decisionTree") })
+
 #' @rdname spark.randomForest
 #' @export
 setGeneric("spark.randomForest",

http://git-wip-us.apache.org/repos/asf/spark/blob/4be33758/R/pkg/R/mllib_tree.R
--
diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R
index 82279be..2f1220a 100644
--- a/R/pkg/R/mllib_tree.R
+++ b/R/pkg/R/mllib_tree.R
@@ -45,6 +45,20 @@ setClass("RandomForestRegressionModel", representation(jobj 
= "jobj"))
 #' @note RandomForestClassificationModel since 2.1.0
 setClass("RandomForestClassificationModel", representation(jobj = "jobj"))
 
+#' S4 class that represents a DecisionTreeRegressionModel
+#'
+#' @param jobj a Java object reference to the backing Scala 
DecisionTreeRegressionModel
+#' @export
+#' @note DecisionTreeRegressionModel since 2.3.0
+setClass("DecisionTreeRegressionModel", representation(jobj = "jobj"))
+
+#' S4 class that represents a DecisionTreeClassificationModel
+#'
+#' @param jobj a Java object reference to the backing Scala 
DecisionTreeClassificationModel
+#' @export
+#' @note DecisionTreeClassificationModel since 2.3.0
+setClass("DecisionTreeClassificationModel", representation(jobj = "jobj"))
+
 # Create the summary of a tree ensemble model (eg. Random Forest, GBT)
 summary.treeEnsemble <- function(model) {
   jobj <- model@jobj
@@ -81,6 +95,36 @@ print.summary.treeEnsemble <- function(x) {
   invisible(x)
 }
 
+# Create the summary 

spark git commit: [SPARK-20815][SPARKR] NullPointerException in RPackageUtils#checkManifestForR

2017-05-22 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master a2460be9c -> 4dbb63f08


[SPARK-20815][SPARKR] NullPointerException in RPackageUtils#checkManifestForR

## What changes were proposed in this pull request?

- Add a null check to RPackageUtils#checkManifestForR so that jars w/o 
manifests don't NPE.

## How was this patch tested?

- Unit tests and manual tests.

Author: James Shuster 

Closes #18040 from jrshust/feature/r-package-utils.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4dbb63f0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4dbb63f0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4dbb63f0

Branch: refs/heads/master
Commit: 4dbb63f0857a9cfb018cf49e3d1103cacc862ba2
Parents: a2460be
Author: James Shuster 
Authored: Mon May 22 21:41:11 2017 -0700
Committer: Felix Cheung 
Committed: Mon May 22 21:41:11 2017 -0700

--
 .../scala/org/apache/spark/deploy/RPackageUtils.scala |  3 +++
 .../scala/org/apache/spark/deploy/IvyTestUtils.scala  | 14 ++
 .../org/apache/spark/deploy/RPackageUtilsSuite.scala  | 10 ++
 3 files changed, 23 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4dbb63f0/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala
--
diff --git a/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala 
b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala
index 050778a..7d356e8 100644
--- a/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala
@@ -92,6 +92,9 @@ private[deploy] object RPackageUtils extends Logging {
* Exposed for testing.
*/
   private[deploy] def checkManifestForR(jar: JarFile): Boolean = {
+if (jar.getManifest == null) {
+  return false
+}
 val manifest = jar.getManifest.getMainAttributes
 manifest.getValue(hasRPackage) != null && 
manifest.getValue(hasRPackage).trim == "true"
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/4dbb63f0/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala
--
diff --git a/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala 
b/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala
index f50cb38..42b8cde 100644
--- a/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala
@@ -243,16 +243,22 @@ private[deploy] object IvyTestUtils {
   withManifest: Option[Manifest] = None): File = {
 val jarFile = new File(dir, artifactName(artifact, useIvyLayout))
 val jarFileStream = new FileOutputStream(jarFile)
-val manifest = withManifest.getOrElse {
-  val mani = new Manifest()
+val manifest: Manifest = withManifest.getOrElse {
   if (withR) {
+val mani = new Manifest()
 val attr = mani.getMainAttributes
 attr.put(Name.MANIFEST_VERSION, "1.0")
 attr.put(new Name("Spark-HasRPackage"), "true")
+mani
+  } else {
+null
   }
-  mani
 }
-val jarStream = new JarOutputStream(jarFileStream, manifest)
+val jarStream = if (manifest != null) {
+  new JarOutputStream(jarFileStream, manifest)
+} else {
+  new JarOutputStream(jarFileStream)
+}
 
 for (file <- files) {
   val jarEntry = new JarEntry(file._1)

http://git-wip-us.apache.org/repos/asf/spark/blob/4dbb63f0/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala
--
diff --git 
a/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala 
b/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala
index 0055870..5e0bf6d 100644
--- a/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala
@@ -133,6 +133,16 @@ class RPackageUtilsSuite
 }
   }
 
+  test("jars without manifest return false") {
+IvyTestUtils.withRepository(main, None, None) { repo =>
+  val jar = IvyTestUtils.packJar(new File(new URI(repo)), dep1, Nil,
+useIvyLayout = false, withR = false, None)
+  val jarFile = new JarFile(jar)
+  assert(jarFile.getManifest == null, "jar file should have null manifest")
+  assert(!RPackageUtils.checkManifestForR(jarFile), "null manifest should 
return false")
+}
+  }
+
   test("SparkR zipping works properly") {
 val tempDir = Files.createTempDir()
 Utils.tryWithSafeFinally {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For 

spark git commit: [SPARK-20815][SPARKR] NullPointerException in RPackageUtils#checkManifestForR

2017-05-22 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.2 d8328d8d1 -> ddc199eef


[SPARK-20815][SPARKR] NullPointerException in RPackageUtils#checkManifestForR

## What changes were proposed in this pull request?

- Add a null check to RPackageUtils#checkManifestForR so that jars w/o 
manifests don't NPE.

## How was this patch tested?

- Unit tests and manual tests.

Author: James Shuster 

Closes #18040 from jrshust/feature/r-package-utils.

(cherry picked from commit 4dbb63f0857a9cfb018cf49e3d1103cacc862ba2)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ddc199ee
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ddc199ee
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ddc199ee

Branch: refs/heads/branch-2.2
Commit: ddc199eefbf68223f817a4c756b243362c1a95ca
Parents: d8328d8
Author: James Shuster 
Authored: Mon May 22 21:41:11 2017 -0700
Committer: Felix Cheung 
Committed: Mon May 22 21:41:23 2017 -0700

--
 .../scala/org/apache/spark/deploy/RPackageUtils.scala |  3 +++
 .../scala/org/apache/spark/deploy/IvyTestUtils.scala  | 14 ++
 .../org/apache/spark/deploy/RPackageUtilsSuite.scala  | 10 ++
 3 files changed, 23 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ddc199ee/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala
--
diff --git a/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala 
b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala
index 050778a..7d356e8 100644
--- a/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala
@@ -92,6 +92,9 @@ private[deploy] object RPackageUtils extends Logging {
* Exposed for testing.
*/
   private[deploy] def checkManifestForR(jar: JarFile): Boolean = {
+if (jar.getManifest == null) {
+  return false
+}
 val manifest = jar.getManifest.getMainAttributes
 manifest.getValue(hasRPackage) != null && 
manifest.getValue(hasRPackage).trim == "true"
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/ddc199ee/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala
--
diff --git a/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala 
b/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala
index f50cb38..42b8cde 100644
--- a/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala
@@ -243,16 +243,22 @@ private[deploy] object IvyTestUtils {
   withManifest: Option[Manifest] = None): File = {
 val jarFile = new File(dir, artifactName(artifact, useIvyLayout))
 val jarFileStream = new FileOutputStream(jarFile)
-val manifest = withManifest.getOrElse {
-  val mani = new Manifest()
+val manifest: Manifest = withManifest.getOrElse {
   if (withR) {
+val mani = new Manifest()
 val attr = mani.getMainAttributes
 attr.put(Name.MANIFEST_VERSION, "1.0")
 attr.put(new Name("Spark-HasRPackage"), "true")
+mani
+  } else {
+null
   }
-  mani
 }
-val jarStream = new JarOutputStream(jarFileStream, manifest)
+val jarStream = if (manifest != null) {
+  new JarOutputStream(jarFileStream, manifest)
+} else {
+  new JarOutputStream(jarFileStream)
+}
 
 for (file <- files) {
   val jarEntry = new JarEntry(file._1)

http://git-wip-us.apache.org/repos/asf/spark/blob/ddc199ee/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala
--
diff --git 
a/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala 
b/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala
index 0055870..5e0bf6d 100644
--- a/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala
@@ -133,6 +133,16 @@ class RPackageUtilsSuite
 }
   }
 
+  test("jars without manifest return false") {
+IvyTestUtils.withRepository(main, None, None) { repo =>
+  val jar = IvyTestUtils.packJar(new File(new URI(repo)), dep1, Nil,
+useIvyLayout = false, withR = false, None)
+  val jarFile = new JarFile(jar)
+  assert(jarFile.getManifest == null, "jar file should have null manifest")
+  assert(!RPackageUtils.checkManifestForR(jarFile), "null manifest should 
return false")
+}
+  }
+
   test("SparkR zipping works properly") {
 val tempDir = Files.createTempDir()
 Utils.tryWithSafeFinally {



spark git commit: [SPARK-20727] Skip tests that use Hadoop utils on CRAN Windows

2017-05-22 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 4dbb63f08 -> d06610f99


[SPARK-20727] Skip tests that use Hadoop utils on CRAN Windows

## What changes were proposed in this pull request?

This change skips tests that use the Hadoop libraries while running
on CRAN check with Windows as the operating system. This is to handle
cases where the Hadoop winutils binaries are missing on the target
system. The skipped tests consist of
1. Tests that save, load a model in MLlib
2. Tests that save, load CSV, JSON and Parquet files in SQL
3. Hive tests

## How was this patch tested?

Tested by running on a local windows VM with HADOOP_HOME unset. Also testing 
with https://win-builder.r-project.org

Author: Shivaram Venkataraman 

Closes #17966 from shivaram/sparkr-windows-cran.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d06610f9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d06610f9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d06610f9

Branch: refs/heads/master
Commit: d06610f992ccf199928c0a71699fbf4c01705c31
Parents: 4dbb63f
Author: Shivaram Venkataraman 
Authored: Mon May 22 23:04:22 2017 -0700
Committer: Felix Cheung 
Committed: Mon May 22 23:04:22 2017 -0700

--
 R/pkg/R/utils.R |  16 +
 .../tests/testthat/test_mllib_classification.R  |  90 +++--
 .../inst/tests/testthat/test_mllib_clustering.R | 112 +++---
 R/pkg/inst/tests/testthat/test_mllib_fpm.R  |  16 +-
 .../tests/testthat/test_mllib_recommendation.R  |  42 +-
 .../inst/tests/testthat/test_mllib_regression.R |  42 +-
 R/pkg/inst/tests/testthat/test_mllib_tree.R | 112 +++---
 R/pkg/inst/tests/testthat/test_sparkSQL.R   | 396 ++-
 8 files changed, 445 insertions(+), 381 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d06610f9/R/pkg/R/utils.R
--
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index d29af00..ea45e39 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -907,3 +907,19 @@ basenameSansExtFromUrl <- function(url) {
 isAtomicLengthOne <- function(x) {
   is.atomic(x) && length(x) == 1
 }
+
+is_cran <- function() {
+  !identical(Sys.getenv("NOT_CRAN"), "true")
+}
+
+is_windows <- function() {
+  .Platform$OS.type == "windows"
+}
+
+hadoop_home_set <- function() {
+  !identical(Sys.getenv("HADOOP_HOME"), "")
+}
+
+not_cran_or_windows_with_hadoop <- function() {
+  !is_cran() && (!is_windows() || hadoop_home_set())
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/d06610f9/R/pkg/inst/tests/testthat/test_mllib_classification.R
--
diff --git a/R/pkg/inst/tests/testthat/test_mllib_classification.R 
b/R/pkg/inst/tests/testthat/test_mllib_classification.R
index f3eaeb3..abf8bb2 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_classification.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_classification.R
@@ -50,15 +50,17 @@ test_that("spark.svmLinear", {
   expect_equal(sort(as.list(take(select(prediction, "prediction"), 10))[[1]]), 
expected)
 
   # Test model save and load
-  modelPath <- tempfile(pattern = "spark-svm-linear", fileext = ".tmp")
-  write.ml(model, modelPath)
-  expect_error(write.ml(model, modelPath))
-  write.ml(model, modelPath, overwrite = TRUE)
-  model2 <- read.ml(modelPath)
-  coefs <- summary(model)$coefficients
-  coefs2 <- summary(model2)$coefficients
-  expect_equal(coefs, coefs2)
-  unlink(modelPath)
+  if (not_cran_or_windows_with_hadoop()) {
+modelPath <- tempfile(pattern = "spark-svm-linear", fileext = ".tmp")
+write.ml(model, modelPath)
+expect_error(write.ml(model, modelPath))
+write.ml(model, modelPath, overwrite = TRUE)
+model2 <- read.ml(modelPath)
+coefs <- summary(model)$coefficients
+coefs2 <- summary(model2)$coefficients
+expect_equal(coefs, coefs2)
+unlink(modelPath)
+  }
 
   # Test prediction with numeric label
   label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
@@ -128,15 +130,17 @@ test_that("spark.logit", {
   expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1))
 
   # Test model save and load
-  modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp")
-  write.ml(model, modelPath)
-  expect_error(write.ml(model, modelPath))
-  write.ml(model, modelPath, overwrite = TRUE)
-  model2 <- read.ml(modelPath)
-  coefs <- summary(model)$coefficients
-  coefs2 <- summary(model2)$coefficients
-  expect_equal(coefs, coefs2)
-  unlink(modelPath)
+  if (not_cran_or_windows_with_hadoop()) {
+modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp")
+write.ml(model, modelPath)
+expect_error(write.ml(model, modelPath))
+write.ml(model, modelPath, overwrite = TRUE)
+model2 <- read.ml(modelPath)
+coefs <- summa

spark git commit: [SPARK-20727] Skip tests that use Hadoop utils on CRAN Windows

2017-05-22 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.2 ddc199eef -> 5e9541a4d


[SPARK-20727] Skip tests that use Hadoop utils on CRAN Windows

## What changes were proposed in this pull request?

This change skips tests that use the Hadoop libraries while running
on CRAN check with Windows as the operating system. This is to handle
cases where the Hadoop winutils binaries are missing on the target
system. The skipped tests consist of
1. Tests that save, load a model in MLlib
2. Tests that save, load CSV, JSON and Parquet files in SQL
3. Hive tests

## How was this patch tested?

Tested by running on a local windows VM with HADOOP_HOME unset. Also testing 
with https://win-builder.r-project.org

Author: Shivaram Venkataraman 

Closes #17966 from shivaram/sparkr-windows-cran.

(cherry picked from commit d06610f992ccf199928c0a71699fbf4c01705c31)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5e9541a4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5e9541a4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5e9541a4

Branch: refs/heads/branch-2.2
Commit: 5e9541a4d4896f7a84755265fa1955e256cda449
Parents: ddc199e
Author: Shivaram Venkataraman 
Authored: Mon May 22 23:04:22 2017 -0700
Committer: Felix Cheung 
Committed: Mon May 22 23:04:34 2017 -0700

--
 R/pkg/R/utils.R |  16 +
 .../tests/testthat/test_mllib_classification.R  |  90 +++--
 .../inst/tests/testthat/test_mllib_clustering.R | 112 +++---
 R/pkg/inst/tests/testthat/test_mllib_fpm.R  |  16 +-
 .../tests/testthat/test_mllib_recommendation.R  |  42 +-
 .../inst/tests/testthat/test_mllib_regression.R |  42 +-
 R/pkg/inst/tests/testthat/test_mllib_tree.R | 112 +++---
 R/pkg/inst/tests/testthat/test_sparkSQL.R   | 396 ++-
 8 files changed, 445 insertions(+), 381 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5e9541a4/R/pkg/R/utils.R
--
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index fbc89e9..b19556a 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -899,3 +899,19 @@ basenameSansExtFromUrl <- function(url) {
 isAtomicLengthOne <- function(x) {
   is.atomic(x) && length(x) == 1
 }
+
+is_cran <- function() {
+  !identical(Sys.getenv("NOT_CRAN"), "true")
+}
+
+is_windows <- function() {
+  .Platform$OS.type == "windows"
+}
+
+hadoop_home_set <- function() {
+  !identical(Sys.getenv("HADOOP_HOME"), "")
+}
+
+not_cran_or_windows_with_hadoop <- function() {
+  !is_cran() && (!is_windows() || hadoop_home_set())
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/5e9541a4/R/pkg/inst/tests/testthat/test_mllib_classification.R
--
diff --git a/R/pkg/inst/tests/testthat/test_mllib_classification.R 
b/R/pkg/inst/tests/testthat/test_mllib_classification.R
index f3eaeb3..abf8bb2 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_classification.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_classification.R
@@ -50,15 +50,17 @@ test_that("spark.svmLinear", {
   expect_equal(sort(as.list(take(select(prediction, "prediction"), 10))[[1]]), 
expected)
 
   # Test model save and load
-  modelPath <- tempfile(pattern = "spark-svm-linear", fileext = ".tmp")
-  write.ml(model, modelPath)
-  expect_error(write.ml(model, modelPath))
-  write.ml(model, modelPath, overwrite = TRUE)
-  model2 <- read.ml(modelPath)
-  coefs <- summary(model)$coefficients
-  coefs2 <- summary(model2)$coefficients
-  expect_equal(coefs, coefs2)
-  unlink(modelPath)
+  if (not_cran_or_windows_with_hadoop()) {
+modelPath <- tempfile(pattern = "spark-svm-linear", fileext = ".tmp")
+write.ml(model, modelPath)
+expect_error(write.ml(model, modelPath))
+write.ml(model, modelPath, overwrite = TRUE)
+model2 <- read.ml(modelPath)
+coefs <- summary(model)$coefficients
+coefs2 <- summary(model2)$coefficients
+expect_equal(coefs, coefs2)
+unlink(modelPath)
+  }
 
   # Test prediction with numeric label
   label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
@@ -128,15 +130,17 @@ test_that("spark.logit", {
   expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1))
 
   # Test model save and load
-  modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp")
-  write.ml(model, modelPath)
-  expect_error(write.ml(model, modelPath))
-  write.ml(model, modelPath, overwrite = TRUE)
-  model2 <- read.ml(modelPath)
-  coefs <- summary(model)$coefficients
-  coefs2 <- summary(model2)$coefficients
-  expect_equal(coefs, coefs2)
-  unlink(modelPath)
+  if (not_cran_or_windows_with_hadoop()) {
+modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp")
+write.ml(model, modelPath)
+expect_error(write.ml(model, modelPat

spark git commit: [SPARK-20849][DOC][SPARKR] Document R DecisionTree

2017-05-25 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 8ce0d8ffb -> a97c49704


[SPARK-20849][DOC][SPARKR] Document R DecisionTree

## What changes were proposed in this pull request?
1, add an example for sparkr `decisionTree`
2, document it in user guide

## How was this patch tested?
local submit

Author: Zheng RuiFeng 

Closes #18067 from zhengruifeng/dt_example.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a97c4970
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a97c4970
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a97c4970

Branch: refs/heads/master
Commit: a97c497045e9102b8eefcd0a0567ee08e61c838c
Parents: 8ce0d8f
Author: Zheng RuiFeng 
Authored: Thu May 25 23:00:50 2017 -0700
Committer: Felix Cheung 
Committed: Thu May 25 23:00:50 2017 -0700

--
 R/pkg/vignettes/sparkr-vignettes.Rmd  | 50 ---
 docs/ml-classification-regression.md  |  7 
 docs/sparkr.md|  1 +
 examples/src/main/r/ml/decisionTree.R | 65 ++
 4 files changed, 108 insertions(+), 15 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a97c4970/R/pkg/vignettes/sparkr-vignettes.Rmd
--
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd 
b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 13a3991..2301a64 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -503,6 +503,8 @@ SparkR supports the following machine learning models and 
algorithms.
 
  Tree - Classification and Regression
 
+* Decision Tree
+
 * Gradient-Boosted Trees (GBT)
 
 * Random Forest
@@ -776,16 +778,32 @@ newDF <- createDataFrame(data.frame(x = c(1.5, 3.2)))
 head(predict(isoregModel, newDF))
 ```
 
+ Decision Tree
+
+`spark.decisionTree` fits a [decision 
tree](https://en.wikipedia.org/wiki/Decision_tree_learning) classification or 
regression model on a `SparkDataFrame`.
+Users can call `summary` to get a summary of the fitted model, `predict` to 
make predictions, and `write.ml`/`read.ml` to save/load fitted models.
+
+We use the `Titanic` dataset to train a decision tree and make predictions:
+
+```{r}
+t <- as.data.frame(Titanic)
+df <- createDataFrame(t)
+dtModel <- spark.decisionTree(df, Survived ~ ., type = "classification", 
maxDepth = 2)
+summary(dtModel)
+predictions <- predict(dtModel, df)
+```
+
  Gradient-Boosted Trees
 
 `spark.gbt` fits a [gradient-boosted 
tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or 
regression model on a `SparkDataFrame`.
 Users can call `summary` to get a summary of the fitted model, `predict` to 
make predictions, and `write.ml`/`read.ml` to save/load fitted models.
 
-We use the `longley` dataset to train a gradient-boosted tree and make 
predictions:
+We use the `Titanic` dataset to train a gradient-boosted tree and make 
predictions:
 
-```{r, warning=FALSE}
-df <- createDataFrame(longley)
-gbtModel <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 2, 
maxIter = 2)
+```{r}
+t <- as.data.frame(Titanic)
+df <- createDataFrame(t)
+gbtModel <- spark.gbt(df, Survived ~ ., type = "classification", maxDepth = 2, 
maxIter = 2)
 summary(gbtModel)
 predictions <- predict(gbtModel, df)
 ```
@@ -795,11 +813,12 @@ predictions <- predict(gbtModel, df)
 `spark.randomForest` fits a [random 
forest](https://en.wikipedia.org/wiki/Random_forest) classification or 
regression model on a `SparkDataFrame`.
 Users can call `summary` to get a summary of the fitted model, `predict` to 
make predictions, and `write.ml`/`read.ml` to save/load fitted models.
 
-In the following example, we use the `longley` dataset to train a random 
forest and make predictions:
+In the following example, we use the `Titanic` dataset to train a random 
forest and make predictions:
 
-```{r, warning=FALSE}
-df <- createDataFrame(longley)
-rfModel <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth 
= 2, numTrees = 2)
+```{r}
+t <- as.data.frame(Titanic)
+df <- createDataFrame(t)
+rfModel <- spark.randomForest(df, Survived ~ ., type = "classification", 
maxDepth = 2, numTrees = 2)
 summary(rfModel)
 predictions <- predict(rfModel, df)
 ```
@@ -965,17 +984,18 @@ Given a `SparkDataFrame`, the test compares continuous 
data in a given column `t
 specified by parameter `nullHypothesis`.
 Users can call `summary` to get a summary of the test results.
 
-In the following example, we test whether the `longley` dataset's 
`Armed_Forces` column
+In the following example, we test whether the `Titanic` dataset's `Freq` column
 follows a normal distribution.  We set the parameters of the normal 
distribution using
 the mean and standard deviation of the sample.
 
-```{r, warning=FALSE}
-df <- createDat

[1/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

2017-06-11 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 5301a19a0 -> dc4c35183


http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_streaming.R
--
diff --git a/R/pkg/tests/fulltests/test_streaming.R 
b/R/pkg/tests/fulltests/test_streaming.R
new file mode 100644
index 000..b20b431
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_streaming.R
@@ -0,0 +1,167 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("Structured Streaming")
+
+# Tests for Structured Streaming functions in SparkR
+
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = 
FALSE)
+
+jsonSubDir <- file.path("sparkr-test", "json", "")
+if (.Platform$OS.type == "windows") {
+  # file.path removes the empty separator on Windows, adds it back
+  jsonSubDir <- paste0(jsonSubDir, .Platform$file.sep)
+}
+jsonDir <- file.path(tempdir(), jsonSubDir)
+dir.create(jsonDir, recursive = TRUE)
+
+mockLines <- c("{\"name\":\"Michael\"}",
+   "{\"name\":\"Andy\", \"age\":30}",
+   "{\"name\":\"Justin\", \"age\":19}")
+jsonPath <- tempfile(pattern = jsonSubDir, fileext = ".tmp")
+writeLines(mockLines, jsonPath)
+
+mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}",
+ "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}",
+ "{\"name\":\"David\",\"age\":60,\"height\":null}")
+jsonPathNa <- tempfile(pattern = jsonSubDir, fileext = ".tmp")
+
+schema <- structType(structField("name", "string"),
+ structField("age", "integer"),
+ structField("count", "double"))
+
+test_that("read.stream, write.stream, awaitTermination, stopQuery", {
+  skip_on_cran()
+
+  df <- read.stream("json", path = jsonDir, schema = schema, 
maxFilesPerTrigger = 1)
+  expect_true(isStreaming(df))
+  counts <- count(group_by(df, "name"))
+  q <- write.stream(counts, "memory", queryName = "people", outputMode = 
"complete")
+
+  expect_false(awaitTermination(q, 5 * 1000))
+  callJMethod(q@ssq, "processAllAvailable")
+  expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 3)
+
+  writeLines(mockLinesNa, jsonPathNa)
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+  expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 6)
+
+  stopQuery(q)
+  expect_true(awaitTermination(q, 1))
+  expect_error(awaitTermination(q), NA)
+})
+
+test_that("print from explain, lastProgress, status, isActive", {
+  skip_on_cran()
+
+  df <- read.stream("json", path = jsonDir, schema = schema)
+  expect_true(isStreaming(df))
+  counts <- count(group_by(df, "name"))
+  q <- write.stream(counts, "memory", queryName = "people2", outputMode = 
"complete")
+
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  expect_equal(capture.output(explain(q))[[1]], "== Physical Plan ==")
+  expect_true(any(grepl("\"description\" : \"MemorySink\"", 
capture.output(lastProgress(q)
+  expect_true(any(grepl("\"isTriggerActive\" : ", capture.output(status(q)
+
+  expect_equal(queryName(q), "people2")
+  expect_true(isActive(q))
+
+  stopQuery(q)
+})
+
+test_that("Stream other format", {
+  skip_on_cran()
+
+  parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
+  df <- read.df(jsonPath, "json", schema)
+  write.df(df, parquetPath, "parquet", "overwrite")
+
+  df <- read.stream(path = parquetPath, schema = schema)
+  expect_true(isStreaming(df))
+  counts <- count(group_by(df, "name"))
+  q <- write.stream(counts, "memory", queryName = "people3", outputMode = 
"complete")
+
+  expect_false(awaitTermination(q, 5 * 1000))
+  callJMethod(q@ssq, "processAllAvailable")
+  expect_equal(head(sql("SELECT count(*) FROM people3"))[[1]], 3)
+
+  expect_equal(queryName(q), "people3")
+  expect_true(any(grepl("\"description\" : 
\"FileStreamSource[[:print:]]+parquet",
+  capture.output(lastProgress(q)
+  expect_true(isActive(q))
+
+  stopQuery(q)
+  expect_true(awaitTermination(q, 1))
+  expect_false(isActive(q))
+
+  unlink(parquetPath)
+})
+
+test_that("Non-streaming DataFrame", {
+  skip_on_cran()
+
+  c <- as.DataFrame(cars)
+  exp

[2/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

2017-06-11 Thread felixcheung
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_sparkSQL.R
--
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R 
b/R/pkg/tests/fulltests/test_sparkSQL.R
new file mode 100644
index 000..c790d02
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -0,0 +1,3474 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("SparkSQL functions")
+
+# Utility function for easily checking the values of a StructField
+checkStructField <- function(actual, expectedName, expectedType, 
expectedNullable) {
+  expect_equal(class(actual), "structField")
+  expect_equal(actual$name(), expectedName)
+  expect_equal(actual$dataType.toString(), expectedType)
+  expect_equal(actual$nullable(), expectedNullable)
+}
+
+markUtf8 <- function(s) {
+  Encoding(s) <- "UTF-8"
+  s
+}
+
+setHiveContext <- function(sc) {
+  if (exists(".testHiveSession", envir = .sparkREnv)) {
+hiveSession <- get(".testHiveSession", envir = .sparkREnv)
+  } else {
+# initialize once and reuse
+ssc <- callJMethod(sc, "sc")
+hiveCtx <- tryCatch({
+  newJObject("org.apache.spark.sql.hive.test.TestHiveContext", ssc, FALSE)
+},
+error = function(err) {
+  skip("Hive is not build with SparkSQL, skipped")
+})
+hiveSession <- callJMethod(hiveCtx, "sparkSession")
+  }
+  previousSession <- get(".sparkRsession", envir = .sparkREnv)
+  assign(".sparkRsession", hiveSession, envir = .sparkREnv)
+  assign(".prevSparkRsession", previousSession, envir = .sparkREnv)
+  hiveSession
+}
+
+unsetHiveContext <- function() {
+  previousSession <- get(".prevSparkRsession", envir = .sparkREnv)
+  assign(".sparkRsession", previousSession, envir = .sparkREnv)
+  remove(".prevSparkRsession", envir = .sparkREnv)
+}
+
+# Tests for SparkSQL functions in SparkR
+
+filesBefore <- list.files(path = sparkRDir, all.files = TRUE)
+sparkSession <- if (not_cran_or_windows_with_hadoop()) {
+sparkR.session(master = sparkRTestMaster)
+  } else {
+sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+  }
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", 
"getJavaSparkContext", sparkSession)
+
+mockLines <- c("{\"name\":\"Michael\"}",
+   "{\"name\":\"Andy\", \"age\":30}",
+   "{\"name\":\"Justin\", \"age\":19}")
+jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
+orcPath <- tempfile(pattern = "sparkr-test", fileext = ".orc")
+writeLines(mockLines, jsonPath)
+
+# For test nafunctions, like dropna(), fillna(),...
+mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}",
+ "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}",
+ "{\"name\":\"David\",\"age\":60,\"height\":null}",
+ "{\"name\":\"Amy\",\"age\":null,\"height\":null}",
+ "{\"name\":null,\"age\":null,\"height\":null}")
+jsonPathNa <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+writeLines(mockLinesNa, jsonPathNa)
+
+# For test complex types in DataFrame
+mockLinesComplexType <-
+  c("{\"c1\":[1, 2, 3], \"c2\":[\"a\", \"b\", \"c\"], \"c3\":[1.0, 2.0, 3.0]}",
+"{\"c1\":[4, 5, 6], \"c2\":[\"d\", \"e\", \"f\"], \"c3\":[4.0, 5.0, 6.0]}",
+"{\"c1\":[7, 8, 9], \"c2\":[\"g\", \"h\", \"i\"], \"c3\":[7.0, 8.0, 9.0]}")
+complexTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+writeLines(mockLinesComplexType, complexTypeJsonPath)
+
+# For test map type and struct type in DataFrame
+mockLinesMapType <- 
c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}",
+  
"{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}",
+  
"{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}")
+mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+writeLines(mockLinesMapType, mapTypeJsonPath)
+
+if (.Platform$OS.type == "windows") {
+  Sys.setenv(TZ = "GMT")
+}
+
+test_that("calling sparkRSQL.init returns existing SQL context", {
+  skip_on_cran()
+
+  sqlContext <- suppressWarnings(sparkRSQL.init(sc))
+  expect_equal(suppressWarni

[3/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

2017-06-11 Thread felixcheung
http://git-wip-us.apache.org/repos/asf/spark/blob/0b0be47e/R/pkg/tests/fulltests/test_mllib_fpm.R
--
diff --git a/R/pkg/tests/fulltests/test_mllib_fpm.R 
b/R/pkg/tests/fulltests/test_mllib_fpm.R
new file mode 100644
index 000..4e10ca1
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_mllib_fpm.R
@@ -0,0 +1,85 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("MLlib frequent pattern mining")
+
+# Tests for MLlib frequent pattern mining algorithms in SparkR
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = 
FALSE)
+
+test_that("spark.fpGrowth", {
+  data <- selectExpr(createDataFrame(data.frame(items = c(
+"1,2",
+"1,2",
+"1,2,3",
+"1,3"
+  ))), "split(items, ',') as items")
+
+  model <- spark.fpGrowth(data, minSupport = 0.3, minConfidence = 0.8, 
numPartitions = 1)
+
+  itemsets <- collect(spark.freqItemsets(model))
+
+  expected_itemsets <- data.frame(
+items = I(list(list("3"), list("3", "1"), list("2"), list("2", "1"), 
list("1"))),
+freq = c(2, 2, 3, 3, 4)
+  )
+
+  expect_equivalent(expected_itemsets, itemsets)
+
+  expected_association_rules <- data.frame(
+antecedent = I(list(list("2"), list("3"))),
+consequent = I(list(list("1"), list("1"))),
+confidence = c(1, 1)
+  )
+
+  expect_equivalent(expected_association_rules, 
collect(spark.associationRules(model)))
+
+  new_data <- selectExpr(createDataFrame(data.frame(items = c(
+"1,2",
+"1,3",
+"2,3"
+  ))), "split(items, ',') as items")
+
+  expected_predictions <- data.frame(
+items = I(list(list("1", "2"), list("1", "3"), list("2", "3"))),
+prediction = I(list(list(), list(), list("1")))
+  )
+
+  expect_equivalent(expected_predictions, collect(predict(model, new_data)))
+
+  if (not_cran_or_windows_with_hadoop()) {
+modelPath <- tempfile(pattern = "spark-fpm", fileext = ".tmp")
+write.ml(model, modelPath, overwrite = TRUE)
+loaded_model <- read.ml(modelPath)
+
+expect_equivalent(
+  itemsets,
+  collect(spark.freqItemsets(loaded_model)))
+
+unlink(modelPath)
+  }
+
+  model_without_numpartitions <- spark.fpGrowth(data, minSupport = 0.3, 
minConfidence = 0.8)
+  expect_equal(
+count(spark.freqItemsets(model_without_numpartitions)),
+count(spark.freqItemsets(model))
+  )
+
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/0b0be47e/R/pkg/tests/fulltests/test_mllib_recommendation.R
--
diff --git a/R/pkg/tests/fulltests/test_mllib_recommendation.R 
b/R/pkg/tests/fulltests/test_mllib_recommendation.R
new file mode 100644
index 000..cc8064f
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_mllib_recommendation.R
@@ -0,0 +1,67 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("MLlib recommendation algorithms")
+
+# Tests for MLlib recommendation algorithms in SparkR
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = 
FALSE)
+
+test_that("spark.als", {
+  data <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 
4.0),
+   list(2, 1, 1.0), list(2, 2, 5.0))
+  df <- createDataFrame(data, c("user", "item", "score"))
+  model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = 
"item",
+ rank = 10, maxIter = 5, seed = 0, regParam = 0.1)
+  stats <-

[3/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

2017-06-11 Thread felixcheung
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_mllib_fpm.R
--
diff --git a/R/pkg/tests/fulltests/test_mllib_fpm.R 
b/R/pkg/tests/fulltests/test_mllib_fpm.R
new file mode 100644
index 000..4e10ca1
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_mllib_fpm.R
@@ -0,0 +1,85 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("MLlib frequent pattern mining")
+
+# Tests for MLlib frequent pattern mining algorithms in SparkR
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = 
FALSE)
+
+test_that("spark.fpGrowth", {
+  data <- selectExpr(createDataFrame(data.frame(items = c(
+"1,2",
+"1,2",
+"1,2,3",
+"1,3"
+  ))), "split(items, ',') as items")
+
+  model <- spark.fpGrowth(data, minSupport = 0.3, minConfidence = 0.8, 
numPartitions = 1)
+
+  itemsets <- collect(spark.freqItemsets(model))
+
+  expected_itemsets <- data.frame(
+items = I(list(list("3"), list("3", "1"), list("2"), list("2", "1"), 
list("1"))),
+freq = c(2, 2, 3, 3, 4)
+  )
+
+  expect_equivalent(expected_itemsets, itemsets)
+
+  expected_association_rules <- data.frame(
+antecedent = I(list(list("2"), list("3"))),
+consequent = I(list(list("1"), list("1"))),
+confidence = c(1, 1)
+  )
+
+  expect_equivalent(expected_association_rules, 
collect(spark.associationRules(model)))
+
+  new_data <- selectExpr(createDataFrame(data.frame(items = c(
+"1,2",
+"1,3",
+"2,3"
+  ))), "split(items, ',') as items")
+
+  expected_predictions <- data.frame(
+items = I(list(list("1", "2"), list("1", "3"), list("2", "3"))),
+prediction = I(list(list(), list(), list("1")))
+  )
+
+  expect_equivalent(expected_predictions, collect(predict(model, new_data)))
+
+  if (not_cran_or_windows_with_hadoop()) {
+modelPath <- tempfile(pattern = "spark-fpm", fileext = ".tmp")
+write.ml(model, modelPath, overwrite = TRUE)
+loaded_model <- read.ml(modelPath)
+
+expect_equivalent(
+  itemsets,
+  collect(spark.freqItemsets(loaded_model)))
+
+unlink(modelPath)
+  }
+
+  model_without_numpartitions <- spark.fpGrowth(data, minSupport = 0.3, 
minConfidence = 0.8)
+  expect_equal(
+count(spark.freqItemsets(model_without_numpartitions)),
+count(spark.freqItemsets(model))
+  )
+
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_mllib_recommendation.R
--
diff --git a/R/pkg/tests/fulltests/test_mllib_recommendation.R 
b/R/pkg/tests/fulltests/test_mllib_recommendation.R
new file mode 100644
index 000..cc8064f
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_mllib_recommendation.R
@@ -0,0 +1,67 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("MLlib recommendation algorithms")
+
+# Tests for MLlib recommendation algorithms in SparkR
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = 
FALSE)
+
+test_that("spark.als", {
+  data <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 
4.0),
+   list(2, 1, 1.0), list(2, 2, 5.0))
+  df <- createDataFrame(data, c("user", "item", "score"))
+  model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = 
"item",
+ rank = 10, maxIter = 5, seed = 0, regParam = 0.1)
+  stats <-

[6/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

2017-06-11 Thread felixcheung
http://git-wip-us.apache.org/repos/asf/spark/blob/0b0be47e/R/pkg/inst/tests/testthat/test_mllib_regression.R
--
diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R 
b/R/pkg/inst/tests/testthat/test_mllib_regression.R
deleted file mode 100644
index b05fdd3..000
--- a/R/pkg/inst/tests/testthat/test_mllib_regression.R
+++ /dev/null
@@ -1,480 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("MLlib regression algorithms, except for tree-based algorithms")
-
-# Tests for MLlib regression algorithms in SparkR
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = 
FALSE)
-
-test_that("formula of spark.glm", {
-  skip_on_cran()
-
-  training <- suppressWarnings(createDataFrame(iris))
-  # directly calling the spark API
-  # dot minus and intercept vs native glm
-  model <- spark.glm(training, Sepal_Width ~ . - Species + 0)
-  vals <- collect(select(predict(model, training), "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
-  # feature interaction vs native glm
-  model <- spark.glm(training, Sepal_Width ~ Species:Sepal_Length)
-  vals <- collect(select(predict(model, training), "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
-  # glm should work with long formula
-  training <- suppressWarnings(createDataFrame(iris))
-  training$LongLongLongLongLongName <- training$Sepal_Width
-  training$VeryLongLongLongLonLongName <- training$Sepal_Length
-  training$AnotherLongLongLongLongName <- training$Species
-  model <- spark.glm(training, LongLongLongLongLongName ~ 
VeryLongLongLongLonLongName +
-AnotherLongLongLongLongName)
-  vals <- collect(select(predict(model, training), "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), 
iris)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-})
-
-test_that("spark.glm and predict", {
-  training <- suppressWarnings(createDataFrame(iris))
-  # gaussian family
-  model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species)
-  prediction <- predict(model, training)
-  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), 
"double")
-  vals <- collect(select(prediction, "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), 
iris)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
-  # poisson family
-  model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
- family = poisson(link = identity))
-  prediction <- predict(model, training)
-  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), 
"double")
-  vals <- collect(select(prediction, "prediction"))
-  rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
-data = iris, family = poisson(link = 
identity)), iris))
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
-  # Gamma family
-  x <- runif(100, -1, 1)
-  y <- rgamma(100, rate = 10 / exp(0.5 + 1.2 * x), shape = 10)
-  df <- as.DataFrame(as.data.frame(list(x = x, y = y)))
-  model <- glm(y ~ x, family = Gamma, df)
-  out <- capture.output(print(summary(model)))
-  expect_true(any(grepl("Dispersion parameter for gamma family", out)))
-
-  # tweedie family
-  model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
- family = "tweedie", var.power = 1.2, link.power = 0.0)
-  prediction <- predict(model, training)
-  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), 
"double")
-  vals <- collect(select(prediction, "prediction"))
-
-  # manual calculation of the R predicted values to avoid dependence on statmod
-  #' library(statmod)
-  #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
-  #' family = tweedie(var.power = 1.2, link.power = 0.0))
-  #' print(coef(rModel))
-
-  rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174)
-  rVals <- exp(as.numeric(model.

[4/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

2017-06-11 Thread felixcheung
http://git-wip-us.apache.org/repos/asf/spark/blob/0b0be47e/R/pkg/inst/tests/testthat/test_streaming.R
--
diff --git a/R/pkg/inst/tests/testthat/test_streaming.R 
b/R/pkg/inst/tests/testthat/test_streaming.R
deleted file mode 100644
index b20b431..000
--- a/R/pkg/inst/tests/testthat/test_streaming.R
+++ /dev/null
@@ -1,167 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("Structured Streaming")
-
-# Tests for Structured Streaming functions in SparkR
-
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = 
FALSE)
-
-jsonSubDir <- file.path("sparkr-test", "json", "")
-if (.Platform$OS.type == "windows") {
-  # file.path removes the empty separator on Windows, adds it back
-  jsonSubDir <- paste0(jsonSubDir, .Platform$file.sep)
-}
-jsonDir <- file.path(tempdir(), jsonSubDir)
-dir.create(jsonDir, recursive = TRUE)
-
-mockLines <- c("{\"name\":\"Michael\"}",
-   "{\"name\":\"Andy\", \"age\":30}",
-   "{\"name\":\"Justin\", \"age\":19}")
-jsonPath <- tempfile(pattern = jsonSubDir, fileext = ".tmp")
-writeLines(mockLines, jsonPath)
-
-mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}",
- "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}",
- "{\"name\":\"David\",\"age\":60,\"height\":null}")
-jsonPathNa <- tempfile(pattern = jsonSubDir, fileext = ".tmp")
-
-schema <- structType(structField("name", "string"),
- structField("age", "integer"),
- structField("count", "double"))
-
-test_that("read.stream, write.stream, awaitTermination, stopQuery", {
-  skip_on_cran()
-
-  df <- read.stream("json", path = jsonDir, schema = schema, 
maxFilesPerTrigger = 1)
-  expect_true(isStreaming(df))
-  counts <- count(group_by(df, "name"))
-  q <- write.stream(counts, "memory", queryName = "people", outputMode = 
"complete")
-
-  expect_false(awaitTermination(q, 5 * 1000))
-  callJMethod(q@ssq, "processAllAvailable")
-  expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 3)
-
-  writeLines(mockLinesNa, jsonPathNa)
-  awaitTermination(q, 5 * 1000)
-  callJMethod(q@ssq, "processAllAvailable")
-  expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 6)
-
-  stopQuery(q)
-  expect_true(awaitTermination(q, 1))
-  expect_error(awaitTermination(q), NA)
-})
-
-test_that("print from explain, lastProgress, status, isActive", {
-  skip_on_cran()
-
-  df <- read.stream("json", path = jsonDir, schema = schema)
-  expect_true(isStreaming(df))
-  counts <- count(group_by(df, "name"))
-  q <- write.stream(counts, "memory", queryName = "people2", outputMode = 
"complete")
-
-  awaitTermination(q, 5 * 1000)
-  callJMethod(q@ssq, "processAllAvailable")
-
-  expect_equal(capture.output(explain(q))[[1]], "== Physical Plan ==")
-  expect_true(any(grepl("\"description\" : \"MemorySink\"", 
capture.output(lastProgress(q)
-  expect_true(any(grepl("\"isTriggerActive\" : ", capture.output(status(q)
-
-  expect_equal(queryName(q), "people2")
-  expect_true(isActive(q))
-
-  stopQuery(q)
-})
-
-test_that("Stream other format", {
-  skip_on_cran()
-
-  parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
-  df <- read.df(jsonPath, "json", schema)
-  write.df(df, parquetPath, "parquet", "overwrite")
-
-  df <- read.stream(path = parquetPath, schema = schema)
-  expect_true(isStreaming(df))
-  counts <- count(group_by(df, "name"))
-  q <- write.stream(counts, "memory", queryName = "people3", outputMode = 
"complete")
-
-  expect_false(awaitTermination(q, 5 * 1000))
-  callJMethod(q@ssq, "processAllAvailable")
-  expect_equal(head(sql("SELECT count(*) FROM people3"))[[1]], 3)
-
-  expect_equal(queryName(q), "people3")
-  expect_true(any(grepl("\"description\" : 
\"FileStreamSource[[:print:]]+parquet",
-  capture.output(lastProgress(q)
-  expect_true(isActive(q))
-
-  stopQuery(q)
-  expect_true(awaitTermination(q, 1))
-  expect_false(isActive(q))
-
-  unlink(parquetPath)
-})
-
-test_that("Non-streaming DataFrame", {
-  skip_on_cran()
-
-  c <- as.DataFrame(cars)
-  expect_false(isStreaming(c))
-
-  expect_error(write.stream(c, "

[4/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

2017-06-11 Thread felixcheung
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_streaming.R
--
diff --git a/R/pkg/inst/tests/testthat/test_streaming.R 
b/R/pkg/inst/tests/testthat/test_streaming.R
deleted file mode 100644
index b20b431..000
--- a/R/pkg/inst/tests/testthat/test_streaming.R
+++ /dev/null
@@ -1,167 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("Structured Streaming")
-
-# Tests for Structured Streaming functions in SparkR
-
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = 
FALSE)
-
-jsonSubDir <- file.path("sparkr-test", "json", "")
-if (.Platform$OS.type == "windows") {
-  # file.path removes the empty separator on Windows, adds it back
-  jsonSubDir <- paste0(jsonSubDir, .Platform$file.sep)
-}
-jsonDir <- file.path(tempdir(), jsonSubDir)
-dir.create(jsonDir, recursive = TRUE)
-
-mockLines <- c("{\"name\":\"Michael\"}",
-   "{\"name\":\"Andy\", \"age\":30}",
-   "{\"name\":\"Justin\", \"age\":19}")
-jsonPath <- tempfile(pattern = jsonSubDir, fileext = ".tmp")
-writeLines(mockLines, jsonPath)
-
-mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}",
- "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}",
- "{\"name\":\"David\",\"age\":60,\"height\":null}")
-jsonPathNa <- tempfile(pattern = jsonSubDir, fileext = ".tmp")
-
-schema <- structType(structField("name", "string"),
- structField("age", "integer"),
- structField("count", "double"))
-
-test_that("read.stream, write.stream, awaitTermination, stopQuery", {
-  skip_on_cran()
-
-  df <- read.stream("json", path = jsonDir, schema = schema, 
maxFilesPerTrigger = 1)
-  expect_true(isStreaming(df))
-  counts <- count(group_by(df, "name"))
-  q <- write.stream(counts, "memory", queryName = "people", outputMode = 
"complete")
-
-  expect_false(awaitTermination(q, 5 * 1000))
-  callJMethod(q@ssq, "processAllAvailable")
-  expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 3)
-
-  writeLines(mockLinesNa, jsonPathNa)
-  awaitTermination(q, 5 * 1000)
-  callJMethod(q@ssq, "processAllAvailable")
-  expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 6)
-
-  stopQuery(q)
-  expect_true(awaitTermination(q, 1))
-  expect_error(awaitTermination(q), NA)
-})
-
-test_that("print from explain, lastProgress, status, isActive", {
-  skip_on_cran()
-
-  df <- read.stream("json", path = jsonDir, schema = schema)
-  expect_true(isStreaming(df))
-  counts <- count(group_by(df, "name"))
-  q <- write.stream(counts, "memory", queryName = "people2", outputMode = 
"complete")
-
-  awaitTermination(q, 5 * 1000)
-  callJMethod(q@ssq, "processAllAvailable")
-
-  expect_equal(capture.output(explain(q))[[1]], "== Physical Plan ==")
-  expect_true(any(grepl("\"description\" : \"MemorySink\"", 
capture.output(lastProgress(q)
-  expect_true(any(grepl("\"isTriggerActive\" : ", capture.output(status(q)
-
-  expect_equal(queryName(q), "people2")
-  expect_true(isActive(q))
-
-  stopQuery(q)
-})
-
-test_that("Stream other format", {
-  skip_on_cran()
-
-  parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
-  df <- read.df(jsonPath, "json", schema)
-  write.df(df, parquetPath, "parquet", "overwrite")
-
-  df <- read.stream(path = parquetPath, schema = schema)
-  expect_true(isStreaming(df))
-  counts <- count(group_by(df, "name"))
-  q <- write.stream(counts, "memory", queryName = "people3", outputMode = 
"complete")
-
-  expect_false(awaitTermination(q, 5 * 1000))
-  callJMethod(q@ssq, "processAllAvailable")
-  expect_equal(head(sql("SELECT count(*) FROM people3"))[[1]], 3)
-
-  expect_equal(queryName(q), "people3")
-  expect_true(any(grepl("\"description\" : 
\"FileStreamSource[[:print:]]+parquet",
-  capture.output(lastProgress(q)
-  expect_true(isActive(q))
-
-  stopQuery(q)
-  expect_true(awaitTermination(q, 1))
-  expect_false(isActive(q))
-
-  unlink(parquetPath)
-})
-
-test_that("Non-streaming DataFrame", {
-  skip_on_cran()
-
-  c <- as.DataFrame(cars)
-  expect_false(isStreaming(c))
-
-  expect_error(write.stream(c, "

[6/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

2017-06-11 Thread felixcheung
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_mllib_regression.R
--
diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R 
b/R/pkg/inst/tests/testthat/test_mllib_regression.R
deleted file mode 100644
index b05fdd3..000
--- a/R/pkg/inst/tests/testthat/test_mllib_regression.R
+++ /dev/null
@@ -1,480 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("MLlib regression algorithms, except for tree-based algorithms")
-
-# Tests for MLlib regression algorithms in SparkR
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = 
FALSE)
-
-test_that("formula of spark.glm", {
-  skip_on_cran()
-
-  training <- suppressWarnings(createDataFrame(iris))
-  # directly calling the spark API
-  # dot minus and intercept vs native glm
-  model <- spark.glm(training, Sepal_Width ~ . - Species + 0)
-  vals <- collect(select(predict(model, training), "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
-  # feature interaction vs native glm
-  model <- spark.glm(training, Sepal_Width ~ Species:Sepal_Length)
-  vals <- collect(select(predict(model, training), "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
-  # glm should work with long formula
-  training <- suppressWarnings(createDataFrame(iris))
-  training$LongLongLongLongLongName <- training$Sepal_Width
-  training$VeryLongLongLongLonLongName <- training$Sepal_Length
-  training$AnotherLongLongLongLongName <- training$Species
-  model <- spark.glm(training, LongLongLongLongLongName ~ 
VeryLongLongLongLonLongName +
-AnotherLongLongLongLongName)
-  vals <- collect(select(predict(model, training), "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), 
iris)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-})
-
-test_that("spark.glm and predict", {
-  training <- suppressWarnings(createDataFrame(iris))
-  # gaussian family
-  model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species)
-  prediction <- predict(model, training)
-  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), 
"double")
-  vals <- collect(select(prediction, "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), 
iris)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
-  # poisson family
-  model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
- family = poisson(link = identity))
-  prediction <- predict(model, training)
-  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), 
"double")
-  vals <- collect(select(prediction, "prediction"))
-  rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
-data = iris, family = poisson(link = 
identity)), iris))
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
-  # Gamma family
-  x <- runif(100, -1, 1)
-  y <- rgamma(100, rate = 10 / exp(0.5 + 1.2 * x), shape = 10)
-  df <- as.DataFrame(as.data.frame(list(x = x, y = y)))
-  model <- glm(y ~ x, family = Gamma, df)
-  out <- capture.output(print(summary(model)))
-  expect_true(any(grepl("Dispersion parameter for gamma family", out)))
-
-  # tweedie family
-  model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
- family = "tweedie", var.power = 1.2, link.power = 0.0)
-  prediction <- predict(model, training)
-  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), 
"double")
-  vals <- collect(select(prediction, "prediction"))
-
-  # manual calculation of the R predicted values to avoid dependence on statmod
-  #' library(statmod)
-  #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
-  #' family = tweedie(var.power = 1.2, link.power = 0.0))
-  #' print(coef(rModel))
-
-  rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174)
-  rVals <- exp(as.numeric(model.

[5/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

2017-06-11 Thread felixcheung
http://git-wip-us.apache.org/repos/asf/spark/blob/0b0be47e/R/pkg/inst/tests/testthat/test_sparkSQL.R
--
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
deleted file mode 100644
index d2d5191..000
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ /dev/null
@@ -1,3198 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("SparkSQL functions")
-
-# Utility function for easily checking the values of a StructField
-checkStructField <- function(actual, expectedName, expectedType, 
expectedNullable) {
-  expect_equal(class(actual), "structField")
-  expect_equal(actual$name(), expectedName)
-  expect_equal(actual$dataType.toString(), expectedType)
-  expect_equal(actual$nullable(), expectedNullable)
-}
-
-markUtf8 <- function(s) {
-  Encoding(s) <- "UTF-8"
-  s
-}
-
-setHiveContext <- function(sc) {
-  if (exists(".testHiveSession", envir = .sparkREnv)) {
-hiveSession <- get(".testHiveSession", envir = .sparkREnv)
-  } else {
-# initialize once and reuse
-ssc <- callJMethod(sc, "sc")
-hiveCtx <- tryCatch({
-  newJObject("org.apache.spark.sql.hive.test.TestHiveContext", ssc, FALSE)
-},
-error = function(err) {
-  skip("Hive is not build with SparkSQL, skipped")
-})
-hiveSession <- callJMethod(hiveCtx, "sparkSession")
-  }
-  previousSession <- get(".sparkRsession", envir = .sparkREnv)
-  assign(".sparkRsession", hiveSession, envir = .sparkREnv)
-  assign(".prevSparkRsession", previousSession, envir = .sparkREnv)
-  hiveSession
-}
-
-unsetHiveContext <- function() {
-  previousSession <- get(".prevSparkRsession", envir = .sparkREnv)
-  assign(".sparkRsession", previousSession, envir = .sparkREnv)
-  remove(".prevSparkRsession", envir = .sparkREnv)
-}
-
-# Tests for SparkSQL functions in SparkR
-
-filesBefore <- list.files(path = sparkRDir, all.files = TRUE)
-sparkSession <- if (not_cran_or_windows_with_hadoop()) {
-sparkR.session(master = sparkRTestMaster)
-  } else {
-sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-  }
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", 
"getJavaSparkContext", sparkSession)
-
-mockLines <- c("{\"name\":\"Michael\"}",
-   "{\"name\":\"Andy\", \"age\":30}",
-   "{\"name\":\"Justin\", \"age\":19}")
-jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
-orcPath <- tempfile(pattern = "sparkr-test", fileext = ".orc")
-writeLines(mockLines, jsonPath)
-
-# For test nafunctions, like dropna(), fillna(),...
-mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}",
- "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}",
- "{\"name\":\"David\",\"age\":60,\"height\":null}",
- "{\"name\":\"Amy\",\"age\":null,\"height\":null}",
- "{\"name\":null,\"age\":null,\"height\":null}")
-jsonPathNa <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-writeLines(mockLinesNa, jsonPathNa)
-
-# For test complex types in DataFrame
-mockLinesComplexType <-
-  c("{\"c1\":[1, 2, 3], \"c2\":[\"a\", \"b\", \"c\"], \"c3\":[1.0, 2.0, 3.0]}",
-"{\"c1\":[4, 5, 6], \"c2\":[\"d\", \"e\", \"f\"], \"c3\":[4.0, 5.0, 6.0]}",
-"{\"c1\":[7, 8, 9], \"c2\":[\"g\", \"h\", \"i\"], \"c3\":[7.0, 8.0, 9.0]}")
-complexTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-writeLines(mockLinesComplexType, complexTypeJsonPath)
-
-# For test map type and struct type in DataFrame
-mockLinesMapType <- 
c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}",
-  
"{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}",
-  
"{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}")
-mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-writeLines(mockLinesMapType, mapTypeJsonPath)
-
-if (.Platform$OS.type == "windows") {
-  Sys.setenv(TZ = "GMT")
-}
-
-test_that("calling sparkRSQL.init returns existing SQL context", {
-  skip_on_cran()
-
-  sqlContext <- suppressWarnings(sparkRSQL.init(sc))
-  expect

[1/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

2017-06-11 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.2 815a0820b -> 0b0be47e7


http://git-wip-us.apache.org/repos/asf/spark/blob/0b0be47e/R/pkg/tests/fulltests/test_streaming.R
--
diff --git a/R/pkg/tests/fulltests/test_streaming.R 
b/R/pkg/tests/fulltests/test_streaming.R
new file mode 100644
index 000..b20b431
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_streaming.R
@@ -0,0 +1,167 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("Structured Streaming")
+
+# Tests for Structured Streaming functions in SparkR
+
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = 
FALSE)
+
+jsonSubDir <- file.path("sparkr-test", "json", "")
+if (.Platform$OS.type == "windows") {
+  # file.path removes the empty separator on Windows, adds it back
+  jsonSubDir <- paste0(jsonSubDir, .Platform$file.sep)
+}
+jsonDir <- file.path(tempdir(), jsonSubDir)
+dir.create(jsonDir, recursive = TRUE)
+
+mockLines <- c("{\"name\":\"Michael\"}",
+   "{\"name\":\"Andy\", \"age\":30}",
+   "{\"name\":\"Justin\", \"age\":19}")
+jsonPath <- tempfile(pattern = jsonSubDir, fileext = ".tmp")
+writeLines(mockLines, jsonPath)
+
+mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}",
+ "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}",
+ "{\"name\":\"David\",\"age\":60,\"height\":null}")
+jsonPathNa <- tempfile(pattern = jsonSubDir, fileext = ".tmp")
+
+schema <- structType(structField("name", "string"),
+ structField("age", "integer"),
+ structField("count", "double"))
+
+test_that("read.stream, write.stream, awaitTermination, stopQuery", {
+  skip_on_cran()
+
+  df <- read.stream("json", path = jsonDir, schema = schema, 
maxFilesPerTrigger = 1)
+  expect_true(isStreaming(df))
+  counts <- count(group_by(df, "name"))
+  q <- write.stream(counts, "memory", queryName = "people", outputMode = 
"complete")
+
+  expect_false(awaitTermination(q, 5 * 1000))
+  callJMethod(q@ssq, "processAllAvailable")
+  expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 3)
+
+  writeLines(mockLinesNa, jsonPathNa)
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+  expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 6)
+
+  stopQuery(q)
+  expect_true(awaitTermination(q, 1))
+  expect_error(awaitTermination(q), NA)
+})
+
+test_that("print from explain, lastProgress, status, isActive", {
+  skip_on_cran()
+
+  df <- read.stream("json", path = jsonDir, schema = schema)
+  expect_true(isStreaming(df))
+  counts <- count(group_by(df, "name"))
+  q <- write.stream(counts, "memory", queryName = "people2", outputMode = 
"complete")
+
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  expect_equal(capture.output(explain(q))[[1]], "== Physical Plan ==")
+  expect_true(any(grepl("\"description\" : \"MemorySink\"", 
capture.output(lastProgress(q)
+  expect_true(any(grepl("\"isTriggerActive\" : ", capture.output(status(q)
+
+  expect_equal(queryName(q), "people2")
+  expect_true(isActive(q))
+
+  stopQuery(q)
+})
+
+test_that("Stream other format", {
+  skip_on_cran()
+
+  parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
+  df <- read.df(jsonPath, "json", schema)
+  write.df(df, parquetPath, "parquet", "overwrite")
+
+  df <- read.stream(path = parquetPath, schema = schema)
+  expect_true(isStreaming(df))
+  counts <- count(group_by(df, "name"))
+  q <- write.stream(counts, "memory", queryName = "people3", outputMode = 
"complete")
+
+  expect_false(awaitTermination(q, 5 * 1000))
+  callJMethod(q@ssq, "processAllAvailable")
+  expect_equal(head(sql("SELECT count(*) FROM people3"))[[1]], 3)
+
+  expect_equal(queryName(q), "people3")
+  expect_true(any(grepl("\"description\" : 
\"FileStreamSource[[:print:]]+parquet",
+  capture.output(lastProgress(q)
+  expect_true(isActive(q))
+
+  stopQuery(q)
+  expect_true(awaitTermination(q, 1))
+  expect_false(isActive(q))
+
+  unlink(parquetPath)
+})
+
+test_that("Non-streaming DataFrame", {
+  skip_on_cran()
+
+  c <- as.DataFrame(cars)
+ 

[5/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

2017-06-11 Thread felixcheung
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_sparkSQL.R
--
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
deleted file mode 100644
index c790d02..000
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ /dev/null
@@ -1,3474 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("SparkSQL functions")
-
-# Utility function for easily checking the values of a StructField
-checkStructField <- function(actual, expectedName, expectedType, 
expectedNullable) {
-  expect_equal(class(actual), "structField")
-  expect_equal(actual$name(), expectedName)
-  expect_equal(actual$dataType.toString(), expectedType)
-  expect_equal(actual$nullable(), expectedNullable)
-}
-
-markUtf8 <- function(s) {
-  Encoding(s) <- "UTF-8"
-  s
-}
-
-setHiveContext <- function(sc) {
-  if (exists(".testHiveSession", envir = .sparkREnv)) {
-hiveSession <- get(".testHiveSession", envir = .sparkREnv)
-  } else {
-# initialize once and reuse
-ssc <- callJMethod(sc, "sc")
-hiveCtx <- tryCatch({
-  newJObject("org.apache.spark.sql.hive.test.TestHiveContext", ssc, FALSE)
-},
-error = function(err) {
-  skip("Hive is not build with SparkSQL, skipped")
-})
-hiveSession <- callJMethod(hiveCtx, "sparkSession")
-  }
-  previousSession <- get(".sparkRsession", envir = .sparkREnv)
-  assign(".sparkRsession", hiveSession, envir = .sparkREnv)
-  assign(".prevSparkRsession", previousSession, envir = .sparkREnv)
-  hiveSession
-}
-
-unsetHiveContext <- function() {
-  previousSession <- get(".prevSparkRsession", envir = .sparkREnv)
-  assign(".sparkRsession", previousSession, envir = .sparkREnv)
-  remove(".prevSparkRsession", envir = .sparkREnv)
-}
-
-# Tests for SparkSQL functions in SparkR
-
-filesBefore <- list.files(path = sparkRDir, all.files = TRUE)
-sparkSession <- if (not_cran_or_windows_with_hadoop()) {
-sparkR.session(master = sparkRTestMaster)
-  } else {
-sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-  }
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", 
"getJavaSparkContext", sparkSession)
-
-mockLines <- c("{\"name\":\"Michael\"}",
-   "{\"name\":\"Andy\", \"age\":30}",
-   "{\"name\":\"Justin\", \"age\":19}")
-jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
-orcPath <- tempfile(pattern = "sparkr-test", fileext = ".orc")
-writeLines(mockLines, jsonPath)
-
-# For test nafunctions, like dropna(), fillna(),...
-mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}",
- "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}",
- "{\"name\":\"David\",\"age\":60,\"height\":null}",
- "{\"name\":\"Amy\",\"age\":null,\"height\":null}",
- "{\"name\":null,\"age\":null,\"height\":null}")
-jsonPathNa <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-writeLines(mockLinesNa, jsonPathNa)
-
-# For test complex types in DataFrame
-mockLinesComplexType <-
-  c("{\"c1\":[1, 2, 3], \"c2\":[\"a\", \"b\", \"c\"], \"c3\":[1.0, 2.0, 3.0]}",
-"{\"c1\":[4, 5, 6], \"c2\":[\"d\", \"e\", \"f\"], \"c3\":[4.0, 5.0, 6.0]}",
-"{\"c1\":[7, 8, 9], \"c2\":[\"g\", \"h\", \"i\"], \"c3\":[7.0, 8.0, 9.0]}")
-complexTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-writeLines(mockLinesComplexType, complexTypeJsonPath)
-
-# For test map type and struct type in DataFrame
-mockLinesMapType <- 
c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}",
-  
"{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}",
-  
"{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}")
-mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-writeLines(mockLinesMapType, mapTypeJsonPath)
-
-if (.Platform$OS.type == "windows") {
-  Sys.setenv(TZ = "GMT")
-}
-
-test_that("calling sparkRSQL.init returns existing SQL context", {
-  skip_on_cran()
-
-  sqlContext <- suppressWarnings(sparkRSQL.init(sc))
-  expect

[7/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

2017-06-11 Thread felixcheung
[SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

## What changes were proposed in this pull request?

Move all existing tests to non-installed directory so that it will never run by 
installing SparkR package

For a follow-up PR:
- remove all skip_on_cran() calls in tests
- clean up test timer
- improve or change basic tests that do run on CRAN (if anyone has suggestion)

It looks like `R CMD build pkg` will still put pkg\tests (ie. the full tests) 
into the source package but `R CMD INSTALL` on such source package does not 
install these tests (and so `R CMD check` does not run them)

## How was this patch tested?

- [x] unit tests, Jenkins
- [x] AppVeyor
- [x] make a source package, install it, `R CMD check` it - verify the full 
tests are not installed or run

Author: Felix Cheung 

Closes #18264 from felixcheung/rtestset.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dc4c3518
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dc4c3518
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dc4c3518

Branch: refs/heads/master
Commit: dc4c351837879dab26ad8fb471dc51c06832a9e4
Parents: 5301a19
Author: Felix Cheung 
Authored: Sun Jun 11 00:00:33 2017 -0700
Committer: Felix Cheung 
Committed: Sun Jun 11 00:00:33 2017 -0700

--
 R/pkg/inst/tests/testthat/jarTest.R |   32 -
 R/pkg/inst/tests/testthat/packageInAJarTest.R   |   30 -
 R/pkg/inst/tests/testthat/test_Serde.R  |   85 -
 R/pkg/inst/tests/testthat/test_Windows.R|   32 -
 R/pkg/inst/tests/testthat/test_basic.R  |   90 +
 R/pkg/inst/tests/testthat/test_binaryFile.R |  100 -
 .../inst/tests/testthat/test_binary_function.R  |  110 -
 R/pkg/inst/tests/testthat/test_broadcast.R  |   55 -
 R/pkg/inst/tests/testthat/test_client.R |   51 -
 R/pkg/inst/tests/testthat/test_context.R|  226 --
 R/pkg/inst/tests/testthat/test_includePackage.R |   64 -
 R/pkg/inst/tests/testthat/test_jvm_api.R|   36 -
 .../tests/testthat/test_mllib_classification.R  |  396 --
 .../inst/tests/testthat/test_mllib_clustering.R |  328 --
 R/pkg/inst/tests/testthat/test_mllib_fpm.R  |   85 -
 .../tests/testthat/test_mllib_recommendation.R  |   67 -
 .../inst/tests/testthat/test_mllib_regression.R |  480 ---
 R/pkg/inst/tests/testthat/test_mllib_stat.R |   53 -
 R/pkg/inst/tests/testthat/test_mllib_tree.R |  320 --
 .../tests/testthat/test_parallelize_collect.R   |  120 -
 R/pkg/inst/tests/testthat/test_rdd.R|  906 -
 R/pkg/inst/tests/testthat/test_shuffle.R|  248 --
 R/pkg/inst/tests/testthat/test_sparkR.R |   48 -
 R/pkg/inst/tests/testthat/test_sparkSQL.R   | 3474 --
 R/pkg/inst/tests/testthat/test_streaming.R  |  167 -
 R/pkg/inst/tests/testthat/test_take.R   |   71 -
 R/pkg/inst/tests/testthat/test_textFile.R   |  182 -
 R/pkg/inst/tests/testthat/test_utils.R  |  248 --
 R/pkg/tests/fulltests/jarTest.R |   32 +
 R/pkg/tests/fulltests/packageInAJarTest.R   |   30 +
 R/pkg/tests/fulltests/test_Serde.R  |   85 +
 R/pkg/tests/fulltests/test_Windows.R|   32 +
 R/pkg/tests/fulltests/test_binaryFile.R |  100 +
 R/pkg/tests/fulltests/test_binary_function.R|  110 +
 R/pkg/tests/fulltests/test_broadcast.R  |   55 +
 R/pkg/tests/fulltests/test_client.R |   51 +
 R/pkg/tests/fulltests/test_context.R|  226 ++
 R/pkg/tests/fulltests/test_includePackage.R |   64 +
 R/pkg/tests/fulltests/test_jvm_api.R|   36 +
 .../tests/fulltests/test_mllib_classification.R |  396 ++
 R/pkg/tests/fulltests/test_mllib_clustering.R   |  328 ++
 R/pkg/tests/fulltests/test_mllib_fpm.R  |   85 +
 .../tests/fulltests/test_mllib_recommendation.R |   67 +
 R/pkg/tests/fulltests/test_mllib_regression.R   |  480 +++
 R/pkg/tests/fulltests/test_mllib_stat.R |   53 +
 R/pkg/tests/fulltests/test_mllib_tree.R |  320 ++
 .../tests/fulltests/test_parallelize_collect.R  |  120 +
 R/pkg/tests/fulltests/test_rdd.R|  906 +
 R/pkg/tests/fulltests/test_shuffle.R|  248 ++
 R/pkg/tests/fulltests/test_sparkR.R |   48 +
 R/pkg/tests/fulltests/test_sparkSQL.R   | 3474 ++
 R/pkg/tests/fulltests/test_streaming.R  |  167 +
 R/pkg/tests/fulltests/test_take.R   |   71 +
 R/pkg/tests/fulltests/test_textFile.R   |  182 +
 R/pkg/tests/fulltests/test_utils.R  |  248 ++
 R/pkg/tests/run-all.R   |8 +
 56 files changed, 8112 insertions(+), 8014 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/jarTest.R

[2/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

2017-06-11 Thread felixcheung
http://git-wip-us.apache.org/repos/asf/spark/blob/0b0be47e/R/pkg/tests/fulltests/test_sparkSQL.R
--
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R 
b/R/pkg/tests/fulltests/test_sparkSQL.R
new file mode 100644
index 000..d2d5191
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -0,0 +1,3198 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("SparkSQL functions")
+
+# Utility function for easily checking the values of a StructField
+checkStructField <- function(actual, expectedName, expectedType, 
expectedNullable) {
+  expect_equal(class(actual), "structField")
+  expect_equal(actual$name(), expectedName)
+  expect_equal(actual$dataType.toString(), expectedType)
+  expect_equal(actual$nullable(), expectedNullable)
+}
+
+markUtf8 <- function(s) {
+  Encoding(s) <- "UTF-8"
+  s
+}
+
+setHiveContext <- function(sc) {
+  if (exists(".testHiveSession", envir = .sparkREnv)) {
+hiveSession <- get(".testHiveSession", envir = .sparkREnv)
+  } else {
+# initialize once and reuse
+ssc <- callJMethod(sc, "sc")
+hiveCtx <- tryCatch({
+  newJObject("org.apache.spark.sql.hive.test.TestHiveContext", ssc, FALSE)
+},
+error = function(err) {
+  skip("Hive is not build with SparkSQL, skipped")
+})
+hiveSession <- callJMethod(hiveCtx, "sparkSession")
+  }
+  previousSession <- get(".sparkRsession", envir = .sparkREnv)
+  assign(".sparkRsession", hiveSession, envir = .sparkREnv)
+  assign(".prevSparkRsession", previousSession, envir = .sparkREnv)
+  hiveSession
+}
+
+unsetHiveContext <- function() {
+  previousSession <- get(".prevSparkRsession", envir = .sparkREnv)
+  assign(".sparkRsession", previousSession, envir = .sparkREnv)
+  remove(".prevSparkRsession", envir = .sparkREnv)
+}
+
+# Tests for SparkSQL functions in SparkR
+
+filesBefore <- list.files(path = sparkRDir, all.files = TRUE)
+sparkSession <- if (not_cran_or_windows_with_hadoop()) {
+sparkR.session(master = sparkRTestMaster)
+  } else {
+sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+  }
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", 
"getJavaSparkContext", sparkSession)
+
+mockLines <- c("{\"name\":\"Michael\"}",
+   "{\"name\":\"Andy\", \"age\":30}",
+   "{\"name\":\"Justin\", \"age\":19}")
+jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
+orcPath <- tempfile(pattern = "sparkr-test", fileext = ".orc")
+writeLines(mockLines, jsonPath)
+
+# For test nafunctions, like dropna(), fillna(),...
+mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}",
+ "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}",
+ "{\"name\":\"David\",\"age\":60,\"height\":null}",
+ "{\"name\":\"Amy\",\"age\":null,\"height\":null}",
+ "{\"name\":null,\"age\":null,\"height\":null}")
+jsonPathNa <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+writeLines(mockLinesNa, jsonPathNa)
+
+# For test complex types in DataFrame
+mockLinesComplexType <-
+  c("{\"c1\":[1, 2, 3], \"c2\":[\"a\", \"b\", \"c\"], \"c3\":[1.0, 2.0, 3.0]}",
+"{\"c1\":[4, 5, 6], \"c2\":[\"d\", \"e\", \"f\"], \"c3\":[4.0, 5.0, 6.0]}",
+"{\"c1\":[7, 8, 9], \"c2\":[\"g\", \"h\", \"i\"], \"c3\":[7.0, 8.0, 9.0]}")
+complexTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+writeLines(mockLinesComplexType, complexTypeJsonPath)
+
+# For test map type and struct type in DataFrame
+mockLinesMapType <- 
c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}",
+  
"{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}",
+  
"{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}")
+mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+writeLines(mockLinesMapType, mapTypeJsonPath)
+
+if (.Platform$OS.type == "windows") {
+  Sys.setenv(TZ = "GMT")
+}
+
+test_that("calling sparkRSQL.init returns existing SQL context", {
+  skip_on_cran()
+
+  sqlContext <- suppressWarnings(sparkRSQL.init(sc))
+  expect_equal(suppressWarni

[7/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

2017-06-11 Thread felixcheung
[SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

## What changes were proposed in this pull request?

Move all existing tests to non-installed directory so that it will never run by 
installing SparkR package

For a follow-up PR:
- remove all skip_on_cran() calls in tests
- clean up test timer
- improve or change basic tests that do run on CRAN (if anyone has suggestion)

It looks like `R CMD build pkg` will still put pkg\tests (ie. the full tests) 
into the source package but `R CMD INSTALL` on such source package does not 
install these tests (and so `R CMD check` does not run them)

## How was this patch tested?

- [x] unit tests, Jenkins
- [x] AppVeyor
- [x] make a source package, install it, `R CMD check` it - verify the full 
tests are not installed or run

Author: Felix Cheung 

Closes #18264 from felixcheung/rtestset.

(cherry picked from commit dc4c351837879dab26ad8fb471dc51c06832a9e4)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0b0be47e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0b0be47e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0b0be47e

Branch: refs/heads/branch-2.2
Commit: 0b0be47e7b742d96810c60b19a9aa920242e5224
Parents: 815a082
Author: Felix Cheung 
Authored: Sun Jun 11 00:00:33 2017 -0700
Committer: Felix Cheung 
Committed: Sun Jun 11 00:00:45 2017 -0700

--
 R/pkg/inst/tests/testthat/jarTest.R |   32 -
 R/pkg/inst/tests/testthat/packageInAJarTest.R   |   30 -
 R/pkg/inst/tests/testthat/test_Serde.R  |   85 -
 R/pkg/inst/tests/testthat/test_Windows.R|   32 -
 R/pkg/inst/tests/testthat/test_basic.R  |   90 +
 R/pkg/inst/tests/testthat/test_binaryFile.R |  100 -
 .../inst/tests/testthat/test_binary_function.R  |  110 -
 R/pkg/inst/tests/testthat/test_broadcast.R  |   55 -
 R/pkg/inst/tests/testthat/test_client.R |   51 -
 R/pkg/inst/tests/testthat/test_context.R|  226 --
 R/pkg/inst/tests/testthat/test_includePackage.R |   64 -
 R/pkg/inst/tests/testthat/test_jvm_api.R|   36 -
 .../tests/testthat/test_mllib_classification.R  |  396 ---
 .../inst/tests/testthat/test_mllib_clustering.R |  328 --
 R/pkg/inst/tests/testthat/test_mllib_fpm.R  |   85 -
 .../tests/testthat/test_mllib_recommendation.R  |   67 -
 .../inst/tests/testthat/test_mllib_regression.R |  480 ---
 R/pkg/inst/tests/testthat/test_mllib_stat.R |   53 -
 R/pkg/inst/tests/testthat/test_mllib_tree.R |  226 --
 .../tests/testthat/test_parallelize_collect.R   |  120 -
 R/pkg/inst/tests/testthat/test_rdd.R|  906 -
 R/pkg/inst/tests/testthat/test_shuffle.R|  248 --
 R/pkg/inst/tests/testthat/test_sparkR.R |   48 -
 R/pkg/inst/tests/testthat/test_sparkSQL.R   | 3198 --
 R/pkg/inst/tests/testthat/test_streaming.R  |  167 -
 R/pkg/inst/tests/testthat/test_take.R   |   71 -
 R/pkg/inst/tests/testthat/test_textFile.R   |  182 -
 R/pkg/inst/tests/testthat/test_utils.R  |  247 --
 R/pkg/tests/fulltests/jarTest.R |   32 +
 R/pkg/tests/fulltests/packageInAJarTest.R   |   30 +
 R/pkg/tests/fulltests/test_Serde.R  |   85 +
 R/pkg/tests/fulltests/test_Windows.R|   32 +
 R/pkg/tests/fulltests/test_binaryFile.R |  100 +
 R/pkg/tests/fulltests/test_binary_function.R|  110 +
 R/pkg/tests/fulltests/test_broadcast.R  |   55 +
 R/pkg/tests/fulltests/test_client.R |   51 +
 R/pkg/tests/fulltests/test_context.R|  226 ++
 R/pkg/tests/fulltests/test_includePackage.R |   64 +
 R/pkg/tests/fulltests/test_jvm_api.R|   36 +
 .../tests/fulltests/test_mllib_classification.R |  396 +++
 R/pkg/tests/fulltests/test_mllib_clustering.R   |  328 ++
 R/pkg/tests/fulltests/test_mllib_fpm.R  |   85 +
 .../tests/fulltests/test_mllib_recommendation.R |   67 +
 R/pkg/tests/fulltests/test_mllib_regression.R   |  480 +++
 R/pkg/tests/fulltests/test_mllib_stat.R |   53 +
 R/pkg/tests/fulltests/test_mllib_tree.R |  226 ++
 .../tests/fulltests/test_parallelize_collect.R  |  120 +
 R/pkg/tests/fulltests/test_rdd.R|  906 +
 R/pkg/tests/fulltests/test_shuffle.R|  248 ++
 R/pkg/tests/fulltests/test_sparkR.R |   48 +
 R/pkg/tests/fulltests/test_sparkSQL.R   | 3198 ++
 R/pkg/tests/fulltests/test_streaming.R  |  167 +
 R/pkg/tests/fulltests/test_take.R   |   71 +
 R/pkg/tests/fulltests/test_textFile.R   |  182 +
 R/pkg/tests/fulltests/test_utils.R  |  247 ++
 R/pkg/tests/run-all.R   |8 +
 56 files changed, 7741 insertions(+), 7643 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark

spark git commit: [SPARK-20877][SPARKR][FOLLOWUP] clean up after test move

2017-06-11 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 823f1eef5 -> 9f4ff9552


[SPARK-20877][SPARKR][FOLLOWUP] clean up after test move

## What changes were proposed in this pull request?

clean up after big test move

## How was this patch tested?

unit tests, jenkins

Author: Felix Cheung 

Closes #18267 from felixcheung/rtestset2.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9f4ff955
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9f4ff955
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9f4ff955

Branch: refs/heads/master
Commit: 9f4ff9552470fb97ca38bb56bbf43be49a9a316c
Parents: 823f1ee
Author: Felix Cheung 
Authored: Sun Jun 11 03:00:44 2017 -0700
Committer: Felix Cheung 
Committed: Sun Jun 11 03:00:44 2017 -0700

--
 R/pkg/.Rbuildignore |   1 +
 R/pkg/R/install.R   |   2 +-
 R/pkg/R/utils.R |   8 +-
 R/pkg/tests/fulltests/test_Serde.R  |   6 --
 R/pkg/tests/fulltests/test_Windows.R|   7 +-
 R/pkg/tests/fulltests/test_binaryFile.R |   8 --
 R/pkg/tests/fulltests/test_binary_function.R|   6 --
 R/pkg/tests/fulltests/test_broadcast.R  |   4 -
 R/pkg/tests/fulltests/test_client.R |   8 --
 R/pkg/tests/fulltests/test_context.R|  16 ---
 R/pkg/tests/fulltests/test_includePackage.R |   4 -
 .../tests/fulltests/test_mllib_classification.R |  12 +--
 R/pkg/tests/fulltests/test_mllib_clustering.R   |  14 +--
 R/pkg/tests/fulltests/test_mllib_fpm.R  |   2 +-
 .../tests/fulltests/test_mllib_recommendation.R |   2 +-
 R/pkg/tests/fulltests/test_mllib_regression.R   |  16 +--
 R/pkg/tests/fulltests/test_mllib_tree.R |  22 ++--
 .../tests/fulltests/test_parallelize_collect.R  |   8 --
 R/pkg/tests/fulltests/test_rdd.R| 102 ---
 R/pkg/tests/fulltests/test_shuffle.R|  24 -
 R/pkg/tests/fulltests/test_sparkR.R |   2 -
 R/pkg/tests/fulltests/test_sparkSQL.R   |  92 ++---
 R/pkg/tests/fulltests/test_streaming.R  |  14 +--
 R/pkg/tests/fulltests/test_take.R   |   2 -
 R/pkg/tests/fulltests/test_textFile.R   |  18 
 R/pkg/tests/fulltests/test_utils.R  |   9 --
 R/pkg/tests/run-all.R   |   2 -
 27 files changed, 35 insertions(+), 376 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9f4ff955/R/pkg/.Rbuildignore
--
diff --git a/R/pkg/.Rbuildignore b/R/pkg/.Rbuildignore
index f12f8c2..18b2db6 100644
--- a/R/pkg/.Rbuildignore
+++ b/R/pkg/.Rbuildignore
@@ -6,3 +6,4 @@
 ^README\.Rmd$
 ^src-native$
 ^html$
+^tests/fulltests/*

http://git-wip-us.apache.org/repos/asf/spark/blob/9f4ff955/R/pkg/R/install.R
--
diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R
index 4ca7aa6..ec931be 100644
--- a/R/pkg/R/install.R
+++ b/R/pkg/R/install.R
@@ -267,7 +267,7 @@ hadoopVersionName <- function(hadoopVersion) {
 # The implementation refers to appdirs package: 
https://pypi.python.org/pypi/appdirs and
 # adapt to Spark context
 sparkCachePath <- function() {
-  if (.Platform$OS.type == "windows") {
+  if (is_windows()) {
 winAppPath <- Sys.getenv("LOCALAPPDATA", unset = NA)
 if (is.na(winAppPath)) {
   stop(paste("%LOCALAPPDATA% not found.",

http://git-wip-us.apache.org/repos/asf/spark/blob/9f4ff955/R/pkg/R/utils.R
--
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index ea45e39..91483a4 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -908,10 +908,6 @@ isAtomicLengthOne <- function(x) {
   is.atomic(x) && length(x) == 1
 }
 
-is_cran <- function() {
-  !identical(Sys.getenv("NOT_CRAN"), "true")
-}
-
 is_windows <- function() {
   .Platform$OS.type == "windows"
 }
@@ -920,6 +916,6 @@ hadoop_home_set <- function() {
   !identical(Sys.getenv("HADOOP_HOME"), "")
 }
 
-not_cran_or_windows_with_hadoop <- function() {
-  !is_cran() && (!is_windows() || hadoop_home_set())
+windows_with_hadoop <- function() {
+  !is_windows() || hadoop_home_set()
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/9f4ff955/R/pkg/tests/fulltests/test_Serde.R
--
diff --git a/R/pkg/tests/fulltests/test_Serde.R 
b/R/pkg/tests/fulltests/test_Serde.R
index 6e160fa..6bbd201 100644
--- a/R/pkg/tests/fulltests/test_Serde.R
+++ b/R/pkg/tests/fulltests/test_Serde.R
@@ -20,8 +20,6 @@ context("SerDe func

spark git commit: [SPARK-20877][SPARKR][FOLLOWUP] clean up after test move

2017-06-11 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.2 0b0be47e7 -> 26003de55


[SPARK-20877][SPARKR][FOLLOWUP] clean up after test move

clean up after big test move

unit tests, jenkins

Author: Felix Cheung 

Closes #18267 from felixcheung/rtestset2.

(cherry picked from commit 9f4ff9552470fb97ca38bb56bbf43be49a9a316c)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/26003de5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/26003de5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/26003de5

Branch: refs/heads/branch-2.2
Commit: 26003de55ba13695649b0d874563a76d71cda88d
Parents: 0b0be47
Author: Felix Cheung 
Authored: Sun Jun 11 03:00:44 2017 -0700
Committer: Felix Cheung 
Committed: Sun Jun 11 03:13:56 2017 -0700

--
 R/pkg/.Rbuildignore |   1 +
 R/pkg/R/install.R   |   2 +-
 R/pkg/R/utils.R |   8 +-
 R/pkg/tests/fulltests/test_Serde.R  |   6 --
 R/pkg/tests/fulltests/test_Windows.R|   7 +-
 R/pkg/tests/fulltests/test_binaryFile.R |   8 --
 R/pkg/tests/fulltests/test_binary_function.R|   6 --
 R/pkg/tests/fulltests/test_broadcast.R  |   4 -
 R/pkg/tests/fulltests/test_client.R |   8 --
 R/pkg/tests/fulltests/test_context.R|  16 ---
 R/pkg/tests/fulltests/test_includePackage.R |   4 -
 .../tests/fulltests/test_mllib_classification.R |  12 +--
 R/pkg/tests/fulltests/test_mllib_clustering.R   |  14 +--
 R/pkg/tests/fulltests/test_mllib_fpm.R  |   2 +-
 .../tests/fulltests/test_mllib_recommendation.R |   2 +-
 R/pkg/tests/fulltests/test_mllib_regression.R   |  16 +--
 R/pkg/tests/fulltests/test_mllib_tree.R |  14 ++-
 .../tests/fulltests/test_parallelize_collect.R  |   8 --
 R/pkg/tests/fulltests/test_rdd.R| 102 ---
 R/pkg/tests/fulltests/test_shuffle.R|  24 -
 R/pkg/tests/fulltests/test_sparkR.R |   2 -
 R/pkg/tests/fulltests/test_sparkSQL.R   |  92 ++---
 R/pkg/tests/fulltests/test_streaming.R  |  14 +--
 R/pkg/tests/fulltests/test_take.R   |   2 -
 R/pkg/tests/fulltests/test_textFile.R   |  18 
 R/pkg/tests/fulltests/test_utils.R  |   8 --
 R/pkg/tests/run-all.R   |   2 -
 27 files changed, 32 insertions(+), 370 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/26003de5/R/pkg/.Rbuildignore
--
diff --git a/R/pkg/.Rbuildignore b/R/pkg/.Rbuildignore
index f12f8c2..18b2db6 100644
--- a/R/pkg/.Rbuildignore
+++ b/R/pkg/.Rbuildignore
@@ -6,3 +6,4 @@
 ^README\.Rmd$
 ^src-native$
 ^html$
+^tests/fulltests/*

http://git-wip-us.apache.org/repos/asf/spark/blob/26003de5/R/pkg/R/install.R
--
diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R
index 4ca7aa6..ec931be 100644
--- a/R/pkg/R/install.R
+++ b/R/pkg/R/install.R
@@ -267,7 +267,7 @@ hadoopVersionName <- function(hadoopVersion) {
 # The implementation refers to appdirs package: 
https://pypi.python.org/pypi/appdirs and
 # adapt to Spark context
 sparkCachePath <- function() {
-  if (.Platform$OS.type == "windows") {
+  if (is_windows()) {
 winAppPath <- Sys.getenv("LOCALAPPDATA", unset = NA)
 if (is.na(winAppPath)) {
   stop(paste("%LOCALAPPDATA% not found.",

http://git-wip-us.apache.org/repos/asf/spark/blob/26003de5/R/pkg/R/utils.R
--
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index b19556a..7225da9 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -900,10 +900,6 @@ isAtomicLengthOne <- function(x) {
   is.atomic(x) && length(x) == 1
 }
 
-is_cran <- function() {
-  !identical(Sys.getenv("NOT_CRAN"), "true")
-}
-
 is_windows <- function() {
   .Platform$OS.type == "windows"
 }
@@ -912,6 +908,6 @@ hadoop_home_set <- function() {
   !identical(Sys.getenv("HADOOP_HOME"), "")
 }
 
-not_cran_or_windows_with_hadoop <- function() {
-  !is_cran() && (!is_windows() || hadoop_home_set())
+windows_with_hadoop <- function() {
+  !is_windows() || hadoop_home_set()
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/26003de5/R/pkg/tests/fulltests/test_Serde.R
--
diff --git a/R/pkg/tests/fulltests/test_Serde.R 
b/R/pkg/tests/fulltests/test_Serde.R
index 6e160fa..6bbd201 100644
--- a/R/pkg/tests/fulltests/test_Serde.R
+++ b/R/pkg/tests/fulltests/test_Serde.R
@@ -20,8

spark git commit: [TEST][SPARKR][CORE] Fix broken SparkSubmitSuite

2017-06-12 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 2639c3ed0 -> 278ba7a2c


[TEST][SPARKR][CORE] Fix broken SparkSubmitSuite

## What changes were proposed in this pull request?

Fix test file path. This is broken in #18264 and undetected since R-only 
changes don't build core and subsequent post-commit with the change built fine 
(again because it wasn't building core)

actually appveyor builds everything but it's not running scala suites ...

## How was this patch tested?

jenkins
srowen gatorsmile

Author: Felix Cheung 

Closes #18283 from felixcheung/rsubmitsuite.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/278ba7a2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/278ba7a2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/278ba7a2

Branch: refs/heads/master
Commit: 278ba7a2c62b2cbb7bcfe79ce10d35ab57bb1950
Parents: 2639c3e
Author: Felix Cheung 
Authored: Mon Jun 12 22:08:49 2017 -0700
Committer: Felix Cheung 
Committed: Mon Jun 12 22:08:49 2017 -0700

--
 .../test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/278ba7a2/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
--
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala 
b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index de71999..b089357 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -505,8 +505,8 @@ class SparkSubmitSuite
 assume(RUtils.isSparkRInstalled, "SparkR is not installed in this build.")
 val main = MavenCoordinate("my.great.lib", "mylib", "0.1")
 val sparkHome = sys.props.getOrElse("spark.test.home", 
fail("spark.test.home is not set!"))
-val rScriptDir =
-  Seq(sparkHome, "R", "pkg", "inst", "tests", 
"packageInAJarTest.R").mkString(File.separator)
+val rScriptDir = Seq(
+  sparkHome, "R", "pkg", "tests", "fulltests", 
"packageInAJarTest.R").mkString(File.separator)
 assert(new File(rScriptDir).exists)
 IvyTestUtils.withRepository(main, None, None, withR = true) { repo =>
   val args = Seq(
@@ -527,7 +527,7 @@ class SparkSubmitSuite
 // Check if the SparkR package is installed
 assume(RUtils.isSparkRInstalled, "SparkR is not installed in this build.")
 val rScriptDir =
-  Seq(sparkHome, "R", "pkg", "inst", "tests", "testthat", 
"jarTest.R").mkString(File.separator)
+  Seq(sparkHome, "R", "pkg", "tests", "fulltests", 
"jarTest.R").mkString(File.separator)
 assert(new File(rScriptDir).exists)
 
 // compile a small jar containing a class that will be called from R code.


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [TEST][SPARKR][CORE] Fix broken SparkSubmitSuite

2017-06-12 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.2 48a843b56 -> dae1a9875


[TEST][SPARKR][CORE] Fix broken SparkSubmitSuite

## What changes were proposed in this pull request?

Fix test file path. This is broken in #18264 and undetected since R-only 
changes don't build core and subsequent post-commit with the change built fine 
(again because it wasn't building core)

actually appveyor builds everything but it's not running scala suites ...

## How was this patch tested?

jenkins
srowen gatorsmile

Author: Felix Cheung 

Closes #18283 from felixcheung/rsubmitsuite.

(cherry picked from commit 278ba7a2c62b2cbb7bcfe79ce10d35ab57bb1950)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dae1a987
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dae1a987
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dae1a987

Branch: refs/heads/branch-2.2
Commit: dae1a98758d09dde97a8e7863100d2dd52389bf3
Parents: 48a843b
Author: Felix Cheung 
Authored: Mon Jun 12 22:08:49 2017 -0700
Committer: Felix Cheung 
Committed: Mon Jun 12 22:09:05 2017 -0700

--
 .../test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/dae1a987/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
--
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala 
b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 6e9721c..6fa3a09 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -485,8 +485,8 @@ class SparkSubmitSuite
 assume(RUtils.isSparkRInstalled, "SparkR is not installed in this build.")
 val main = MavenCoordinate("my.great.lib", "mylib", "0.1")
 val sparkHome = sys.props.getOrElse("spark.test.home", 
fail("spark.test.home is not set!"))
-val rScriptDir =
-  Seq(sparkHome, "R", "pkg", "inst", "tests", 
"packageInAJarTest.R").mkString(File.separator)
+val rScriptDir = Seq(
+  sparkHome, "R", "pkg", "tests", "fulltests", 
"packageInAJarTest.R").mkString(File.separator)
 assert(new File(rScriptDir).exists)
 IvyTestUtils.withRepository(main, None, None, withR = true) { repo =>
   val args = Seq(
@@ -507,7 +507,7 @@ class SparkSubmitSuite
 // Check if the SparkR package is installed
 assume(RUtils.isSparkRInstalled, "SparkR is not installed in this build.")
 val rScriptDir =
-  Seq(sparkHome, "R", "pkg", "inst", "tests", "testthat", 
"jarTest.R").mkString(File.separator)
+  Seq(sparkHome, "R", "pkg", "tests", "fulltests", 
"jarTest.R").mkString(File.separator)
 assert(new File(rScriptDir).exists)
 
 // compile a small jar containing a class that will be called from R code.


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-20980][DOCS] update doc to reflect multiLine change

2017-06-14 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.2 af4f89c98 -> b5504f6d3


[SPARK-20980][DOCS] update doc to reflect multiLine change

## What changes were proposed in this pull request?

doc only change

## How was this patch tested?

manually

Author: Felix Cheung 

Closes #18312 from felixcheung/sqljsonwholefiledoc.

(cherry picked from commit 1bf55e396c7b995a276df61d9a4eb8e60bcee334)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b5504f6d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b5504f6d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b5504f6d

Branch: refs/heads/branch-2.2
Commit: b5504f6d3fc375eecb131460c8b01e0be18f4e9b
Parents: af4f89c
Author: Felix Cheung 
Authored: Wed Jun 14 23:08:05 2017 -0700
Committer: Felix Cheung 
Committed: Wed Jun 14 23:08:18 2017 -0700

--
 docs/sql-programming-guide.md | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b5504f6d/docs/sql-programming-guide.md
--
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 314ff6e..8e722ae 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -998,7 +998,7 @@ Note that the file that is offered as _a json file_ is not 
a typical JSON file.
 line must contain a separate, self-contained valid JSON object. For more 
information, please see
 [JSON Lines text format, also called newline-delimited 
JSON](http://jsonlines.org/).
 
-For a regular multi-line JSON file, set the `wholeFile` option to `true`.
+For a regular multi-line JSON file, set the `multiLine` option to `true`.
 
 {% include_example json_dataset 
scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
 
@@ -1012,7 +1012,7 @@ Note that the file that is offered as _a json file_ is 
not a typical JSON file.
 line must contain a separate, self-contained valid JSON object. For more 
information, please see
 [JSON Lines text format, also called newline-delimited 
JSON](http://jsonlines.org/).
 
-For a regular multi-line JSON file, set the `wholeFile` option to `true`.
+For a regular multi-line JSON file, set the `multiLine` option to `true`.
 
 {% include_example json_dataset 
java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
 
@@ -1025,7 +1025,7 @@ Note that the file that is offered as _a json file_ is 
not a typical JSON file.
 line must contain a separate, self-contained valid JSON object. For more 
information, please see
 [JSON Lines text format, also called newline-delimited 
JSON](http://jsonlines.org/).
 
-For a regular multi-line JSON file, set the `wholeFile` parameter to `True`.
+For a regular multi-line JSON file, set the `multiLine` parameter to `True`.
 
 {% include_example json_dataset python/sql/datasource.py %}
 
@@ -1039,7 +1039,7 @@ Note that the file that is offered as _a json file_ is 
not a typical JSON file.
 line must contain a separate, self-contained valid JSON object. For more 
information, please see
 [JSON Lines text format, also called newline-delimited 
JSON](http://jsonlines.org/).
 
-For a regular multi-line JSON file, set a named parameter `wholeFile` to 
`TRUE`.
+For a regular multi-line JSON file, set a named parameter `multiLine` to 
`TRUE`.
 
 {% include_example json_dataset r/RSparkSQLExample.R %}
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-20980][DOCS] update doc to reflect multiLine change

2017-06-14 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master b32b2123d -> 1bf55e396


[SPARK-20980][DOCS] update doc to reflect multiLine change

## What changes were proposed in this pull request?

doc only change

## How was this patch tested?

manually

Author: Felix Cheung 

Closes #18312 from felixcheung/sqljsonwholefiledoc.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1bf55e39
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1bf55e39
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1bf55e39

Branch: refs/heads/master
Commit: 1bf55e396c7b995a276df61d9a4eb8e60bcee334
Parents: b32b212
Author: Felix Cheung 
Authored: Wed Jun 14 23:08:05 2017 -0700
Committer: Felix Cheung 
Committed: Wed Jun 14 23:08:05 2017 -0700

--
 docs/sql-programming-guide.md | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1bf55e39/docs/sql-programming-guide.md
--
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 314ff6e..8e722ae 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -998,7 +998,7 @@ Note that the file that is offered as _a json file_ is not 
a typical JSON file.
 line must contain a separate, self-contained valid JSON object. For more 
information, please see
 [JSON Lines text format, also called newline-delimited 
JSON](http://jsonlines.org/).
 
-For a regular multi-line JSON file, set the `wholeFile` option to `true`.
+For a regular multi-line JSON file, set the `multiLine` option to `true`.
 
 {% include_example json_dataset 
scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
 
@@ -1012,7 +1012,7 @@ Note that the file that is offered as _a json file_ is 
not a typical JSON file.
 line must contain a separate, self-contained valid JSON object. For more 
information, please see
 [JSON Lines text format, also called newline-delimited 
JSON](http://jsonlines.org/).
 
-For a regular multi-line JSON file, set the `wholeFile` option to `true`.
+For a regular multi-line JSON file, set the `multiLine` option to `true`.
 
 {% include_example json_dataset 
java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
 
@@ -1025,7 +1025,7 @@ Note that the file that is offered as _a json file_ is 
not a typical JSON file.
 line must contain a separate, self-contained valid JSON object. For more 
information, please see
 [JSON Lines text format, also called newline-delimited 
JSON](http://jsonlines.org/).
 
-For a regular multi-line JSON file, set the `wholeFile` parameter to `True`.
+For a regular multi-line JSON file, set the `multiLine` parameter to `True`.
 
 {% include_example json_dataset python/sql/datasource.py %}
 
@@ -1039,7 +1039,7 @@ Note that the file that is offered as _a json file_ is 
not a typical JSON file.
 line must contain a separate, self-contained valid JSON object. For more 
information, please see
 [JSON Lines text format, also called newline-delimited 
JSON](http://jsonlines.org/).
 
-For a regular multi-line JSON file, set a named parameter `wholeFile` to 
`TRUE`.
+For a regular multi-line JSON file, set a named parameter `multiLine` to 
`TRUE`.
 
 {% include_example json_dataset r/RSparkSQLExample.R %}
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-21128][R] Remove both "spark-warehouse" and "metastore_db" before listing files in R tests

2017-06-18 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 75a6d0585 -> 05f83c532


[SPARK-21128][R] Remove both "spark-warehouse" and "metastore_db" before 
listing files in R tests

## What changes were proposed in this pull request?

This PR proposes to list the files in test _after_ removing both 
"spark-warehouse" and "metastore_db" so that the next run of R tests pass fine. 
This is sometimes a bit annoying.

## How was this patch tested?

Manually running multiple times R tests via `./R/run-tests.sh`.

**Before**

Second run:

```
SparkSQL functions: Spark package found in SPARK_HOME: .../spark
...
...
...
...
...
1234...

Failed -
1. Failure: No extra files are created in SPARK_HOME by starting session and 
making calls (test_sparkSQL.R#3384)
length(list1) not equal to length(list2).
1/1 mismatches
[1] 25 - 23 == 2

2. Failure: No extra files are created in SPARK_HOME by starting session and 
making calls (test_sparkSQL.R#3384)
sort(list1, na.last = TRUE) not equal to sort(list2, na.last = TRUE).
10/25 mismatches
x[16]: "metastore_db"
y[16]: "pkg"

x[17]: "pkg"
y[17]: "R"

x[18]: "R"
y[18]: "README.md"

x[19]: "README.md"
y[19]: "run-tests.sh"

x[20]: "run-tests.sh"
y[20]: "SparkR_2.2.0.tar.gz"

x[21]: "metastore_db"
y[21]: "pkg"

x[22]: "pkg"
y[22]: "R"

x[23]: "R"
y[23]: "README.md"

x[24]: "README.md"
y[24]: "run-tests.sh"

x[25]: "run-tests.sh"
y[25]: "SparkR_2.2.0.tar.gz"

3. Failure: No extra files are created in SPARK_HOME by starting session and 
making calls (test_sparkSQL.R#3388)
length(list1) not equal to length(list2).
1/1 mismatches
[1] 25 - 23 == 2

4. Failure: No extra files are created in SPARK_HOME by starting session and 
making calls (test_sparkSQL.R#3388)
sort(list1, na.last = TRUE) not equal to sort(list2, na.last = TRUE).
10/25 mismatches
x[16]: "metastore_db"
y[16]: "pkg"

x[17]: "pkg"
y[17]: "R"

x[18]: "R"
y[18]: "README.md"

x[19]: "README.md"
y[19]: "run-tests.sh"

x[20]: "run-tests.sh"
y[20]: "SparkR_2.2.0.tar.gz"

x[21]: "metastore_db"
y[21]: "pkg"

x[22]: "pkg"
y[22]: "R"

x[23]: "R"
y[23]: "README.md"

x[24]: "README.md"
y[24]: "run-tests.sh"

x[25]: "run-tests.sh"
y[25]: "SparkR_2.2.0.tar.gz"

DONE ===
```

**After**

Second run:

```
SparkSQL functions: Spark package found in SPARK_HOME: .../spark
...
...
...
...
...
...
```

Author: hyukjinkwon 

Closes #18335 from HyukjinKwon/SPARK-21128.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/05f83c53
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/05f83c53
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/05f83c53

Branch: refs/heads/master
Commit: 05f83c532a96ead8dec1c046f985164b7f7205c0
Parents: 75a6d05
Author: hyukjinkwon 
Authored: Sun Jun 18 11:26:27 2017 -0700
Committer: Felix Cheung 
Committed: Sun Jun 18 11:26:27 2017 -0700

--
 R/pkg/tests/run-all.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--

spark git commit: [SPARK-20892][SPARKR] Add SQL trunc function to SparkR

2017-06-18 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 05f83c532 -> 110ce1f27


[SPARK-20892][SPARKR] Add SQL trunc function to SparkR

## What changes were proposed in this pull request?

Add SQL trunc function

## How was this patch tested?
standard test

Author: actuaryzhang 

Closes #18291 from actuaryzhang/sparkRTrunc2.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/110ce1f2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/110ce1f2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/110ce1f2

Branch: refs/heads/master
Commit: 110ce1f27b66905afada6b5fd63c34fbf7602739
Parents: 05f83c5
Author: actuaryzhang 
Authored: Sun Jun 18 18:00:27 2017 -0700
Committer: Felix Cheung 
Committed: Sun Jun 18 18:00:27 2017 -0700

--
 R/pkg/NAMESPACE   |  1 +
 R/pkg/R/functions.R   | 29 +
 R/pkg/tests/fulltests/test_sparkSQL.R |  2 ++
 3 files changed, 32 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/110ce1f2/R/pkg/NAMESPACE
--
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 4e3fe00..229de4a 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -357,6 +357,7 @@ exportMethods("%<=>%",
   "to_utc_timestamp",
   "translate",
   "trim",
+  "trunc",
   "unbase64",
   "unhex",
   "unix_timestamp",

http://git-wip-us.apache.org/repos/asf/spark/blob/110ce1f2/R/pkg/R/functions.R
--
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 06a9019..7128c3b 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -4015,3 +4015,32 @@ setMethod("input_file_name", signature("missing"),
 jc <- callJStatic("org.apache.spark.sql.functions", 
"input_file_name")
 column(jc)
   })
+
+#' trunc
+#'
+#' Returns date truncated to the unit specified by the format.
+#'
+#' @param x Column to compute on.
+#' @param format string used for specify the truncation method. For example, 
"year", "",
+#' "yy" for truncate by year, or "month", "mon", "mm" for truncate by month.
+#'
+#' @rdname trunc
+#' @name trunc
+#' @family date time functions
+#' @aliases trunc,Column-method
+#' @export
+#' @examples
+#' \dontrun{
+#' trunc(df$c, "year")
+#' trunc(df$c, "yy")
+#' trunc(df$c, "month")
+#' trunc(df$c, "mon")
+#' }
+#' @note trunc since 2.3.0
+setMethod("trunc",
+  signature(x = "Column"),
+  function(x, format) {
+jc <- callJStatic("org.apache.spark.sql.functions", "trunc",
+  x@jc, as.character(format))
+column(jc)
+  })

http://git-wip-us.apache.org/repos/asf/spark/blob/110ce1f2/R/pkg/tests/fulltests/test_sparkSQL.R
--
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R 
b/R/pkg/tests/fulltests/test_sparkSQL.R
index af52906..911b73b 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1382,6 +1382,8 @@ test_that("column functions", {
   c20 <- to_timestamp(c) + to_timestamp(c, "") + to_date(c, "")
   c21 <- posexplode_outer(c) + explode_outer(c)
   c22 <- not(c)
+  c23 <- trunc(c, "year") + trunc(c, "") + trunc(c, "yy") +
+trunc(c, "month") + trunc(c, "mon") + trunc(c, "mm")
 
   # Test if base::is.nan() is exposed
   expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE))


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-20889][SPARKR] Grouped documentation for AGGREGATE column methods

2017-06-19 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 9b57cd8d5 -> 8965fe764


[SPARK-20889][SPARKR] Grouped documentation for AGGREGATE column methods

## What changes were proposed in this pull request?
Grouped documentation for the aggregate functions for Column.

Author: actuaryzhang 

Closes #18025 from actuaryzhang/sparkRDoc4.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8965fe76
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8965fe76
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8965fe76

Branch: refs/heads/master
Commit: 8965fe764a4218d944938aa4828072f1ad9dbda7
Parents: 9b57cd8
Author: actuaryzhang 
Authored: Mon Jun 19 19:41:24 2017 -0700
Committer: Felix Cheung 
Committed: Mon Jun 19 19:41:24 2017 -0700

--
 R/pkg/R/functions.R | 427 +++
 R/pkg/R/generics.R  |  56 ---
 R/pkg/R/stats.R |  22 +--
 3 files changed, 219 insertions(+), 286 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8965fe76/R/pkg/R/functions.R
--
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 7128c3b..01ca8b8 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -18,6 +18,22 @@
 #' @include generics.R column.R
 NULL
 
+#' Aggregate functions for Column operations
+#'
+#' Aggregate functions defined for \code{Column}.
+#'
+#' @param x Column to compute on.
+#' @param y,na.rm,use currently not used.
+#' @param ... additional argument(s). For example, it could be used to pass 
additional Columns.
+#' @name column_aggregate_functions
+#' @rdname column_aggregate_functions
+#' @family aggregate functions
+#' @examples
+#' \dontrun{
+#' # Dataframe used throughout this doc
+#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))}
+NULL
+
 #' lit
 #'
 #' A new \linkS4class{Column} is created to represent the literal value.
@@ -85,17 +101,20 @@ setMethod("acos",
 column(jc)
   })
 
-#' Returns the approximate number of distinct items in a group
+#' @details
+#' \code{approxCountDistinct}: Returns the approximate number of distinct 
items in a group.
 #'
-#' Returns the approximate number of distinct items in a group. This is a 
column
-#' aggregate function.
-#'
-#' @rdname approxCountDistinct
-#' @name approxCountDistinct
-#' @return the approximate number of distinct items in a group.
+#' @rdname column_aggregate_functions
 #' @export
-#' @aliases approxCountDistinct,Column-method
-#' @examples \dontrun{approxCountDistinct(df$c)}
+#' @aliases approxCountDistinct approxCountDistinct,Column-method
+#' @examples
+#'
+#' \dontrun{
+#' head(select(df, approxCountDistinct(df$gear)))
+#' head(select(df, approxCountDistinct(df$gear, 0.02)))
+#' head(select(df, countDistinct(df$gear, df$cyl)))
+#' head(select(df, n_distinct(df$gear)))
+#' head(distinct(select(df, "gear")))}
 #' @note approxCountDistinct(Column) since 1.4.0
 setMethod("approxCountDistinct",
   signature(x = "Column"),
@@ -342,10 +361,13 @@ setMethod("column",
 #'
 #' @rdname corr
 #' @name corr
-#' @family math functions
+#' @family aggregate functions
 #' @export
 #' @aliases corr,Column-method
-#' @examples \dontrun{corr(df$c, df$d)}
+#' @examples
+#' \dontrun{
+#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
+#' head(select(df, corr(df$mpg, df$hp)))}
 #' @note corr since 1.6.0
 setMethod("corr", signature(x = "Column"),
   function(x, col2) {
@@ -356,20 +378,22 @@ setMethod("corr", signature(x = "Column"),
 
 #' cov
 #'
-#' Compute the sample covariance between two expressions.
+#' Compute the covariance between two expressions.
+#'
+#' @details
+#' \code{cov}: Compute the sample covariance between two expressions.
 #'
 #' @rdname cov
 #' @name cov
-#' @family math functions
+#' @family aggregate functions
 #' @export
 #' @aliases cov,characterOrColumn-method
 #' @examples
 #' \dontrun{
-#' cov(df$c, df$d)
-#' cov("c", "d")
-#' covar_samp(df$c, df$d)
-#' covar_samp("c", "d")
-#' }
+#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
+#' head(select(df, cov(df$mpg, df$hp), cov("mpg", "hp"),
+#' covar_samp(df$mpg, df$hp), covar_samp("mpg", "hp"),
+#' covar_pop(df$mpg, df$hp), covar_pop("mpg", "hp")))}
 #' @note cov since 1.6.0
 setMethod("cov", signature(x = "characterOrColumn"),
   function(x, col2) {
@@ -377,6 +401,9 @@ setMethod("cov", signature(x = "characterOrColumn"),
 covar_samp(x, col2)
   })
 
+#' @details
+#' \code{covar_sample}: Alias for \code{cov}.
+#'
 #' @rdname cov
 #'
 #' @param col1 the first Column.
@@ -395,23 +422,13 @@ setMethod("covar_samp", signature(col1 = 
"characterOrColumn", col2 = "characterO
 column(jc)
   

spark git commit: [SPARK-20917][ML][SPARKR] SparkR supports string encoding consistent with R

2017-06-21 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master cad88f17e -> ad459cfb1


[SPARK-20917][ML][SPARKR] SparkR supports string encoding consistent with R

## What changes were proposed in this pull request?

Add `stringIndexerOrderType` to `spark.glm` and `spark.survreg` to support 
string encoding that is consistent with default R.

## How was this patch tested?
new tests

Author: actuaryzhang 

Closes #18140 from actuaryzhang/sparkRFormula.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ad459cfb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ad459cfb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ad459cfb

Branch: refs/heads/master
Commit: ad459cfb1d169d8dd7b9e039ca135ba5cafcab83
Parents: cad88f1
Author: actuaryzhang 
Authored: Wed Jun 21 10:35:16 2017 -0700
Committer: Felix Cheung 
Committed: Wed Jun 21 10:35:16 2017 -0700

--
 R/pkg/R/mllib_regression.R  | 52 +---
 R/pkg/tests/fulltests/test_mllib_regression.R   | 62 
 .../ml/r/AFTSurvivalRegressionWrapper.scala |  4 +-
 .../r/GeneralizedLinearRegressionWrapper.scala  |  6 +-
 4 files changed, 115 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ad459cfb/R/pkg/R/mllib_regression.R
--
diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index d59c890..9ecd887 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -70,6 +70,12 @@ setClass("IsotonicRegressionModel", representation(jobj = 
"jobj"))
 #'  the relationship between the variance and mean of the 
distribution. Only
 #'  applicable to the Tweedie family.
 #' @param link.power the index in the power link function. Only applicable to 
the Tweedie family.
+#' @param stringIndexerOrderType how to order categories of a string feature 
column. This is used to
+#'   decide the base level of a string feature as 
the last category after
+#'   ordering is dropped when encoding strings. 
Supported options are
+#'   "frequencyDesc", "frequencyAsc", 
"alphabetDesc", and "alphabetAsc".
+#'   The default value is "frequencyDesc". When 
the ordering is set to
+#'   "alphabetDesc", this drops the same category 
as R when encoding strings.
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.glm,SparkDataFrame,formula-method
 #' @return \code{spark.glm} returns a fitted generalized linear model.
@@ -79,7 +85,7 @@ setClass("IsotonicRegressionModel", representation(jobj = 
"jobj"))
 #' @examples
 #' \dontrun{
 #' sparkR.session()
-#' t <- as.data.frame(Titanic)
+#' t <- as.data.frame(Titanic, stringsAsFactors = FALSE)
 #' df <- createDataFrame(t)
 #' model <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian")
 #' summary(model)
@@ -96,6 +102,15 @@ setClass("IsotonicRegressionModel", representation(jobj = 
"jobj"))
 #' savedModel <- read.ml(path)
 #' summary(savedModel)
 #'
+#' # note that the default string encoding is different from R's glm
+#' model2 <- glm(Freq ~ Sex + Age, family = "gaussian", data = t)
+#' summary(model2)
+#' # use stringIndexerOrderType = "alphabetDesc" to force string encoding
+#' # to be consistent with R
+#' model3 <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian",
+#'stringIndexerOrderType = "alphabetDesc")
+#' summary(model3)
+#'
 #' # fit tweedie model
 #' model <- spark.glm(df, Freq ~ Sex + Age, family = "tweedie",
 #'var.power = 1.2, link.power = 0)
@@ -110,8 +125,11 @@ setClass("IsotonicRegressionModel", representation(jobj = 
"jobj"))
 #' @seealso \link{glm}, \link{read.ml}
 setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
   function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, 
weightCol = NULL,
-   regParam = 0.0, var.power = 0.0, link.power = 1.0 - 
var.power) {
+   regParam = 0.0, var.power = 0.0, link.power = 1.0 - 
var.power,
+   stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
+  "alphabetDesc", "alphabetAsc")) {
 
+stringIndexerOrderType <- match.arg(stringIndexerOrderType)
 if (is.character(family)) {
   # Handle when family = "tweedie"
   if (tolower(family) == "tweedie") {
@@ -145,7 +163,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", 
formula = "formula"),
 jobj <- 
callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
 "fit", f

spark git commit: [SPARK-20906][SPARKR] Constrained Logistic Regression for SparkR

2017-06-21 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 215281d88 -> 53543374c


[SPARK-20906][SPARKR] Constrained Logistic Regression for SparkR

## What changes were proposed in this pull request?

PR https://github.com/apache/spark/pull/17715 Added Constrained Logistic 
Regression for ML. We should add it to SparkR.

## How was this patch tested?

Add new unit tests.

Author: wangmiao1981 

Closes #18128 from wangmiao1981/test.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/53543374
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/53543374
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/53543374

Branch: refs/heads/master
Commit: 53543374ce0cf0cec26de2382fbc85b7d5c7e9d6
Parents: 215281d
Author: wangmiao1981 
Authored: Wed Jun 21 20:42:45 2017 -0700
Committer: Felix Cheung 
Committed: Wed Jun 21 20:42:45 2017 -0700

--
 R/pkg/R/mllib_classification.R  | 61 +++-
 .../tests/fulltests/test_mllib_classification.R | 40 +
 .../ml/classification/LogisticRegression.scala  |  8 +--
 .../spark/ml/r/LogisticRegressionWrapper.scala  | 34 ++-
 4 files changed, 135 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/53543374/R/pkg/R/mllib_classification.R
--
diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R
index bdcc081..82d2428 100644
--- a/R/pkg/R/mllib_classification.R
+++ b/R/pkg/R/mllib_classification.R
@@ -204,6 +204,20 @@ function(object, path, overwrite = FALSE) {
 #' @param aggregationDepth The depth for treeAggregate (greater than or equal 
to 2). If the dimensions of features
 #' or the number of partitions are large, this param 
could be adjusted to a larger size.
 #' This is an expert parameter. Default value should 
be good for most cases.
+#' @param lowerBoundsOnCoefficients The lower bounds on coefficients if 
fitting under bound constrained optimization.
+#'  The bound matrix must be compatible with 
the shape (1, number of features) for binomial
+#'  regression, or (number of classes, number 
of features) for multinomial regression.
+#'  It is a R matrix.
+#' @param upperBoundsOnCoefficients The upper bounds on coefficients if 
fitting under bound constrained optimization.
+#'  The bound matrix must be compatible with 
the shape (1, number of features) for binomial
+#'  regression, or (number of classes, number 
of features) for multinomial regression.
+#'  It is a R matrix.
+#' @param lowerBoundsOnIntercepts The lower bounds on intercepts if fitting 
under bound constrained optimization.
+#'The bounds vector size must be equal to 1 
for binomial regression, or the number
+#'of classes for multinomial regression.
+#' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting 
under bound constrained optimization.
+#'The bound vector size must be equal to 1 for 
binomial regression, or the number
+#'of classes for multinomial regression.
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.logit} returns a fitted logistic regression model.
 #' @rdname spark.logit
@@ -241,8 +255,12 @@ function(object, path, overwrite = FALSE) {
 setMethod("spark.logit", signature(data = "SparkDataFrame", formula = 
"formula"),
   function(data, formula, regParam = 0.0, elasticNetParam = 0.0, 
maxIter = 100,
tol = 1E-6, family = "auto", standardization = TRUE,
-   thresholds = 0.5, weightCol = NULL, aggregationDepth = 2) {
+   thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
+   lowerBoundsOnCoefficients = NULL, upperBoundsOnCoefficients 
= NULL,
+   lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = 
NULL) {
 formula <- paste(deparse(formula), collapse = "")
+row <- 0
+col <- 0
 
 if (!is.null(weightCol) && weightCol == "") {
   weightCol <- NULL
@@ -250,12 +268,51 @@ setMethod("spark.logit", signature(data = 
"SparkDataFrame", formula = "formula")
   weightCol <- as.character(weightCol)
 }
 
+if (!is.null(lowerBoundsOnIntercepts)) {
+lowerBoundsOnIntercepts <- as.array(lowerBoundsOnIntercepts)
+}
+
+if (!is.null(upperBoundsOnIntercepts)) {
+upperBoundsOnIntercep

spark git commit: [SPARK-20889][SPARKR] Grouped documentation for DATETIME column methods

2017-06-22 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 2dadea95c -> 19331b8e4


[SPARK-20889][SPARKR] Grouped documentation for DATETIME column methods

## What changes were proposed in this pull request?
Grouped documentation for datetime column methods.

Author: actuaryzhang 

Closes #18114 from actuaryzhang/sparkRDocDate.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/19331b8e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/19331b8e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/19331b8e

Branch: refs/heads/master
Commit: 19331b8e44ad910550f810b80e2a0caf0ef62cb3
Parents: 2dadea9
Author: actuaryzhang 
Authored: Thu Jun 22 10:16:51 2017 -0700
Committer: Felix Cheung 
Committed: Thu Jun 22 10:16:51 2017 -0700

--
 R/pkg/R/functions.R | 532 ---
 R/pkg/R/generics.R  |  69 --
 2 files changed, 273 insertions(+), 328 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/19331b8e/R/pkg/R/functions.R
--
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 01ca8b8..3102858 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -34,6 +34,58 @@ NULL
 #' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))}
 NULL
 
+#' Date time functions for Column operations
+#'
+#' Date time functions defined for \code{Column}.
+#'
+#' @param x Column to compute on.
+#' @param format For \code{to_date} and \code{to_timestamp}, it is the string 
to use to parse
+#'   x Column to DateType or TimestampType. For \code{trunc}, it 
is the string used
+#'   for specifying the truncation method. For example, "year", 
"", "yy" for
+#'   truncate by year, or "month", "mon", "mm" for truncate by 
month.
+#' @param ... additional argument(s).
+#' @name column_datetime_functions
+#' @rdname column_datetime_functions
+#' @family data time functions
+#' @examples
+#' \dontrun{
+#' dts <- c("2005-01-02 18:47:22",
+#' "2005-12-24 16:30:58",
+#' "2005-10-28 07:30:05",
+#' "2005-12-28 07:01:05",
+#' "2006-01-24 00:01:10")
+#' y <- c(2.0, 2.2, 3.4, 2.5, 1.8)
+#' df <- createDataFrame(data.frame(time = as.POSIXct(dts), y = y))}
+NULL
+
+#' Date time arithmetic functions for Column operations
+#'
+#' Date time arithmetic functions defined for \code{Column}.
+#'
+#' @param y Column to compute on.
+#' @param x For class \code{Column}, it is the column used to perform 
arithmetic operations
+#'  with column \code{y}. For class \code{numeric}, it is the number 
of months or
+#'  days to be added to or subtracted from \code{y}. For class 
\code{character}, it is
+#'  \itemize{
+#'  \item \code{date_format}: date format specification.
+#'  \item \code{from_utc_timestamp}, \code{to_utc_timestamp}: time 
zone to use.
+#'  \item \code{next_day}: day of the week string.
+#'  }
+#'
+#' @name column_datetime_diff_functions
+#' @rdname column_datetime_diff_functions
+#' @family data time functions
+#' @examples
+#' \dontrun{
+#' dts <- c("2005-01-02 18:47:22",
+#' "2005-12-24 16:30:58",
+#' "2005-10-28 07:30:05",
+#' "2005-12-28 07:01:05",
+#' "2006-01-24 00:01:10")
+#' y <- c(2.0, 2.2, 3.4, 2.5, 1.8)
+#' df <- createDataFrame(data.frame(time = as.POSIXct(dts), y = y))}
+NULL
+
 #' lit
 #'
 #' A new \linkS4class{Column} is created to represent the literal value.
@@ -546,18 +598,20 @@ setMethod("hash",
 column(jc)
   })
 
-#' dayofmonth
-#'
-#' Extracts the day of the month as an integer from a given 
date/timestamp/string.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{dayofmonth}: Extracts the day of the month as an integer from a
+#' given date/timestamp/string.
 #'
-#' @rdname dayofmonth
-#' @name dayofmonth
-#' @family date time functions
-#' @aliases dayofmonth,Column-method
+#' @rdname column_datetime_functions
+#' @aliases dayofmonth dayofmonth,Column-method
 #' @export
-#' @examples \dontrun{dayofmonth(df$c)}
+#' @examples
+#'
+#' \dontrun{
+#' head(select(df, df$time, year(df$time), quarter(df$time), month(df$time),
+#'dayofmonth(df$time), dayofyear(df$time), weekofyear(df$time)))
+#' head(agg(groupBy(df, year(df$time)), count(df$y), avg(df$y)))
+#' head(agg(groupBy(df, month(df$time)), avg(df$y)))}
 #' @note dayofmonth since 1.5.0
 setMethod("dayofmonth",
   signature(x = "Column"),
@@ -566,18 +620,13 @@ setMethod("dayofmonth",
 column(jc)
   })
 
-#' dayofyear
-#'
-#' Extracts the day of the year as an integer from a given 
date/timestamp/string.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{dayofyear}: Extracts the day of the year

spark git commit: [SPARK-21149][R] Add job description API for R

2017-06-23 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master f3dea6079 -> 07479b3cf


[SPARK-21149][R] Add job description API for R

## What changes were proposed in this pull request?

Extend `setJobDescription` to SparkR API.

## How was this patch tested?

It looks difficult to add a test. Manually tested as below:

```r
df <- createDataFrame(iris)
count(df)
setJobDescription("This is an example job.")
count(df)
```

prints ...

![2017-06-22 12 05 
49](https://user-images.githubusercontent.com/6477701/27415670-2a649936-5743-11e7-8e95-312f1cd103af.png)

Author: hyukjinkwon 

Closes #18382 from HyukjinKwon/SPARK-21149.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/07479b3c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/07479b3c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/07479b3c

Branch: refs/heads/master
Commit: 07479b3cfb7a617a18feca14e9e31c208c80630e
Parents: f3dea60
Author: hyukjinkwon 
Authored: Fri Jun 23 09:59:24 2017 -0700
Committer: Felix Cheung 
Committed: Fri Jun 23 09:59:24 2017 -0700

--
 R/pkg/NAMESPACE  |  3 ++-
 R/pkg/R/sparkR.R | 17 +
 R/pkg/tests/fulltests/test_context.R |  1 +
 3 files changed, 20 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/07479b3c/R/pkg/NAMESPACE
--
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 229de4a..b7fdae5 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -75,7 +75,8 @@ exportMethods("glm",
 # Job group lifecycle management methods
 export("setJobGroup",
"clearJobGroup",
-   "cancelJobGroup")
+   "cancelJobGroup",
+   "setJobDescription")
 
 # Export Utility methods
 export("setLogLevel")

http://git-wip-us.apache.org/repos/asf/spark/blob/07479b3c/R/pkg/R/sparkR.R
--
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index d0a12b7..f2d2620 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -535,6 +535,23 @@ cancelJobGroup <- function(sc, groupId) {
   }
 }
 
+#' Set a human readable description of the current job.
+#'
+#' Set a description that is shown as a job description in UI.
+#'
+#' @param value The job description of the current job.
+#' @rdname setJobDescription
+#' @name setJobDescription
+#' @examples
+#'\dontrun{
+#' setJobDescription("This is an example job.")
+#'}
+#' @note setJobDescription since 2.3.0
+setJobDescription <- function(value) {
+  sc <- getSparkContext()
+  invisible(callJMethod(sc, "setJobDescription", value))
+}
+
 sparkConfToSubmitOps <- new.env()
 sparkConfToSubmitOps[["spark.driver.memory"]]   <- "--driver-memory"
 sparkConfToSubmitOps[["spark.driver.extraClassPath"]]   <- 
"--driver-class-path"

http://git-wip-us.apache.org/repos/asf/spark/blob/07479b3c/R/pkg/tests/fulltests/test_context.R
--
diff --git a/R/pkg/tests/fulltests/test_context.R 
b/R/pkg/tests/fulltests/test_context.R
index 710485d..77635c5 100644
--- a/R/pkg/tests/fulltests/test_context.R
+++ b/R/pkg/tests/fulltests/test_context.R
@@ -100,6 +100,7 @@ test_that("job group functions can be called", {
   setJobGroup("groupId", "job description", TRUE)
   cancelJobGroup("groupId")
   clearJobGroup()
+  setJobDescription("job description")
 
   suppressWarnings(setJobGroup(sc, "groupId", "job description", TRUE))
   suppressWarnings(cancelJobGroup(sc, "groupId"))


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-21093][R] Terminate R's worker processes in the parent of R's daemon to prevent a leak

2017-06-25 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 884347e1f -> 6b3d02285


[SPARK-21093][R] Terminate R's worker processes in the parent of R's daemon to 
prevent a leak

## What changes were proposed in this pull request?

`mcfork` in R looks opening a pipe ahead but the existing logic does not 
properly close it when it is executed hot. This leads to the failure of more 
forking due to the limit for number of files open.

This hot execution looks particularly for `gapply`/`gapplyCollect`. For unknown 
reason, this happens more easily in CentOS and could be reproduced in Mac too.

All the details are described in 
https://issues.apache.org/jira/browse/SPARK-21093

This PR proposes simply to terminate R's worker processes in the parent of R's 
daemon to prevent a leak.

## How was this patch tested?

I ran the codes below on both CentOS and Mac with that configuration 
disabled/enabled.

```r
df <- createDataFrame(list(list(1L, 1, "1", 0.1)), c("a", "b", "c", "d"))
collect(gapply(df, "a", function(key, x) { x }, schema(df)))
collect(gapply(df, "a", function(key, x) { x }, schema(df)))
...  # 30 times
```

Also, now it passes R tests on CentOS as below:

```
SparkSQL functions: Spark package found in SPARK_HOME: .../spark
..
..
..
..
..

```

Author: hyukjinkwon 

Closes #18320 from HyukjinKwon/SPARK-21093.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6b3d0228
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6b3d0228
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6b3d0228

Branch: refs/heads/master
Commit: 6b3d02285ee0debc73cbcab01b10398a498fbeb8
Parents: 884347e
Author: hyukjinkwon 
Authored: Sun Jun 25 11:05:57 2017 -0700
Committer: Felix Cheung 
Committed: Sun Jun 25 11:05:57 2017 -0700

--
 R/pkg/inst/worker/daemon.R | 59 ++---
 1 file changed, 55 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6b3d0228/R/pkg/inst/worker/daemon.R
--
diff --git a/R/pkg/inst/worker/daemon.R b/R/pkg/inst/worker/daemon.R
index 3a318b7..6e385b2 100644
--- a/R/pkg/inst/worker/daemon.R
+++ b/R/pkg/inst/worker/daemon.R
@@ -30,8 +30,55 @@ port <- as.integer(Sys.getenv("SPARKR_WORKER_PORT"))
 inputCon <- socketConnection(
 port = port, open = "rb", blocking = TRUE, timeout = connectionTimeout)
 
+# Waits indefinitely for a socket connecion by default.
+selectTimeout <- NULL
+
+# Exit code that children send to the parent to indicate they exited.
+exitCode <- 1
+
 while (TRUE) {
-  ready <- socketSelect(list(inputCon))
+  ready <- socketSelect(list(inputCon), timeout = selectTimeout)
+
+  # Note that the children should be terminated in the parent. If each child 
terminates
+  # itself, it appears that the resource is not released properly, that causes 
an unexpected
+  # termination of this daemon due to, for example, running out of file 
descriptors
+  # (see SPARK-21093). Therefore, the current implementation tries to retrieve 
children
+  # that are exited (but not terminated) and then sends a kill signal to 
terminate them properly
+  # in the parent.
+  #
+  # There are two paths that it attempts to send a signal to terminate the 
children in the parent.
+  #
+  #   1. Every second if any socket connection is not available and if there 
are child workers
+  # running.
+  #   2. Right after a socket connection is available.
+  #
+  # In other words, the parent attempts to send the signal to the children 
every second if
+  # any worker is running or right before launching other worker children from 
the following
+  # new socket connection.
+
+  # Only the process IDs of children sent data to the parent are returned 
below. The children
+  # send a custom exit code to the parent after being exited and the parent 
tries
+  # to terminate them only if they s

spark git commit: [SPARK-20889][SPARKR] Grouped documentation for MATH column methods

2017-06-27 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 2d686a19e -> e793bf248


[SPARK-20889][SPARKR] Grouped documentation for MATH column methods

## What changes were proposed in this pull request?

Grouped documentation for math column methods.

Author: actuaryzhang 
Author: Wayne Zhang 

Closes #18371 from actuaryzhang/sparkRDocMath.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e793bf24
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e793bf24
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e793bf24

Branch: refs/heads/master
Commit: e793bf248bc3c71b9664f26377bce06b0ffa97a7
Parents: 2d686a1
Author: actuaryzhang 
Authored: Tue Jun 27 23:15:45 2017 -0700
Committer: Felix Cheung 
Committed: Tue Jun 27 23:15:45 2017 -0700

--
 R/pkg/R/functions.R | 619 ---
 R/pkg/R/generics.R  |  48 ++--
 2 files changed, 241 insertions(+), 426 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e793bf24/R/pkg/R/functions.R
--
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 3102858..23ccdf9 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -86,6 +86,31 @@ NULL
 #' df <- createDataFrame(data.frame(time = as.POSIXct(dts), y = y))}
 NULL
 
+#' Math functions for Column operations
+#'
+#' Math functions defined for \code{Column}.
+#'
+#' @param x Column to compute on. In \code{shiftLeft}, \code{shiftRight} and 
\code{shiftRightUnsigned},
+#'  this is the number of bits to shift.
+#' @param y Column to compute on.
+#' @param ... additional argument(s).
+#' @name column_math_functions
+#' @rdname column_math_functions
+#' @family math functions
+#' @examples
+#' \dontrun{
+#' # Dataframe used throughout this doc
+#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
+#' tmp <- mutate(df, v1 = log(df$mpg), v2 = cbrt(df$disp),
+#'   v3 = bround(df$wt, 1), v4 = bin(df$cyl),
+#'   v5 = hex(df$wt), v6 = toDegrees(df$gear),
+#'   v7 = atan2(df$cyl, df$am), v8 = hypot(df$cyl, df$am),
+#'   v9 = pmod(df$hp, df$cyl), v10 = shiftLeft(df$disp, 1),
+#'   v11 = conv(df$hp, 10, 16), v12 = sign(df$vs - 0.5),
+#'   v13 = sqrt(df$disp), v14 = ceil(df$wt))
+#' head(tmp)}
+NULL
+
 #' lit
 #'
 #' A new \linkS4class{Column} is created to represent the literal value.
@@ -112,18 +137,12 @@ setMethod("lit", signature("ANY"),
 column(jc)
   })
 
-#' abs
-#'
-#' Computes the absolute value.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{abs}: Computes the absolute value.
 #'
-#' @rdname abs
-#' @name abs
-#' @family non-aggregate functions
+#' @rdname column_math_functions
 #' @export
-#' @examples \dontrun{abs(df$c)}
-#' @aliases abs,Column-method
+#' @aliases abs abs,Column-method
 #' @note abs since 1.5.0
 setMethod("abs",
   signature(x = "Column"),
@@ -132,19 +151,13 @@ setMethod("abs",
 column(jc)
   })
 
-#' acos
-#'
-#' Computes the cosine inverse of the given value; the returned angle is in 
the range
-#' 0.0 through pi.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{acos}: Computes the cosine inverse of the given value; the returned 
angle is in
+#' the range 0.0 through pi.
 #'
-#' @rdname acos
-#' @name acos
-#' @family math functions
+#' @rdname column_math_functions
 #' @export
-#' @examples \dontrun{acos(df$c)}
-#' @aliases acos,Column-method
+#' @aliases acos acos,Column-method
 #' @note acos since 1.5.0
 setMethod("acos",
   signature(x = "Column"),
@@ -196,19 +209,13 @@ setMethod("ascii",
 column(jc)
   })
 
-#' asin
-#'
-#' Computes the sine inverse of the given value; the returned angle is in the 
range
-#' -pi/2 through pi/2.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{asin}: Computes the sine inverse of the given value; the returned 
angle is in
+#' the range -pi/2 through pi/2.
 #'
-#' @rdname asin
-#' @name asin
-#' @family math functions
+#' @rdname column_math_functions
 #' @export
-#' @aliases asin,Column-method
-#' @examples \dontrun{asin(df$c)}
+#' @aliases asin asin,Column-method
 #' @note asin since 1.5.0
 setMethod("asin",
   signature(x = "Column"),
@@ -217,18 +224,12 @@ setMethod("asin",
 column(jc)
   })
 
-#' atan
-#'
-#' Computes the tangent inverse of the given value.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{atan}: Computes the tangent inverse of the given value.
 #'
-#' @rdname atan
-#' @name atan
-#' @family math functions
+#' @rdname column_math_functions
 #' @export
-#' @aliases atan,Column-method
-#' @examples \dontrun{atan(df$c)}
+#' @aliases atan atan,

spark git commit: [SPARK-20889][SPARKR] Grouped documentation for STRING column methods

2017-06-28 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master b72b8521d -> 376d90d55


[SPARK-20889][SPARKR] Grouped documentation for STRING column methods

## What changes were proposed in this pull request?

Grouped documentation for string column methods.

Author: actuaryzhang 
Author: Wayne Zhang 

Closes #18366 from actuaryzhang/sparkRDocString.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/376d90d5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/376d90d5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/376d90d5

Branch: refs/heads/master
Commit: 376d90d556fcd4fd84f70ee42a1323e1f48f829d
Parents: b72b852
Author: actuaryzhang 
Authored: Wed Jun 28 19:31:54 2017 -0700
Committer: Felix Cheung 
Committed: Wed Jun 28 19:31:54 2017 -0700

--
 R/pkg/R/functions.R | 573 ---
 R/pkg/R/generics.R  |  84 ---
 2 files changed, 300 insertions(+), 357 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/376d90d5/R/pkg/R/functions.R
--
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 23ccdf9..70ea620 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -111,6 +111,27 @@ NULL
 #' head(tmp)}
 NULL
 
+#' String functions for Column operations
+#'
+#' String functions defined for \code{Column}.
+#'
+#' @param x Column to compute on except in the following methods:
+#'  \itemize{
+#'  \item \code{instr}: \code{character}, the substring to check. See 
'Details'.
+#'  \item \code{format_number}: \code{numeric}, the number of decimal 
place to
+#'   format to. See 'Details'.
+#'  }
+#' @param y Column to compute on.
+#' @param ... additional columns.
+#' @name column_string_functions
+#' @rdname column_string_functions
+#' @family string functions
+#' @examples
+#' \dontrun{
+#' # Dataframe used throughout this doc
+#' df <- createDataFrame(as.data.frame(Titanic, stringsAsFactors = FALSE))}
+NULL
+
 #' lit
 #'
 #' A new \linkS4class{Column} is created to represent the literal value.
@@ -188,19 +209,17 @@ setMethod("approxCountDistinct",
 column(jc)
   })
 
-#' ascii
-#'
-#' Computes the numeric value of the first character of the string column, and 
returns the
-#' result as a int column.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{ascii}: Computes the numeric value of the first character of the 
string column,
+#' and returns the result as an int column.
 #'
-#' @rdname ascii
-#' @name ascii
-#' @family string functions
+#' @rdname column_string_functions
 #' @export
-#' @aliases ascii,Column-method
-#' @examples \dontrun{\dontrun{ascii(df$c)}}
+#' @aliases ascii ascii,Column-method
+#' @examples
+#'
+#' \dontrun{
+#' head(select(df, ascii(df$Class), ascii(df$Sex)))}
 #' @note ascii since 1.5.0
 setMethod("ascii",
   signature(x = "Column"),
@@ -256,19 +275,22 @@ setMethod("avg",
 column(jc)
   })
 
-#' base64
-#'
-#' Computes the BASE64 encoding of a binary column and returns it as a string 
column.
-#' This is the reverse of unbase64.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{base64}: Computes the BASE64 encoding of a binary column and returns 
it as
+#' a string column. This is the reverse of unbase64.
 #'
-#' @rdname base64
-#' @name base64
-#' @family string functions
+#' @rdname column_string_functions
 #' @export
-#' @aliases base64,Column-method
-#' @examples \dontrun{base64(df$c)}
+#' @aliases base64 base64,Column-method
+#' @examples
+#'
+#' \dontrun{
+#' tmp <- mutate(df, s1 = encode(df$Class, "UTF-8"))
+#' str(tmp)
+#' tmp2 <- mutate(tmp, s2 = base64(tmp$s1), s3 = decode(tmp$s1, "UTF-8"),
+#' s4 = soundex(tmp$Sex))
+#' head(tmp2)
+#' head(select(tmp2, unbase64(tmp2$s2)))}
 #' @note base64 since 1.5.0
 setMethod("base64",
   signature(x = "Column"),
@@ -620,20 +642,16 @@ setMethod("dayofyear",
 column(jc)
   })
 
-#' decode
-#'
-#' Computes the first argument into a string from a binary using the provided 
character set
-#' (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 
'UTF-16').
+#' @details
+#' \code{decode}: Computes the first argument into a string from a binary 
using the provided
+#' character set.
 #'
-#' @param x Column to compute on.
-#' @param charset Character set to use
+#' @param charset Character set to use (one of "US-ASCII", "ISO-8859-1", 
"UTF-8", "UTF-16BE",
+#'"UTF-16LE", "UTF-16").
 #'
-#' @rdname decode
-#' @name decode
-#' @family string functions
-#' @aliases decode,Column,character-method
+#' @rdname column_string_functions
+#' @aliases decode decode,Column,character-method
 #' @export
-#' @examples \dontrun{decode(df$c, "UTF-8")}
 

spark git commit: [SPARK-21224][R] Specify a schema by using a DDL-formatted string when reading in R

2017-06-28 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 0c8444cf6 -> db44f5f3e


[SPARK-21224][R] Specify a schema by using a DDL-formatted string when reading 
in R

## What changes were proposed in this pull request?

This PR proposes to support a DDL-formetted string as schema as below:

```r
mockLines <- c("{\"name\":\"Michael\"}",
   "{\"name\":\"Andy\", \"age\":30}",
   "{\"name\":\"Justin\", \"age\":19}")
jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
writeLines(mockLines, jsonPath)
df <- read.df(jsonPath, "json", "name STRING, age DOUBLE")
collect(df)
```

## How was this patch tested?

Tests added in `test_streaming.R` and `test_sparkSQL.R` and manual tests.

Author: hyukjinkwon 

Closes #18431 from HyukjinKwon/r-ddl-schema.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/db44f5f3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/db44f5f3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/db44f5f3

Branch: refs/heads/master
Commit: db44f5f3e8b5bc28c33b154319539d51c05a089c
Parents: 0c8444c
Author: hyukjinkwon 
Authored: Wed Jun 28 19:36:00 2017 -0700
Committer: Felix Cheung 
Committed: Wed Jun 28 19:36:00 2017 -0700

--
 R/pkg/R/SQLContext.R| 38 +---
 R/pkg/tests/fulltests/test_sparkSQL.R   | 20 +--
 R/pkg/tests/fulltests/test_streaming.R  | 23 
 .../org/apache/spark/sql/api/r/SQLUtils.scala   | 15 
 4 files changed, 67 insertions(+), 29 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/db44f5f3/R/pkg/R/SQLContext.R
--
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index e3528bc..3b7f71b 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -584,7 +584,7 @@ tableToDF <- function(tableName) {
 #'
 #' @param path The path of files to load
 #' @param source The name of external data source
-#' @param schema The data schema defined in structType
+#' @param schema The data schema defined in structType or a DDL-formatted 
string.
 #' @param na.strings Default string value for NA when source is "csv"
 #' @param ... additional external data source specific named properties.
 #' @return SparkDataFrame
@@ -600,6 +600,8 @@ tableToDF <- function(tableName) {
 #'  structField("info", "map"))
 #' df2 <- read.df(mapTypeJsonPath, "json", schema, multiLine = TRUE)
 #' df3 <- loadDF("data/test_table", "parquet", mergeSchema = "true")
+#' stringSchema <- "name STRING, info MAP"
+#' df4 <- read.df(mapTypeJsonPath, "json", stringSchema, multiLine = TRUE)
 #' }
 #' @name read.df
 #' @method read.df default
@@ -623,14 +625,19 @@ read.df.default <- function(path = NULL, source = NULL, 
schema = NULL, na.string
   if (source == "csv" && is.null(options[["nullValue"]])) {
 options[["nullValue"]] <- na.strings
   }
+  read <- callJMethod(sparkSession, "read")
+  read <- callJMethod(read, "format", source)
   if (!is.null(schema)) {
-stopifnot(class(schema) == "structType")
-sdf <- handledCallJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", 
sparkSession,
-  source, schema$jobj, options)
-  } else {
-sdf <- handledCallJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", 
sparkSession,
-  source, options)
+if (class(schema) == "structType") {
+  read <- callJMethod(read, "schema", schema$jobj)
+} else if (is.character(schema)) {
+  read <- callJMethod(read, "schema", schema)
+} else {
+  stop("schema should be structType or character.")
+}
   }
+  read <- callJMethod(read, "options", options)
+  sdf <- handledCallJMethod(read, "load")
   dataFrame(sdf)
 }
 
@@ -717,8 +724,8 @@ read.jdbc <- function(url, tableName,
 #' "spark.sql.sources.default" will be used.
 #'
 #' @param source The name of external data source
-#' @param schema The data schema defined in structType, this is required for 
file-based streaming
-#'   data source
+#' @param schema The data schema defined in structType or a DDL-formatted 
string, this is
+#'   required for file-based streaming data source
 #' @param ... additional external data source specific named options, for 
instance \code{path} for
 #'file-based streaming data source
 #' @return SparkDataFrame
@@ -733,6 +740,8 @@ read.jdbc <- function(url, tableName,
 #' q <- write.stream(df, "text", path = "/home/user/out", checkpointLocation = 
"/home/user/cp")
 #'
 #' df <- read.stream("json", path = jsonDir, schema = schema, 
maxFilesPerTrigger = 1)
+#' stringSchema <- "name STRING, info MAP"
+#' df1 <- read.stream("json", path = jsonDir, schema = stringSchema, 
maxFilesPerTrigger = 1)
 #' }
 #'

spark git commit: Revert "[SPARK-21094][R] Terminate R's worker processes in the parent of R's daemon to prevent a leak"

2017-06-28 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master db44f5f3e -> fc92d25f2


Revert "[SPARK-21094][R] Terminate R's worker processes in the parent of R's 
daemon to prevent a leak"

This reverts commit 6b3d02285ee0debc73cbcab01b10398a498fbeb8.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fc92d25f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fc92d25f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fc92d25f

Branch: refs/heads/master
Commit: fc92d25f2a27e81ef2d5031dcf856af1cc1d8c31
Parents: db44f5f
Author: Felix Cheung 
Authored: Wed Jun 28 20:06:29 2017 -0700
Committer: Felix Cheung 
Committed: Wed Jun 28 20:06:29 2017 -0700

--
 R/pkg/inst/worker/daemon.R | 59 +++--
 1 file changed, 4 insertions(+), 55 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/fc92d25f/R/pkg/inst/worker/daemon.R
--
diff --git a/R/pkg/inst/worker/daemon.R b/R/pkg/inst/worker/daemon.R
index 6e385b2..3a318b7 100644
--- a/R/pkg/inst/worker/daemon.R
+++ b/R/pkg/inst/worker/daemon.R
@@ -30,55 +30,8 @@ port <- as.integer(Sys.getenv("SPARKR_WORKER_PORT"))
 inputCon <- socketConnection(
 port = port, open = "rb", blocking = TRUE, timeout = connectionTimeout)
 
-# Waits indefinitely for a socket connecion by default.
-selectTimeout <- NULL
-
-# Exit code that children send to the parent to indicate they exited.
-exitCode <- 1
-
 while (TRUE) {
-  ready <- socketSelect(list(inputCon), timeout = selectTimeout)
-
-  # Note that the children should be terminated in the parent. If each child 
terminates
-  # itself, it appears that the resource is not released properly, that causes 
an unexpected
-  # termination of this daemon due to, for example, running out of file 
descriptors
-  # (see SPARK-21093). Therefore, the current implementation tries to retrieve 
children
-  # that are exited (but not terminated) and then sends a kill signal to 
terminate them properly
-  # in the parent.
-  #
-  # There are two paths that it attempts to send a signal to terminate the 
children in the parent.
-  #
-  #   1. Every second if any socket connection is not available and if there 
are child workers
-  # running.
-  #   2. Right after a socket connection is available.
-  #
-  # In other words, the parent attempts to send the signal to the children 
every second if
-  # any worker is running or right before launching other worker children from 
the following
-  # new socket connection.
-
-  # Only the process IDs of children sent data to the parent are returned 
below. The children
-  # send a custom exit code to the parent after being exited and the parent 
tries
-  # to terminate them only if they sent the exit code.
-  children <- parallel:::selectChildren(timeout = 0)
-
-  if (is.integer(children)) {
-lapply(children, function(child) {
-  # This data should be raw bytes if any data was sent from this child.
-  # Otherwise, this returns the PID.
-  data <- parallel:::readChild(child)
-  if (is.raw(data)) {
-# This checks if the data from this child is the exit code that 
indicates an exited child.
-if (unserialize(data) == exitCode) {
-  # If so, we terminate this child.
-  tools::pskill(child, tools::SIGUSR1)
-}
-  }
-})
-  } else if (is.null(children)) {
-# If it is NULL, there are no children. Waits indefinitely for a socket 
connecion.
-selectTimeout <- NULL
-  }
-
+  ready <- socketSelect(list(inputCon))
   if (ready) {
 port <- SparkR:::readInt(inputCon)
 # There is a small chance that it could be interrupted by signal, retry 
one time
@@ -91,16 +44,12 @@ while (TRUE) {
 }
 p <- parallel:::mcfork()
 if (inherits(p, "masterProcess")) {
-  # Reach here because this is a child process.
   close(inputCon)
   Sys.setenv(SPARKR_WORKER_PORT = port)
   try(source(script))
-  # Note that this mcexit does not fully terminate this child. So, this 
writes back
-  # a custom exit code so that the parent can read and terminate this 
child.
-  parallel:::mcexit(0L, send = exitCode)
-} else {
-  # Forking succeeded and we need to check if they finished their jobs 
every second.
-  selectTimeout <- 1
+  # Set SIGUSR1 so that child can exit
+  tools::pskill(Sys.getpid(), tools::SIGUSR1)
+  parallel:::mcexit(0L)
 }
   }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-20889][SPARKR] Grouped documentation for NONAGGREGATE column methods

2017-06-29 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 9f6b3e65c -> a2d562354


[SPARK-20889][SPARKR] Grouped documentation for NONAGGREGATE column methods

## What changes were proposed in this pull request?

Grouped documentation for nonaggregate column methods.

Author: actuaryzhang 
Author: Wayne Zhang 

Closes #18422 from actuaryzhang/sparkRDocNonAgg.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a2d56235
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a2d56235
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a2d56235

Branch: refs/heads/master
Commit: a2d5623548194f15989e7b68118d744673e33819
Parents: 9f6b3e6
Author: actuaryzhang 
Authored: Thu Jun 29 01:23:13 2017 -0700
Committer: Felix Cheung 
Committed: Thu Jun 29 01:23:13 2017 -0700

--
 R/pkg/R/functions.R | 360 +++
 R/pkg/R/generics.R  |  55 +---
 2 files changed, 182 insertions(+), 233 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a2d56235/R/pkg/R/functions.R
--
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 70ea620..cb09e84 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -132,23 +132,39 @@ NULL
 #' df <- createDataFrame(as.data.frame(Titanic, stringsAsFactors = FALSE))}
 NULL
 
-#' lit
+#' Non-aggregate functions for Column operations
 #'
-#' A new \linkS4class{Column} is created to represent the literal value.
-#' If the parameter is a \linkS4class{Column}, it is returned unchanged.
+#' Non-aggregate functions defined for \code{Column}.
 #'
-#' @param x a literal value or a Column.
+#' @param x Column to compute on. In \code{lit}, it is a literal value or a 
Column.
+#'  In \code{expr}, it contains an expression character object to be 
parsed.
+#' @param y Column to compute on.
+#' @param ... additional Columns.
+#' @name column_nonaggregate_functions
+#' @rdname column_nonaggregate_functions
+#' @seealso coalesce,SparkDataFrame-method
 #' @family non-aggregate functions
-#' @rdname lit
-#' @name lit
+#' @examples
+#' \dontrun{
+#' # Dataframe used throughout this doc
+#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))}
+NULL
+
+#' @details
+#' \code{lit}: A new Column is created to represent the literal value.
+#' If the parameter is a Column, it is returned unchanged.
+#'
+#' @rdname column_nonaggregate_functions
 #' @export
-#' @aliases lit,ANY-method
+#' @aliases lit lit,ANY-method
 #' @examples
+#'
 #' \dontrun{
-#' lit(df$name)
-#' select(df, lit("x"))
-#' select(df, lit("2015-01-01"))
-#'}
+#' tmp <- mutate(df, v1 = lit(df$mpg), v2 = lit("x"), v3 = lit("2015-01-01"),
+#'   v4 = negate(df$mpg), v5 = expr('length(model)'),
+#'   v6 = greatest(df$vs, df$am), v7 = least(df$vs, df$am),
+#'   v8 = column("mpg"))
+#' head(tmp)}
 #' @note lit since 1.5.0
 setMethod("lit", signature("ANY"),
   function(x) {
@@ -314,18 +330,16 @@ setMethod("bin",
 column(jc)
   })
 
-#' bitwiseNOT
-#'
-#' Computes bitwise NOT.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{bitwiseNOT}: Computes bitwise NOT.
 #'
-#' @rdname bitwiseNOT
-#' @name bitwiseNOT
-#' @family non-aggregate functions
+#' @rdname column_nonaggregate_functions
 #' @export
-#' @aliases bitwiseNOT,Column-method
-#' @examples \dontrun{bitwiseNOT(df$c)}
+#' @aliases bitwiseNOT bitwiseNOT,Column-method
+#' @examples
+#'
+#' \dontrun{
+#' head(select(df, bitwiseNOT(cast(df$vs, "int"}
 #' @note bitwiseNOT since 1.5.0
 setMethod("bitwiseNOT",
   signature(x = "Column"),
@@ -375,16 +389,12 @@ setMethod("ceiling",
 ceil(x)
   })
 
-#' Returns the first column that is not NA
-#'
-#' Returns the first column that is not NA, or NA if all inputs are.
+#' @details
+#' \code{coalesce}: Returns the first column that is not NA, or NA if all 
inputs are.
 #'
-#' @rdname coalesce
-#' @name coalesce
-#' @family non-aggregate functions
+#' @rdname column_nonaggregate_functions
 #' @export
 #' @aliases coalesce,Column-method
-#' @examples \dontrun{coalesce(df$c, df$d, df$e)}
 #' @note coalesce(Column) since 2.1.1
 setMethod("coalesce",
   signature(x = "Column"),
@@ -824,22 +834,24 @@ setMethod("initcap",
 column(jc)
   })
 
-#' is.nan
-#'
-#' Return true if the column is NaN, alias for \link{isnan}
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{isnan}: Returns true if the column is NaN.
+#' @rdname column_nonaggregate_functions
+#' @aliases isnan isnan,Column-method
+#' @note isnan since 2.0.0
+setMethod("isnan",
+  signature(x = "Column"),
+  function(x) {
+jc <- callJStatic("org.apache.spark.sql.functions", "isna

spark git commit: [SPARK-20889][SPARKR] Grouped documentation for MISC column methods

2017-06-29 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master e2f32ee45 -> fddb63f46


[SPARK-20889][SPARKR] Grouped documentation for MISC column methods

## What changes were proposed in this pull request?
Grouped documentation for column misc methods.

Author: actuaryzhang 
Author: Wayne Zhang 

Closes #18448 from actuaryzhang/sparkRDocMisc.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fddb63f4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fddb63f4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fddb63f4

Branch: refs/heads/master
Commit: fddb63f46345be36c40d9a7f3660920af6502bbd
Parents: e2f32ee
Author: actuaryzhang 
Authored: Thu Jun 29 21:35:01 2017 -0700
Committer: Felix Cheung 
Committed: Thu Jun 29 21:35:01 2017 -0700

--
 R/pkg/R/functions.R | 98 ++--
 R/pkg/R/generics.R  | 15 +---
 2 files changed, 55 insertions(+), 58 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/fddb63f4/R/pkg/R/functions.R
--
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index cb09e84..67cb7a7 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -150,6 +150,27 @@ NULL
 #' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))}
 NULL
 
+#' Miscellaneous functions for Column operations
+#'
+#' Miscellaneous functions defined for \code{Column}.
+#'
+#' @param x Column to compute on. In \code{sha2}, it is one of 224, 256, 384, 
or 512.
+#' @param y Column to compute on.
+#' @param ... additional Columns.
+#' @name column_misc_functions
+#' @rdname column_misc_functions
+#' @family misc functions
+#' @examples
+#' \dontrun{
+#' # Dataframe used throughout this doc
+#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars)[, 1:2])
+#' tmp <- mutate(df, v1 = crc32(df$model), v2 = hash(df$model),
+#'   v3 = hash(df$model, df$mpg), v4 = md5(df$model),
+#'   v5 = sha1(df$model), v6 = sha2(df$model, 256))
+#' head(tmp)
+#' }
+NULL
+
 #' @details
 #' \code{lit}: A new Column is created to represent the literal value.
 #' If the parameter is a Column, it is returned unchanged.
@@ -569,19 +590,13 @@ setMethod("count",
 column(jc)
   })
 
-#' crc32
-#'
-#' Calculates the cyclic redundancy check value  (CRC32) of a binary column and
-#' returns the value as a bigint.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{crc32}: Calculates the cyclic redundancy check value  (CRC32) of a 
binary column
+#' and returns the value as a bigint.
 #'
-#' @rdname crc32
-#' @name crc32
-#' @family misc functions
-#' @aliases crc32,Column-method
+#' @rdname column_misc_functions
+#' @aliases crc32 crc32,Column-method
 #' @export
-#' @examples \dontrun{crc32(df$c)}
 #' @note crc32 since 1.5.0
 setMethod("crc32",
   signature(x = "Column"),
@@ -590,19 +605,13 @@ setMethod("crc32",
 column(jc)
   })
 
-#' hash
-#'
-#' Calculates the hash code of given columns, and returns the result as a int 
column.
-#'
-#' @param x Column to compute on.
-#' @param ... additional Column(s) to be included.
+#' @details
+#' \code{hash}: Calculates the hash code of given columns, and returns the 
result
+#' as an int column.
 #'
-#' @rdname hash
-#' @name hash
-#' @family misc functions
-#' @aliases hash,Column-method
+#' @rdname column_misc_functions
+#' @aliases hash hash,Column-method
 #' @export
-#' @examples \dontrun{hash(df$c)}
 #' @note hash since 2.0.0
 setMethod("hash",
   signature(x = "Column"),
@@ -1055,19 +1064,13 @@ setMethod("max",
 column(jc)
   })
 
-#' md5
-#'
-#' Calculates the MD5 digest of a binary column and returns the value
+#' @details
+#' \code{md5}: Calculates the MD5 digest of a binary column and returns the 
value
 #' as a 32 character hex string.
 #'
-#' @param x Column to compute on.
-#'
-#' @rdname md5
-#' @name md5
-#' @family misc functions
-#' @aliases md5,Column-method
+#' @rdname column_misc_functions
+#' @aliases md5 md5,Column-method
 #' @export
-#' @examples \dontrun{md5(df$c)}
 #' @note md5 since 1.5.0
 setMethod("md5",
   signature(x = "Column"),
@@ -1307,19 +1310,13 @@ setMethod("second",
 column(jc)
   })
 
-#' sha1
-#'
-#' Calculates the SHA-1 digest of a binary column and returns the value
+#' @details
+#' \code{sha1}: Calculates the SHA-1 digest of a binary column and returns the 
value
 #' as a 40 character hex string.
 #'
-#' @param x Column to compute on.
-#'
-#' @rdname sha1
-#' @name sha1
-#' @family misc functions
-#' @aliases sha1,Column-method
+#' @rdname column_misc_functions
+#' @aliases sha1 sha1,Column-method
 #' @export
-#' @examples \dontrun{sha1(df$c)}
 #' @note sha1 since 1.5.0
 setMe

spark git commit: [SPARK-20889][SPARKR] Grouped documentation for COLLECTION column methods

2017-06-29 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master fddb63f46 -> 52981715b


[SPARK-20889][SPARKR] Grouped documentation for COLLECTION column methods

## What changes were proposed in this pull request?

Grouped documentation for column collection methods.

Author: actuaryzhang 
Author: Wayne Zhang 

Closes #18458 from actuaryzhang/sparkRDocCollection.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/52981715
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/52981715
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/52981715

Branch: refs/heads/master
Commit: 52981715bb8d653a1141f55b36da804412eb783a
Parents: fddb63f
Author: actuaryzhang 
Authored: Thu Jun 29 23:00:50 2017 -0700
Committer: Felix Cheung 
Committed: Thu Jun 29 23:00:50 2017 -0700

--
 R/pkg/R/functions.R | 204 +--
 R/pkg/R/generics.R  |  27 ---
 2 files changed, 108 insertions(+), 123 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/52981715/R/pkg/R/functions.R
--
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 67cb7a7..a1f5c4f 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -171,6 +171,35 @@ NULL
 #' }
 NULL
 
+#' Collection functions for Column operations
+#'
+#' Collection functions defined for \code{Column}.
+#'
+#' @param x Column to compute on. Note the difference in the following methods:
+#'  \itemize{
+#'  \item \code{to_json}: it is the column containing the struct or 
array of the structs.
+#'  \item \code{from_json}: it is the column containing the JSON 
string.
+#'  }
+#' @param ... additional argument(s). In \code{to_json} and \code{from_json}, 
this contains
+#'additional named properties to control how it is converted, 
accepts the same
+#'options as the JSON data source.
+#' @name column_collection_functions
+#' @rdname column_collection_functions
+#' @family collection functions
+#' @examples
+#' \dontrun{
+#' # Dataframe used throughout this doc
+#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
+#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
+#' tmp <- mutate(df, v1 = create_array(df$mpg, df$cyl, df$hp))
+#' head(select(tmp, array_contains(tmp$v1, 21), size(tmp$v1)))
+#' tmp2 <- mutate(tmp, v2 = explode(tmp$v1))
+#' head(tmp2)
+#' head(select(tmp, posexplode(tmp$v1)))
+#' head(select(tmp, sort_array(tmp$v1)))
+#' head(select(tmp, sort_array(tmp$v1, asc = FALSE)))}
+NULL
+
 #' @details
 #' \code{lit}: A new Column is created to represent the literal value.
 #' If the parameter is a Column, it is returned unchanged.
@@ -1642,30 +1671,23 @@ setMethod("to_date",
 column(jc)
   })
 
-#' to_json
-#'
-#' Converts a column containing a \code{structType} or array of 
\code{structType} into a Column
-#' of JSON string. Resolving the Column can fail if an unsupported type is 
encountered.
-#'
-#' @param x Column containing the struct or array of the structs
-#' @param ... additional named properties to control how it is converted, 
accepts the same options
-#'as the JSON data source.
+#' @details
+#' \code{to_json}: Converts a column containing a \code{structType} or array 
of \code{structType}
+#' into a Column of JSON string. Resolving the Column can fail if an 
unsupported type is encountered.
 #'
-#' @family non-aggregate functions
-#' @rdname to_json
-#' @name to_json
-#' @aliases to_json,Column-method
+#' @rdname column_collection_functions
+#' @aliases to_json to_json,Column-method
 #' @export
 #' @examples
+#'
 #' \dontrun{
 #' # Converts a struct into a JSON object
-#' df <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d")
-#' select(df, to_json(df$d, dateFormat = 'dd/MM/'))
+#' df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d")
+#' select(df2, to_json(df2$d, dateFormat = 'dd/MM/'))
 #'
 #' # Converts an array of structs into a JSON array
-#' df <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 
'Alice')) as people")
-#' select(df, to_json(df$people))
-#'}
+#' df2 <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 
'Alice')) as people")
+#' df2 <- mutate(df2, people_json = to_json(df2$people))}
 #' @note to_json since 2.2.0
 setMethod("to_json", signature(x = "Column"),
   function(x, ...) {
@@ -2120,28 +2142,28 @@ setMethod("date_format", signature(y = "Column", x = 
"character"),
 column(jc)
   })
 
-#' from_json
-#'
-#' Parses a column containing a JSON string into a Column of \code{structType} 
with the specified
-#' \code{schema} or array of \code{structType} if \code{as.json.array} is set 
to \code{TR

spark git commit: [SPARK-20889][SPARKR] Grouped documentation for WINDOW column methods

2017-07-04 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 4d6d8192c -> cec392150


[SPARK-20889][SPARKR] Grouped documentation for WINDOW column methods

## What changes were proposed in this pull request?

Grouped documentation for column window methods.

Author: actuaryzhang 

Closes #18481 from actuaryzhang/sparkRDocWindow.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cec39215
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cec39215
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cec39215

Branch: refs/heads/master
Commit: cec392150451a64c9c2902b7f8f4b3b38f25cbea
Parents: 4d6d819
Author: actuaryzhang 
Authored: Tue Jul 4 12:18:51 2017 -0700
Committer: Felix Cheung 
Committed: Tue Jul 4 12:18:51 2017 -0700

--
 R/pkg/R/functions.R | 225 +++
 R/pkg/R/generics.R  |  28 +++---
 2 files changed, 88 insertions(+), 165 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/cec39215/R/pkg/R/functions.R
--
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index a1f5c4f..8c12308 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -200,6 +200,34 @@ NULL
 #' head(select(tmp, sort_array(tmp$v1, asc = FALSE)))}
 NULL
 
+#' Window functions for Column operations
+#'
+#' Window functions defined for \code{Column}.
+#'
+#' @param x In \code{lag} and \code{lead}, it is the column as a character 
string or a Column
+#'  to compute on. In \code{ntile}, it is the number of ntile groups.
+#' @param offset In \code{lag}, the number of rows back from the current row 
from which to obtain
+#'   a value. In \code{lead}, the number of rows after the current 
row from which to
+#'   obtain a value. If not specified, the default is 1.
+#' @param defaultValue (optional) default to use when the offset row does not 
exist.
+#' @param ... additional argument(s).
+#' @name column_window_functions
+#' @rdname column_window_functions
+#' @family window functions
+#' @examples
+#' \dontrun{
+#' # Dataframe used throughout this doc
+#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
+#' ws <- orderBy(windowPartitionBy("am"), "hp")
+#' tmp <- mutate(df, dist = over(cume_dist(), ws), dense_rank = 
over(dense_rank(), ws),
+#'   lag = over(lag(df$mpg), ws), lead = over(lead(df$mpg, 1), ws),
+#'   percent_rank = over(percent_rank(), ws),
+#'   rank = over(rank(), ws), row_number = over(row_number(), ws))
+#' # Get ntile group id (1-4) for hp
+#' tmp <- mutate(tmp, ntile = over(ntile(4), ws))
+#' head(tmp)}
+NULL
+
 #' @details
 #' \code{lit}: A new Column is created to represent the literal value.
 #' If the parameter is a Column, it is returned unchanged.
@@ -2844,27 +2872,16 @@ setMethod("ifelse",
 
 ## Window functions##
 
-#' cume_dist
-#'
-#' Window function: returns the cumulative distribution of values within a 
window partition,
-#' i.e. the fraction of rows that are below the current row.
-#'
-#'   N = total number of rows in the partition
-#'   cume_dist(x) = number of values before (and including) x / N
-#'
+#' @details
+#' \code{cume_dist}: Returns the cumulative distribution of values within a 
window partition,
+#' i.e. the fraction of rows that are below the current row:
+#' (number of values before and including x) / (total number of rows in the 
partition).
 #' This is equivalent to the \code{CUME_DIST} function in SQL.
+#' The method should be used with no argument.
 #'
-#' @rdname cume_dist
-#' @name cume_dist
-#' @family window functions
-#' @aliases cume_dist,missing-method
+#' @rdname column_window_functions
+#' @aliases cume_dist cume_dist,missing-method
 #' @export
-#' @examples
-#' \dontrun{
-#'   df <- createDataFrame(mtcars)
-#'   ws <- orderBy(windowPartitionBy("am"), "hp")
-#'   out <- select(df, over(cume_dist(), ws), df$hp, df$am)
-#' }
 #' @note cume_dist since 1.6.0
 setMethod("cume_dist",
   signature("missing"),
@@ -2873,28 +2890,19 @@ setMethod("cume_dist",
 column(jc)
   })
 
-#' dense_rank
-#'
-#' Window function: returns the rank of rows within a window partition, 
without any gaps.
+#' @details
+#' \code{dense_rank}: Returns the rank of rows within a window partition, 
without any gaps.
 #' The difference between rank and dense_rank is that dense_rank leaves no 
gaps in ranking
 #' sequence when there are ties. That is, if you were ranking a competition 
using dense_rank
 #' and had three people tie for second place, you would say that all three 
were in second
 #' place and that the next person came in third. Rank would give me sequential 
numbers, making
 #' the person that came in third place (af

spark git commit: [MINOR][SPARKR] ignore Rplots.pdf test output after running R tests

2017-07-04 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master cec392150 -> daabf425e


[MINOR][SPARKR] ignore Rplots.pdf test output after running R tests

## What changes were proposed in this pull request?

After running R tests in local build, it outputs Rplots.pdf. This one should be 
ignored in the git repository.

Author: wangmiao1981 

Closes #18518 from wangmiao1981/ignore.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/daabf425
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/daabf425
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/daabf425

Branch: refs/heads/master
Commit: daabf425ec0272951b11f286e4bec7a48f42cc0d
Parents: cec3921
Author: wangmiao1981 
Authored: Tue Jul 4 12:37:29 2017 -0700
Committer: Felix Cheung 
Committed: Tue Jul 4 12:37:29 2017 -0700

--
 .gitignore | 1 +
 1 file changed, 1 insertion(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/daabf425/.gitignore
--
diff --git a/.gitignore b/.gitignore
index 1d91b43..cf9780d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ R-unit-tests.log
 R/unit-tests.out
 R/cran-check.out
 R/pkg/vignettes/sparkr-vignettes.html
+R/pkg/tests/fulltests/Rplots.pdf
 build/*.jar
 build/apache-maven*
 build/scala*


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-20889][SPARKR][FOLLOWUP] Clean up grouped doc for column methods

2017-07-04 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master ce10545d3 -> e9a93f814


[SPARK-20889][SPARKR][FOLLOWUP] Clean up grouped doc for column methods

## What changes were proposed in this pull request?
Add doc for methods that were left out, and fix various style and consistency 
issues.

Author: actuaryzhang 

Closes #18493 from actuaryzhang/sparkRDocCleanup.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e9a93f81
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e9a93f81
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e9a93f81

Branch: refs/heads/master
Commit: e9a93f8140c913b91781b35e0e1b051c30244882
Parents: ce10545
Author: actuaryzhang 
Authored: Tue Jul 4 21:05:05 2017 -0700
Committer: Felix Cheung 
Committed: Tue Jul 4 21:05:05 2017 -0700

--
 R/pkg/R/functions.R | 100 +--
 R/pkg/R/generics.R  |   7 ++--
 2 files changed, 49 insertions(+), 58 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e9a93f81/R/pkg/R/functions.R
--
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 8c12308..c529d83 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -38,10 +38,10 @@ NULL
 #'
 #' Date time functions defined for \code{Column}.
 #'
-#' @param x Column to compute on.
+#' @param x Column to compute on. In \code{window}, it must be a time Column 
of \code{TimestampType}.
 #' @param format For \code{to_date} and \code{to_timestamp}, it is the string 
to use to parse
-#'   x Column to DateType or TimestampType. For \code{trunc}, it 
is the string used
-#'   for specifying the truncation method. For example, "year", 
"", "yy" for
+#'   Column \code{x} to DateType or TimestampType. For 
\code{trunc}, it is the string
+#'   to use to specify the truncation method. For example, "year", 
"", "yy" for
 #'   truncate by year, or "month", "mon", "mm" for truncate by 
month.
 #' @param ... additional argument(s).
 #' @name column_datetime_functions
@@ -122,7 +122,7 @@ NULL
 #'   format to. See 'Details'.
 #'  }
 #' @param y Column to compute on.
-#' @param ... additional columns.
+#' @param ... additional Columns.
 #' @name column_string_functions
 #' @rdname column_string_functions
 #' @family string functions
@@ -167,8 +167,7 @@ NULL
 #' tmp <- mutate(df, v1 = crc32(df$model), v2 = hash(df$model),
 #'   v3 = hash(df$model, df$mpg), v4 = md5(df$model),
 #'   v5 = sha1(df$model), v6 = sha2(df$model, 256))
-#' head(tmp)
-#' }
+#' head(tmp)}
 NULL
 
 #' Collection functions for Column operations
@@ -190,7 +189,6 @@ NULL
 #' \dontrun{
 #' # Dataframe used throughout this doc
 #' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
-#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
 #' tmp <- mutate(df, v1 = create_array(df$mpg, df$cyl, df$hp))
 #' head(select(tmp, array_contains(tmp$v1, 21), size(tmp$v1)))
 #' tmp2 <- mutate(tmp, v2 = explode(tmp$v1))
@@ -394,7 +392,7 @@ setMethod("base64",
   })
 
 #' @details
-#' \code{bin}: An expression that returns the string representation of the 
binary value
+#' \code{bin}: Returns the string representation of the binary value
 #' of the given long column. For example, bin("12") returns "1100".
 #'
 #' @rdname column_math_functions
@@ -722,7 +720,7 @@ setMethod("dayofyear",
 #' \code{decode}: Computes the first argument into a string from a binary 
using the provided
 #' character set.
 #'
-#' @param charset Character set to use (one of "US-ASCII", "ISO-8859-1", 
"UTF-8", "UTF-16BE",
+#' @param charset character set to use (one of "US-ASCII", "ISO-8859-1", 
"UTF-8", "UTF-16BE",
 #'"UTF-16LE", "UTF-16").
 #'
 #' @rdname column_string_functions
@@ -855,7 +853,7 @@ setMethod("hex",
   })
 
 #' @details
-#' \code{hour}: Extracts the hours as an integer from a given 
date/timestamp/string.
+#' \code{hour}: Extracts the hour as an integer from a given 
date/timestamp/string.
 #'
 #' @rdname column_datetime_functions
 #' @aliases hour hour,Column-method
@@ -1177,7 +1175,7 @@ setMethod("min",
   })
 
 #' @details
-#' \code{minute}: Extracts the minutes as an integer from a given 
date/timestamp/string.
+#' \code{minute}: Extracts the minute as an integer from a given 
date/timestamp/string.
 #'
 #' @rdname column_datetime_functions
 #' @aliases minute minute,Column-method
@@ -1354,7 +1352,7 @@ setMethod("sd",
   })
 
 #' @details
-#' \code{second}: Extracts the seconds as an integer from a given 
date/timestamp/string.
+#' \code{second}: Extracts the second as an integer from a given 
date/timestamp/string.
 #'
 #' @rdname column_datetime_fu

spark git commit: [SPARK-20307][SPARKR] SparkR: pass on setHandleInvalid to spark.mllib functions that use StringIndexer

2017-07-07 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master d0bfc6733 -> a7b46c627


[SPARK-20307][SPARKR] SparkR: pass on setHandleInvalid to spark.mllib functions 
that use StringIndexer

## What changes were proposed in this pull request?

For randomForest classifier, if test data contains unseen labels, it will throw 
an error. The StringIndexer already has the handleInvalid logic. The patch add 
a new method to set the underlying StringIndexer handleInvalid logic.

This patch should also apply to other classifiers. This PR focuses on the main 
logic and randomForest classifier. I will do follow-up PR for other classifiers.

## How was this patch tested?

Add a new unit test based on the error case in the JIRA.

Author: wangmiao1981 

Closes #18496 from wangmiao1981/handle.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a7b46c62
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a7b46c62
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a7b46c62

Branch: refs/heads/master
Commit: a7b46c627b5d2461257f337139a29f23350e0c77
Parents: d0bfc67
Author: wangmiao1981 
Authored: Fri Jul 7 23:51:32 2017 -0700
Committer: Felix Cheung 
Committed: Fri Jul 7 23:51:32 2017 -0700

--
 R/pkg/R/mllib_tree.R| 11 +++--
 R/pkg/tests/fulltests/test_mllib_tree.R | 17 +
 .../org/apache/spark/ml/feature/RFormula.scala  | 25 
 .../r/RandomForestClassificationWrapper.scala   |  4 +++-
 .../spark/ml/feature/StringIndexerSuite.scala   |  2 +-
 5 files changed, 55 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a7b46c62/R/pkg/R/mllib_tree.R
--
diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R
index 2f1220a..75b1a74 100644
--- a/R/pkg/R/mllib_tree.R
+++ b/R/pkg/R/mllib_tree.R
@@ -374,6 +374,10 @@ setMethod("write.ml", signature(object = 
"GBTClassificationModel", path = "chara
 #' nodes. If TRUE, the algorithm will cache node IDs for 
each instance. Caching
 #' can speed up training of deeper trees. Users can set 
how often should the
 #' cache be checkpointed or disable it by setting 
checkpointInterval.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL 
values) in classification model.
+#'Supported options: "skip" (filter out rows with invalid data),
+#'   "error" (throw an error), "keep" (put invalid 
data in a special additional
+#'   bucket, at index numLabels). Default is "error".
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.randomForest,SparkDataFrame,formula-method
 #' @return \code{spark.randomForest} returns a fitted Random Forest model.
@@ -409,7 +413,8 @@ setMethod("spark.randomForest", signature(data = 
"SparkDataFrame", formula = "fo
maxDepth = 5, maxBins = 32, numTrees = 20, impurity = NULL,
featureSubsetStrategy = "auto", seed = NULL, 
subsamplingRate = 1.0,
minInstancesPerNode = 1, minInfoGain = 0.0, 
checkpointInterval = 10,
-   maxMemoryInMB = 256, cacheNodeIds = FALSE) {
+   maxMemoryInMB = 256, cacheNodeIds = FALSE,
+   handleInvalid = c("error", "keep", "skip")) {
 type <- match.arg(type)
 formula <- paste(deparse(formula), collapse = "")
 if (!is.null(seed)) {
@@ -430,6 +435,7 @@ setMethod("spark.randomForest", signature(data = 
"SparkDataFrame", formula = "fo
  new("RandomForestRegressionModel", jobj = jobj)
},
classification = {
+ handleInvalid <- match.arg(handleInvalid)
  if (is.null(impurity)) impurity <- "gini"
  impurity <- match.arg(impurity, c("gini", "entropy"))
  jobj <- 
callJStatic("org.apache.spark.ml.r.RandomForestClassifierWrapper",
@@ -439,7 +445,8 @@ setMethod("spark.randomForest", signature(data = 
"SparkDataFrame", formula = "fo
  as.numeric(minInfoGain), 
as.integer(checkpointInterval),
  as.character(featureSubsetStrategy), 
seed,
  as.numeric(subsamplingRate),
- as.integer(maxMemoryInMB), 
as.logical(cacheNodeIds))
+ as.integer(maxMemoryInMB), 
as.logical(cacheNodeIds),
+ handleInvalid)
  new("RandomForestClassificationModel", jobj = jobj)
}
 )

http://git-wip-u

spark git commit: [SPARK-20456][DOCS] Add examples for functions collection for pyspark

2017-07-07 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master a7b46c627 -> f5f02d213


[SPARK-20456][DOCS] Add examples for functions collection for pyspark

## What changes were proposed in this pull request?

This adds documentation to many functions in pyspark.sql.functions.py:
`upper`, `lower`, `reverse`, `unix_timestamp`, `from_unixtime`, `rand`, 
`randn`, `collect_list`, `collect_set`, `lit`
Add units to the trigonometry functions.
Renames columns in datetime examples to be more informative.
Adds links between some functions.

## How was this patch tested?

`./dev/lint-python`
`python python/pyspark/sql/functions.py`
`./python/run-tests.py --module pyspark-sql`

Author: Michael Patterson 

Closes #17865 from map222/spark-20456.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f5f02d21
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f5f02d21
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f5f02d21

Branch: refs/heads/master
Commit: f5f02d213d3151f58070e113d64fcded4f5d401e
Parents: a7b46c6
Author: Michael Patterson 
Authored: Fri Jul 7 23:59:34 2017 -0700
Committer: Felix Cheung 
Committed: Fri Jul 7 23:59:34 2017 -0700

--
 R/pkg/R/functions.R |  11 +-
 python/pyspark/sql/functions.py | 166 ---
 .../scala/org/apache/spark/sql/functions.scala  |  14 +-
 3 files changed, 119 insertions(+), 72 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f5f02d21/R/pkg/R/functions.R
--
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index c529d83..f28d26a 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -336,7 +336,8 @@ setMethod("asin",
   })
 
 #' @details
-#' \code{atan}: Computes the tangent inverse of the given value.
+#' \code{atan}: Computes the tangent inverse of the given value; the returned 
angle is in the range
+#' -pi/2 through pi/2.
 #'
 #' @rdname column_math_functions
 #' @export
@@ -599,7 +600,7 @@ setMethod("covar_pop", signature(col1 = 
"characterOrColumn", col2 = "characterOr
   })
 
 #' @details
-#' \code{cos}: Computes the cosine of the given value.
+#' \code{cos}: Computes the cosine of the given value. Units in radians.
 #'
 #' @rdname column_math_functions
 #' @aliases cos cos,Column-method
@@ -1407,7 +1408,7 @@ setMethod("sign", signature(x = "Column"),
   })
 
 #' @details
-#' \code{sin}: Computes the sine of the given value.
+#' \code{sin}: Computes the sine of the given value. Units in radians.
 #'
 #' @rdname column_math_functions
 #' @aliases sin sin,Column-method
@@ -1597,7 +1598,7 @@ setMethod("sumDistinct",
   })
 
 #' @details
-#' \code{tan}: Computes the tangent of the given value.
+#' \code{tan}: Computes the tangent of the given value. Units in radians.
 #'
 #' @rdname column_math_functions
 #' @aliases tan tan,Column-method
@@ -1896,7 +1897,7 @@ setMethod("year",
 
 #' @details
 #' \code{atan2}: Returns the angle theta from the conversion of rectangular 
coordinates
-#' (x, y) to polar coordinates (r, theta).
+#' (x, y) to polar coordinates (r, theta). Units in radians.
 #'
 #' @rdname column_math_functions
 #' @aliases atan2 atan2,Column-method

http://git-wip-us.apache.org/repos/asf/spark/blob/f5f02d21/python/pyspark/sql/functions.py
--
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 3416c4b..5d8ded8 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -67,9 +67,14 @@ def _create_window_function(name, doc=''):
 _.__doc__ = 'Window function: ' + doc
 return _
 
+_lit_doc = """
+Creates a :class:`Column` of literal value.
 
+>>> df.select(lit(5).alias('height')).withColumn('spark_user', 
lit(True)).take(1)
+[Row(height=5, spark_user=True)]
+"""
 _functions = {
-'lit': 'Creates a :class:`Column` of literal value.',
+'lit': _lit_doc,
 'col': 'Returns a :class:`Column` based on the given column name.',
 'column': 'Returns a :class:`Column` based on the given column name.',
 'asc': 'Returns a sort expression based on the ascending order of the 
given column name.',
@@ -95,10 +100,13 @@ _functions_1_4 = {
 '0.0 through pi.',
 'asin': 'Computes the sine inverse of the given value; the returned angle 
is in the range' +
 '-pi/2 through pi/2.',
-'atan': 'Computes the tangent inverse of the given value.',
+'atan': 'Computes the tangent inverse of the given value; the returned 
angle is in the range' +
+'-pi/2 through pi/2',
 'cbrt': 'Computes the cube-root of the given value.',
 'ceil': 'Computes the ceiling of the given value.',
-'cos': 'Co

spark git commit: [SPARK-21093][R] Terminate R's worker processes in the parent of R's daemon to prevent a leak

2017-07-08 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master c3712b77a -> 08e0d033b


[SPARK-21093][R] Terminate R's worker processes in the parent of R's daemon to 
prevent a leak

## What changes were proposed in this pull request?

This is a retry for #18320. This PR was reverted due to unexpected test 
failures with -10 error code.

I was unable to reproduce in MacOS, CentOS and Ubuntu but only in Jenkins. So, 
the tests proceeded to verify this and revert the past try here - 
https://github.com/apache/spark/pull/18456

This new approach was tested in https://github.com/apache/spark/pull/18463.

**Test results**:

- With the part of suspicious change in the past try 
(https://github.com/apache/spark/pull/18463/commits/466325d3fd353668583f3bde38ae490d9db0b189)

  Tests ran 4 times and 2 times passed and 2 time failed.

- Without the part of suspicious change in the past try 
(https://github.com/apache/spark/pull/18463/commits/466325d3fd353668583f3bde38ae490d9db0b189)

  Tests ran 5 times and they all passed.

- With this new approach 
(https://github.com/apache/spark/pull/18463/commits/0a7589c09f53dfc2094497d8d3e59d6407569417)

  Tests ran 5 times and they all passed.

It looks the cause is as below (see 
https://github.com/apache/spark/pull/18463/commits/466325d3fd353668583f3bde38ae490d9db0b189):

```diff
+ exitCode <- 1
...
+   data <- parallel:::readChild(child)
+   if (is.raw(data)) {
+ if (unserialize(data) == exitCode) {
  ...
+ }
+   }

...

- parallel:::mcexit(0L)
+ parallel:::mcexit(0L, send = exitCode)
```

Two possibilities I think

 - `parallel:::mcexit(.. , send = exitCode)`

   https://stat.ethz.ch/R-manual/R-devel/library/parallel/html/mcfork.html

   > It sends send to the master (unless NULL) and then shuts down the child 
process.

   However, it looks possible that the parent attemps to terminate the child 
right after getting our custom exit code. So, the child gets terminated between 
"send" and "shuts down", failing to exit properly.

 - A bug between `parallel:::mcexit(..., send = ...)` and 
`parallel:::readChild`.

**Proposal**:

To resolve this, I simply decided to avoid both possibilities with this new 
approach here 
(https://github.com/apache/spark/pull/18465/commits/9ff89a7859cb9f427fc774f33c3521c7d962b723).
 To support this idea, I explained with some quotation of the documentation as 
below:

https://stat.ethz.ch/R-manual/R-devel/library/parallel/html/mcfork.html

> `readChild` and `readChildren` return a raw vector with a "pid" attribute if 
> data were available, an integer vector of length one with the process ID if a 
> child terminated or `NULL` if the child no longer exists (no children at all 
> for `readChildren`).

`readChild` returns "an integer vector of length one with the process ID if a 
child terminated" so we can check if it is `integer` and the same selected 
"process ID". I believe this makes sure that the children are exited.

In case that children happen to send any data manually to parent (which is why 
we introduced the suspicious part of the change 
(https://github.com/apache/spark/pull/18463/commits/466325d3fd353668583f3bde38ae490d9db0b189)),
 this should be raw bytes and will be discarded (and then will try to read the 
next and check if it is `integer` in the next loop).

## How was this patch tested?

Manual tests and Jenkins tests.

Author: hyukjinkwon 

Closes #18465 from HyukjinKwon/SPARK-21093-retry-1.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/08e0d033
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/08e0d033
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/08e0d033

Branch: refs/heads/master
Commit: 08e0d033b40946b4ef5741a7aa1e7ba0bd48c6fb
Parents: c3712b7
Author: hyukjinkwon 
Authored: Sat Jul 8 14:24:37 2017 -0700
Committer: Felix Cheung 
Committed: Sat Jul 8 14:24:37 2017 -0700

--
 R/pkg/inst/worker/daemon.R | 51 ++---
 1 file changed, 48 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/08e0d033/R/pkg/inst/worker/daemon.R
--
diff --git a/R/pkg/inst/worker/daemon.R b/R/pkg/inst/worker/daemon.R
index 3a318b7..2e31dc5 100644
--- a/R/pkg/inst/worker/daemon.R
+++ b/R/pkg/inst/worker/daemon.R
@@ -30,8 +30,50 @@ port <- as.integer(Sys.getenv("SPARKR_WORKER_PORT"))
 inputCon <- socketConnection(
 port = port, open = "rb", blocking = TRUE, timeout = connectionTimeout)
 
+# Waits indefinitely for a socket connecion by default.
+selectTimeout <- NULL
+
 while (TRUE) {
-  ready <- socketSelect(list(inputCon))
+  ready <- socketSelect(list(inputCon), timeout = selectTimeout)
+
+  # Note that the children should be terminated in the parent. If each child 
terminates
+  # itself,

spark git commit: [MINOR][SPARKR] R API documentation for "coltypes" is confusing

2016-08-10 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 9dc3e602d -> 1203c8415


[MINOR][SPARKR] R API documentation for "coltypes" is confusing

## What changes were proposed in this pull request?

R API documentation for "coltypes" is confusing, found when working on another 
ticket.

Current version http://spark.apache.org/docs/2.0.0/api/R/coltypes.html, where 
parameters have 2 "x" which is a duplicate, and also the example is not very 
clear

![current](https://cloud.githubusercontent.com/assets/3925641/17386808/effb98ce-59a2-11e6-9657-d477d258a80c.png)

![screen shot 2016-08-03 at 5 56 00 
pm](https://cloud.githubusercontent.com/assets/3925641/17386884/91831096-59a3-11e6-84af-39890b3d45d8.png)

## How was this patch tested?

Tested manually on local machine. And the screenshots are like below:

![screen shot 2016-08-07 at 11 29 20 
pm](https://cloud.githubusercontent.com/assets/3925641/17471144/df36633c-5cf6-11e6-8238-4e32ead0e529.png)

![screen shot 2016-08-03 at 5 56 22 
pm](https://cloud.githubusercontent.com/assets/3925641/17386896/9d36cb26-59a3-11e6-9619-6dae29f7ab17.png)

Author: Xin Ren 

Closes #14489 from keypointt/rExample.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1203c841
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1203c841
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1203c841

Branch: refs/heads/master
Commit: 1203c8415cd11540f79a235e66a2f241ca6c71e4
Parents: 9dc3e60
Author: Xin Ren 
Authored: Wed Aug 10 00:49:06 2016 -0700
Committer: Felix Cheung 
Committed: Wed Aug 10 00:49:06 2016 -0700

--
 R/pkg/R/DataFrame.R | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1203c841/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index a473331..0ce4696 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -223,7 +223,7 @@ setMethod("showDF",
 #' sparkR.session()
 #' path <- "path/to/file.json"
 #' df <- read.json(path)
-#' df
+#' show(df)
 #'}
 #' @note show(SparkDataFrame) since 1.4.0
 setMethod("show", "SparkDataFrame",
@@ -368,7 +368,7 @@ setMethod("colnames<-",
 #' @examples
 #'\dontrun{
 #' irisDF <- createDataFrame(iris)
-#' coltypes(irisDF)
+#' coltypes(irisDF) # get column types
 #'}
 #' @note coltypes since 1.6.0
 setMethod("coltypes",
@@ -411,7 +411,6 @@ setMethod("coltypes",
 #'
 #' Set the column types of a SparkDataFrame.
 #'
-#' @param x A SparkDataFrame
 #' @param value A character vector with the target column types for the given
 #'SparkDataFrame. Column types can be one of integer, numeric/double, 
character, logical, or NA
 #'to keep that column as-is.
@@ -424,8 +423,8 @@ setMethod("coltypes",
 #' sparkR.session()
 #' path <- "path/to/file.json"
 #' df <- read.json(path)
-#' coltypes(df) <- c("character", "integer")
-#' coltypes(df) <- c(NA, "numeric")
+#' coltypes(df) <- c("character", "integer") # set column types
+#' coltypes(df) <- c(NA, "numeric") # set column types
 #'}
 #' @note coltypes<- since 1.6.0
 setMethod("coltypes<-",


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-16444][SPARKR] Isotonic Regression wrapper in SparkR

2016-08-17 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 4d0cc84af -> 363793f2b


[SPARK-16444][SPARKR] Isotonic Regression wrapper in SparkR

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)

Add Isotonic Regression wrapper in SparkR

Wrappers in R and Scala are added.
Unit tests
Documentation

## How was this patch tested?
Manually tested with sudo ./R/run-tests.sh

(Please explain how this patch was tested. E.g. unit tests, integration tests, 
manual tests)

Author: wm...@hotmail.com 

Closes #14182 from wangmiao1981/isoR.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/363793f2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/363793f2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/363793f2

Branch: refs/heads/master
Commit: 363793f2bf57205f1d753d4705583aaf441849b5
Parents: 4d0cc84
Author: wm...@hotmail.com 
Authored: Wed Aug 17 06:15:04 2016 -0700
Committer: Felix Cheung 
Committed: Wed Aug 17 06:15:04 2016 -0700

--
 R/pkg/NAMESPACE |   3 +-
 R/pkg/R/generics.R  |   4 +
 R/pkg/R/mllib.R | 118 ++
 R/pkg/inst/tests/testthat/test_mllib.R  |  32 +
 .../spark/ml/r/IsotonicRegressionWrapper.scala  | 119 +++
 .../scala/org/apache/spark/ml/r/RWrappers.scala |   2 +
 6 files changed, 277 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/363793f2/R/pkg/NAMESPACE
--
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index aaab92f..1e23b23 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -24,7 +24,8 @@ exportMethods("glm",
   "spark.kmeans",
   "fitted",
   "spark.naiveBayes",
-  "spark.survreg")
+  "spark.survreg",
+  "spark.isoreg")
 
 # Job group lifecycle management methods
 export("setJobGroup",

http://git-wip-us.apache.org/repos/asf/spark/blob/363793f2/R/pkg/R/generics.R
--
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 52ab730..ebacc11 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1304,6 +1304,10 @@ setGeneric("spark.naiveBayes", function(data, formula, 
...) { standardGeneric("s
 #' @export
 setGeneric("spark.survreg", function(data, formula, ...) { 
standardGeneric("spark.survreg") })
 
+#' @rdname spark.isoreg
+#' @export
+setGeneric("spark.isoreg", function(data, formula, ...) { 
standardGeneric("spark.isoreg") })
+
 #' @rdname write.ml
 #' @export
 setGeneric("write.ml", function(object, path, ...) { 
standardGeneric("write.ml") })

http://git-wip-us.apache.org/repos/asf/spark/blob/363793f2/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 6f6e2fc..0dcc54d 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -53,6 +53,13 @@ setClass("AFTSurvivalRegressionModel", representation(jobj = 
"jobj"))
 #' @note KMeansModel since 2.0.0
 setClass("KMeansModel", representation(jobj = "jobj"))
 
+#' S4 class that represents an IsotonicRegressionModel
+#'
+#' @param jobj a Java object reference to the backing Scala 
IsotonicRegressionModel
+#' @export
+#' @note IsotonicRegressionModel since 2.1.0
+setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
+
 #' Saves the MLlib model to the input path
 #'
 #' Saves the MLlib model to the input path. For more information, see the 
specific
@@ -62,6 +69,7 @@ setClass("KMeansModel", representation(jobj = "jobj"))
 #' @export
 #' @seealso \link{spark.glm}, \link{glm}
 #' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg}
+#' @seealso \link{spark.isoreg}
 #' @seealso \link{read.ml}
 NULL
 
@@ -74,6 +82,7 @@ NULL
 #' @export
 #' @seealso \link{spark.glm}, \link{glm}
 #' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg}
+#' @seealso \link{spark.isoreg}
 NULL
 
 #' Generalized Linear Models
@@ -299,6 +308,94 @@ setMethod("summary", signature(object = "NaiveBayesModel"),
 return(list(apriori = apriori, tables = tables))
   })
 
+#' Isotonic Regression Model
+#'
+#' Fits an Isotonic Regression model against a Spark DataFrame, similarly to 
R's isoreg().
+#' Users can print, make predictions on the produced model and save the model 
to the input path.
+#'
+#' @param data SparkDataFrame for training
+#' @param formula A symbolic description of the model to be fitted. Currently 
only a few formula
+#'operators are supported, including '~', '.', ':', '+', and 
'-'.
+#' @param isotonic Whether the output sequence should be isoton

spark git commit: [SPARK-16447][ML][SPARKR] LDA wrapper in SparkR

2016-08-18 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 68f5087d2 -> b72bb62d4


[SPARK-16447][ML][SPARKR] LDA wrapper in SparkR

## What changes were proposed in this pull request?

Add LDA Wrapper in SparkR with the following interfaces:

- spark.lda(data, ...)

- spark.posterior(object, newData, ...)

- spark.perplexity(object, ...)

- summary(object)

- write.ml(object)

- read.ml(path)

## How was this patch tested?

Test with SparkR unit test.

Author: Xusen Yin 

Closes #14229 from yinxusen/SPARK-16447.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b72bb62d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b72bb62d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b72bb62d

Branch: refs/heads/master
Commit: b72bb62d421840f82d663c6b8e3922bd14383fbb
Parents: 68f5087
Author: Xusen Yin 
Authored: Thu Aug 18 05:33:52 2016 -0700
Committer: Felix Cheung 
Committed: Thu Aug 18 05:33:52 2016 -0700

--
 R/pkg/NAMESPACE |   3 +
 R/pkg/R/generics.R  |  14 ++
 R/pkg/R/mllib.R | 166 +-
 R/pkg/inst/tests/testthat/test_mllib.R  |  87 
 .../org/apache/spark/ml/clustering/LDA.scala|   4 +
 .../org/apache/spark/ml/r/LDAWrapper.scala  | 216 +++
 .../scala/org/apache/spark/ml/r/RWrappers.scala |   2 +
 7 files changed, 490 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b72bb62d/R/pkg/NAMESPACE
--
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index c71eec5..4404cff 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -25,6 +25,9 @@ exportMethods("glm",
   "fitted",
   "spark.naiveBayes",
   "spark.survreg",
+  "spark.lda",
+  "spark.posterior",
+  "spark.perplexity",
   "spark.isoreg",
   "spark.gaussianMixture")
 

http://git-wip-us.apache.org/repos/asf/spark/blob/b72bb62d/R/pkg/R/generics.R
--
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 06bb25d..fe04bcf 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1304,6 +1304,19 @@ setGeneric("spark.naiveBayes", function(data, formula, 
...) { standardGeneric("s
 #' @export
 setGeneric("spark.survreg", function(data, formula, ...) { 
standardGeneric("spark.survreg") })
 
+#' @rdname spark.lda
+#' @param ... Additional parameters to tune LDA.
+#' @export
+setGeneric("spark.lda", function(data, ...) { standardGeneric("spark.lda") })
+
+#' @rdname spark.lda
+#' @export
+setGeneric("spark.posterior", function(object, newData) { 
standardGeneric("spark.posterior") })
+
+#' @rdname spark.lda
+#' @export
+setGeneric("spark.perplexity", function(object, data) { 
standardGeneric("spark.perplexity") })
+
 #' @rdname spark.isoreg
 #' @export
 setGeneric("spark.isoreg", function(data, formula, ...) { 
standardGeneric("spark.isoreg") })
@@ -1315,6 +1328,7 @@ setGeneric("spark.gaussianMixture",
  standardGeneric("spark.gaussianMixture")
})
 
+#' write.ml
 #' @rdname write.ml
 #' @export
 setGeneric("write.ml", function(object, path, ...) { 
standardGeneric("write.ml") })

http://git-wip-us.apache.org/repos/asf/spark/blob/b72bb62d/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index db74046..b952741 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -39,6 +39,13 @@ setClass("GeneralizedLinearRegressionModel", 
representation(jobj = "jobj"))
 #' @note NaiveBayesModel since 2.0.0
 setClass("NaiveBayesModel", representation(jobj = "jobj"))
 
+#' S4 class that represents an LDAModel
+#'
+#' @param jobj a Java object reference to the backing Scala LDAWrapper
+#' @export
+#' @note LDAModel since 2.1.0
+setClass("LDAModel", representation(jobj = "jobj"))
+
 #' S4 class that represents a AFTSurvivalRegressionModel
 #'
 #' @param jobj a Java object reference to the backing Scala 
AFTSurvivalRegressionWrapper
@@ -75,7 +82,7 @@ setClass("GaussianMixtureModel", representation(jobj = 
"jobj"))
 #' @name write.ml
 #' @export
 #' @seealso \link{spark.glm}, \link{glm}, \link{spark.gaussianMixture}
-#' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg}
+#' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, 
\link{spark.survreg}, \link{spark.lda}
 #' @seealso \link{spark.isoreg}
 #' @seealso \link{read.ml}
 NULL
@@ -315,6 +322,94 @@ setMethod("summary", signature(object = "NaiveBayesModel"),
 return(list(apriori = apriori, tables = tables))
   })
 
+# Returns posterior probabilities from a Latent Dir

spark git commit: [SPARK-16508][SPARKR] Fix CRAN undocumented/duplicated arguments warnings.

2016-08-20 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 39f328ba3 -> 01401e965


[SPARK-16508][SPARKR] Fix CRAN undocumented/duplicated arguments warnings.

## What changes were proposed in this pull request?

This PR tries to fix all the remaining "undocumented/duplicated arguments" 
warnings given by CRAN-check.

One left is doc for R `stats::glm` exported in SparkR. To mute that warning, we 
have to also provide document for all arguments of that non-SparkR function.

Some previous conversation is in #14558.

## How was this patch tested?

R unit test and `check-cran.sh` script (with no-test).

Author: Junyang Qian 

Closes #14705 from junyangq/SPARK-16508-master.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/01401e96
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/01401e96
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/01401e96

Branch: refs/heads/master
Commit: 01401e965b58f7e8ab615764a452d7d18f1d4bf0
Parents: 39f328b
Author: Junyang Qian 
Authored: Sat Aug 20 06:59:23 2016 -0700
Committer: Felix Cheung 
Committed: Sat Aug 20 06:59:23 2016 -0700

--
 R/pkg/R/DataFrame.R  | 221 +++---
 R/pkg/R/SQLContext.R |  30 ---
 R/pkg/R/WindowSpec.R |  11 ++-
 R/pkg/R/column.R |  18 +++-
 R/pkg/R/functions.R  | 173 
 R/pkg/R/generics.R   |  62 ++---
 R/pkg/R/group.R  |   7 +-
 R/pkg/R/mllib.R  | 113 +---
 R/pkg/R/schema.R |   5 +-
 R/pkg/R/sparkR.R |  21 ++---
 R/pkg/R/stats.R  |  25 +++---
 11 files changed, 419 insertions(+), 267 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/01401e96/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 09be06d..540dc31 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -120,8 +120,9 @@ setMethod("schema",
 #'
 #' Print the logical and physical Catalyst plans to the console for debugging.
 #'
-#' @param x A SparkDataFrame
+#' @param x a SparkDataFrame.
 #' @param extended Logical. If extended is FALSE, explain() only prints the 
physical plan.
+#' @param ... further arguments to be passed to or from other methods.
 #' @family SparkDataFrame functions
 #' @aliases explain,SparkDataFrame-method
 #' @rdname explain
@@ -177,11 +178,13 @@ setMethod("isLocal",
 #'
 #' Print the first numRows rows of a SparkDataFrame
 #'
-#' @param x A SparkDataFrame
-#' @param numRows The number of rows to print. Defaults to 20.
-#' @param truncate Whether truncate long strings. If true, strings more than 
20 characters will be
-#'truncated. However, if set greater than zero, truncates strings longer 
than `truncate`
-#'characters and all cells will be aligned right.
+#' @param x a SparkDataFrame.
+#' @param numRows the number of rows to print. Defaults to 20.
+#' @param truncate whether truncate long strings. If \code{TRUE}, strings more 
than
+#' 20 characters will be truncated. However, if set greater 
than zero,
+#' truncates strings longer than `truncate` characters and all 
cells
+#' will be aligned right.
+#' @param ... further arguments to be passed to or from other methods.
 #' @family SparkDataFrame functions
 #' @aliases showDF,SparkDataFrame-method
 #' @rdname showDF
@@ -211,7 +214,7 @@ setMethod("showDF",
 #'
 #' Print the SparkDataFrame column names and types
 #'
-#' @param x A SparkDataFrame
+#' @param object a SparkDataFrame.
 #'
 #' @family SparkDataFrame functions
 #' @rdname show
@@ -262,11 +265,11 @@ setMethod("dtypes",
 })
   })
 
-#' Column names
+#' Column Names of SparkDataFrame
 #'
-#' Return all column names as a list
+#' Return all column names as a list.
 #'
-#' @param x A SparkDataFrame
+#' @param x a SparkDataFrame.
 #'
 #' @family SparkDataFrame functions
 #' @rdname columns
@@ -323,6 +326,8 @@ setMethod("colnames",
 columns(x)
   })
 
+#' @param value a character vector. Must have the same length as the number
+#'  of columns in the SparkDataFrame.
 #' @rdname columns
 #' @aliases colnames<-,SparkDataFrame-method
 #' @name colnames<-
@@ -514,9 +519,10 @@ setMethod("registerTempTable",
 #'
 #' Insert the contents of a SparkDataFrame into a table registered in the 
current SparkSession.
 #'
-#' @param x A SparkDataFrame
-#' @param tableName A character vector containing the name of the table
-#' @param overwrite A logical argument indicating whether or not to overwrite
+#' @param x a SparkDataFrame.
+#' @param tableName a character vector containing the name of the table.
+#' @param overwrite a logical argument indicating whether or not to overwrite.
+#' @param ... further arguments t

spark git commit: [SPARKR][EXAMPLE] change example APP name

2016-08-20 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 01401e965 -> 3e5fdeb3f


[SPARKR][EXAMPLE] change example APP name

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)

For R SQL example, appname is "MyApp". While examples in scala, Java and 
python, the appName is "x Spark SQL basic example".

I made the R example consistent with other examples.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, 
manual tests)

Manual test
(If this patch involves UI changes, please attach a screenshot; otherwise, 
remove this)

Author: wm...@hotmail.com 

Closes #14703 from wangmiao1981/example.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3e5fdeb3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3e5fdeb3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3e5fdeb3

Branch: refs/heads/master
Commit: 3e5fdeb3fb084cc9d25ce2f3f8cbf07a0aa2c573
Parents: 01401e9
Author: wm...@hotmail.com 
Authored: Sat Aug 20 07:00:51 2016 -0700
Committer: Felix Cheung 
Committed: Sat Aug 20 07:00:51 2016 -0700

--
 examples/src/main/r/RSparkSQLExample.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3e5fdeb3/examples/src/main/r/RSparkSQLExample.R
--
diff --git a/examples/src/main/r/RSparkSQLExample.R 
b/examples/src/main/r/RSparkSQLExample.R
index de489e1..4e0267a 100644
--- a/examples/src/main/r/RSparkSQLExample.R
+++ b/examples/src/main/r/RSparkSQLExample.R
@@ -18,7 +18,7 @@
 library(SparkR)
 
 # $example on:init_session$
-sparkR.session(appName = "MyApp", sparkConfig = list(spark.some.config.option 
= "some-value"))
+sparkR.session(appName = "R Spark SQL basic example", sparkConfig = 
list(spark.some.config.option = "some-value"))
 # $example off:init_session$
 
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-17173][SPARKR] R MLlib refactor, cleanup, reformat, fix deprecation in test

2016-08-22 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 342278c09 -> 0583ecda1


[SPARK-17173][SPARKR] R MLlib refactor, cleanup, reformat, fix deprecation in 
test

## What changes were proposed in this pull request?

refactor, cleanup, reformat, fix deprecation in test

## How was this patch tested?

unit tests, manual tests

Author: Felix Cheung 

Closes #14735 from felixcheung/rmllibutil.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0583ecda
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0583ecda
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0583ecda

Branch: refs/heads/master
Commit: 0583ecda1b63a7e3f126c3276059e4f99548a741
Parents: 342278c
Author: Felix Cheung 
Authored: Mon Aug 22 12:27:33 2016 -0700
Committer: Felix Cheung 
Committed: Mon Aug 22 12:27:33 2016 -0700

--
 R/pkg/R/mllib.R| 205 
 R/pkg/inst/tests/testthat/test_mllib.R |  10 +-
 2 files changed, 98 insertions(+), 117 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0583ecda/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 9a53c80..b36fbce 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -88,9 +88,9 @@ setClass("ALSModel", representation(jobj = "jobj"))
 #' @rdname write.ml
 #' @name write.ml
 #' @export
-#' @seealso \link{spark.glm}, \link{glm}, \link{spark.gaussianMixture}
-#' @seealso \link{spark.als}, \link{spark.kmeans}, \link{spark.lda}, 
\link{spark.naiveBayes}
-#' @seealso \link{spark.survreg}, \link{spark.isoreg}
+#' @seealso \link{spark.glm}, \link{glm},
+#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, 
\link{spark.isoreg}, \link{spark.kmeans},
+#' @seealso \link{spark.lda}, \link{spark.naiveBayes}, \link{spark.survreg},
 #' @seealso \link{read.ml}
 NULL
 
@@ -101,11 +101,22 @@ NULL
 #' @rdname predict
 #' @name predict
 #' @export
-#' @seealso \link{spark.glm}, \link{glm}, \link{spark.gaussianMixture}
-#' @seealso \link{spark.als}, \link{spark.kmeans}, \link{spark.naiveBayes}, 
\link{spark.survreg}
-#' @seealso \link{spark.isoreg}
+#' @seealso \link{spark.glm}, \link{glm},
+#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, 
\link{spark.isoreg}, \link{spark.kmeans},
+#' @seealso \link{spark.naiveBayes}, \link{spark.survreg},
 NULL
 
+write_internal <- function(object, path, overwrite = FALSE) {
+  writer <- callJMethod(object@jobj, "write")
+  if (overwrite) {
+writer <- callJMethod(writer, "overwrite")
+  }
+  invisible(callJMethod(writer, "save", path))
+}
+
+predict_internal <- function(object, newData) {
+  dataFrame(callJMethod(object@jobj, "transform", newData@sdf))
+}
 
 #' Generalized Linear Models
 #'
@@ -173,7 +184,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", 
formula = "formula"),
 jobj <- 
callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
 "fit", formula, data@sdf, family$family, 
family$link,
 tol, as.integer(maxIter), 
as.character(weightCol))
-return(new("GeneralizedLinearRegressionModel", jobj = jobj))
+new("GeneralizedLinearRegressionModel", jobj = jobj)
   })
 
 #' Generalized Linear Models (R-compliant)
@@ -219,7 +230,7 @@ setMethod("glm", signature(formula = "formula", family = 
"ANY", data = "SparkDat
 #' @export
 #' @note summary(GeneralizedLinearRegressionModel) since 2.0.0
 setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
-  function(object, ...) {
+  function(object) {
 jobj <- object@jobj
 is.loaded <- callJMethod(jobj, "isLoaded")
 features <- callJMethod(jobj, "rFeatures")
@@ -245,7 +256,7 @@ setMethod("summary", signature(object = 
"GeneralizedLinearRegressionModel"),
 deviance = deviance, df.null = df.null, df.residual = 
df.residual,
 aic = aic, iter = iter, family = family, is.loaded = 
is.loaded)
 class(ans) <- "summary.GeneralizedLinearRegressionModel"
-return(ans)
+ans
   })
 
 #  Prints the summary of GeneralizedLinearRegressionModel
@@ -275,8 +286,7 @@ print.summary.GeneralizedLinearRegressionModel <- 
function(x, ...) {
 " on", format(unlist(x[c("df.null

spark git commit: [SPARK-16508][SPARKR] doc updates and more CRAN check fixes

2016-08-22 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 84770b59f -> 71afeeea4


[SPARK-16508][SPARKR] doc updates and more CRAN check fixes

## What changes were proposed in this pull request?

replace ``` ` ``` in code doc with `\code{thing}`
remove added `...` for drop(DataFrame)
fix remaining CRAN check warnings

## How was this patch tested?

create doc with knitr

junyangq

Author: Felix Cheung 

Closes #14734 from felixcheung/rdoccleanup.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/71afeeea
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/71afeeea
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/71afeeea

Branch: refs/heads/master
Commit: 71afeeea4ec8e67edc95b5d504c557c88a2598b9
Parents: 84770b5
Author: Felix Cheung 
Authored: Mon Aug 22 15:53:10 2016 -0700
Committer: Felix Cheung 
Committed: Mon Aug 22 15:53:10 2016 -0700

--
 R/pkg/NAMESPACE  |  6 +++-
 R/pkg/R/DataFrame.R  | 71 +++
 R/pkg/R/RDD.R| 10 +++
 R/pkg/R/SQLContext.R | 30 ++--
 R/pkg/R/WindowSpec.R | 23 +++
 R/pkg/R/column.R |  2 +-
 R/pkg/R/functions.R  | 36 
 R/pkg/R/generics.R   | 15 +-
 R/pkg/R/group.R  |  1 +
 R/pkg/R/mllib.R  | 19 +++--
 R/pkg/R/pairRDD.R|  6 ++--
 R/pkg/R/stats.R  | 14 +-
 12 files changed, 119 insertions(+), 114 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/71afeeea/R/pkg/NAMESPACE
--
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index e1b87b2..7090576 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -1,5 +1,9 @@
 # Imports from base R
-importFrom(methods, setGeneric, setMethod, setOldClass)
+# Do not include stats:: "rpois", "runif" - causes error at runtime
+importFrom("methods", "setGeneric", "setMethod", "setOldClass")
+importFrom("methods", "is", "new", "signature", "show")
+importFrom("stats", "gaussian", "setNames")
+importFrom("utils", "download.file", "packageVersion", "untar")
 
 # Disable native libraries till we figure out how to package it
 # See SPARKR-7839

http://git-wip-us.apache.org/repos/asf/spark/blob/71afeeea/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 540dc31..52a6628 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -150,7 +150,7 @@ setMethod("explain",
 
 #' isLocal
 #'
-#' Returns True if the `collect` and `take` methods can be run locally
+#' Returns True if the \code{collect} and \code{take} methods can be run 
locally
 #' (without any Spark executors).
 #'
 #' @param x A SparkDataFrame
@@ -182,7 +182,7 @@ setMethod("isLocal",
 #' @param numRows the number of rows to print. Defaults to 20.
 #' @param truncate whether truncate long strings. If \code{TRUE}, strings more 
than
 #' 20 characters will be truncated. However, if set greater 
than zero,
-#' truncates strings longer than `truncate` characters and all 
cells
+#' truncates strings longer than \code{truncate} characters 
and all cells
 #' will be aligned right.
 #' @param ... further arguments to be passed to or from other methods.
 #' @family SparkDataFrame functions
@@ -642,10 +642,10 @@ setMethod("unpersist",
 #' The following options for repartition are possible:
 #' \itemize{
 #'  \item{1.} {Return a new SparkDataFrame partitioned by
-#'  the given columns into `numPartitions`.}
-#'  \item{2.} {Return a new SparkDataFrame that has exactly `numPartitions`.}
+#'  the given columns into \code{numPartitions}.}
+#'  \item{2.} {Return a new SparkDataFrame that has exactly 
\code{numPartitions}.}
 #'  \item{3.} {Return a new SparkDataFrame partitioned by the given column(s),
-#'  using `spark.sql.shuffle.partitions` as number of 
partitions.}
+#'  using \code{spark.sql.shuffle.partitions} as number of 
partitions.}
 #'}
 #' @param x a SparkDataFrame.
 #' @param numPartitions the number of partitions to use.
@@ -1132,9 +1132,8 @@ setMethod("take",
 
 #' Head
 #'
-#' Return the first NUM rows of a SparkDataFrame as a R data.frame. If NUM is 
NULL,
-#' then head() returns the first 6 rows in keeping with the current data.frame
-#&

spark git commit: [SPARKR][MINOR] Remove reference link for common Windows environment variables

2016-08-23 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 9afdfc94f -> 8fd63e808


[SPARKR][MINOR] Remove reference link for common Windows environment variables

## What changes were proposed in this pull request?

The PR removes reference link in the doc for environment variables for common 
Windows folders. The cran check gave code 503: service unavailable on the 
original link.

## How was this patch tested?

Manual check.

Author: Junyang Qian 

Closes #14767 from junyangq/SPARKR-RemoveLink.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8fd63e80
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8fd63e80
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8fd63e80

Branch: refs/heads/master
Commit: 8fd63e808e15c8a7e78fef847183c86f332daa91
Parents: 9afdfc9
Author: Junyang Qian 
Authored: Tue Aug 23 11:22:32 2016 -0700
Committer: Felix Cheung 
Committed: Tue Aug 23 11:22:32 2016 -0700

--
 R/pkg/R/install.R | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8fd63e80/R/pkg/R/install.R
--
diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R
index ff81e86..c6ed88e 100644
--- a/R/pkg/R/install.R
+++ b/R/pkg/R/install.R
@@ -50,9 +50,7 @@
 #' \itemize{
 #'   \item Mac OS X: \file{~/Library/Caches/spark}
 #'   \item Unix: \env{$XDG_CACHE_HOME} if defined, otherwise 
\file{~/.cache/spark}
-#'   \item Windows: 
\file{\%LOCALAPPDATA\%\\spark\\spark\\Cache}. See
-#' 
\href{https://www.microsoft.com/security/portal/mmpc/shared/variables.aspx}{
-#' Windows Common Folder Variables} about 
\%LOCALAPPDATA\%
+#'   \item Windows: 
\file{\%LOCALAPPDATA\%\\spark\\spark\\Cache}.
 #' }
 #' @param overwrite If \code{TRUE}, download and overwrite the existing tar 
file in localDir
 #'  and force re-install Spark (in case the local directory or 
file is corrupted)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARKR][MINOR] Remove reference link for common Windows environment variables

2016-08-23 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 811a2cef0 -> cc4018996


[SPARKR][MINOR] Remove reference link for common Windows environment variables

## What changes were proposed in this pull request?

The PR removes reference link in the doc for environment variables for common 
Windows folders. The cran check gave code 503: service unavailable on the 
original link.

## How was this patch tested?

Manual check.

Author: Junyang Qian 

Closes #14767 from junyangq/SPARKR-RemoveLink.

(cherry picked from commit 8fd63e808e15c8a7e78fef847183c86f332daa91)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cc401899
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cc401899
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cc401899

Branch: refs/heads/branch-2.0
Commit: cc4018996740b3a68d4a557615c59c67b8996ebb
Parents: 811a2ce
Author: Junyang Qian 
Authored: Tue Aug 23 11:22:32 2016 -0700
Committer: Felix Cheung 
Committed: Tue Aug 23 11:22:46 2016 -0700

--
 R/pkg/R/install.R | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/cc401899/R/pkg/R/install.R
--
diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R
index ff81e86..c6ed88e 100644
--- a/R/pkg/R/install.R
+++ b/R/pkg/R/install.R
@@ -50,9 +50,7 @@
 #' \itemize{
 #'   \item Mac OS X: \file{~/Library/Caches/spark}
 #'   \item Unix: \env{$XDG_CACHE_HOME} if defined, otherwise 
\file{~/.cache/spark}
-#'   \item Windows: 
\file{\%LOCALAPPDATA\%\\spark\\spark\\Cache}. See
-#' 
\href{https://www.microsoft.com/security/portal/mmpc/shared/variables.aspx}{
-#' Windows Common Folder Variables} about 
\%LOCALAPPDATA\%
+#'   \item Windows: 
\file{\%LOCALAPPDATA\%\\spark\\spark\\Cache}.
 #' }
 #' @param overwrite If \code{TRUE}, download and overwrite the existing tar 
file in localDir
 #'  and force re-install Spark (in case the local directory or 
file is corrupted)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARKR][MINOR] Fix doc for show method

2016-08-24 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 45b786aca -> d2932a0e9


[SPARKR][MINOR] Fix doc for show method

## What changes were proposed in this pull request?

The original doc of `show` put methods for multiple classes together but the 
text only talks about `SparkDataFrame`. This PR tries to fix this problem.

## How was this patch tested?

Manual test.

Author: Junyang Qian 

Closes #14776 from junyangq/SPARK-FixShowDoc.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d2932a0e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d2932a0e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d2932a0e

Branch: refs/heads/master
Commit: d2932a0e987132c694ed59515b7c77adaad052e6
Parents: 45b786a
Author: Junyang Qian 
Authored: Wed Aug 24 10:40:09 2016 -0700
Committer: Felix Cheung 
Committed: Wed Aug 24 10:40:09 2016 -0700

--
 R/pkg/R/DataFrame.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d2932a0e/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 52a6628..e12b58e 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -212,9 +212,9 @@ setMethod("showDF",
 
 #' show
 #'
-#' Print the SparkDataFrame column names and types
+#' Print class and type information of a Spark object.
 #'
-#' @param object a SparkDataFrame.
+#' @param object a Spark object. Can be a SparkDataFrame, Column, GroupedData, 
WindowSpec.
 #'
 #' @family SparkDataFrame functions
 #' @rdname show


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARKR][MINOR] Fix doc for show method

2016-08-24 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 33d79b587 -> 29091d7cd


[SPARKR][MINOR] Fix doc for show method

## What changes were proposed in this pull request?

The original doc of `show` put methods for multiple classes together but the 
text only talks about `SparkDataFrame`. This PR tries to fix this problem.

## How was this patch tested?

Manual test.

Author: Junyang Qian 

Closes #14776 from junyangq/SPARK-FixShowDoc.

(cherry picked from commit d2932a0e987132c694ed59515b7c77adaad052e6)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/29091d7c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/29091d7c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/29091d7c

Branch: refs/heads/branch-2.0
Commit: 29091d7cd60c20bf019dc9c1625a22e80ea50928
Parents: 33d79b5
Author: Junyang Qian 
Authored: Wed Aug 24 10:40:09 2016 -0700
Committer: Felix Cheung 
Committed: Wed Aug 24 10:40:26 2016 -0700

--
 R/pkg/R/DataFrame.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/29091d7c/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index f8a05c6..ab45d2c 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -205,9 +205,9 @@ setMethod("showDF",
 
 #' show
 #'
-#' Print the SparkDataFrame column names and types
+#' Print class and type information of a Spark object.
 #'
-#' @param object a SparkDataFrame.
+#' @param object a Spark object. Can be a SparkDataFrame, Column, GroupedData, 
WindowSpec.
 #'
 #' @family SparkDataFrame functions
 #' @rdname show


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-16445][MLLIB][SPARKR] Multilayer Perceptron Classifier wrapper in SparkR

2016-08-24 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master d2932a0e9 -> 2fbdb6063


[SPARK-16445][MLLIB][SPARKR] Multilayer Perceptron Classifier wrapper in SparkR

https://issues.apache.org/jira/browse/SPARK-16445

## What changes were proposed in this pull request?

Create Multilayer Perceptron Classifier wrapper in SparkR

## How was this patch tested?

Tested manually on local machine

Author: Xin Ren 

Closes #14447 from keypointt/SPARK-16445.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2fbdb606
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2fbdb606
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2fbdb606

Branch: refs/heads/master
Commit: 2fbdb606392631b1dff88ec86f388cc2559c28f5
Parents: d2932a0
Author: Xin Ren 
Authored: Wed Aug 24 11:18:10 2016 -0700
Committer: Felix Cheung 
Committed: Wed Aug 24 11:18:10 2016 -0700

--
 R/pkg/NAMESPACE |   1 +
 R/pkg/R/generics.R  |   4 +
 R/pkg/R/mllib.R | 125 -
 R/pkg/inst/tests/testthat/test_mllib.R  |  32 +
 .../MultilayerPerceptronClassifierWrapper.scala | 134 +++
 .../scala/org/apache/spark/ml/r/RWrappers.scala |   2 +
 6 files changed, 293 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2fbdb606/R/pkg/NAMESPACE
--
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 7090576..ad587a6 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -27,6 +27,7 @@ exportMethods("glm",
   "summary",
   "spark.kmeans",
   "fitted",
+  "spark.mlp",
   "spark.naiveBayes",
   "spark.survreg",
   "spark.lda",

http://git-wip-us.apache.org/repos/asf/spark/blob/2fbdb606/R/pkg/R/generics.R
--
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 4e6..7e626be 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1330,6 +1330,10 @@ setGeneric("spark.kmeans", function(data, formula, ...) 
{ standardGeneric("spark
 #' @export
 setGeneric("fitted")
 
+#' @rdname spark.mlp
+#' @export
+setGeneric("spark.mlp", function(data, ...) { standardGeneric("spark.mlp") })
+
 #' @rdname spark.naiveBayes
 #' @export
 setGeneric("spark.naiveBayes", function(data, formula, ...) { 
standardGeneric("spark.naiveBayes") })

http://git-wip-us.apache.org/repos/asf/spark/blob/2fbdb606/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index a40310d..a670600 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -60,6 +60,13 @@ setClass("AFTSurvivalRegressionModel", representation(jobj = 
"jobj"))
 #' @note KMeansModel since 2.0.0
 setClass("KMeansModel", representation(jobj = "jobj"))
 
+#' S4 class that represents a MultilayerPerceptronClassificationModel
+#'
+#' @param jobj a Java object reference to the backing Scala 
MultilayerPerceptronClassifierWrapper
+#' @export
+#' @note MultilayerPerceptronClassificationModel since 2.1.0
+setClass("MultilayerPerceptronClassificationModel", representation(jobj = 
"jobj"))
+
 #' S4 class that represents an IsotonicRegressionModel
 #'
 #' @param jobj a Java object reference to the backing Scala 
IsotonicRegressionModel
@@ -90,7 +97,7 @@ setClass("ALSModel", representation(jobj = "jobj"))
 #' @export
 #' @seealso \link{spark.glm}, \link{glm},
 #' @seealso \link{spark.als}, \link{spark.gaussianMixture}, 
\link{spark.isoreg}, \link{spark.kmeans},
-#' @seealso \link{spark.lda}, \link{spark.naiveBayes}, \link{spark.survreg},
+#' @seealso \link{spark.lda}, \link{spark.mlp}, \link{spark.naiveBayes}, 
\link{spark.survreg}
 #' @seealso \link{read.ml}
 NULL
 
@@ -103,7 +110,7 @@ NULL
 #' @export
 #' @seealso \link{spark.glm}, \link{glm},
 #' @seealso \link{spark.als}, \link{spark.gaussianMixture}, 
\link{spark.isoreg}, \link{spark.kmeans},
-#' @seealso \link{spark.naiveBayes}, \link{spark.survreg},
+#' @seealso \link{spark.mlp}, \link{spark.naiveBayes}, \link{spark.survreg}
 NULL
 
 write_internal <- function(object, path, overwrite = FALSE) {
@@ -631,6 +638,95 @@ setMethod("predict", signature(object = "KMeansModel"),
 predict_internal(object, newData)
   })
 
+#' Multilayer Perceptron Classification Model
+#'
+#' \code{spark.mlp} fits a multi-layer perceptron neural network model against 
a SparkDataFrame.
+#' Users can call \code{summary} to print a summary of the fitted model, 
\code{predict} to make
+#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load 
fitted models.
+#' Only categorical data is supported.
+#' For more det

spark git commit: [MINOR][SPARKR] fix R MLlib parameter documentation

2016-08-24 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 29952ed09 -> 945c04bcd


[MINOR][SPARKR] fix R MLlib parameter documentation

## What changes were proposed in this pull request?

Fixed several misplaced param tag - they should be on the spark.* method 
generics

## How was this patch tested?

run knitr
junyangq

Author: Felix Cheung 

Closes #14792 from felixcheung/rdocmllib.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/945c04bc
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/945c04bc
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/945c04bc

Branch: refs/heads/master
Commit: 945c04bcd439e0624232c040df529f12bcc05e13
Parents: 29952ed
Author: Felix Cheung 
Authored: Wed Aug 24 15:59:09 2016 -0700
Committer: Felix Cheung 
Committed: Wed Aug 24 15:59:09 2016 -0700

--
 R/pkg/R/mllib.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/945c04bc/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index a670600..dfc5a1c 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -444,6 +444,7 @@ setMethod("write.ml", signature(object = "LDAModel", path = 
"character"),
 #' @param featureIndex The index of the feature if \code{featuresCol} is a 
vector column
 #' (default: 0), no effect otherwise
 #' @param weightCol The weight column name.
+#' @param ... additional arguments passed to the method.
 #' @return \code{spark.isoreg} returns a fitted Isotonic Regression model
 #' @rdname spark.isoreg
 #' @aliases spark.isoreg,SparkDataFrame,formula-method
@@ -504,7 +505,6 @@ setMethod("predict", signature(object = 
"IsotonicRegressionModel"),
 
 #  Get the summary of an IsotonicRegressionModel model
 
-#' @param ... Other optional arguments to summary of an IsotonicRegressionModel
 #' @return \code{summary} returns the model's boundaries and prediction as 
lists
 #' @rdname spark.isoreg
 #' @aliases summary,IsotonicRegressionModel-method
@@ -1074,6 +1074,7 @@ setMethod("predict", signature(object = 
"AFTSurvivalRegressionModel"),
 #' @param k number of independent Gaussians in the mixture model.
 #' @param maxIter maximum iteration number.
 #' @param tol the convergence tolerance.
+#' @param ... additional arguments passed to the method.
 #' @aliases spark.gaussianMixture,SparkDataFrame,formula-method
 #' @return \code{spark.gaussianMixture} returns a fitted multivariate gaussian 
mixture model.
 #' @rdname spark.gaussianMixture
@@ -1117,7 +1118,6 @@ setMethod("spark.gaussianMixture", signature(data = 
"SparkDataFrame", formula =
 #  Get the summary of a multivariate gaussian mixture model
 
 #' @param object a fitted gaussian mixture model.
-#' @param ... currently not used argument(s) passed to the method.
 #' @return \code{summary} returns the model's lambda, mu, sigma and posterior.
 #' @aliases spark.gaussianMixture,SparkDataFrame,formula-method
 #' @rdname spark.gaussianMixture


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARKR][MINOR] Add more examples to window function docs

2016-08-24 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 945c04bcd -> 18708f76c


[SPARKR][MINOR] Add more examples to window function docs

## What changes were proposed in this pull request?

This PR adds more examples to window function docs to make them more accessible 
to the users.

It also fixes default value issues for `lag` and `lead`.

## How was this patch tested?

Manual test, R unit test.

Author: Junyang Qian 

Closes #14779 from junyangq/SPARKR-FixWindowFunctionDocs.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/18708f76
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/18708f76
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/18708f76

Branch: refs/heads/master
Commit: 18708f76c366c6e01b5865981666e40d8642ac20
Parents: 945c04b
Author: Junyang Qian 
Authored: Wed Aug 24 16:00:04 2016 -0700
Committer: Felix Cheung 
Committed: Wed Aug 24 16:00:04 2016 -0700

--
 R/pkg/R/WindowSpec.R | 12 
 R/pkg/R/functions.R  | 78 ---
 2 files changed, 72 insertions(+), 18 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/18708f76/R/pkg/R/WindowSpec.R
--
diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R
index ddd2ef2..4ac83c2 100644
--- a/R/pkg/R/WindowSpec.R
+++ b/R/pkg/R/WindowSpec.R
@@ -203,6 +203,18 @@ setMethod("rangeBetween",
 #' @aliases over,Column,WindowSpec-method
 #' @family colum_func
 #' @export
+#' @examples \dontrun{
+#'   df <- createDataFrame(mtcars)
+#'
+#'   # Partition by am (transmission) and order by hp (horsepower)
+#'   ws <- orderBy(windowPartitionBy("am"), "hp")
+#'
+#'   # Rank on hp within each partition
+#'   out <- select(df, over(rank(), ws), df$hp, df$am)
+#'
+#'   # Lag mpg values by 1 row on the partition-and-ordered table
+#'   out <- select(df, over(lead(df$mpg), ws), df$mpg, df$hp, df$am)
+#' }
 #' @note over since 2.0.0
 setMethod("over",
   signature(x = "Column", window = "WindowSpec"),

http://git-wip-us.apache.org/repos/asf/spark/blob/18708f76/R/pkg/R/functions.R
--
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index f042add..dbf8dd8 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -3121,9 +3121,9 @@ setMethod("ifelse",
 #' @aliases cume_dist,missing-method
 #' @export
 #' @examples \dontrun{
-#'   df <- createDataFrame(iris)
-#'   ws <- orderBy(windowPartitionBy("Species"), "Sepal_Length")
-#'   out <- select(df, over(cume_dist(), ws), df$Sepal_Length, df$Species)
+#'   df <- createDataFrame(mtcars)
+#'   ws <- orderBy(windowPartitionBy("am"), "hp")
+#'   out <- select(df, over(cume_dist(), ws), df$hp, df$am)
 #' }
 #' @note cume_dist since 1.6.0
 setMethod("cume_dist",
@@ -3148,7 +3148,11 @@ setMethod("cume_dist",
 #' @family window_funcs
 #' @aliases dense_rank,missing-method
 #' @export
-#' @examples \dontrun{dense_rank()}
+#' @examples \dontrun{
+#'   df <- createDataFrame(mtcars)
+#'   ws <- orderBy(windowPartitionBy("am"), "hp")
+#'   out <- select(df, over(dense_rank(), ws), df$hp, df$am)
+#' }
 #' @note dense_rank since 1.6.0
 setMethod("dense_rank",
   signature("missing"),
@@ -3168,18 +3172,26 @@ setMethod("dense_rank",
 #' @param x the column as a character string or a Column to compute on.
 #' @param offset the number of rows back from the current row from which to 
obtain a value.
 #'   If not specified, the default is 1.
-#' @param defaultValue default to use when the offset row does not exist.
+#' @param defaultValue (optional) default to use when the offset row does not 
exist.
 #' @param ... further arguments to be passed to or from other methods.
 #' @rdname lag
 #' @name lag
 #' @aliases lag,characterOrColumn-method
 #' @family window_funcs
 #' @export
-#' @examples \dontrun{lag(df$c)}
+#' @examples \dontrun{
+#'   df <- createDataFrame(mtcars)
+#'
+#'   # Partition by am (transmission) and order by hp (horsepower)
+#'   ws <- orderBy(windowPartitionBy("am"), "hp")
+#'
+#'   # Lag mpg values by 1 row on the partition-and-ordered table
+#'   out <- select(df, over(lag(df$mpg), ws), df$mpg, df$hp, df$am)
+#' }
 #' @note lag since 1.6.0
 setMethod("lag",
   signature(x = "characterOrColumn"),
-  function(x, offset, defaultValue = NULL) {
+  function(x, offset = 1, defaultValue = NULL) {
 col <- if (class(x) == "Column") {
   x@jc
 } else {
@@ -3194,25 +3206,35 @@ setMethod("lag",
 #' lead
 #'
 #' Window function: returns the value that is \code{offset} rows after the 
current row, and
-#' NULL if there is less than \code{offset} rows after the current row. For 
example,
-#' an \code{offset} of one will return the nex

spark git commit: [SPARKR][MINOR] Add installation message for remote master mode and improve other messages

2016-08-24 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 43273377a -> 9f363a690


[SPARKR][MINOR] Add installation message for remote master mode and improve 
other messages

## What changes were proposed in this pull request?

This PR gives informative message to users when they try to connect to a remote 
master but don't have Spark package in their local machine.

As a clarification, for now, automatic installation will only happen if they 
start SparkR in R console (rather than from sparkr-shell) and connect to local 
master. In the remote master mode, local Spark package is still needed, but we 
will not trigger the install.spark function because the versions have to match 
those on the cluster, which involves more user input. Instead, we here try to 
provide detailed message that may help the users.

Some of the other messages have also been slightly changed.

## How was this patch tested?

Manual test.

Author: Junyang Qian 

Closes #14761 from junyangq/SPARK-16579-V1.

(cherry picked from commit 3a60be4b15a5ab9b6e0c4839df99dac7738aa7fe)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9f363a69
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9f363a69
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9f363a69

Branch: refs/heads/branch-2.0
Commit: 9f363a690102f04a2a486853c1b89134455518bc
Parents: 4327337
Author: Junyang Qian 
Authored: Wed Aug 24 16:04:14 2016 -0700
Committer: Felix Cheung 
Committed: Wed Aug 24 16:04:26 2016 -0700

--
 R/pkg/R/install.R | 64 ++
 R/pkg/R/sparkR.R  | 51 ++--
 R/pkg/R/utils.R   |  4 ++--
 3 files changed, 80 insertions(+), 39 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9f363a69/R/pkg/R/install.R
--
diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R
index c6ed88e..69b0a52 100644
--- a/R/pkg/R/install.R
+++ b/R/pkg/R/install.R
@@ -70,9 +70,9 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = 
NULL,
   localDir = NULL, overwrite = FALSE) {
   version <- paste0("spark-", packageVersion("SparkR"))
   hadoopVersion <- tolower(hadoopVersion)
-  hadoopVersionName <- hadoop_version_name(hadoopVersion)
+  hadoopVersionName <- hadoopVersionName(hadoopVersion)
   packageName <- paste(version, "bin", hadoopVersionName, sep = "-")
-  localDir <- ifelse(is.null(localDir), spark_cache_path(),
+  localDir <- ifelse(is.null(localDir), sparkCachePath(),
  normalizePath(localDir, mustWork = FALSE))
 
   if (is.na(file.info(localDir)$isdir)) {
@@ -88,12 +88,14 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl 
= NULL,
 
   # can use dir.exists(packageLocalDir) under R 3.2.0 or later
   if (!is.na(file.info(packageLocalDir)$isdir) && !overwrite) {
-fmt <- "Spark %s for Hadoop %s is found, and SPARK_HOME set to %s"
+fmt <- "%s for Hadoop %s found, with SPARK_HOME set to %s"
 msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free 
build", hadoopVersion),
packageLocalDir)
 message(msg)
 Sys.setenv(SPARK_HOME = packageLocalDir)
 return(invisible(packageLocalDir))
+  } else {
+message("Spark not found in the cache directory. Installation will start.")
   }
 
   packageLocalPath <- paste0(packageLocalDir, ".tgz")
@@ -102,7 +104,7 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl 
= NULL,
   if (tarExists && !overwrite) {
 message("tar file found.")
   } else {
-robust_download_tar(mirrorUrl, version, hadoopVersion, packageName, 
packageLocalPath)
+robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, 
packageLocalPath)
   }
 
   message(sprintf("Installing to %s", localDir))
@@ -116,33 +118,37 @@ install.spark <- function(hadoopVersion = "2.7", 
mirrorUrl = NULL,
   invisible(packageLocalDir)
 }
 
-robust_download_tar <- function(mirrorUrl, version, hadoopVersion, 
packageName, packageLocalPath) {
+robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, 
packageLocalPath) {
   # step 1: use user-provided url
   if (!is.null(mirrorUrl)) {
 msg <- sprintf("Use user-provided mirror site: %s.", mirrorUrl)
 message(msg)
-success <- direct_download_tar(mirrorUrl, version, hadoopVersion,
+success <- directDownloadTar(mirrorUrl, version, hadoopVersion,
packageName, packageLocalPath)
-if (success) return()
+if (success) {
+  return()
+} else {
+  message(paste0("Unable to download from mirrorUrl: ", mirrorUrl))
+}
   } else {
-message("Mirror site not provided.")
+message("MirrorUrl not provided.")
   }
 
   # step

spark git commit: [SPARKR][MINOR] Add installation message for remote master mode and improve other messages

2016-08-24 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 18708f76c -> 3a60be4b1


[SPARKR][MINOR] Add installation message for remote master mode and improve 
other messages

## What changes were proposed in this pull request?

This PR gives informative message to users when they try to connect to a remote 
master but don't have Spark package in their local machine.

As a clarification, for now, automatic installation will only happen if they 
start SparkR in R console (rather than from sparkr-shell) and connect to local 
master. In the remote master mode, local Spark package is still needed, but we 
will not trigger the install.spark function because the versions have to match 
those on the cluster, which involves more user input. Instead, we here try to 
provide detailed message that may help the users.

Some of the other messages have also been slightly changed.

## How was this patch tested?

Manual test.

Author: Junyang Qian 

Closes #14761 from junyangq/SPARK-16579-V1.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3a60be4b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3a60be4b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3a60be4b

Branch: refs/heads/master
Commit: 3a60be4b15a5ab9b6e0c4839df99dac7738aa7fe
Parents: 18708f7
Author: Junyang Qian 
Authored: Wed Aug 24 16:04:14 2016 -0700
Committer: Felix Cheung 
Committed: Wed Aug 24 16:04:14 2016 -0700

--
 R/pkg/R/install.R | 64 ++
 R/pkg/R/sparkR.R  | 51 ++--
 R/pkg/R/utils.R   |  4 ++--
 3 files changed, 80 insertions(+), 39 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3a60be4b/R/pkg/R/install.R
--
diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R
index c6ed88e..69b0a52 100644
--- a/R/pkg/R/install.R
+++ b/R/pkg/R/install.R
@@ -70,9 +70,9 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = 
NULL,
   localDir = NULL, overwrite = FALSE) {
   version <- paste0("spark-", packageVersion("SparkR"))
   hadoopVersion <- tolower(hadoopVersion)
-  hadoopVersionName <- hadoop_version_name(hadoopVersion)
+  hadoopVersionName <- hadoopVersionName(hadoopVersion)
   packageName <- paste(version, "bin", hadoopVersionName, sep = "-")
-  localDir <- ifelse(is.null(localDir), spark_cache_path(),
+  localDir <- ifelse(is.null(localDir), sparkCachePath(),
  normalizePath(localDir, mustWork = FALSE))
 
   if (is.na(file.info(localDir)$isdir)) {
@@ -88,12 +88,14 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl 
= NULL,
 
   # can use dir.exists(packageLocalDir) under R 3.2.0 or later
   if (!is.na(file.info(packageLocalDir)$isdir) && !overwrite) {
-fmt <- "Spark %s for Hadoop %s is found, and SPARK_HOME set to %s"
+fmt <- "%s for Hadoop %s found, with SPARK_HOME set to %s"
 msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free 
build", hadoopVersion),
packageLocalDir)
 message(msg)
 Sys.setenv(SPARK_HOME = packageLocalDir)
 return(invisible(packageLocalDir))
+  } else {
+message("Spark not found in the cache directory. Installation will start.")
   }
 
   packageLocalPath <- paste0(packageLocalDir, ".tgz")
@@ -102,7 +104,7 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl 
= NULL,
   if (tarExists && !overwrite) {
 message("tar file found.")
   } else {
-robust_download_tar(mirrorUrl, version, hadoopVersion, packageName, 
packageLocalPath)
+robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, 
packageLocalPath)
   }
 
   message(sprintf("Installing to %s", localDir))
@@ -116,33 +118,37 @@ install.spark <- function(hadoopVersion = "2.7", 
mirrorUrl = NULL,
   invisible(packageLocalDir)
 }
 
-robust_download_tar <- function(mirrorUrl, version, hadoopVersion, 
packageName, packageLocalPath) {
+robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, 
packageLocalPath) {
   # step 1: use user-provided url
   if (!is.null(mirrorUrl)) {
 msg <- sprintf("Use user-provided mirror site: %s.", mirrorUrl)
 message(msg)
-success <- direct_download_tar(mirrorUrl, version, hadoopVersion,
+success <- directDownloadTar(mirrorUrl, version, hadoopVersion,
packageName, packageLocalPath)
-if (success) return()
+if (success) {
+  return()
+} else {
+  message(paste0("Unable to download from mirrorUrl: ", mirrorUrl))
+}
   } else {
-message("Mirror site not provided.")
+message("MirrorUrl not provided.")
   }
 
   # step 2: use url suggested from apache website
-  message("Looking for site suggested from apache website...")
-

spark git commit: [SPARKR][MINOR] Add more examples to window function docs

2016-08-24 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 9f924a01b -> 43273377a


[SPARKR][MINOR] Add more examples to window function docs

## What changes were proposed in this pull request?

This PR adds more examples to window function docs to make them more accessible 
to the users.

It also fixes default value issues for `lag` and `lead`.

## How was this patch tested?

Manual test, R unit test.

Author: Junyang Qian 

Closes #14779 from junyangq/SPARKR-FixWindowFunctionDocs.

(cherry picked from commit 18708f76c366c6e01b5865981666e40d8642ac20)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/43273377
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/43273377
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/43273377

Branch: refs/heads/branch-2.0
Commit: 43273377a38a9136ff5e56929630930f076af5af
Parents: 9f924a0
Author: Junyang Qian 
Authored: Wed Aug 24 16:00:04 2016 -0700
Committer: Felix Cheung 
Committed: Wed Aug 24 16:00:18 2016 -0700

--
 R/pkg/R/WindowSpec.R | 12 
 R/pkg/R/functions.R  | 78 ---
 2 files changed, 72 insertions(+), 18 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/43273377/R/pkg/R/WindowSpec.R
--
diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R
index ddd2ef2..4ac83c2 100644
--- a/R/pkg/R/WindowSpec.R
+++ b/R/pkg/R/WindowSpec.R
@@ -203,6 +203,18 @@ setMethod("rangeBetween",
 #' @aliases over,Column,WindowSpec-method
 #' @family colum_func
 #' @export
+#' @examples \dontrun{
+#'   df <- createDataFrame(mtcars)
+#'
+#'   # Partition by am (transmission) and order by hp (horsepower)
+#'   ws <- orderBy(windowPartitionBy("am"), "hp")
+#'
+#'   # Rank on hp within each partition
+#'   out <- select(df, over(rank(), ws), df$hp, df$am)
+#'
+#'   # Lag mpg values by 1 row on the partition-and-ordered table
+#'   out <- select(df, over(lead(df$mpg), ws), df$mpg, df$hp, df$am)
+#' }
 #' @note over since 2.0.0
 setMethod("over",
   signature(x = "Column", window = "WindowSpec"),

http://git-wip-us.apache.org/repos/asf/spark/blob/43273377/R/pkg/R/functions.R
--
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index f042add..dbf8dd8 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -3121,9 +3121,9 @@ setMethod("ifelse",
 #' @aliases cume_dist,missing-method
 #' @export
 #' @examples \dontrun{
-#'   df <- createDataFrame(iris)
-#'   ws <- orderBy(windowPartitionBy("Species"), "Sepal_Length")
-#'   out <- select(df, over(cume_dist(), ws), df$Sepal_Length, df$Species)
+#'   df <- createDataFrame(mtcars)
+#'   ws <- orderBy(windowPartitionBy("am"), "hp")
+#'   out <- select(df, over(cume_dist(), ws), df$hp, df$am)
 #' }
 #' @note cume_dist since 1.6.0
 setMethod("cume_dist",
@@ -3148,7 +3148,11 @@ setMethod("cume_dist",
 #' @family window_funcs
 #' @aliases dense_rank,missing-method
 #' @export
-#' @examples \dontrun{dense_rank()}
+#' @examples \dontrun{
+#'   df <- createDataFrame(mtcars)
+#'   ws <- orderBy(windowPartitionBy("am"), "hp")
+#'   out <- select(df, over(dense_rank(), ws), df$hp, df$am)
+#' }
 #' @note dense_rank since 1.6.0
 setMethod("dense_rank",
   signature("missing"),
@@ -3168,18 +3172,26 @@ setMethod("dense_rank",
 #' @param x the column as a character string or a Column to compute on.
 #' @param offset the number of rows back from the current row from which to 
obtain a value.
 #'   If not specified, the default is 1.
-#' @param defaultValue default to use when the offset row does not exist.
+#' @param defaultValue (optional) default to use when the offset row does not 
exist.
 #' @param ... further arguments to be passed to or from other methods.
 #' @rdname lag
 #' @name lag
 #' @aliases lag,characterOrColumn-method
 #' @family window_funcs
 #' @export
-#' @examples \dontrun{lag(df$c)}
+#' @examples \dontrun{
+#'   df <- createDataFrame(mtcars)
+#'
+#'   # Partition by am (transmission) and order by hp (horsepower)
+#'   ws <- orderBy(windowPartitionBy("am"), "hp")
+#'
+#'   # Lag mpg values by 1 row on the partition-and-ordered table
+#'   out <- select(df, over(lag(df$mpg), ws), df$mpg, df$hp, df$am)
+#' }
 #' @note lag since 1.6.0
 setMethod("lag",
   signature(x = "characterOrColumn"),
-  function(x, offset, defaultValue = NULL) {
+  function(x, offset = 1, defaultValue = NULL) {
 col <- if (class(x) == "Column") {
   x@jc
 } else {
@@ -3194,25 +3206,35 @@ setMethod("lag",
 #' lead
 #'
 #' Window function: returns the value that is \code{offset} rows after the 
current row, and
-#' NULL if there is less t

spark git commit: [SPARKR][MINOR] Fix example of spark.naiveBayes

2016-08-26 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 970ab8f6d -> 188321623


[SPARKR][MINOR] Fix example of spark.naiveBayes

## What changes were proposed in this pull request?

The original example doesn't work because the features are not categorical. 
This PR fixes this by changing to another dataset.

## How was this patch tested?

Manual test.

Author: Junyang Qian 

Closes #14820 from junyangq/SPARK-FixNaiveBayes.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/18832162
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/18832162
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/18832162

Branch: refs/heads/master
Commit: 18832162357282ec81515b5b2ba93747be3ad18b
Parents: 970ab8f
Author: Junyang Qian 
Authored: Fri Aug 26 11:01:48 2016 -0700
Committer: Felix Cheung 
Committed: Fri Aug 26 11:01:48 2016 -0700

--
 R/pkg/R/mllib.R | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/18832162/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index dfc5a1c..6808aae 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -747,10 +747,11 @@ setMethod("summary", signature(object = 
"MultilayerPerceptronClassificationModel
 #' @export
 #' @examples
 #' \dontrun{
-#' df <- createDataFrame(infert)
+#' data <- as.data.frame(UCBAdmissions)
+#' df <- createDataFrame(data)
 #'
 #' # fit a Bernoulli naive Bayes model
-#' model <- spark.naiveBayes(df, education ~ ., smoothing = 0)
+#' model <- spark.naiveBayes(df, Admit ~ Gender + Dept, smoothing = 0)
 #'
 #' # get the summary of the model
 #' summary(model)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARKR][MINOR] Fix example of spark.naiveBayes

2016-08-26 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 27ed6d5dc -> 6f82d2da3


[SPARKR][MINOR] Fix example of spark.naiveBayes

## What changes were proposed in this pull request?

The original example doesn't work because the features are not categorical. 
This PR fixes this by changing to another dataset.

## How was this patch tested?

Manual test.

Author: Junyang Qian 

Closes #14820 from junyangq/SPARK-FixNaiveBayes.

(cherry picked from commit 18832162357282ec81515b5b2ba93747be3ad18b)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f82d2da
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f82d2da
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f82d2da

Branch: refs/heads/branch-2.0
Commit: 6f82d2da382cee2950a0797436e5d48805cbba5f
Parents: 27ed6d5
Author: Junyang Qian 
Authored: Fri Aug 26 11:01:48 2016 -0700
Committer: Felix Cheung 
Committed: Fri Aug 26 11:02:04 2016 -0700

--
 R/pkg/R/mllib.R | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6f82d2da/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 25dcb3a..b33a16a 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -433,10 +433,11 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' @export
 #' @examples
 #' \dontrun{
-#' df <- createDataFrame(infert)
+#' data <- as.data.frame(UCBAdmissions)
+#' df <- createDataFrame(data)
 #'
 #' # fit a Bernoulli naive Bayes model
-#' model <- spark.naiveBayes(df, education ~ ., smoothing = 0)
+#' model <- spark.naiveBayes(df, Admit ~ Gender + Dept, smoothing = 0)
 #'
 #' # get the summary of the model
 #' summary(model)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-17178][SPARKR][SPARKSUBMIT] Allow to set sparkr shell command through --conf

2016-08-31 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master d92cd227c -> fa6347938


[SPARK-17178][SPARKR][SPARKSUBMIT] Allow to set sparkr shell command through 
--conf

## What changes were proposed in this pull request?

Allow user to set sparkr shell command through --conf spark.r.shell.command

## How was this patch tested?

Unit test is added and also verify it manually through
```
bin/sparkr --master yarn-client --conf spark.r.shell.command=/usr/local/bin/R
```

Author: Jeff Zhang 

Closes #14744 from zjffdu/SPARK-17178.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fa634793
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fa634793
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fa634793

Branch: refs/heads/master
Commit: fa6347938fc1c72ddc03a5f3cd2e929b5694f0a6
Parents: d92cd22
Author: Jeff Zhang 
Authored: Wed Aug 31 00:20:41 2016 -0700
Committer: Felix Cheung 
Committed: Wed Aug 31 00:20:41 2016 -0700

--
 docs/configuration.md | 11 ++-
 .../org/apache/spark/launcher/SparkLauncher.java  |  2 ++
 .../spark/launcher/SparkSubmitCommandBuilder.java |  3 ++-
 .../launcher/SparkSubmitCommandBuilderSuite.java  | 18 ++
 4 files changed, 32 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/fa634793/docs/configuration.md
--
diff --git a/docs/configuration.md b/docs/configuration.md
index d0c76aa..6e98f67 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1786,6 +1786,14 @@ showDF(properties, numRows = 200, truncate = FALSE)
 Executable for executing R scripts in client modes for driver. Ignored in 
cluster modes.
   
 
+
+  spark.r.shell.command
+  R
+  
+Executable for executing sparkR shell in client modes for driver. Ignored 
in cluster modes. It is the same as environment variable 
SPARKR_DRIVER_R, but take precedence over it.
+spark.r.shell.command is used for sparkR shell while 
spark.r.driver.command is used for running R script.
+  
+
 
 
  Deploy
@@ -1852,7 +1860,8 @@ The following variables can be set in `spark-env.sh`:
   
   
 SPARKR_DRIVER_R
-R binary executable to use for SparkR shell (default is 
R).
+R binary executable to use for SparkR shell (default is 
R).
+Property spark.r.shell.command take precedence if it is 
set
   
   
 SPARK_LOCAL_IP

http://git-wip-us.apache.org/repos/asf/spark/blob/fa634793/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
--
diff --git 
a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java 
b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
index 7b7a7bf..ea56214 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
@@ -68,6 +68,8 @@ public class SparkLauncher {
 
   static final String PYSPARK_PYTHON = "spark.pyspark.python";
 
+  static final String SPARKR_R_SHELL = "spark.r.shell.command";
+
   /** Logger name to use when launching a child process. */
   public static final String CHILD_PROCESS_LOGGER_NAME = 
"spark.launcher.childProcLoggerName";
 

http://git-wip-us.apache.org/repos/asf/spark/blob/fa634793/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
--
diff --git 
a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
 
b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
index f6da644..29c6d82 100644
--- 
a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
+++ 
b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
@@ -336,7 +336,8 @@ class SparkSubmitCommandBuilder extends 
AbstractCommandBuilder {
 join(File.separator, sparkHome, "R", "lib", "SparkR", "profile", 
"shell.R"));
 
 List args = new ArrayList<>();
-args.add(firstNonEmpty(System.getenv("SPARKR_DRIVER_R"), "R"));
+args.add(firstNonEmpty(conf.get(SparkLauncher.SPARKR_R_SHELL),
+  System.getenv("SPARKR_DRIVER_R"), "R"));
 return args;
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/fa634793/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java
--
diff --git 
a/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java
 
b/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java
index 16e5a22..ad2e7a7 100644
--- 
a/launcher/src/test/java/org/apache/spark

spark git commit: [SPARK-16883][SPARKR] SQL decimal type is not properly cast to number when collecting SparkDataFrame

2016-09-02 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 2ab8dbdda -> 0f30cdedb


[SPARK-16883][SPARKR] SQL decimal type is not properly cast to number when 
collecting SparkDataFrame

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)

registerTempTable(createDataFrame(iris), "iris")
str(collect(sql("select cast('1' as double) as x, cast('2' as decimal) as y  
from iris limit 5")))

'data.frame':   5 obs. of  2 variables:
 $ x: num  1 1 1 1 1
 $ y:List of 5
  ..$ : num 2
  ..$ : num 2
  ..$ : num 2
  ..$ : num 2
  ..$ : num 2

The problem is that spark returns `decimal(10, 0)` col type, instead of 
`decimal`. Thus, `decimal(10, 0)` is not handled correctly. It should be 
handled as "double".

As discussed in JIRA thread, we can have two potential fixes:
1). Scala side fix to add a new case when writing the object back; However, I 
can't use spark.sql.types._ in Spark core due to dependency issues. I don't 
find a way of doing type case match;

2). SparkR side fix: Add a helper function to check special type like 
`"decimal(10, 0)"` and replace it with `double`, which is PRIMITIVE type. This 
special helper is generic for adding new types handling in the future.

I open this PR to discuss pros and cons of both approaches. If we want to do 
Scala side fix, we need to find a way to match the case of DecimalType and 
StructType in Spark Core.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, 
manual tests)

Manual test:
> str(collect(sql("select cast('1' as double) as x, cast('2' as decimal) as y  
> from iris limit 5")))
'data.frame':   5 obs. of  2 variables:
 $ x: num  1 1 1 1 1
 $ y: num  2 2 2 2 2
R Unit tests

Author: wm...@hotmail.com 

Closes #14613 from wangmiao1981/type.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0f30cded
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0f30cded
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0f30cded

Branch: refs/heads/master
Commit: 0f30cdedbdb0d38e8c479efab6bb1c6c376206ff
Parents: 2ab8dbd
Author: wm...@hotmail.com 
Authored: Fri Sep 2 01:47:17 2016 -0700
Committer: Felix Cheung 
Committed: Fri Sep 2 01:47:17 2016 -0700

--
 R/pkg/R/DataFrame.R   | 13 -
 R/pkg/R/types.R   | 16 
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 22 ++
 3 files changed, 50 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0f30cded/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index e12b58e..a924502 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -397,7 +397,11 @@ setMethod("coltypes",
 }
 
 if (is.null(type)) {
-  stop(paste("Unsupported data type: ", x))
+  specialtype <- specialtypeshandle(x)
+  if (is.null(specialtype)) {
+stop(paste("Unsupported data type: ", x))
+  }
+  type <- PRIMITIVE_TYPES[[specialtype]]
 }
   }
   type
@@ -1063,6 +1067,13 @@ setMethod("collect",
   df[[colIndex]] <- col
 } else {
   colType <- dtypes[[colIndex]][[2]]
+  if (is.null(PRIMITIVE_TYPES[[colType]])) {
+specialtype <- specialtypeshandle(colType)
+if (!is.null(specialtype)) {
+  colType <- specialtype
+}
+  }
+
   # Note that "binary" columns behave like complex types.
   if (!is.null(PRIMITIVE_TYPES[[colType]]) && colType != 
"binary") {
 vec <- do.call(c, col)

http://git-wip-us.apache.org/repos/asf/spark/blob/0f30cded/R/pkg/R/types.R
--
diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R
index ad048b1..abca703 100644
--- a/R/pkg/R/types.R
+++ b/R/pkg/R/types.R
@@ -67,3 +67,19 @@ rToSQLTypes <- as.environment(list(
   "double" = "double",
   "character" = "string",
   "logical" = "boolean"))
+
+# Helper function of coverting decimal type. When backend returns column type 
in the
+# format of decimal(,) (e.g., decimal(10, 0)), this function coverts the 
column type
+# as double type. This function converts backend returned types that are not 
the key
+# of PRIMITIVE_TYPES, but should be treated as PRIMITIVE_TYPES.
+# @param A type returned from the JVM backend.
+# @return A type is the key of the PRIMITIVE_TYPES.
+specialtypeshandle <- function(type) {
+  returntype <- NULL
+  m <- regexec("^decim

spark git commit: [SPARK-16883][SPARKR] SQL decimal type is not properly cast to number when collecting SparkDataFrame

2016-09-02 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 f9463238d -> 171bdfd96


[SPARK-16883][SPARKR] SQL decimal type is not properly cast to number when 
collecting SparkDataFrame

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)

registerTempTable(createDataFrame(iris), "iris")
str(collect(sql("select cast('1' as double) as x, cast('2' as decimal) as y  
from iris limit 5")))

'data.frame':   5 obs. of  2 variables:
 $ x: num  1 1 1 1 1
 $ y:List of 5
  ..$ : num 2
  ..$ : num 2
  ..$ : num 2
  ..$ : num 2
  ..$ : num 2

The problem is that spark returns `decimal(10, 0)` col type, instead of 
`decimal`. Thus, `decimal(10, 0)` is not handled correctly. It should be 
handled as "double".

As discussed in JIRA thread, we can have two potential fixes:
1). Scala side fix to add a new case when writing the object back; However, I 
can't use spark.sql.types._ in Spark core due to dependency issues. I don't 
find a way of doing type case match;

2). SparkR side fix: Add a helper function to check special type like 
`"decimal(10, 0)"` and replace it with `double`, which is PRIMITIVE type. This 
special helper is generic for adding new types handling in the future.

I open this PR to discuss pros and cons of both approaches. If we want to do 
Scala side fix, we need to find a way to match the case of DecimalType and 
StructType in Spark Core.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, 
manual tests)

Manual test:
> str(collect(sql("select cast('1' as double) as x, cast('2' as decimal) as y  
> from iris limit 5")))
'data.frame':   5 obs. of  2 variables:
 $ x: num  1 1 1 1 1
 $ y: num  2 2 2 2 2
R Unit tests

Author: wm...@hotmail.com 

Closes #14613 from wangmiao1981/type.

(cherry picked from commit 0f30cdedbdb0d38e8c479efab6bb1c6c376206ff)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/171bdfd9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/171bdfd9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/171bdfd9

Branch: refs/heads/branch-2.0
Commit: 171bdfd963b5dda85ddf5e72b72471fdaaaf2fe3
Parents: f946323
Author: wm...@hotmail.com 
Authored: Fri Sep 2 01:47:17 2016 -0700
Committer: Felix Cheung 
Committed: Fri Sep 2 01:48:11 2016 -0700

--
 R/pkg/R/DataFrame.R   | 13 -
 R/pkg/R/types.R   | 16 
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 22 ++
 3 files changed, 50 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/171bdfd9/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index ab45d2c..8aea228 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -390,7 +390,11 @@ setMethod("coltypes",
 }
 
 if (is.null(type)) {
-  stop(paste("Unsupported data type: ", x))
+  specialtype <- specialtypeshandle(x)
+  if (is.null(specialtype)) {
+stop(paste("Unsupported data type: ", x))
+  }
+  type <- PRIMITIVE_TYPES[[specialtype]]
 }
   }
   type
@@ -1056,6 +1060,13 @@ setMethod("collect",
   df[[colIndex]] <- col
 } else {
   colType <- dtypes[[colIndex]][[2]]
+  if (is.null(PRIMITIVE_TYPES[[colType]])) {
+specialtype <- specialtypeshandle(colType)
+if (!is.null(specialtype)) {
+  colType <- specialtype
+}
+  }
+
   # Note that "binary" columns behave like complex types.
   if (!is.null(PRIMITIVE_TYPES[[colType]]) && colType != 
"binary") {
 vec <- do.call(c, col)

http://git-wip-us.apache.org/repos/asf/spark/blob/171bdfd9/R/pkg/R/types.R
--
diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R
index ad048b1..abca703 100644
--- a/R/pkg/R/types.R
+++ b/R/pkg/R/types.R
@@ -67,3 +67,19 @@ rToSQLTypes <- as.environment(list(
   "double" = "double",
   "character" = "string",
   "logical" = "boolean"))
+
+# Helper function of coverting decimal type. When backend returns column type 
in the
+# format of decimal(,) (e.g., decimal(10, 0)), this function coverts the 
column type
+# as double type. This function converts backend returned types that are not 
the key
+# of PRIMITIVE_TYPES, but should be treated as PRIMITIVE_TYPES.
+# @param A type returned from the JVM backend.
+# @return A type is the key of

spark git commit: [SPARK-15509][ML][SPARKR] R MLlib algorithms should support input columns "features" and "label"

2016-09-02 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 0f30cdedb -> 6969dcc79


[SPARK-15509][ML][SPARKR] R MLlib algorithms should support input columns 
"features" and "label"

https://issues.apache.org/jira/browse/SPARK-15509

## What changes were proposed in this pull request?

Currently in SparkR, when you load a LibSVM dataset using the sqlContext and 
then pass it to an MLlib algorithm, the ML wrappers will fail since they will 
try to create a "features" column, which conflicts with the existing "features" 
column from the LibSVM loader. E.g., using the "mnist" dataset from LibSVM:
`training <- loadDF(sqlContext, ".../mnist", "libsvm")`
`model <- naiveBayes(label ~ features, training)`
This fails with:
```
16/05/24 11:52:41 ERROR RBackendHandler: fit on 
org.apache.spark.ml.r.NaiveBayesWrapper failed
Error in invokeJava(isStatic = TRUE, className, methodName, ...) :
  java.lang.IllegalArgumentException: Output column features already exists.
at 
org.apache.spark.ml.feature.VectorAssembler.transformSchema(VectorAssembler.scala:120)
at 
org.apache.spark.ml.Pipeline$$anonfun$transformSchema$4.apply(Pipeline.scala:179)
at 
org.apache.spark.ml.Pipeline$$anonfun$transformSchema$4.apply(Pipeline.scala:179)
at 
scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:57)
at 
scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:66)
at scala.collection.mutable.ArrayOps$ofRef.foldLeft(ArrayOps.scala:186)
at org.apache.spark.ml.Pipeline.transformSchema(Pipeline.scala:179)
at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:67)
at org.apache.spark.ml.Pipeline.fit(Pipeline.scala:131)
at org.apache.spark.ml.feature.RFormula.fit(RFormula.scala:169)
at 
org.apache.spark.ml.r.NaiveBayesWrapper$.fit(NaiveBayesWrapper.scala:62)
at org.apache.spark.ml.r.NaiveBayesWrapper.fit(NaiveBayesWrapper.sca
The same issue appears for the "label" column once you rename the "features" 
column.
```
The cause is, when using `loadDF()` to generate dataframes, sometimes it’s 
with default column name `“label”` and `“features”`, and these two name 
will conflict with default column names `setDefault(labelCol, "label")` and ` 
setDefault(featuresCol, "features")` of `SharedParams.scala`

## How was this patch tested?

Test on my local machine.

Author: Xin Ren 

Closes #13584 from keypointt/SPARK-15509.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6969dcc7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6969dcc7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6969dcc7

Branch: refs/heads/master
Commit: 6969dcc79a33d715250958b24361f2d43552d840
Parents: 0f30cde
Author: Xin Ren 
Authored: Fri Sep 2 01:54:28 2016 -0700
Committer: Felix Cheung 
Committed: Fri Sep 2 01:54:28 2016 -0700

--
 .../ml/r/AFTSurvivalRegressionWrapper.scala |  1 +
 .../spark/ml/r/GaussianMixtureWrapper.scala |  5 +-
 .../r/GeneralizedLinearRegressionWrapper.scala  |  1 +
 .../spark/ml/r/IsotonicRegressionWrapper.scala  |  5 +-
 .../org/apache/spark/ml/r/KMeansWrapper.scala   |  5 +-
 .../apache/spark/ml/r/NaiveBayesWrapper.scala   | 11 +--
 .../org/apache/spark/ml/r/RWrapperUtils.scala   | 71 
 .../apache/spark/ml/feature/RFormulaSuite.scala |  3 -
 .../apache/spark/ml/r/RWrapperUtilsSuite.scala  | 56 +++
 9 files changed, 144 insertions(+), 14 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6969dcc7/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala 
b/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala
index 5462f80..67d037e 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala
@@ -87,6 +87,7 @@ private[r] object AFTSurvivalRegressionWrapper extends 
MLReadable[AFTSurvivalReg
 val (rewritedFormula, censorCol) = formulaRewrite(formula)
 
 val rFormula = new RFormula().setFormula(rewritedFormula)
+RWrapperUtils.checkDataColumns(rFormula, data)
 val rFormulaModel = rFormula.fit(data)
 
 // get feature names from output schema

http://git-wip-us.apache.org/repos/asf/spark/blob/6969dcc7/mllib/src/main/scala/org/apache/spark/ml/r/GaussianMixtureWrapper.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/r/GaussianMixtureWrapper.scala 
b/mllib/src/main/scala/org/apache/spark/ml/r/GaussianMixtureW

spark git commit: [SPARK-17376][SPARKR] followup - change since version

2016-09-02 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 d4ae35d02 -> 03d9af604


[SPARK-17376][SPARKR] followup - change since version

## What changes were proposed in this pull request?

change since version in doc

## How was this patch tested?

manual

Author: Felix Cheung 

Closes #14939 from felixcheung/rsparkversion2.

(cherry picked from commit eac1d0e921345b5d15aa35d8c565140292ab2af3)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/03d9af60
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/03d9af60
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/03d9af60

Branch: refs/heads/branch-2.0
Commit: 03d9af6043ae443ced004383c996fa8eebf3a1d1
Parents: d4ae35d
Author: Felix Cheung 
Authored: Fri Sep 2 11:08:25 2016 -0700
Committer: Felix Cheung 
Committed: Fri Sep 2 11:08:38 2016 -0700

--
 R/pkg/R/SQLContext.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/03d9af60/R/pkg/R/SQLContext.R
--
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index a140454..783df53 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -169,7 +169,7 @@ sparkR.conf <- function(key, defaultValue) {
 #' sparkR.session()
 #' version <- sparkR.version()
 #' }
-#' @note sparkR.version since 2.1.0
+#' @note sparkR.version since 2.0.1
 sparkR.version <- function() {
   sparkSession <- getSparkSession()
   callJMethod(sparkSession, "version")


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-17376][SPARKR] followup - change since version

2016-09-02 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master e79962f2f -> eac1d0e92


[SPARK-17376][SPARKR] followup - change since version

## What changes were proposed in this pull request?

change since version in doc

## How was this patch tested?

manual

Author: Felix Cheung 

Closes #14939 from felixcheung/rsparkversion2.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eac1d0e9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eac1d0e9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eac1d0e9

Branch: refs/heads/master
Commit: eac1d0e921345b5d15aa35d8c565140292ab2af3
Parents: e79962f
Author: Felix Cheung 
Authored: Fri Sep 2 11:08:25 2016 -0700
Committer: Felix Cheung 
Committed: Fri Sep 2 11:08:25 2016 -0700

--
 R/pkg/R/SQLContext.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/eac1d0e9/R/pkg/R/SQLContext.R
--
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index a140454..783df53 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -169,7 +169,7 @@ sparkR.conf <- function(key, defaultValue) {
 #' sparkR.session()
 #' version <- sparkR.version()
 #' }
-#' @note sparkR.version since 2.1.0
+#' @note sparkR.version since 2.0.1
 sparkR.version <- function() {
   sparkSession <- getSparkSession()
   callJMethod(sparkSession, "version")


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-17315][SPARKR] Kolmogorov-Smirnov test SparkR wrapper

2016-09-03 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master c2a1576c2 -> abb2f9210


[SPARK-17315][SPARKR] Kolmogorov-Smirnov test SparkR wrapper

## What changes were proposed in this pull request?

This PR tries to add Kolmogorov-Smirnov Test wrapper to SparkR. This wrapper 
implementation only supports one sample test against normal distribution.

## How was this patch tested?

R unit test.

Author: Junyang Qian 

Closes #14881 from junyangq/SPARK-17315.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/abb2f921
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/abb2f921
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/abb2f921

Branch: refs/heads/master
Commit: abb2f921036d97d8cab033838ae559eb731bf0fd
Parents: c2a1576
Author: Junyang Qian 
Authored: Sat Sep 3 12:26:30 2016 -0700
Committer: Felix Cheung 
Committed: Sat Sep 3 12:26:30 2016 -0700

--
 R/pkg/NAMESPACE |   7 +-
 R/pkg/R/generics.R  |   4 +
 R/pkg/R/mllib.R | 105 +++
 R/pkg/inst/tests/testthat/test_mllib.R  |  34 ++
 .../org/apache/spark/ml/r/KSTestWrapper.scala   |  57 ++
 5 files changed, 205 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/abb2f921/R/pkg/NAMESPACE
--
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index ce41b51..a5e9cbd 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -42,7 +42,8 @@ exportMethods("glm",
   "spark.perplexity",
   "spark.isoreg",
   "spark.gaussianMixture",
-  "spark.als")
+  "spark.als",
+  "spark.kstest")
 
 # Job group lifecycle management methods
 export("setJobGroup",
@@ -342,7 +343,8 @@ export("as.DataFrame",
"tables",
"uncacheTable",
"print.summary.GeneralizedLinearRegressionModel",
-   "read.ml")
+   "read.ml",
+   "print.summary.KSTest")
 
 export("structField",
"structField.jobj",
@@ -366,6 +368,7 @@ S3method(print, jobj)
 S3method(print, structField)
 S3method(print, structType)
 S3method(print, summary.GeneralizedLinearRegressionModel)
+S3method(print, summary.KSTest)
 S3method(structField, character)
 S3method(structField, jobj)
 S3method(structType, jobj)

http://git-wip-us.apache.org/repos/asf/spark/blob/abb2f921/R/pkg/R/generics.R
--
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 7e626be..67a999d 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1375,3 +1375,7 @@ setGeneric("write.ml", function(object, path, ...) { 
standardGeneric("write.ml")
 #' @rdname spark.als
 #' @export
 setGeneric("spark.als", function(data, ...) { standardGeneric("spark.als") })
+
+#' @rdname spark.kstest
+#' @export
+setGeneric("spark.kstest", function(data, ...) { 
standardGeneric("spark.kstest") })

http://git-wip-us.apache.org/repos/asf/spark/blob/abb2f921/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 9a53f75..f321fd1 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -88,6 +88,13 @@ setClass("GaussianMixtureModel", representation(jobj = 
"jobj"))
 #' @note ALSModel since 2.1.0
 setClass("ALSModel", representation(jobj = "jobj"))
 
+#' S4 class that represents an KSTest
+#'
+#' @param jobj a Java object reference to the backing Scala KSTestWrapper
+#' @export
+#' @note KSTest since 2.1.0
+setClass("KSTest", representation(jobj = "jobj"))
+
 #' Saves the MLlib model to the input path
 #'
 #' Saves the MLlib model to the input path. For more information, see the 
specific
@@ -1310,3 +1317,101 @@ setMethod("write.ml", signature(object = "ALSModel", 
path = "character"),
   function(object, path, overwrite = FALSE) {
 write_internal(object, path, overwrite)
   })
+
+#' (One-Sample) Kolmogorov-Smirnov Test
+#'
+#' @description
+#' \code{spark.kstest} Conduct the two-sided Kolmogorov-Smirnov (KS) test for 
data sampled from a
+#' continuous distribution.
+#'
+#' By comparing the largest difference between the empirical cumulative
+#' distribution of the sample data and the theoretical distribution we can 
provide a test for the
+#' the null hypothesis that the sample data comes from that theoretical 
distribution.
+#'
+#' Users can call \code{summary} to obtain a summary of the test, and 
\code{print.summary.KSTest}
+#' to print out a summary result.
+#'
+#' @param data a SparkDataFrame of user data.
+#' @param testCol column name where the test data is from. It should be a 
column of double type.
+#' @param nullHypothesis name of the theoretical dis

spark git commit: [SPARK-19133][SPARKR][ML][BACKPORT-2.0] fix glm for Gamma, clarify glm family supported

2017-01-11 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 6fe676c09 -> ec2fe925c


[SPARK-19133][SPARKR][ML][BACKPORT-2.0] fix glm for Gamma, clarify glm family 
supported

## What changes were proposed in this pull request?

Backport to 2.0 (cherry picking from 2.1 didn't work)

## How was this patch tested?

unit test

Author: Felix Cheung 

Closes #16543 from felixcheung/rgammabackport20.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ec2fe925
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ec2fe925
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ec2fe925

Branch: refs/heads/branch-2.0
Commit: ec2fe925cd359ca5c132372d4b18ff791b70605a
Parents: 6fe676c
Author: Felix Cheung 
Authored: Wed Jan 11 20:01:11 2017 -0800
Committer: Felix Cheung 
Committed: Wed Jan 11 20:01:11 2017 -0800

--
 R/pkg/R/mllib.R| 7 ++-
 R/pkg/inst/tests/testthat/test_mllib.R | 8 
 2 files changed, 14 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ec2fe925/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index b33a16a..cd07f27 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -89,6 +89,8 @@ NULL
 #'   This can be a character string naming a family function, a 
family function or
 #'   the result of a call to a family function. Refer R family at
 #'   
\url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
+#'   Currently these families are supported: \code{binomial}, 
\code{gaussian},
+#'   \code{Gamma}, and \code{poisson}.
 #' @param tol positive convergence tolerance of iterations.
 #' @param maxIter integer giving the maximal number of IRLS iterations.
 #' @param ... additional arguments passed to the method.
@@ -134,8 +136,9 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", 
formula = "formula"),
 
 formula <- paste(deparse(formula), collapse = "")
 
+# For known families, Gamma is upper-cased
 jobj <- 
callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
-"fit", formula, data@sdf, family$family, 
family$link,
+"fit", formula, data@sdf, 
tolower(family$family), family$link,
 tol, as.integer(maxIter))
 return(new("GeneralizedLinearRegressionModel", jobj = jobj))
   })
@@ -150,6 +153,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", 
formula = "formula"),
 #'   This can be a character string naming a family function, a 
family function or
 #'   the result of a call to a family function. Refer R family at
 #'   
\url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
+#'   Currently these families are supported: \code{binomial}, 
\code{gaussian},
+#'   \code{Gamma}, and \code{poisson}.
 #' @param epsilon positive convergence tolerance of iterations.
 #' @param maxit integer giving the maximal number of IRLS iterations.
 #' @return \code{glm} returns a fitted generalized linear model.

http://git-wip-us.apache.org/repos/asf/spark/blob/ec2fe925/R/pkg/inst/tests/testthat/test_mllib.R
--
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R 
b/R/pkg/inst/tests/testthat/test_mllib.R
index 753da81..e0d2e53 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -69,6 +69,14 @@ test_that("spark.glm and predict", {
   data = iris, family = poisson(link = identity)), iris))
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 
+  # Gamma family
+  x <- runif(100, -1, 1)
+  y <- rgamma(100, rate = 10 / exp(0.5 + 1.2 * x), shape = 10)
+  df <- as.DataFrame(as.data.frame(list(x = x, y = y)))
+  model <- glm(y ~ x, family = Gamma, df)
+  out <- capture.output(print(summary(model)))
+  expect_true(any(grepl("Dispersion parameter for gamma family", out)))
+
   # Test stats::predict is working
   x <- rnorm(15)
   y <- x + rnorm(15)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-12757][CORE] lower "block locks were not released" log to info level

2017-01-12 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master c6c37b8af -> 2bc4d4e28


[SPARK-12757][CORE] lower "block locks were not released" log to info level

## What changes were proposed in this pull request?

lower "block locks were not released" log to info level, as it is generating a 
lot of warnings in running ML, graph calls, as pointed out in the JIRA.

Author: Felix Cheung 

Closes #16513 from felixcheung/blocklockswarn.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2bc4d4e2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2bc4d4e2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2bc4d4e2

Branch: refs/heads/master
Commit: 2bc4d4e286e65f8b4e9ee21bccd913b62e6061f2
Parents: c6c37b8
Author: Felix Cheung 
Authored: Thu Jan 12 09:45:16 2017 -0800
Committer: Felix Cheung 
Committed: Thu Jan 12 09:45:16 2017 -0800

--
 core/src/main/scala/org/apache/spark/executor/Executor.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2bc4d4e2/core/src/main/scala/org/apache/spark/executor/Executor.scala
--
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala 
b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 789198f..b6c0f0c 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -336,7 +336,7 @@ private[spark] class Executor(
 if (conf.getBoolean("spark.storage.exceptionOnPinLeak", false)) {
   throw new SparkException(errMsg)
 } else {
-  logWarning(errMsg)
+  logInfo(errMsg)
 }
   }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-18828][SPARKR] Refactor scripts for R

2017-01-16 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master a115a5439 -> c84f7d3e1


[SPARK-18828][SPARKR] Refactor scripts for R

## What changes were proposed in this pull request?

Refactored script to remove duplications and clearer purpose for each script

## How was this patch tested?

manually

Author: Felix Cheung 

Closes #16249 from felixcheung/rscripts.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c84f7d3e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c84f7d3e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c84f7d3e

Branch: refs/heads/master
Commit: c84f7d3e1b845bc1e595ce9a6e2de663c2d218f4
Parents: a115a54
Author: Felix Cheung 
Authored: Mon Jan 16 13:49:12 2017 -0800
Committer: Felix Cheung 
Committed: Mon Jan 16 13:49:12 2017 -0800

--
 R/check-cran.sh | 32 +++---
 R/create-docs.sh| 11 
 R/create-rd.sh  | 37 ++
 R/find-r.sh | 34 
 R/install-dev.sh| 20 +++---
 R/install-source-package.sh | 57 
 dev/make-distribution.sh|  7 +++--
 7 files changed, 146 insertions(+), 52 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c84f7d3e/R/check-cran.sh
--
diff --git a/R/check-cran.sh b/R/check-cran.sh
index 1288e7f..a188b14 100755
--- a/R/check-cran.sh
+++ b/R/check-cran.sh
@@ -20,25 +20,14 @@
 set -o pipefail
 set -e
 
-FWDIR="$(cd `dirname $0`; pwd)"
+FWDIR="$(cd `dirname "${BASH_SOURCE[0]}"`; pwd)"
 pushd $FWDIR > /dev/null
 
-if [ ! -z "$R_HOME" ]
-  then
-R_SCRIPT_PATH="$R_HOME/bin"
-  else
-# if system wide R_HOME is not found, then exit
-if [ ! `command -v R` ]; then
-  echo "Cannot find 'R_HOME'. Please specify 'R_HOME' or make sure R is 
properly installed."
-  exit 1
-fi
-R_SCRIPT_PATH="$(dirname $(which R))"
-fi
-echo "Using R_SCRIPT_PATH = ${R_SCRIPT_PATH}"
+. $FWDIR/find-r.sh
 
 # Install the package (this is required for code in vignettes to run when 
building it later)
 # Build the latest docs, but not vignettes, which is built with the package 
next
-$FWDIR/create-docs.sh
+. $FWDIR/install-dev.sh
 
 # Build source package with vignettes
 SPARK_HOME="$(cd "${FWDIR}"/..; pwd)"
@@ -84,19 +73,4 @@ else
   SPARK_HOME="${SPARK_HOME}" "$R_SCRIPT_PATH/"R CMD check $CRAN_CHECK_OPTIONS 
SparkR_"$VERSION".tar.gz
 fi
 
-# Install source package to get it to generate vignettes rds files, etc.
-if [ -n "$CLEAN_INSTALL" ]
-then
-  echo "Removing lib path and installing from source package"
-  LIB_DIR="$FWDIR/lib"
-  rm -rf $LIB_DIR
-  mkdir -p $LIB_DIR
-  "$R_SCRIPT_PATH/"R CMD INSTALL SparkR_"$VERSION".tar.gz --library=$LIB_DIR
-
-  # Zip the SparkR package so that it can be distributed to worker nodes on 
YARN
-  pushd $LIB_DIR > /dev/null
-  jar cfM "$LIB_DIR/sparkr.zip" SparkR
-  popd > /dev/null
-fi
-
 popd > /dev/null

http://git-wip-us.apache.org/repos/asf/spark/blob/c84f7d3e/R/create-docs.sh
--
diff --git a/R/create-docs.sh b/R/create-docs.sh
index 84e6aa9..6bef7e7 100755
--- a/R/create-docs.sh
+++ b/R/create-docs.sh
@@ -29,18 +29,19 @@ set -o pipefail
 set -e
 
 # Figure out where the script is
-export FWDIR="$(cd "`dirname "$0"`"; pwd)"
-export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+export FWDIR="$(cd "`dirname "${BASH_SOURCE[0]}"`"; pwd)"
+export SPARK_HOME="$(cd "`dirname "${BASH_SOURCE[0]}"`"/..; pwd)"
 
 # Required for setting SPARK_SCALA_VERSION
 . "${SPARK_HOME}"/bin/load-spark-env.sh
 
 echo "Using Scala $SPARK_SCALA_VERSION"
 
-pushd $FWDIR
+pushd $FWDIR > /dev/null
+. $FWDIR/find-r.sh
 
 # Install the package (this will also generate the Rd files)
-./install-dev.sh
+. $FWDIR/install-dev.sh
 
 # Now create HTML files
 
@@ -48,7 +49,7 @@ pushd $FWDIR
 mkdir -p pkg/html
 pushd pkg/html
 
-Rscript -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); 
library(knitr); knit_rd("SparkR", links = tools::findHTMLlinks(paste(libDir, 
"SparkR", sep="/")))'
+"$R_SCRIPT_PATH/"Rscript -e 'libDir <- "../../lib"; library(SparkR, 
lib.loc=libDir); library(knitr); knit_rd("SparkR", links = 
tools::findHTMLlinks(paste(libDir, "SparkR", sep=&quo

spark git commit: [SPARK-19066][SPARKR][BACKPORT-2.1] LDA doesn't set optimizer correctly

2017-01-17 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.1 3ec3e3f2e -> 29b954bba


[SPARK-19066][SPARKR][BACKPORT-2.1] LDA doesn't set optimizer correctly

## What changes were proposed in this pull request?
Back port the fix to SPARK-19066 to 2.1 branch.

## How was this patch tested?
Unit tests

Author: wm...@hotmail.com 

Closes #16623 from wangmiao1981/bugport.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/29b954bb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/29b954bb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/29b954bb

Branch: refs/heads/branch-2.1
Commit: 29b954bba1a9fa6e3bd823fa36ea7df4c2461381
Parents: 3ec3e3f
Author: wm...@hotmail.com 
Authored: Tue Jan 17 21:24:33 2017 -0800
Committer: Felix Cheung 
Committed: Tue Jan 17 21:24:33 2017 -0800

--
 R/pkg/inst/tests/testthat/test_mllib.R  | 4 ++--
 mllib/src/main/scala/org/apache/spark/ml/r/LDAWrapper.scala | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/29b954bb/R/pkg/inst/tests/testthat/test_mllib.R
--
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R 
b/R/pkg/inst/tests/testthat/test_mllib.R
index 1f2fae9..3891f00 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -860,7 +860,7 @@ test_that("spark.lda with libsvm", {
   weights <- stats$topicTopTermsWeights
   vocabulary <- stats$vocabulary
 
-  expect_false(isDistributed)
+  expect_true(isDistributed)
   expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
   expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
   expect_equal(vocabSize, 11)
@@ -874,7 +874,7 @@ test_that("spark.lda with libsvm", {
   model2 <- read.ml(modelPath)
   stats2 <- summary(model2)
 
-  expect_false(stats2$isDistributed)
+  expect_true(stats2$isDistributed)
   expect_equal(logLikelihood, stats2$logLikelihood)
   expect_equal(logPerplexity, stats2$logPerplexity)
   expect_equal(vocabSize, stats2$vocabSize)

http://git-wip-us.apache.org/repos/asf/spark/blob/29b954bb/mllib/src/main/scala/org/apache/spark/ml/r/LDAWrapper.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LDAWrapper.scala 
b/mllib/src/main/scala/org/apache/spark/ml/r/LDAWrapper.scala
index cbe6a70..e7851e1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/LDAWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/LDAWrapper.scala
@@ -122,6 +122,7 @@ private[r] object LDAWrapper extends MLReadable[LDAWrapper] 
{
   .setK(k)
   .setMaxIter(maxIter)
   .setSubsamplingRate(subsamplingRate)
+  .setOptimizer(optimizer)
 
 val featureSchema = data.schema(features)
 val stages = featureSchema.dataType match {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-19231][SPARKR] add error handling for download and untar for Spark release

2017-01-18 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.1 29b954bba -> 77202a6c5


[SPARK-19231][SPARKR] add error handling for download and untar for Spark 
release

## What changes were proposed in this pull request?

When R is starting as a package and it needs to download the Spark release 
distribution we need to handle error for download and untar, and clean up, 
otherwise it will get stuck.

## How was this patch tested?

manually

Author: Felix Cheung 

Closes #16589 from felixcheung/rtarreturncode.

(cherry picked from commit 278fa1eb305220a85c816c948932d6af8fa619aa)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/77202a6c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/77202a6c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/77202a6c

Branch: refs/heads/branch-2.1
Commit: 77202a6c57e6ac2438cdb6bd232a187b6734fa2b
Parents: 29b954b
Author: Felix Cheung 
Authored: Wed Jan 18 09:53:14 2017 -0800
Committer: Felix Cheung 
Committed: Wed Jan 18 09:53:31 2017 -0800

--
 R/pkg/R/install.R | 55 --
 1 file changed, 40 insertions(+), 15 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/77202a6c/R/pkg/R/install.R
--
diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R
index cb6bbe5..72386e6 100644
--- a/R/pkg/R/install.R
+++ b/R/pkg/R/install.R
@@ -54,7 +54,7 @@
 #' }
 #' @param overwrite If \code{TRUE}, download and overwrite the existing tar 
file in localDir
 #'  and force re-install Spark (in case the local directory or 
file is corrupted)
-#' @return \code{install.spark} returns the local directory where Spark is 
found or installed
+#' @return the (invisible) local directory where Spark is found or installed
 #' @rdname install.spark
 #' @name install.spark
 #' @aliases install.spark
@@ -115,17 +115,35 @@ install.spark <- function(hadoopVersion = "2.7", 
mirrorUrl = NULL,
   } else {
 if (releaseUrl != "") {
   message("Downloading from alternate URL:\n- ", releaseUrl)
-  downloadUrl(releaseUrl, packageLocalPath, paste0("Fetch failed from ", 
releaseUrl))
+  success <- downloadUrl(releaseUrl, packageLocalPath)
+  if (!success) {
+unlink(packageLocalPath)
+stop(paste0("Fetch failed from ", releaseUrl))
+  }
 } else {
   robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, 
packageLocalPath)
 }
   }
 
   message(sprintf("Installing to %s", localDir))
-  untar(tarfile = packageLocalPath, exdir = localDir)
-  if (!tarExists || overwrite) {
+  # There are two ways untar can fail - untar could stop() on errors like 
incomplete block on file
+  # or, tar command can return failure code
+  success <- tryCatch(untar(tarfile = packageLocalPath, exdir = localDir) == 0,
+ error = function(e) {
+   message(e)
+   message()
+   FALSE
+ },
+ warning = function(w) {
+   # Treat warning as error, add an empty line with 
message()
+   message(w)
+   message()
+   FALSE
+ })
+  if (!tarExists || overwrite || !success) {
 unlink(packageLocalPath)
   }
+  if (!success) stop("Extract archive failed.")
   message("DONE.")
   Sys.setenv(SPARK_HOME = packageLocalDir)
   message(paste("SPARK_HOME set to", packageLocalDir))
@@ -135,8 +153,7 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl 
= NULL,
 robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, 
packageLocalPath) {
   # step 1: use user-provided url
   if (!is.null(mirrorUrl)) {
-msg <- sprintf("Use user-provided mirror site: %s.", mirrorUrl)
-message(msg)
+message("Use user-provided mirror site: ", mirrorUrl)
 success <- directDownloadTar(mirrorUrl, version, hadoopVersion,
packageName, packageLocalPath)
 if (success) {
@@ -156,7 +173,7 @@ robustDownloadTar <- function(mirrorUrl, version, 
hadoopVersion, packageName, pa
packageName, packageLocalPath)
 if (success) return()
   } else {
-message("Unable to find preferred mirror site.")
+message("Unable to download from preferred mirror site: ", mirrorUrl)
   }
 
   # step 3: use backup option
@@ -165,8 +182,11 @@ robustDownloadTar <- function(mirrorUrl, version, 
hadoopVersio

spark git commit: [SPARK-19231][SPARKR] add error handling for download and untar for Spark release

2017-01-18 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master d06172b88 -> 278fa1eb3


[SPARK-19231][SPARKR] add error handling for download and untar for Spark 
release

## What changes were proposed in this pull request?

When R is starting as a package and it needs to download the Spark release 
distribution we need to handle error for download and untar, and clean up, 
otherwise it will get stuck.

## How was this patch tested?

manually

Author: Felix Cheung 

Closes #16589 from felixcheung/rtarreturncode.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/278fa1eb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/278fa1eb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/278fa1eb

Branch: refs/heads/master
Commit: 278fa1eb305220a85c816c948932d6af8fa619aa
Parents: d06172b
Author: Felix Cheung 
Authored: Wed Jan 18 09:53:14 2017 -0800
Committer: Felix Cheung 
Committed: Wed Jan 18 09:53:14 2017 -0800

--
 R/pkg/R/install.R | 55 --
 1 file changed, 40 insertions(+), 15 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/278fa1eb/R/pkg/R/install.R
--
diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R
index cb6bbe5..72386e6 100644
--- a/R/pkg/R/install.R
+++ b/R/pkg/R/install.R
@@ -54,7 +54,7 @@
 #' }
 #' @param overwrite If \code{TRUE}, download and overwrite the existing tar 
file in localDir
 #'  and force re-install Spark (in case the local directory or 
file is corrupted)
-#' @return \code{install.spark} returns the local directory where Spark is 
found or installed
+#' @return the (invisible) local directory where Spark is found or installed
 #' @rdname install.spark
 #' @name install.spark
 #' @aliases install.spark
@@ -115,17 +115,35 @@ install.spark <- function(hadoopVersion = "2.7", 
mirrorUrl = NULL,
   } else {
 if (releaseUrl != "") {
   message("Downloading from alternate URL:\n- ", releaseUrl)
-  downloadUrl(releaseUrl, packageLocalPath, paste0("Fetch failed from ", 
releaseUrl))
+  success <- downloadUrl(releaseUrl, packageLocalPath)
+  if (!success) {
+unlink(packageLocalPath)
+stop(paste0("Fetch failed from ", releaseUrl))
+  }
 } else {
   robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, 
packageLocalPath)
 }
   }
 
   message(sprintf("Installing to %s", localDir))
-  untar(tarfile = packageLocalPath, exdir = localDir)
-  if (!tarExists || overwrite) {
+  # There are two ways untar can fail - untar could stop() on errors like 
incomplete block on file
+  # or, tar command can return failure code
+  success <- tryCatch(untar(tarfile = packageLocalPath, exdir = localDir) == 0,
+ error = function(e) {
+   message(e)
+   message()
+   FALSE
+ },
+ warning = function(w) {
+   # Treat warning as error, add an empty line with 
message()
+   message(w)
+   message()
+   FALSE
+ })
+  if (!tarExists || overwrite || !success) {
 unlink(packageLocalPath)
   }
+  if (!success) stop("Extract archive failed.")
   message("DONE.")
   Sys.setenv(SPARK_HOME = packageLocalDir)
   message(paste("SPARK_HOME set to", packageLocalDir))
@@ -135,8 +153,7 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl 
= NULL,
 robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, 
packageLocalPath) {
   # step 1: use user-provided url
   if (!is.null(mirrorUrl)) {
-msg <- sprintf("Use user-provided mirror site: %s.", mirrorUrl)
-message(msg)
+message("Use user-provided mirror site: ", mirrorUrl)
 success <- directDownloadTar(mirrorUrl, version, hadoopVersion,
packageName, packageLocalPath)
 if (success) {
@@ -156,7 +173,7 @@ robustDownloadTar <- function(mirrorUrl, version, 
hadoopVersion, packageName, pa
packageName, packageLocalPath)
 if (success) return()
   } else {
-message("Unable to find preferred mirror site.")
+message("Unable to download from preferred mirror site: ", mirrorUrl)
   }
 
   # step 3: use backup option
@@ -165,8 +182,11 @@ robustDownloadTar <- function(mirrorUrl, version, 
hadoopVersion, packageName, pa
   success <- directDownloadTar(mirrorUrl, version, hadoopVersion,
   

spark git commit: [SPARK-18823][SPARKR] add support for assigning to column

2017-01-24 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.1 570e5e11d -> 9c04e427d


[SPARK-18823][SPARKR] add support for assigning to column

## What changes were proposed in this pull request?

Support for
```
df[[myname]] <- 1
df[[2]] <- df$eruptions
```

## How was this patch tested?

manual tests, unit tests

Author: Felix Cheung 

Closes #16663 from felixcheung/rcolset.

(cherry picked from commit f27e024768e328b96704a9ef35b77381da480328)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9c04e427
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9c04e427
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9c04e427

Branch: refs/heads/branch-2.1
Commit: 9c04e427d0a4b99bfdb6af1ea1bc8c4bdaee724e
Parents: 570e5e1
Author: Felix Cheung 
Authored: Tue Jan 24 00:23:23 2017 -0800
Committer: Felix Cheung 
Committed: Tue Jan 24 00:23:35 2017 -0800

--
 R/pkg/R/DataFrame.R   | 48 +++---
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 20 +++
 2 files changed, 55 insertions(+), 13 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9c04e427/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index c79b1d3..48ac307 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1711,6 +1711,23 @@ getColumn <- function(x, c) {
   column(callJMethod(x@sdf, "col", c))
 }
 
+setColumn <- function(x, c, value) {
+  if (class(value) != "Column" && !is.null(value)) {
+if (isAtomicLengthOne(value)) {
+  value <- lit(value)
+} else {
+  stop("value must be a Column, literal value as atomic in length of 1, or 
NULL")
+}
+  }
+
+  if (is.null(value)) {
+nx <- drop(x, c)
+  } else {
+nx <- withColumn(x, c, value)
+  }
+  nx
+}
+
 #' @param name name of a Column (without being wrapped by \code{""}).
 #' @rdname select
 #' @name $
@@ -1729,19 +1746,7 @@ setMethod("$", signature(x = "SparkDataFrame"),
 #' @note $<- since 1.4.0
 setMethod("$<-", signature(x = "SparkDataFrame"),
   function(x, name, value) {
-if (class(value) != "Column" && !is.null(value)) {
-  if (isAtomicLengthOne(value)) {
-value <- lit(value)
-  } else {
-stop("value must be a Column, literal value as atomic in 
length of 1, or NULL")
-  }
-}
-
-if (is.null(value)) {
-  nx <- drop(x, name)
-} else {
-  nx <- withColumn(x, name, value)
-}
+nx <- setColumn(x, name, value)
 x@sdf <- nx@sdf
 x
   })
@@ -1762,6 +1767,21 @@ setMethod("[[", signature(x = "SparkDataFrame", i = 
"numericOrcharacter"),
   })
 
 #' @rdname subset
+#' @name [[<-
+#' @aliases [[<-,SparkDataFrame,numericOrcharacter-method
+#' @note [[<- since 2.1.1
+setMethod("[[<-", signature(x = "SparkDataFrame", i = "numericOrcharacter"),
+  function(x, i, value) {
+if (is.numeric(i)) {
+  cols <- columns(x)
+  i <- cols[[i]]
+}
+nx <- setColumn(x, i, value)
+x@sdf <- nx@sdf
+x
+  })
+
+#' @rdname subset
 #' @name [
 #' @aliases [,SparkDataFrame-method
 #' @note [ since 1.4.0
@@ -1808,6 +1828,8 @@ setMethod("[", signature(x = "SparkDataFrame"),
 #' @param j,select expression for the single Column or a list of columns to 
select from the SparkDataFrame.
 #' @param drop if TRUE, a Column will be returned if the resulting dataset has 
only one column.
 #' Otherwise, a SparkDataFrame will always be returned.
+#' @param value a Column or an atomic vector in the length of 1 as literal 
value, or \code{NULL}.
+#'  If \code{NULL}, the specified Column is dropped.
 #' @param ... currently not used.
 #' @return A new SparkDataFrame containing only the rows that meet the 
condition with selected columns.
 #' @export

http://git-wip-us.apache.org/repos/asf/spark/blob/9c04e427/R/pkg/inst/tests/testthat/test_sparkSQL.R
--
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 7f27ba6..1f9daf5 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_spa

spark git commit: [SPARK-18823][SPARKR] add support for assigning to column

2017-01-24 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master ec9493b44 -> f27e02476


[SPARK-18823][SPARKR] add support for assigning to column

## What changes were proposed in this pull request?

Support for
```
df[[myname]] <- 1
df[[2]] <- df$eruptions
```

## How was this patch tested?

manual tests, unit tests

Author: Felix Cheung 

Closes #16663 from felixcheung/rcolset.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f27e0247
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f27e0247
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f27e0247

Branch: refs/heads/master
Commit: f27e024768e328b96704a9ef35b77381da480328
Parents: ec9493b
Author: Felix Cheung 
Authored: Tue Jan 24 00:23:23 2017 -0800
Committer: Felix Cheung 
Committed: Tue Jan 24 00:23:23 2017 -0800

--
 R/pkg/R/DataFrame.R   | 48 +++---
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 20 +++
 2 files changed, 55 insertions(+), 13 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f27e0247/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 3d912c9..0a10122 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1717,6 +1717,23 @@ getColumn <- function(x, c) {
   column(callJMethod(x@sdf, "col", c))
 }
 
+setColumn <- function(x, c, value) {
+  if (class(value) != "Column" && !is.null(value)) {
+if (isAtomicLengthOne(value)) {
+  value <- lit(value)
+} else {
+  stop("value must be a Column, literal value as atomic in length of 1, or 
NULL")
+}
+  }
+
+  if (is.null(value)) {
+nx <- drop(x, c)
+  } else {
+nx <- withColumn(x, c, value)
+  }
+  nx
+}
+
 #' @param name name of a Column (without being wrapped by \code{""}).
 #' @rdname select
 #' @name $
@@ -1735,19 +1752,7 @@ setMethod("$", signature(x = "SparkDataFrame"),
 #' @note $<- since 1.4.0
 setMethod("$<-", signature(x = "SparkDataFrame"),
   function(x, name, value) {
-if (class(value) != "Column" && !is.null(value)) {
-  if (isAtomicLengthOne(value)) {
-value <- lit(value)
-  } else {
-stop("value must be a Column, literal value as atomic in 
length of 1, or NULL")
-  }
-}
-
-if (is.null(value)) {
-  nx <- drop(x, name)
-} else {
-  nx <- withColumn(x, name, value)
-}
+nx <- setColumn(x, name, value)
 x@sdf <- nx@sdf
 x
   })
@@ -1768,6 +1773,21 @@ setMethod("[[", signature(x = "SparkDataFrame", i = 
"numericOrcharacter"),
   })
 
 #' @rdname subset
+#' @name [[<-
+#' @aliases [[<-,SparkDataFrame,numericOrcharacter-method
+#' @note [[<- since 2.1.1
+setMethod("[[<-", signature(x = "SparkDataFrame", i = "numericOrcharacter"),
+  function(x, i, value) {
+if (is.numeric(i)) {
+  cols <- columns(x)
+  i <- cols[[i]]
+}
+nx <- setColumn(x, i, value)
+x@sdf <- nx@sdf
+x
+  })
+
+#' @rdname subset
 #' @name [
 #' @aliases [,SparkDataFrame-method
 #' @note [ since 1.4.0
@@ -1814,6 +1834,8 @@ setMethod("[", signature(x = "SparkDataFrame"),
 #' @param j,select expression for the single Column or a list of columns to 
select from the SparkDataFrame.
 #' @param drop if TRUE, a Column will be returned if the resulting dataset has 
only one column.
 #' Otherwise, a SparkDataFrame will always be returned.
+#' @param value a Column or an atomic vector in the length of 1 as literal 
value, or \code{NULL}.
+#'  If \code{NULL}, the specified Column is dropped.
 #' @param ... currently not used.
 #' @return A new SparkDataFrame containing only the rows that meet the 
condition with selected columns.
 #' @export

http://git-wip-us.apache.org/repos/asf/spark/blob/f27e0247/R/pkg/inst/tests/testthat/test_sparkSQL.R
--
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 2601742..aaa8fb4 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1021,6 +1021,9 @@ test_that("select operators", {
   df$age2 <- df$age * 2
   exp

spark git commit: [SPARK-18821][SPARKR] Bisecting k-means wrapper in SparkR

2017-01-26 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master 1191fe267 -> c0ba28430


[SPARK-18821][SPARKR] Bisecting k-means wrapper in SparkR

## What changes were proposed in this pull request?

Add R wrapper for bisecting Kmeans.

As JIRA is down, I will update title to link with corresponding JIRA later.

## How was this patch tested?

Add new unit tests.

Author: wm...@hotmail.com 

Closes #16566 from wangmiao1981/bk.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c0ba2843
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c0ba2843
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c0ba2843

Branch: refs/heads/master
Commit: c0ba284300e494354f5bb205a10a12ac7daa2b5e
Parents: 1191fe2
Author: wm...@hotmail.com 
Authored: Thu Jan 26 21:01:59 2017 -0800
Committer: Felix Cheung 
Committed: Thu Jan 26 21:01:59 2017 -0800

--
 R/pkg/NAMESPACE |   3 +-
 R/pkg/R/generics.R  |   5 +
 R/pkg/R/mllib_clustering.R  | 149 +++
 R/pkg/R/mllib_utils.R   |  10 +-
 .../inst/tests/testthat/test_mllib_clustering.R |  40 +
 .../spark/ml/r/BisectingKMeansWrapper.scala | 143 ++
 .../scala/org/apache/spark/ml/r/RWrappers.scala |   2 +
 7 files changed, 347 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c0ba2843/R/pkg/NAMESPACE
--
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 0cd9cb8..caa1c3b 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -47,7 +47,8 @@ exportMethods("glm",
   "spark.kstest",
   "spark.logit",
   "spark.randomForest",
-  "spark.gbt")
+  "spark.gbt",
+  "spark.bisectingKmeans")
 
 # Job group lifecycle management methods
 export("setJobGroup",

http://git-wip-us.apache.org/repos/asf/spark/blob/c0ba2843/R/pkg/R/generics.R
--
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 499c7b2..433c166 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1338,6 +1338,11 @@ setGeneric("rbind", signature = "...")
 #' @export
 setGeneric("spark.als", function(data, ...) { standardGeneric("spark.als") })
 
+#' @rdname spark.bisectingKmeans
+#' @export
+setGeneric("spark.bisectingKmeans",
+   function(data, formula, ...) { 
standardGeneric("spark.bisectingKmeans") })
+
 #' @rdname spark.gaussianMixture
 #' @export
 setGeneric("spark.gaussianMixture",

http://git-wip-us.apache.org/repos/asf/spark/blob/c0ba2843/R/pkg/R/mllib_clustering.R
--
diff --git a/R/pkg/R/mllib_clustering.R b/R/pkg/R/mllib_clustering.R
index fa40f9d..05bbab6 100644
--- a/R/pkg/R/mllib_clustering.R
+++ b/R/pkg/R/mllib_clustering.R
@@ -17,6 +17,13 @@
 
 # mllib_clustering.R: Provides methods for MLlib clustering algorithms 
integration
 
+#' S4 class that represents a BisectingKMeansModel
+#'
+#' @param jobj a Java object reference to the backing Scala 
BisectingKMeansModel
+#' @export
+#' @note BisectingKMeansModel since 2.2.0
+setClass("BisectingKMeansModel", representation(jobj = "jobj"))
+
 #' S4 class that represents a GaussianMixtureModel
 #'
 #' @param jobj a Java object reference to the backing Scala 
GaussianMixtureModel
@@ -38,6 +45,148 @@ setClass("KMeansModel", representation(jobj = "jobj"))
 #' @note LDAModel since 2.1.0
 setClass("LDAModel", representation(jobj = "jobj"))
 
+#' Bisecting K-Means Clustering Model
+#'
+#' Fits a bisecting k-means clustering model against a Spark DataFrame.
+#' Users can call \code{summary} to print a summary of the fitted model, 
\code{predict} to make
+#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load 
fitted models.
+#'
+#' @param data a SparkDataFrame for training.
+#' @param formula a symbolic description of the model to be fitted. Currently 
only a few formula
+#'operators are supported, including '~', '.', ':', '+', and 
'-'.
+#'Note that the response variable of formula is empty in 
spark.bisectingKmeans.
+#' @param k the desired number of leaf clusters. Must be > 1.
+#'  The actual number could be smaller if there are no divisible leaf 
clusters.
+#' @param maxIter maximum iteration number.
+#' @param seed the random seed.
+#' @param minDivisibleClusterSize The minimum number of points (if greater 
than or equal to 1.0)
+#'or the minimum proportion of points (if less 
than 1.0) of a divisible cluster.
+#'Note that it is an expert parameter. The 
default value should be good enough
+

spark git commit: [SPARK-18788][SPARKR] Add API for getNumPartitions

2017-01-26 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/master c0ba28430 -> 90817a6cd


[SPARK-18788][SPARKR] Add API for getNumPartitions

## What changes were proposed in this pull request?

With doc to say this would convert DF into RDD

## How was this patch tested?

unit tests, manual tests

Author: Felix Cheung 

Closes #16668 from felixcheung/rgetnumpartitions.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/90817a6c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/90817a6c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/90817a6c

Branch: refs/heads/master
Commit: 90817a6cd06068fa9f9ff77384a1fcba73b43006
Parents: c0ba284
Author: Felix Cheung 
Authored: Thu Jan 26 21:06:39 2017 -0800
Committer: Felix Cheung 
Committed: Thu Jan 26 21:06:39 2017 -0800

--
 R/pkg/NAMESPACE   |  1 +
 R/pkg/R/DataFrame.R   | 23 
 R/pkg/R/RDD.R | 30 +-
 R/pkg/R/generics.R|  8 +--
 R/pkg/R/pairRDD.R |  4 ++--
 R/pkg/inst/tests/testthat/test_rdd.R  | 10 -
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 14 ++--
 7 files changed, 59 insertions(+), 31 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/90817a6c/R/pkg/NAMESPACE
--
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index caa1c3b..7ff6e9a 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -95,6 +95,7 @@ exportMethods("arrange",
   "freqItems",
   "gapply",
   "gapplyCollect",
+  "getNumPartitions",
   "group_by",
   "groupBy",
   "head",

http://git-wip-us.apache.org/repos/asf/spark/blob/90817a6c/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 0a10122..523343e 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -3428,3 +3428,26 @@ setMethod("randomSplit",
 }
 sapply(sdfs, dataFrame)
   })
+
+#' getNumPartitions
+#'
+#' Return the number of partitions
+#'
+#' @param x A SparkDataFrame
+#' @family SparkDataFrame functions
+#' @aliases getNumPartitions,SparkDataFrame-method
+#' @rdname getNumPartitions
+#' @name getNumPartitions
+#' @export
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' df <- createDataFrame(cars, numPartitions = 2)
+#' getNumPartitions(df)
+#' }
+#' @note getNumPartitions since 2.1.1
+setMethod("getNumPartitions",
+  signature(x = "SparkDataFrame"),
+  function(x) {
+callJMethod(callJMethod(x@sdf, "rdd"), "getNumPartitions")
+  })

http://git-wip-us.apache.org/repos/asf/spark/blob/90817a6c/R/pkg/R/RDD.R
--
diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
index 0f1162f..91bab33 100644
--- a/R/pkg/R/RDD.R
+++ b/R/pkg/R/RDD.R
@@ -313,7 +313,7 @@ setMethod("checkpoint",
 #' @rdname getNumPartitions
 #' @aliases getNumPartitions,RDD-method
 #' @noRd
-setMethod("getNumPartitions",
+setMethod("getNumPartitionsRDD",
   signature(x = "RDD"),
   function(x) {
 callJMethod(getJRDD(x), "getNumPartitions")
@@ -329,7 +329,7 @@ setMethod("numPartitions",
   signature(x = "RDD"),
   function(x) {
 .Deprecated("getNumPartitions")
-getNumPartitions(x)
+getNumPartitionsRDD(x)
   })
 
 #' Collect elements of an RDD
@@ -460,7 +460,7 @@ setMethod("countByValue",
   signature(x = "RDD"),
   function(x) {
 ones <- lapply(x, function(item) { list(item, 1L) })
-collectRDD(reduceByKey(ones, `+`, getNumPartitions(x)))
+collectRDD(reduceByKey(ones, `+`, getNumPartitionsRDD(x)))
   })
 
 #' Apply a function to all elements
@@ -780,7 +780,7 @@ setMethod("takeRDD",
 resList <- list()
 index <- -1
 jrdd <- getJRDD(x)
-numPartitions <- getNumPartitions(x)
+numPartitions <- getNumPartitionsRDD(x)
 serializedModeRDD <- getSerializedMode(x)
 
 # TODO(shivaram): Collect more than one partition based on size
@@ -846,7 +846,7 @@ setMethod("firstRDD",
 #' @noRd
 setMeth

spark git commit: [SPARK-18788][SPARKR] Add API for getNumPartitions

2017-01-26 Thread felixcheung
Repository: spark
Updated Branches:
  refs/heads/branch-2.1 59502bbcf -> ba2a5ada4


[SPARK-18788][SPARKR] Add API for getNumPartitions

## What changes were proposed in this pull request?

With doc to say this would convert DF into RDD

## How was this patch tested?

unit tests, manual tests

Author: Felix Cheung 

Closes #16668 from felixcheung/rgetnumpartitions.

(cherry picked from commit 90817a6cd06068fa9f9ff77384a1fcba73b43006)
Signed-off-by: Felix Cheung 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ba2a5ada
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ba2a5ada
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ba2a5ada

Branch: refs/heads/branch-2.1
Commit: ba2a5ada4825a9ca3e4e954a51574a2eede096a3
Parents: 59502bb
Author: Felix Cheung 
Authored: Thu Jan 26 21:06:39 2017 -0800
Committer: Felix Cheung 
Committed: Thu Jan 26 21:06:54 2017 -0800

--
 R/pkg/NAMESPACE   |  1 +
 R/pkg/R/DataFrame.R   | 23 
 R/pkg/R/RDD.R | 30 +-
 R/pkg/R/generics.R|  8 +--
 R/pkg/R/pairRDD.R |  4 ++--
 R/pkg/inst/tests/testthat/test_rdd.R  | 10 -
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 14 ++--
 7 files changed, 59 insertions(+), 31 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ba2a5ada/R/pkg/NAMESPACE
--
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index c3ec3f4..8a19fd0 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -94,6 +94,7 @@ exportMethods("arrange",
   "freqItems",
   "gapply",
   "gapplyCollect",
+  "getNumPartitions",
   "group_by",
   "groupBy",
   "head",

http://git-wip-us.apache.org/repos/asf/spark/blob/ba2a5ada/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 48ac307..39e8376 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -3422,3 +3422,26 @@ setMethod("randomSplit",
 }
 sapply(sdfs, dataFrame)
   })
+
+#' getNumPartitions
+#'
+#' Return the number of partitions
+#'
+#' @param x A SparkDataFrame
+#' @family SparkDataFrame functions
+#' @aliases getNumPartitions,SparkDataFrame-method
+#' @rdname getNumPartitions
+#' @name getNumPartitions
+#' @export
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' df <- createDataFrame(cars, numPartitions = 2)
+#' getNumPartitions(df)
+#' }
+#' @note getNumPartitions since 2.1.1
+setMethod("getNumPartitions",
+  signature(x = "SparkDataFrame"),
+  function(x) {
+callJMethod(callJMethod(x@sdf, "rdd"), "getNumPartitions")
+  })

http://git-wip-us.apache.org/repos/asf/spark/blob/ba2a5ada/R/pkg/R/RDD.R
--
diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
index 0f1162f..91bab33 100644
--- a/R/pkg/R/RDD.R
+++ b/R/pkg/R/RDD.R
@@ -313,7 +313,7 @@ setMethod("checkpoint",
 #' @rdname getNumPartitions
 #' @aliases getNumPartitions,RDD-method
 #' @noRd
-setMethod("getNumPartitions",
+setMethod("getNumPartitionsRDD",
   signature(x = "RDD"),
   function(x) {
 callJMethod(getJRDD(x), "getNumPartitions")
@@ -329,7 +329,7 @@ setMethod("numPartitions",
   signature(x = "RDD"),
   function(x) {
 .Deprecated("getNumPartitions")
-getNumPartitions(x)
+getNumPartitionsRDD(x)
   })
 
 #' Collect elements of an RDD
@@ -460,7 +460,7 @@ setMethod("countByValue",
   signature(x = "RDD"),
   function(x) {
 ones <- lapply(x, function(item) { list(item, 1L) })
-collectRDD(reduceByKey(ones, `+`, getNumPartitions(x)))
+collectRDD(reduceByKey(ones, `+`, getNumPartitionsRDD(x)))
   })
 
 #' Apply a function to all elements
@@ -780,7 +780,7 @@ setMethod("takeRDD",
 resList <- list()
 index <- -1
 jrdd <- getJRDD(x)
-numPartitions <- getNumPartitions(x)
+numPartitions <- getNumPartitionsRDD(x)
 serializedModeRDD <- getSerializedMode(x)
 
 # TODO(shivaram): Collect more than o

<    1   2   3   4   >