spark git commit: [SPARK-20550][SPARKR] R wrapper for Dataset.alias
Repository: spark Updated Branches: refs/heads/master 500436b43 -> 1f73d3589 [SPARK-20550][SPARKR] R wrapper for Dataset.alias ## What changes were proposed in this pull request? - Add SparkR wrapper for `Dataset.alias`. - Adjust roxygen annotations for `functions.alias` (including example usage). ## How was this patch tested? Unit tests, `check_cran.sh`. Author: zero323 Closes #17825 from zero323/SPARK-20550. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1f73d358 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1f73d358 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1f73d358 Branch: refs/heads/master Commit: 1f73d3589a84b78473598c17ac328a9805896778 Parents: 500436b Author: zero323 Authored: Sun May 7 16:24:42 2017 -0700 Committer: Felix Cheung Committed: Sun May 7 16:24:42 2017 -0700 -- R/pkg/R/DataFrame.R | 24 R/pkg/R/column.R | 16 R/pkg/R/generics.R| 11 +++ R/pkg/inst/tests/testthat/test_sparkSQL.R | 10 ++ 4 files changed, 53 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1f73d358/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 1c88692..b56dddc 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -3745,3 +3745,27 @@ setMethod("hint", jdf <- callJMethod(x@sdf, "hint", name, parameters) dataFrame(jdf) }) + +#' alias +#' +#' @aliases alias,SparkDataFrame-method +#' @family SparkDataFrame functions +#' @rdname alias +#' @name alias +#' @export +#' @examples +#' \dontrun{ +#' df <- alias(createDataFrame(mtcars), "mtcars") +#' avg_mpg <- alias(agg(groupBy(df, df$cyl), avg(df$mpg)), "avg_mpg") +#' +#' head(select(df, column("mtcars.mpg"))) +#' head(join(df, avg_mpg, column("mtcars.cyl") == column("avg_mpg.cyl"))) +#' } +#' @note alias(SparkDataFrame) since 2.3.0 +setMethod("alias", + signature(object = "SparkDataFrame"), + function(object, data) { +stopifnot(is.character(data)) +sdf <- callJMethod(object@sdf, "alias", data) +dataFrame(sdf) + }) http://git-wip-us.apache.org/repos/asf/spark/blob/1f73d358/R/pkg/R/column.R -- diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 147ee4b..5740780 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -130,19 +130,19 @@ createMethods <- function() { createMethods() -#' alias -#' -#' Set a new name for a column -#' -#' @param object Column to rename -#' @param data new name to use -#' #' @rdname alias #' @name alias #' @aliases alias,Column-method #' @family colum_func #' @export -#' @note alias since 1.4.0 +#' @examples \dontrun{ +#' df <- createDataFrame(iris) +#' +#' head(select( +#' df, alias(df$Sepal_Length, "slength"), alias(df$Petal_Length, "plength") +#' )) +#' } +#' @note alias(Column) since 1.4.0 setMethod("alias", signature(object = "Column"), function(object, data) { http://git-wip-us.apache.org/repos/asf/spark/blob/1f73d358/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index e835ef3..3c84bf8 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -387,6 +387,17 @@ setGeneric("value", function(bcast) { standardGeneric("value") }) #' @export setGeneric("agg", function (x, ...) { standardGeneric("agg") }) +#' alias +#' +#' Returns a new SparkDataFrame or a Column with an alias set. Equivalent to SQL "AS" keyword. +#' +#' @name alias +#' @rdname alias +#' @param object x a SparkDataFrame or a Column +#' @param data new name to use +#' @return a SparkDataFrame or a Column +NULL + #' @rdname arrange #' @export setGeneric("arrange", function(x, col, ...) { standardGeneric("arrange") }) http://git-wip-us.apache.org/repos/asf/spark/blob/1f73d358/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 232246d..0856bab 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1223,6 +1223,16 @@ test_that("select with column", { expect_equal(columns(df4), c("name", "age")) expect_equal(count(df4), 3) + # Test select with alias + df5 <- alias(df, "table") + + expect_equal(columns(select(df5, column("table.name"))), "name") + expect_equal(columns(select(df5, "table.name")), "name") + + # Test that stats::alias is not m
spark git commit: [SPARK-20626][SPARKR] address date test warning with timezone on windows
Repository: spark Updated Branches: refs/heads/master 22691556e -> c24bdaab5 [SPARK-20626][SPARKR] address date test warning with timezone on windows ## What changes were proposed in this pull request? set timezone on windows ## How was this patch tested? unit test, AppVeyor Author: Felix Cheung Closes #17892 from felixcheung/rtimestamptest. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c24bdaab Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c24bdaab Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c24bdaab Branch: refs/heads/master Commit: c24bdaab5a234d18b273544cefc44cc4005bf8fc Parents: 2269155 Author: Felix Cheung Authored: Sun May 7 23:10:18 2017 -0700 Committer: Felix Cheung Committed: Sun May 7 23:10:18 2017 -0700 -- R/pkg/inst/tests/testthat/test_sparkSQL.R | 4 1 file changed, 4 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c24bdaab/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 0856bab..f517ce6 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -96,6 +96,10 @@ mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}} mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") writeLines(mockLinesMapType, mapTypeJsonPath) +if (.Platform$OS.type == "windows") { + Sys.setenv(TZ = "GMT") +} + test_that("calling sparkRSQL.init returns existing SQL context", { skip_on_cran() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20626][SPARKR] address date test warning with timezone on windows
Repository: spark Updated Branches: refs/heads/branch-2.2 048e9890c -> 6c5b7e106 [SPARK-20626][SPARKR] address date test warning with timezone on windows ## What changes were proposed in this pull request? set timezone on windows ## How was this patch tested? unit test, AppVeyor Author: Felix Cheung Closes #17892 from felixcheung/rtimestamptest. (cherry picked from commit c24bdaab5a234d18b273544cefc44cc4005bf8fc) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6c5b7e10 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6c5b7e10 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6c5b7e10 Branch: refs/heads/branch-2.2 Commit: 6c5b7e106895302a87cf6522d3c64c3badac699f Parents: 048e989 Author: Felix Cheung Authored: Sun May 7 23:10:18 2017 -0700 Committer: Felix Cheung Committed: Sun May 7 23:10:42 2017 -0700 -- R/pkg/inst/tests/testthat/test_sparkSQL.R | 4 1 file changed, 4 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6c5b7e10/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 3c985f2..3f445e2 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -96,6 +96,10 @@ mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}} mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") writeLines(mockLinesMapType, mapTypeJsonPath) +if (.Platform$OS.type == "windows") { + Sys.setenv(TZ = "GMT") +} + test_that("calling sparkRSQL.init returns existing SQL context", { skip_on_cran() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARKR][DOC] fix typo in vignettes
Repository: spark Updated Branches: refs/heads/branch-2.2 6c5b7e106 -> d8a5a0d34 [SPARKR][DOC] fix typo in vignettes ## What changes were proposed in this pull request? Fix typo in vignettes Author: Wayne Zhang Closes #17884 from actuaryzhang/typo. (cherry picked from commit 2fdaeb52bbe2ed1a9127ac72917286e505303c85) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d8a5a0d3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d8a5a0d3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d8a5a0d3 Branch: refs/heads/branch-2.2 Commit: d8a5a0d3420abbb911d8a80dc7165762eb08d779 Parents: 6c5b7e1 Author: Wayne Zhang Authored: Sun May 7 23:16:30 2017 -0700 Committer: Felix Cheung Committed: Sun May 7 23:16:44 2017 -0700 -- R/pkg/vignettes/sparkr-vignettes.Rmd | 36 +++ 1 file changed, 18 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d8a5a0d3/R/pkg/vignettes/sparkr-vignettes.Rmd -- diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index b933c59..0f6d5c2 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -65,7 +65,7 @@ We can view the first few rows of the `SparkDataFrame` by `head` or `showDF` fun head(carsDF) ``` -Common data processing operations such as `filter`, `select` are supported on the `SparkDataFrame`. +Common data processing operations such as `filter` and `select` are supported on the `SparkDataFrame`. ```{r} carsSubDF <- select(carsDF, "model", "mpg", "hp") carsSubDF <- filter(carsSubDF, carsSubDF$hp >= 200) @@ -364,7 +364,7 @@ out <- dapply(carsSubDF, function(x) { x <- cbind(x, x$mpg * 1.61) }, schema) head(collect(out)) ``` -Like `dapply`, apply a function to each partition of a `SparkDataFrame` and collect the result back. The output of function should be a `data.frame`, but no schema is required in this case. Note that `dapplyCollect` can fail if the output of UDF run on all the partition cannot be pulled to the driver and fit in driver memory. +Like `dapply`, `dapplyCollect` can apply a function to each partition of a `SparkDataFrame` and collect the result back. The output of the function should be a `data.frame`, but no schema is required in this case. Note that `dapplyCollect` can fail if the output of the UDF on all partitions cannot be pulled into the driver's memory. ```{r} out <- dapplyCollect( @@ -390,7 +390,7 @@ result <- gapply( head(arrange(result, "max_mpg", decreasing = TRUE)) ``` -Like gapply, `gapplyCollect` applies a function to each partition of a `SparkDataFrame` and collect the result back to R `data.frame`. The output of the function should be a `data.frame` but no schema is required in this case. Note that `gapplyCollect` can fail if the output of UDF run on all the partition cannot be pulled to the driver and fit in driver memory. +Like `gapply`, `gapplyCollect` can apply a function to each partition of a `SparkDataFrame` and collect the result back to R `data.frame`. The output of the function should be a `data.frame` but no schema is required in this case. Note that `gapplyCollect` can fail if the output of the UDF on all partitions cannot be pulled into the driver's memory. ```{r} result <- gapplyCollect( @@ -443,20 +443,20 @@ options(ops) ### SQL Queries -A `SparkDataFrame` can also be registered as a temporary view in Spark SQL and that allows you to run SQL queries over its data. The sql function enables applications to run SQL queries programmatically and returns the result as a `SparkDataFrame`. +A `SparkDataFrame` can also be registered as a temporary view in Spark SQL so that one can run SQL queries over its data. The sql function enables applications to run SQL queries programmatically and returns the result as a `SparkDataFrame`. ```{r} people <- read.df(paste0(sparkR.conf("spark.home"), "/examples/src/main/resources/people.json"), "json") ``` -Register this SparkDataFrame as a temporary view. +Register this `SparkDataFrame` as a temporary view. ```{r} createOrReplaceTempView(people, "people") ``` -SQL statements can be run by using the sql method. +SQL statements can be run using the sql method. ```{r} teenagers <- sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") head(teenagers) @@ -765,7 +765,7 @@ head(predict(isoregModel, newDF)) `spark.gbt` fits a [gradient-boosted tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or regression model on a `SparkDataFrame`. Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write
spark git commit: [SPARKR][DOC] fix typo in vignettes
Repository: spark Updated Branches: refs/heads/master 42cc6d13e -> 2fdaeb52b [SPARKR][DOC] fix typo in vignettes ## What changes were proposed in this pull request? Fix typo in vignettes Author: Wayne Zhang Closes #17884 from actuaryzhang/typo. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2fdaeb52 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2fdaeb52 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2fdaeb52 Branch: refs/heads/master Commit: 2fdaeb52bbe2ed1a9127ac72917286e505303c85 Parents: 42cc6d1 Author: Wayne Zhang Authored: Sun May 7 23:16:30 2017 -0700 Committer: Felix Cheung Committed: Sun May 7 23:16:30 2017 -0700 -- R/pkg/vignettes/sparkr-vignettes.Rmd | 36 +++ 1 file changed, 18 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2fdaeb52/R/pkg/vignettes/sparkr-vignettes.Rmd -- diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index d38ec4f..49f4ab8 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -65,7 +65,7 @@ We can view the first few rows of the `SparkDataFrame` by `head` or `showDF` fun head(carsDF) ``` -Common data processing operations such as `filter`, `select` are supported on the `SparkDataFrame`. +Common data processing operations such as `filter` and `select` are supported on the `SparkDataFrame`. ```{r} carsSubDF <- select(carsDF, "model", "mpg", "hp") carsSubDF <- filter(carsSubDF, carsSubDF$hp >= 200) @@ -379,7 +379,7 @@ out <- dapply(carsSubDF, function(x) { x <- cbind(x, x$mpg * 1.61) }, schema) head(collect(out)) ``` -Like `dapply`, apply a function to each partition of a `SparkDataFrame` and collect the result back. The output of function should be a `data.frame`, but no schema is required in this case. Note that `dapplyCollect` can fail if the output of UDF run on all the partition cannot be pulled to the driver and fit in driver memory. +Like `dapply`, `dapplyCollect` can apply a function to each partition of a `SparkDataFrame` and collect the result back. The output of the function should be a `data.frame`, but no schema is required in this case. Note that `dapplyCollect` can fail if the output of the UDF on all partitions cannot be pulled into the driver's memory. ```{r} out <- dapplyCollect( @@ -405,7 +405,7 @@ result <- gapply( head(arrange(result, "max_mpg", decreasing = TRUE)) ``` -Like gapply, `gapplyCollect` applies a function to each partition of a `SparkDataFrame` and collect the result back to R `data.frame`. The output of the function should be a `data.frame` but no schema is required in this case. Note that `gapplyCollect` can fail if the output of UDF run on all the partition cannot be pulled to the driver and fit in driver memory. +Like `gapply`, `gapplyCollect` can apply a function to each partition of a `SparkDataFrame` and collect the result back to R `data.frame`. The output of the function should be a `data.frame` but no schema is required in this case. Note that `gapplyCollect` can fail if the output of the UDF on all partitions cannot be pulled into the driver's memory. ```{r} result <- gapplyCollect( @@ -458,20 +458,20 @@ options(ops) ### SQL Queries -A `SparkDataFrame` can also be registered as a temporary view in Spark SQL and that allows you to run SQL queries over its data. The sql function enables applications to run SQL queries programmatically and returns the result as a `SparkDataFrame`. +A `SparkDataFrame` can also be registered as a temporary view in Spark SQL so that one can run SQL queries over its data. The sql function enables applications to run SQL queries programmatically and returns the result as a `SparkDataFrame`. ```{r} people <- read.df(paste0(sparkR.conf("spark.home"), "/examples/src/main/resources/people.json"), "json") ``` -Register this SparkDataFrame as a temporary view. +Register this `SparkDataFrame` as a temporary view. ```{r} createOrReplaceTempView(people, "people") ``` -SQL statements can be run by using the sql method. +SQL statements can be run using the sql method. ```{r} teenagers <- sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") head(teenagers) @@ -780,7 +780,7 @@ head(predict(isoregModel, newDF)) `spark.gbt` fits a [gradient-boosted tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or regression model on a `SparkDataFrame`. Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models. -Similar to the random forest example above, we use the `longl
spark git commit: [SPARK-20661][SPARKR][TEST][FOLLOWUP] SparkR tableNames() test fails
Repository: spark Updated Branches: refs/heads/master 2abfee18b -> b952b44af [SPARK-20661][SPARKR][TEST][FOLLOWUP] SparkR tableNames() test fails ## What changes were proposed in this pull request? Change it to check for relative count like in this test https://github.com/apache/spark/blame/master/R/pkg/inst/tests/testthat/test_sparkSQL.R#L3355 for catalog APIs ## How was this patch tested? unit tests, this needs to combine with another commit with SQL change to check Author: Felix Cheung Closes #17905 from felixcheung/rtabletests. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b952b44a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b952b44a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b952b44a Branch: refs/heads/master Commit: b952b44af4d243f1e3ad88bccf4af7d04df3fc81 Parents: 2abfee1 Author: Felix Cheung Authored: Mon May 8 22:49:40 2017 -0700 Committer: Felix Cheung Committed: Mon May 8 22:49:40 2017 -0700 -- R/pkg/inst/tests/testthat/test_sparkSQL.R | 15 --- 1 file changed, 8 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b952b44a/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index ab6888e..19aa61e 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -677,26 +677,27 @@ test_that("jsonRDD() on a RDD with json string", { }) test_that("test tableNames and tables", { - # Making sure there are no registered temp tables from previous tests - suppressWarnings(sapply(tableNames(), function(tname) { dropTempTable(tname) })) + count <- count(listTables()) + df <- read.json(jsonPath) createOrReplaceTempView(df, "table1") - expect_equal(length(tableNames()), 1) - expect_equal(length(tableNames("default")), 1) + expect_equal(length(tableNames()), count + 1) + expect_equal(length(tableNames("default")), count + 1) + tables <- listTables() - expect_equal(count(tables), 1) + expect_equal(count(tables), count + 1) expect_equal(count(tables()), count(tables)) expect_true("tableName" %in% colnames(tables())) expect_true(all(c("tableName", "database", "isTemporary") %in% colnames(tables( suppressWarnings(registerTempTable(df, "table2")) tables <- listTables() - expect_equal(count(tables), 2) + expect_equal(count(tables), count + 2) suppressWarnings(dropTempTable("table1")) expect_true(dropTempView("table2")) tables <- listTables() - expect_equal(count(tables), 0) + expect_equal(count(tables), count + 0) }) test_that( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20661][SPARKR][TEST][FOLLOWUP] SparkR tableNames() test fails
Repository: spark Updated Branches: refs/heads/branch-2.2 4179ffc03 -> 54e074349 [SPARK-20661][SPARKR][TEST][FOLLOWUP] SparkR tableNames() test fails ## What changes were proposed in this pull request? Change it to check for relative count like in this test https://github.com/apache/spark/blame/master/R/pkg/inst/tests/testthat/test_sparkSQL.R#L3355 for catalog APIs ## How was this patch tested? unit tests, this needs to combine with another commit with SQL change to check Author: Felix Cheung Closes #17905 from felixcheung/rtabletests. (cherry picked from commit b952b44af4d243f1e3ad88bccf4af7d04df3fc81) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/54e07434 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/54e07434 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/54e07434 Branch: refs/heads/branch-2.2 Commit: 54e07434968624dbb0fb80773356e614b954e52f Parents: 4179ffc Author: Felix Cheung Authored: Mon May 8 22:49:40 2017 -0700 Committer: Felix Cheung Committed: Mon May 8 22:49:53 2017 -0700 -- R/pkg/inst/tests/testthat/test_sparkSQL.R | 15 --- 1 file changed, 8 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/54e07434/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 58cd259..ae2969f 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -668,26 +668,27 @@ test_that("jsonRDD() on a RDD with json string", { }) test_that("test tableNames and tables", { - # Making sure there are no registered temp tables from previous tests - suppressWarnings(sapply(tableNames(), function(tname) { dropTempTable(tname) })) + count <- count(listTables()) + df <- read.json(jsonPath) createOrReplaceTempView(df, "table1") - expect_equal(length(tableNames()), 1) - expect_equal(length(tableNames("default")), 1) + expect_equal(length(tableNames()), count + 1) + expect_equal(length(tableNames("default")), count + 1) + tables <- listTables() - expect_equal(count(tables), 1) + expect_equal(count(tables), count + 1) expect_equal(count(tables()), count(tables)) expect_true("tableName" %in% colnames(tables())) expect_true(all(c("tableName", "database", "isTemporary") %in% colnames(tables( suppressWarnings(registerTempTable(df, "table2")) tables <- listTables() - expect_equal(count(tables), 2) + expect_equal(count(tables), count + 2) suppressWarnings(dropTempTable("table1")) expect_true(dropTempView("table2")) tables <- listTables() - expect_equal(count(tables), 0) + expect_equal(count(tables), count + 0) }) test_that( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20670][ML] Simplify FPGrowth transform
Repository: spark Updated Branches: refs/heads/master a90c5cd82 -> a819dab66 [SPARK-20670][ML] Simplify FPGrowth transform ## What changes were proposed in this pull request? jira: https://issues.apache.org/jira/browse/SPARK-20670 As suggested by Sean Owen in https://github.com/apache/spark/pull/17130, the transform code in FPGrowthModel can be simplified. As I tested on some public dataset http://fimi.ua.ac.be/data/, the performance of the new transform code is even or better than the old implementation. ## How was this patch tested? Existing unit test. Author: Yuhao Yang Closes #17912 from hhbyyh/fpgrowthTransform. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a819dab6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a819dab6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a819dab6 Branch: refs/heads/master Commit: a819dab6681f3a16615039227865af188b3c3f2a Parents: a90c5cd Author: Yuhao Yang Authored: Tue May 9 23:39:26 2017 -0700 Committer: Felix Cheung Committed: Tue May 9 23:39:26 2017 -0700 -- mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala | 8 ++-- 1 file changed, 2 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a819dab6/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala index 8f00daa..12804d0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala @@ -269,12 +269,8 @@ class FPGrowthModel private[ml] ( val predictUDF = udf((items: Seq[_]) => { if (items != null) { val itemset = items.toSet -brRules.value.flatMap(rule => - if (items != null && rule._1.forall(item => itemset.contains(item))) { -rule._2.filter(item => !itemset.contains(item)) - } else { -Seq.empty - }).distinct +brRules.value.filter(_._1.forall(itemset.contains)) + .flatMap(_._2.filter(!itemset.contains(_))).distinct } else { Seq.empty }}, dt) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20704][SPARKR] change CRAN test to run single thread
Repository: spark Updated Branches: refs/heads/master c8da53560 -> 888b84abe [SPARK-20704][SPARKR] change CRAN test to run single thread ## What changes were proposed in this pull request? - [x] need to test by running R CMD check --as-cran - [x] sanity check vignettes ## How was this patch tested? Jenkins Author: Felix Cheung Closes #17945 from felixcheung/rchangesforpackage. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/888b84ab Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/888b84ab Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/888b84ab Branch: refs/heads/master Commit: 888b84abe8d3fd36c5c2226aeb9e202029936f94 Parents: c8da535 Author: Felix Cheung Authored: Thu May 11 23:10:04 2017 -0700 Committer: Felix Cheung Committed: Thu May 11 23:10:04 2017 -0700 -- R/pkg/inst/tests/testthat/jarTest.R | 2 +- R/pkg/inst/tests/testthat/packageInAJarTest.R | 2 +- R/pkg/inst/tests/testthat/test_Serde.R | 2 +- R/pkg/inst/tests/testthat/test_binaryFile.R | 2 +- R/pkg/inst/tests/testthat/test_binary_function.R| 2 +- R/pkg/inst/tests/testthat/test_broadcast.R | 2 +- R/pkg/inst/tests/testthat/test_context.R| 16 R/pkg/inst/tests/testthat/test_includePackage.R | 2 +- R/pkg/inst/tests/testthat/test_jvm_api.R| 2 +- .../inst/tests/testthat/test_mllib_classification.R | 2 +- R/pkg/inst/tests/testthat/test_mllib_clustering.R | 2 +- R/pkg/inst/tests/testthat/test_mllib_fpm.R | 2 +- .../inst/tests/testthat/test_mllib_recommendation.R | 2 +- R/pkg/inst/tests/testthat/test_mllib_regression.R | 2 +- R/pkg/inst/tests/testthat/test_mllib_stat.R | 2 +- R/pkg/inst/tests/testthat/test_mllib_tree.R | 2 +- .../inst/tests/testthat/test_parallelize_collect.R | 2 +- R/pkg/inst/tests/testthat/test_rdd.R| 2 +- R/pkg/inst/tests/testthat/test_shuffle.R| 2 +- R/pkg/inst/tests/testthat/test_sparkSQL.R | 2 +- R/pkg/inst/tests/testthat/test_streaming.R | 2 +- R/pkg/inst/tests/testthat/test_take.R | 2 +- R/pkg/inst/tests/testthat/test_textFile.R | 2 +- R/pkg/inst/tests/testthat/test_utils.R | 2 +- R/pkg/tests/run-all.R | 5 + R/pkg/vignettes/sparkr-vignettes.Rmd| 3 ++- 26 files changed, 38 insertions(+), 32 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/888b84ab/R/pkg/inst/tests/testthat/jarTest.R -- diff --git a/R/pkg/inst/tests/testthat/jarTest.R b/R/pkg/inst/tests/testthat/jarTest.R index c9615c8..e2241e0 100644 --- a/R/pkg/inst/tests/testthat/jarTest.R +++ b/R/pkg/inst/tests/testthat/jarTest.R @@ -16,7 +16,7 @@ # library(SparkR) -sc <- sparkR.session() +sc <- sparkR.session(master = "local[1]") helloTest <- SparkR:::callJStatic("sparkrtest.DummyClass", "helloWorld", http://git-wip-us.apache.org/repos/asf/spark/blob/888b84ab/R/pkg/inst/tests/testthat/packageInAJarTest.R -- diff --git a/R/pkg/inst/tests/testthat/packageInAJarTest.R b/R/pkg/inst/tests/testthat/packageInAJarTest.R index 4bc935c..ac70626 100644 --- a/R/pkg/inst/tests/testthat/packageInAJarTest.R +++ b/R/pkg/inst/tests/testthat/packageInAJarTest.R @@ -17,7 +17,7 @@ library(SparkR) library(sparkPackageTest) -sparkR.session() +sparkR.session(master = "local[1]") run1 <- myfunc(5L) http://git-wip-us.apache.org/repos/asf/spark/blob/888b84ab/R/pkg/inst/tests/testthat/test_Serde.R -- diff --git a/R/pkg/inst/tests/testthat/test_Serde.R b/R/pkg/inst/tests/testthat/test_Serde.R index 518fb7b..6e160fa 100644 --- a/R/pkg/inst/tests/testthat/test_Serde.R +++ b/R/pkg/inst/tests/testthat/test_Serde.R @@ -17,7 +17,7 @@ context("SerDe functionality") -sparkSession <- sparkR.session(enableHiveSupport = FALSE) +sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) test_that("SerDe of primitive types", { skip_on_cran() http://git-wip-us.apache.org/repos/asf/spark/blob/888b84ab/R/pkg/inst/tests/testthat/test_binaryFile.R -- diff --git a/R/pkg/inst/tests/testthat/test_binaryFile.R b/R/pkg/inst/tests/testthat/test_binaryFile.R index 63f54e1..00954fa 100644 --- a/R/pkg/inst/tests/testthat/test_binaryFile.R +++ b/R/pkg/inst/tests/testthat/test_binaryFile.R @@ -18,7 +18,7 @@ context("fu
spark git commit: [SPARK-20704][SPARKR] change CRAN test to run single thread
Repository: spark Updated Branches: refs/heads/branch-2.2 a8d981dc5 -> c1e5ac267 [SPARK-20704][SPARKR] change CRAN test to run single thread ## What changes were proposed in this pull request? - [x] need to test by running R CMD check --as-cran - [x] sanity check vignettes ## How was this patch tested? Jenkins Author: Felix Cheung Closes #17945 from felixcheung/rchangesforpackage. (cherry picked from commit 888b84abe8d3fd36c5c2226aeb9e202029936f94) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c1e5ac26 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c1e5ac26 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c1e5ac26 Branch: refs/heads/branch-2.2 Commit: c1e5ac267fcf73b96c28bb08797de98624df15dc Parents: a8d981d Author: Felix Cheung Authored: Thu May 11 23:10:04 2017 -0700 Committer: Felix Cheung Committed: Thu May 11 23:10:19 2017 -0700 -- R/pkg/inst/tests/testthat/jarTest.R | 2 +- R/pkg/inst/tests/testthat/packageInAJarTest.R | 2 +- R/pkg/inst/tests/testthat/test_Serde.R | 2 +- R/pkg/inst/tests/testthat/test_binaryFile.R | 2 +- R/pkg/inst/tests/testthat/test_binary_function.R| 2 +- R/pkg/inst/tests/testthat/test_broadcast.R | 2 +- R/pkg/inst/tests/testthat/test_context.R| 16 R/pkg/inst/tests/testthat/test_includePackage.R | 2 +- R/pkg/inst/tests/testthat/test_jvm_api.R| 2 +- .../inst/tests/testthat/test_mllib_classification.R | 2 +- R/pkg/inst/tests/testthat/test_mllib_clustering.R | 2 +- R/pkg/inst/tests/testthat/test_mllib_fpm.R | 2 +- .../inst/tests/testthat/test_mllib_recommendation.R | 2 +- R/pkg/inst/tests/testthat/test_mllib_regression.R | 2 +- R/pkg/inst/tests/testthat/test_mllib_stat.R | 2 +- R/pkg/inst/tests/testthat/test_mllib_tree.R | 2 +- .../inst/tests/testthat/test_parallelize_collect.R | 2 +- R/pkg/inst/tests/testthat/test_rdd.R| 2 +- R/pkg/inst/tests/testthat/test_shuffle.R| 2 +- R/pkg/inst/tests/testthat/test_sparkSQL.R | 2 +- R/pkg/inst/tests/testthat/test_streaming.R | 2 +- R/pkg/inst/tests/testthat/test_take.R | 2 +- R/pkg/inst/tests/testthat/test_textFile.R | 2 +- R/pkg/inst/tests/testthat/test_utils.R | 2 +- R/pkg/tests/run-all.R | 5 + R/pkg/vignettes/sparkr-vignettes.Rmd| 3 ++- 26 files changed, 38 insertions(+), 32 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c1e5ac26/R/pkg/inst/tests/testthat/jarTest.R -- diff --git a/R/pkg/inst/tests/testthat/jarTest.R b/R/pkg/inst/tests/testthat/jarTest.R index c9615c8..e2241e0 100644 --- a/R/pkg/inst/tests/testthat/jarTest.R +++ b/R/pkg/inst/tests/testthat/jarTest.R @@ -16,7 +16,7 @@ # library(SparkR) -sc <- sparkR.session() +sc <- sparkR.session(master = "local[1]") helloTest <- SparkR:::callJStatic("sparkrtest.DummyClass", "helloWorld", http://git-wip-us.apache.org/repos/asf/spark/blob/c1e5ac26/R/pkg/inst/tests/testthat/packageInAJarTest.R -- diff --git a/R/pkg/inst/tests/testthat/packageInAJarTest.R b/R/pkg/inst/tests/testthat/packageInAJarTest.R index 4bc935c..ac70626 100644 --- a/R/pkg/inst/tests/testthat/packageInAJarTest.R +++ b/R/pkg/inst/tests/testthat/packageInAJarTest.R @@ -17,7 +17,7 @@ library(SparkR) library(sparkPackageTest) -sparkR.session() +sparkR.session(master = "local[1]") run1 <- myfunc(5L) http://git-wip-us.apache.org/repos/asf/spark/blob/c1e5ac26/R/pkg/inst/tests/testthat/test_Serde.R -- diff --git a/R/pkg/inst/tests/testthat/test_Serde.R b/R/pkg/inst/tests/testthat/test_Serde.R index 518fb7b..6e160fa 100644 --- a/R/pkg/inst/tests/testthat/test_Serde.R +++ b/R/pkg/inst/tests/testthat/test_Serde.R @@ -17,7 +17,7 @@ context("SerDe functionality") -sparkSession <- sparkR.session(enableHiveSupport = FALSE) +sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) test_that("SerDe of primitive types", { skip_on_cran() http://git-wip-us.apache.org/repos/asf/spark/blob/c1e5ac26/R/pkg/inst/tests/testthat/test_binaryFile.R -- diff --git a/R/pkg/inst/tests/testthat/test_binaryFile.R b/R/pkg/inst/tests/testthat/test_binaryFile.R index 63f54e1..00954fa 100644 --- a/R/pkg/inst/tests/test
spark git commit: [SPARK-20619][ML] StringIndexer supports multiple ways to order label
Repository: spark Updated Branches: refs/heads/master 888b84abe -> af40bb115 [SPARK-20619][ML] StringIndexer supports multiple ways to order label ## What changes were proposed in this pull request? StringIndexer maps labels to numbers according to the descending order of label frequency. Other types of ordering (e.g., alphabetical) may be needed in feature ETL. For example, the ordering will affect the result in one-hot encoding and RFormula. This PR proposes to support other ordering methods and we add a parameter `stringOrderType` that supports the following four options: - 'frequencyDesc': descending order by label frequency (most frequent label assigned 0) - 'frequencyAsc': ascending order by label frequency (least frequent label assigned 0) - 'alphabetDesc': descending alphabetical order - 'alphabetAsc': ascending alphabetical order The default is still descending order of label frequency, so there should be no impact to existing programs. ## How was this patch tested? new test Author: Wayne Zhang Closes #17879 from actuaryzhang/stringIndexer. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/af40bb11 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/af40bb11 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/af40bb11 Branch: refs/heads/master Commit: af40bb1159b1f443bf44594c716d2f2dd3c98640 Parents: 888b84a Author: Wayne Zhang Authored: Fri May 12 00:12:47 2017 -0700 Committer: Felix Cheung Committed: Fri May 12 00:12:47 2017 -0700 -- .../apache/spark/ml/feature/StringIndexer.scala | 55 +--- .../spark/ml/feature/StringIndexerSuite.scala | 23 2 files changed, 71 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/af40bb11/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index 99321bc..b2dc4fc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -59,6 +59,29 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha @Since("1.6.0") def getHandleInvalid: String = $(handleInvalid) + /** + * Param for how to order labels of string column. The first label after ordering is assigned + * an index of 0. + * Options are: + * - 'frequencyDesc': descending order by label frequency (most frequent label assigned 0) + * - 'frequencyAsc': ascending order by label frequency (least frequent label assigned 0) + * - 'alphabetDesc': descending alphabetical order + * - 'alphabetAsc': ascending alphabetical order + * Default is 'frequencyDesc'. + * + * @group param + */ + @Since("2.3.0") + final val stringOrderType: Param[String] = new Param(this, "stringOrderType", +"how to order labels of string column. " + +"The first label after ordering is assigned an index of 0. " + +s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.", +ParamValidators.inArray(StringIndexer.supportedStringOrderType)) + + /** @group getParam */ + @Since("2.3.0") + def getStringOrderType: String = $(stringOrderType) + /** Validates and transforms the input schema. */ protected def validateAndTransformSchema(schema: StructType): StructType = { val inputColName = $(inputCol) @@ -79,8 +102,9 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha /** * A label indexer that maps a string column of labels to an ML column of label indices. * If the input column is numeric, we cast it to string and index the string values. - * The indices are in [0, numLabels), ordered by label frequencies. - * So the most frequent label gets index 0. + * The indices are in [0, numLabels). By default, this is ordered by label frequencies + * so the most frequent label gets index 0. The ordering behavior is controlled by + * setting `stringOrderType`. * * @see `IndexToString` for the inverse transformation */ @@ -97,6 +121,11 @@ class StringIndexer @Since("1.4.0") ( def setHandleInvalid(value: String): this.type = set(handleInvalid, value) /** @group setParam */ + @Since("2.3.0") + def setStringOrderType(value: String): this.type = set(stringOrderType, value) + setDefault(stringOrderType, StringIndexer.frequencyDesc) + + /** @group setParam */ @Since("1.4.0") def setInputCol(value: String): this.type = set(inputCol, value) @@ -107,11 +136,17 @@ class StringIndexer @Since("1.4.0") ( @Since("2.0.0") override def fit(dataset: Dataset[_])
spark git commit: [DOCS][SPARKR] Use verbose names for family annotations in functions.R
Repository: spark Updated Branches: refs/heads/master 1283c3d11 -> aa3df1590 [DOCS][SPARKR] Use verbose names for family annotations in functions.R ## What changes were proposed in this pull request? - Change current short annotations (same as Scala `group`) to verbose names (same as Scala `groupname`). Before: ![image](https://cloud.githubusercontent.com/assets/1554276/26033909/9a98b596-38b4-11e7-961e-15fd9ea7440d.png) After: ![image](https://cloud.githubusercontent.com/assets/1554276/26033903/727a9944-38b4-11e7-8873-b09c553f4ec3.png) - Add missing `family` annotations. ## How was this patch tested? `check-cran.R` (skipping tests), manual inspection. Author: zero323 Closes #17976 from zero323/SPARKR-FUNCTIONS-DOCSTRINGS. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/aa3df159 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/aa3df159 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/aa3df159 Branch: refs/heads/master Commit: aa3df15904f95bc05c513d6f7c186a45db5ffa88 Parents: 1283c3d Author: zero323 Authored: Sun May 14 11:43:28 2017 -0700 Committer: Felix Cheung Committed: Sun May 14 11:43:28 2017 -0700 -- R/pkg/R/functions.R | 318 +++ 1 file changed, 159 insertions(+), 159 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/aa3df159/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 2fd2d36..a6c2dea 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -24,7 +24,7 @@ NULL #' If the parameter is a \linkS4class{Column}, it is returned unchanged. #' #' @param x a literal value or a Column. -#' @family normal_funcs +#' @family non-aggregate functions #' @rdname lit #' @name lit #' @export @@ -52,7 +52,7 @@ setMethod("lit", signature("ANY"), #' #' @rdname abs #' @name abs -#' @family normal_funcs +#' @family non-aggregate functions #' @export #' @examples \dontrun{abs(df$c)} #' @aliases abs,Column-method @@ -73,7 +73,7 @@ setMethod("abs", #' #' @rdname acos #' @name acos -#' @family math_funcs +#' @family math functions #' @export #' @examples \dontrun{acos(df$c)} #' @aliases acos,Column-method @@ -113,7 +113,7 @@ setMethod("approxCountDistinct", #' #' @rdname ascii #' @name ascii -#' @family string_funcs +#' @family string functions #' @export #' @aliases ascii,Column-method #' @examples \dontrun{\dontrun{ascii(df$c)}} @@ -134,7 +134,7 @@ setMethod("ascii", #' #' @rdname asin #' @name asin -#' @family math_funcs +#' @family math functions #' @export #' @aliases asin,Column-method #' @examples \dontrun{asin(df$c)} @@ -154,7 +154,7 @@ setMethod("asin", #' #' @rdname atan #' @name atan -#' @family math_funcs +#' @family math functions #' @export #' @aliases atan,Column-method #' @examples \dontrun{atan(df$c)} @@ -172,7 +172,7 @@ setMethod("atan", #' #' @rdname avg #' @name avg -#' @family agg_funcs +#' @family aggregate functions #' @export #' @aliases avg,Column-method #' @examples \dontrun{avg(df$c)} @@ -193,7 +193,7 @@ setMethod("avg", #' #' @rdname base64 #' @name base64 -#' @family string_funcs +#' @family string functions #' @export #' @aliases base64,Column-method #' @examples \dontrun{base64(df$c)} @@ -214,7 +214,7 @@ setMethod("base64", #' #' @rdname bin #' @name bin -#' @family math_funcs +#' @family math functions #' @export #' @aliases bin,Column-method #' @examples \dontrun{bin(df$c)} @@ -234,7 +234,7 @@ setMethod("bin", #' #' @rdname bitwiseNOT #' @name bitwiseNOT -#' @family normal_funcs +#' @family non-aggregate functions #' @export #' @aliases bitwiseNOT,Column-method #' @examples \dontrun{bitwiseNOT(df$c)} @@ -254,7 +254,7 @@ setMethod("bitwiseNOT", #' #' @rdname cbrt #' @name cbrt -#' @family math_funcs +#' @family math functions #' @export #' @aliases cbrt,Column-method #' @examples \dontrun{cbrt(df$c)} @@ -274,7 +274,7 @@ setMethod("cbrt", #' #' @rdname ceil #' @name ceil -#' @family math_funcs +#' @family math functions #' @export #' @aliases ceil,Column-method #' @examples \dontrun{ceil(df$c)} @@ -292,7 +292,7 @@ setMethod("ceil", #' #' @rdname coalesce #' @name coalesce -#' @family normal_funcs +#' @family non-aggregate functions #' @export #' @aliases coalesce,Column-method #' @examples \dontrun{coalesce(df$c, df$d, df$e)} @@ -324,7 +324,7 @@ col <- function(x) { #' #' @rdname column #' @name column -#' @family normal_funcs +#' @family non-aggregate functions #' @export #' @aliases column,character-method #' @examples \dontrun{column("name")} @@ -342,7 +342,7 @@ setMethod("column", #' #' @rdname corr #' @name corr -#' @family math_funcs +#' @family math functions #' @export #'
spark git commit: [SPARK-20726][SPARKR] wrapper for SQL broadcast
Repository: spark Updated Branches: refs/heads/master aa3df1590 -> 5a799fd8c [SPARK-20726][SPARKR] wrapper for SQL broadcast ## What changes were proposed in this pull request? - Adds R wrapper for `o.a.s.sql.functions.broadcast`. - Renames `broadcast` to `broadcast_`. ## How was this patch tested? Unit tests, check `check-cran.sh`. Author: zero323 Closes #17965 from zero323/SPARK-20726. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5a799fd8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5a799fd8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5a799fd8 Branch: refs/heads/master Commit: 5a799fd8c3664da1fa9821ead6c0e25f561c6a8d Parents: aa3df15 Author: zero323 Authored: Sun May 14 13:22:19 2017 -0700 Committer: Felix Cheung Committed: Sun May 14 13:22:19 2017 -0700 -- R/pkg/NAMESPACE| 1 + R/pkg/R/DataFrame.R| 29 + R/pkg/R/context.R | 4 ++-- R/pkg/R/generics.R | 4 R/pkg/inst/tests/testthat/test_broadcast.R | 2 +- R/pkg/inst/tests/testthat/test_sparkSQL.R | 5 + R/pkg/inst/tests/testthat/test_utils.R | 2 +- 7 files changed, 43 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5a799fd8/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index ba0fe77..5c074d3 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -84,6 +84,7 @@ exportClasses("SparkDataFrame") exportMethods("arrange", "as.data.frame", "attach", + "broadcast", "cache", "checkpoint", "coalesce", http://git-wip-us.apache.org/repos/asf/spark/blob/5a799fd8/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index b56dddc..aab2fc1 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -3769,3 +3769,32 @@ setMethod("alias", sdf <- callJMethod(object@sdf, "alias", data) dataFrame(sdf) }) + +#' broadcast +#' +#' Return a new SparkDataFrame marked as small enough for use in broadcast joins. +#' +#' Equivalent to \code{hint(x, "broadcast")}. +#' +#' @param x a SparkDataFrame. +#' @return a SparkDataFrame. +#' +#' @aliases broadcast,SparkDataFrame-method +#' @family SparkDataFrame functions +#' @rdname broadcast +#' @name broadcast +#' @export +#' @examples +#' \dontrun{ +#' df <- createDataFrame(mtcars) +#' avg_mpg <- mean(groupBy(createDataFrame(mtcars), "cyl"), "mpg") +#' +#' head(join(df, broadcast(avg_mpg), df$cyl == avg_mpg$cyl)) +#' } +#' @note broadcast since 2.3.0 +setMethod("broadcast", + signature(x = "SparkDataFrame"), + function(x) { +sdf <- callJStatic("org.apache.spark.sql.functions", "broadcast", x@sdf) +dataFrame(sdf) + }) http://git-wip-us.apache.org/repos/asf/spark/blob/5a799fd8/R/pkg/R/context.R -- diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index 50856e3..8349b57 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -258,7 +258,7 @@ includePackage <- function(sc, pkg) { #' #' # Large Matrix object that we want to broadcast #' randomMat <- matrix(nrow=100, ncol=10, data=rnorm(1000)) -#' randomMatBr <- broadcast(sc, randomMat) +#' randomMatBr <- broadcastRDD(sc, randomMat) #' #' # Use the broadcast variable inside the function #' useBroadcast <- function(x) { @@ -266,7 +266,7 @@ includePackage <- function(sc, pkg) { #' } #' sumRDD <- lapply(rdd, useBroadcast) #'} -broadcast <- function(sc, object) { +broadcastRDD <- function(sc, object) { objName <- as.character(substitute(object)) serializedObj <- serialize(object, connection = NULL) http://git-wip-us.apache.org/repos/asf/spark/blob/5a799fd8/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 3c84bf8..514ca99 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -799,6 +799,10 @@ setGeneric("write.df", function(df, path = NULL, ...) { standardGeneric("write.d #' @export setGeneric("randomSplit", function(x, weights, seed) { standardGeneric("randomSplit") }) +#' @rdname broadcast +#' @export +setGeneric("broadcast", function(x) { standardGeneric("broadcast") }) + ## Column Methods ## #' @rdname columnfunctions http://git-wip-us.apache.org/repos/asf/spark/blob/5a799fd8/R/pkg/inst/tests/testthat/test_broadcast.R ---
spark git commit: [SPARKR][DOCS][MINOR] Use consistent names in rollup and cube examples
Repository: spark Updated Branches: refs/heads/master ea3b1e352 -> 2d90c04f2 [SPARKR][DOCS][MINOR] Use consistent names in rollup and cube examples ## What changes were proposed in this pull request? Rename `carsDF` to `df` in SparkR `rollup` and `cube` examples. ## How was this patch tested? Manual tests. Author: zero323 Closes #17988 from zero323/cube-docs. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2d90c04f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2d90c04f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2d90c04f Branch: refs/heads/master Commit: 2d90c04f2343b0ce6cb4d76320bf583934cb9993 Parents: ea3b1e3 Author: zero323 Authored: Fri May 19 11:04:38 2017 -0700 Committer: Felix Cheung Committed: Fri May 19 11:04:38 2017 -0700 -- R/pkg/R/DataFrame.R | 8 1 file changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2d90c04f/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index aab2fc1..2b5888f 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -3667,8 +3667,8 @@ setMethod("checkpoint", #' mean(cube(df, "cyl", "gear", "am"), "mpg") #' #' # Following calls are equivalent -#' agg(cube(carsDF), mean(carsDF$mpg)) -#' agg(carsDF, mean(carsDF$mpg)) +#' agg(cube(df), mean(df$mpg)) +#' agg(df, mean(df$mpg)) #' } #' @note cube since 2.3.0 #' @seealso \link{agg}, \link{groupBy}, \link{rollup} @@ -3702,8 +3702,8 @@ setMethod("cube", #' mean(rollup(df, "cyl", "gear", "am"), "mpg") #' #' # Following calls are equivalent -#' agg(rollup(carsDF), mean(carsDF$mpg)) -#' agg(carsDF, mean(carsDF$mpg)) +#' agg(rollup(df), mean(df$mpg)) +#' agg(df, mean(df$mpg)) #' } #' @note rollup since 2.3.0 #' @seealso \link{agg}, \link{cube}, \link{groupBy} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARKR] Fix bad examples in DataFrame methods and style issues
Repository: spark Updated Branches: refs/heads/master 2d90c04f2 -> 7f203a248 [SPARKR] Fix bad examples in DataFrame methods and style issues ## What changes were proposed in this pull request? Some examples in the DataFrame methods are syntactically wrong, even though they are pseudo code. Fix these and some style issues. Author: Wayne Zhang Closes #18003 from actuaryzhang/sparkRDoc3. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7f203a24 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7f203a24 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7f203a24 Branch: refs/heads/master Commit: 7f203a248f94df6183a4bc4642a3d873171fef29 Parents: 2d90c04 Author: Wayne Zhang Authored: Fri May 19 11:18:20 2017 -0700 Committer: Felix Cheung Committed: Fri May 19 11:18:20 2017 -0700 -- R/pkg/R/DataFrame.R | 14 +++-- R/pkg/R/WindowSpec.R | 3 ++- R/pkg/R/column.R | 6 -- R/pkg/R/functions.R | 51 +++ 4 files changed, 48 insertions(+), 26 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7f203a24/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 2b5888f..166b398 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -549,7 +549,7 @@ setMethod("registerTempTable", #' sparkR.session() #' df <- read.df(path, "parquet") #' df2 <- read.df(path2, "parquet") -#' createOrReplaceTempView(df, "table1") +#' saveAsTable(df, "table1") #' insertInto(df2, "table1", overwrite = TRUE) #'} #' @note insertInto since 1.4.0 @@ -1125,7 +1125,8 @@ setMethod("dim", #' path <- "path/to/file.json" #' df <- read.json(path) #' collected <- collect(df) -#' firstName <- collected[[1]]$name +#' class(collected) +#' firstName <- names(collected)[1] #' } #' @note collect since 1.4.0 setMethod("collect", @@ -2814,7 +2815,7 @@ setMethod("except", #' path <- "path/to/file.json" #' df <- read.json(path) #' write.df(df, "myfile", "parquet", "overwrite") -#' saveDF(df, parquetPath2, "parquet", mode = saveMode, mergeSchema = mergeSchema) +#' saveDF(df, parquetPath2, "parquet", mode = "append", mergeSchema = TRUE) #' } #' @note write.df since 1.4.0 setMethod("write.df", @@ -3097,8 +3098,8 @@ setMethod("fillna", #' @family SparkDataFrame functions #' @aliases as.data.frame,SparkDataFrame-method #' @rdname as.data.frame -#' @examples \dontrun{ -#' +#' @examples +#' \dontrun{ #' irisDF <- createDataFrame(iris) #' df <- as.data.frame(irisDF[irisDF$Species == "setosa", ]) #' } @@ -3175,7 +3176,8 @@ setMethod("with", #' @aliases str,SparkDataFrame-method #' @family SparkDataFrame functions #' @param object a SparkDataFrame -#' @examples \dontrun{ +#' @examples +#' \dontrun{ #' # Create a SparkDataFrame from the Iris dataset #' irisDF <- createDataFrame(iris) #' http://git-wip-us.apache.org/repos/asf/spark/blob/7f203a24/R/pkg/R/WindowSpec.R -- diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R index 4ac83c2..81beac9 100644 --- a/R/pkg/R/WindowSpec.R +++ b/R/pkg/R/WindowSpec.R @@ -203,7 +203,8 @@ setMethod("rangeBetween", #' @aliases over,Column,WindowSpec-method #' @family colum_func #' @export -#' @examples \dontrun{ +#' @examples +#' \dontrun{ #' df <- createDataFrame(mtcars) #' #' # Partition by am (transmission) and order by hp (horsepower) http://git-wip-us.apache.org/repos/asf/spark/blob/7f203a24/R/pkg/R/column.R -- diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 5740780..a5c2ea8 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -135,7 +135,8 @@ createMethods() #' @aliases alias,Column-method #' @family colum_func #' @export -#' @examples \dontrun{ +#' @examples +#' \dontrun{ #' df <- createDataFrame(iris) #' #' head(select( @@ -244,7 +245,8 @@ setMethod("between", signature(x = "Column"), #' @family colum_func #' @aliases cast,Column-method #' -#' @examples \dontrun{ +#' @examples +#' \dontrun{ #' cast(df$age, "string") #' } #' @note cast since 1.4.0 http://git-wip-us.apache.org/repos/asf/spark/blob/7f203a24/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index a6c2dea..06a9019 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -3257,7 +3257,8 @@ setMethod("when", signature(condition = "Column", value = "ANY"), #' @aliases ifelse,Column-method #' @seealso \link{when} #' @export -#' @examples \dontrun{ +#' @examples +#' \dontrun{ #' ifelse(df$a > 1 & df$b > 2, 0, 1) #' ifelse(df$a > 1, df$a, 1) #' } @@ -3292,7 +3293,8 @
spark git commit: [SPARK-20736][PYTHON] PySpark StringIndexer supports StringOrderType
Repository: spark Updated Branches: refs/heads/master 9d6661c82 -> 0f2f56c37 [SPARK-20736][PYTHON] PySpark StringIndexer supports StringOrderType ## What changes were proposed in this pull request? PySpark StringIndexer supports StringOrderType added in #17879. Author: Wayne Zhang Closes #17978 from actuaryzhang/PythonStringIndexer. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0f2f56c3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0f2f56c3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0f2f56c3 Branch: refs/heads/master Commit: 0f2f56c37b8d09eec2722a5ffba3015d7f3b626f Parents: 9d6661c Author: Wayne Zhang Authored: Sun May 21 16:51:55 2017 -0700 Committer: Felix Cheung Committed: Sun May 21 16:51:55 2017 -0700 -- python/pyspark/ml/feature.py | 51 +-- 1 file changed, 43 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0f2f56c3/python/pyspark/ml/feature.py -- diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 8d25f5b..955bc97 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2082,10 +2082,12 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, """ A label indexer that maps a string column of labels to an ML column of label indices. If the input column is numeric, we cast it to string and index the string values. -The indices are in [0, numLabels), ordered by label frequencies. -So the most frequent label gets index 0. +The indices are in [0, numLabels). By default, this is ordered by label frequencies +so the most frequent label gets index 0. The ordering behavior is controlled by +setting :py:attr:`stringOrderType`. Its default value is 'frequencyDesc'. ->>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid='error') +>>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="error", +... stringOrderType="frequencyDesc") >>> model = stringIndexer.fit(stringIndDf) >>> td = model.transform(stringIndDf) >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]), @@ -2111,26 +2113,45 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, >>> loadedInverter = IndexToString.load(indexToStringPath) >>> loadedInverter.getLabels() == inverter.getLabels() True +>>> stringIndexer.getStringOrderType() +'frequencyDesc' +>>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="error", +... stringOrderType="alphabetDesc") +>>> model = stringIndexer.fit(stringIndDf) +>>> td = model.transform(stringIndDf) +>>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]), +... key=lambda x: x[0]) +[(0, 2.0), (1, 1.0), (2, 0.0), (3, 2.0), (4, 2.0), (5, 0.0)] .. versionadded:: 1.4.0 """ +stringOrderType = Param(Params._dummy(), "stringOrderType", +"How to order labels of string column. The first label after " + +"ordering is assigned an index of 0. Supported options: " + +"frequencyDesc, frequencyAsc, alphabetDesc, alphabetAsc.", +typeConverter=TypeConverters.toString) + @keyword_only -def __init__(self, inputCol=None, outputCol=None, handleInvalid="error"): +def __init__(self, inputCol=None, outputCol=None, handleInvalid="error", + stringOrderType="frequencyDesc"): """ -__init__(self, inputCol=None, outputCol=None, handleInvalid="error") +__init__(self, inputCol=None, outputCol=None, handleInvalid="error", \ + stringOrderType="frequencyDesc") """ super(StringIndexer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StringIndexer", self.uid) -self._setDefault(handleInvalid="error") +self._setDefault(handleInvalid="error", stringOrderType="frequencyDesc") kwargs = self._input_kwargs self.setParams(**kwargs) @keyword_only @since("1.4.0") -def setParams(self, inputCol=None, outputCol=None, handleInvalid="error"): +def setParams(self, inputCol=None, outputCol=None, handleInvalid="error", + stringOrderType="frequencyDesc"): """ -setParams(self, inputCol=None, outputCol=None, handleInvalid="error") +setParams(self, inputCol=None, outputCol=None, handleInvalid="error", \ + stringOrderType="frequencyDesc")
spark git commit: [SPARK-15767][ML][SPARKR] Decision Tree wrapper in SparkR
Repository: spark Updated Branches: refs/heads/master 363091100 -> 4be337583 [SPARK-15767][ML][SPARKR] Decision Tree wrapper in SparkR ## What changes were proposed in this pull request? support decision tree in R ## How was this patch tested? added tests Author: Zheng RuiFeng Closes #17981 from zhengruifeng/dt_r. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4be33758 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4be33758 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4be33758 Branch: refs/heads/master Commit: 4be33758354e1f95fd1d82a5482f3f00218e8c91 Parents: 3630911 Author: Zheng RuiFeng Authored: Mon May 22 10:40:49 2017 -0700 Committer: Felix Cheung Committed: Mon May 22 10:40:49 2017 -0700 -- R/pkg/NAMESPACE | 5 + R/pkg/R/generics.R | 5 + R/pkg/R/mllib_tree.R| 240 +++ R/pkg/R/mllib_utils.R | 14 +- R/pkg/inst/tests/testthat/test_mllib_tree.R | 86 +++ .../r/DecisionTreeClassificationWrapper.scala | 152 .../ml/r/DecisionTreeRegressionWrapper.scala| 137 +++ .../scala/org/apache/spark/ml/r/RWrappers.scala | 4 + 8 files changed, 639 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4be33758/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 5c074d3..4e3fe00 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -63,6 +63,7 @@ exportMethods("glm", "spark.als", "spark.kstest", "spark.logit", + "spark.decisionTree", "spark.randomForest", "spark.gbt", "spark.bisectingKmeans", @@ -414,6 +415,8 @@ export("as.DataFrame", "print.summary.GeneralizedLinearRegressionModel", "read.ml", "print.summary.KSTest", + "print.summary.DecisionTreeRegressionModel", + "print.summary.DecisionTreeClassificationModel", "print.summary.RandomForestRegressionModel", "print.summary.RandomForestClassificationModel", "print.summary.GBTRegressionModel", @@ -452,6 +455,8 @@ S3method(print, structField) S3method(print, structType) S3method(print, summary.GeneralizedLinearRegressionModel) S3method(print, summary.KSTest) +S3method(print, summary.DecisionTreeRegressionModel) +S3method(print, summary.DecisionTreeClassificationModel) S3method(print, summary.RandomForestRegressionModel) S3method(print, summary.RandomForestClassificationModel) S3method(print, summary.GBTRegressionModel) http://git-wip-us.apache.org/repos/asf/spark/blob/4be33758/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 514ca99..5630d0c 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1506,6 +1506,11 @@ setGeneric("spark.mlp", function(data, formula, ...) { standardGeneric("spark.ml #' @export setGeneric("spark.naiveBayes", function(data, formula, ...) { standardGeneric("spark.naiveBayes") }) +#' @rdname spark.decisionTree +#' @export +setGeneric("spark.decisionTree", + function(data, formula, ...) { standardGeneric("spark.decisionTree") }) + #' @rdname spark.randomForest #' @export setGeneric("spark.randomForest", http://git-wip-us.apache.org/repos/asf/spark/blob/4be33758/R/pkg/R/mllib_tree.R -- diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R index 82279be..2f1220a 100644 --- a/R/pkg/R/mllib_tree.R +++ b/R/pkg/R/mllib_tree.R @@ -45,6 +45,20 @@ setClass("RandomForestRegressionModel", representation(jobj = "jobj")) #' @note RandomForestClassificationModel since 2.1.0 setClass("RandomForestClassificationModel", representation(jobj = "jobj")) +#' S4 class that represents a DecisionTreeRegressionModel +#' +#' @param jobj a Java object reference to the backing Scala DecisionTreeRegressionModel +#' @export +#' @note DecisionTreeRegressionModel since 2.3.0 +setClass("DecisionTreeRegressionModel", representation(jobj = "jobj")) + +#' S4 class that represents a DecisionTreeClassificationModel +#' +#' @param jobj a Java object reference to the backing Scala DecisionTreeClassificationModel +#' @export +#' @note DecisionTreeClassificationModel since 2.3.0 +setClass("DecisionTreeClassificationModel", representation(jobj = "jobj")) + # Create the summary of a tree ensemble model (eg. Random Forest, GBT) summary.treeEnsemble <- function(model) { jobj <- model@jobj @@ -81,6 +95,36 @@ print.summary.treeEnsemble <- function(x) { invisible(x) } +# Create the summary
spark git commit: [SPARK-20815][SPARKR] NullPointerException in RPackageUtils#checkManifestForR
Repository: spark Updated Branches: refs/heads/master a2460be9c -> 4dbb63f08 [SPARK-20815][SPARKR] NullPointerException in RPackageUtils#checkManifestForR ## What changes were proposed in this pull request? - Add a null check to RPackageUtils#checkManifestForR so that jars w/o manifests don't NPE. ## How was this patch tested? - Unit tests and manual tests. Author: James Shuster Closes #18040 from jrshust/feature/r-package-utils. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4dbb63f0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4dbb63f0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4dbb63f0 Branch: refs/heads/master Commit: 4dbb63f0857a9cfb018cf49e3d1103cacc862ba2 Parents: a2460be Author: James Shuster Authored: Mon May 22 21:41:11 2017 -0700 Committer: Felix Cheung Committed: Mon May 22 21:41:11 2017 -0700 -- .../scala/org/apache/spark/deploy/RPackageUtils.scala | 3 +++ .../scala/org/apache/spark/deploy/IvyTestUtils.scala | 14 ++ .../org/apache/spark/deploy/RPackageUtilsSuite.scala | 10 ++ 3 files changed, 23 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4dbb63f0/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala index 050778a..7d356e8 100644 --- a/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala +++ b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala @@ -92,6 +92,9 @@ private[deploy] object RPackageUtils extends Logging { * Exposed for testing. */ private[deploy] def checkManifestForR(jar: JarFile): Boolean = { +if (jar.getManifest == null) { + return false +} val manifest = jar.getManifest.getMainAttributes manifest.getValue(hasRPackage) != null && manifest.getValue(hasRPackage).trim == "true" } http://git-wip-us.apache.org/repos/asf/spark/blob/4dbb63f0/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala -- diff --git a/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala b/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala index f50cb38..42b8cde 100644 --- a/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala +++ b/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala @@ -243,16 +243,22 @@ private[deploy] object IvyTestUtils { withManifest: Option[Manifest] = None): File = { val jarFile = new File(dir, artifactName(artifact, useIvyLayout)) val jarFileStream = new FileOutputStream(jarFile) -val manifest = withManifest.getOrElse { - val mani = new Manifest() +val manifest: Manifest = withManifest.getOrElse { if (withR) { +val mani = new Manifest() val attr = mani.getMainAttributes attr.put(Name.MANIFEST_VERSION, "1.0") attr.put(new Name("Spark-HasRPackage"), "true") +mani + } else { +null } - mani } -val jarStream = new JarOutputStream(jarFileStream, manifest) +val jarStream = if (manifest != null) { + new JarOutputStream(jarFileStream, manifest) +} else { + new JarOutputStream(jarFileStream) +} for (file <- files) { val jarEntry = new JarEntry(file._1) http://git-wip-us.apache.org/repos/asf/spark/blob/4dbb63f0/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala index 0055870..5e0bf6d 100644 --- a/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala @@ -133,6 +133,16 @@ class RPackageUtilsSuite } } + test("jars without manifest return false") { +IvyTestUtils.withRepository(main, None, None) { repo => + val jar = IvyTestUtils.packJar(new File(new URI(repo)), dep1, Nil, +useIvyLayout = false, withR = false, None) + val jarFile = new JarFile(jar) + assert(jarFile.getManifest == null, "jar file should have null manifest") + assert(!RPackageUtils.checkManifestForR(jarFile), "null manifest should return false") +} + } + test("SparkR zipping works properly") { val tempDir = Files.createTempDir() Utils.tryWithSafeFinally { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For
spark git commit: [SPARK-20815][SPARKR] NullPointerException in RPackageUtils#checkManifestForR
Repository: spark Updated Branches: refs/heads/branch-2.2 d8328d8d1 -> ddc199eef [SPARK-20815][SPARKR] NullPointerException in RPackageUtils#checkManifestForR ## What changes were proposed in this pull request? - Add a null check to RPackageUtils#checkManifestForR so that jars w/o manifests don't NPE. ## How was this patch tested? - Unit tests and manual tests. Author: James Shuster Closes #18040 from jrshust/feature/r-package-utils. (cherry picked from commit 4dbb63f0857a9cfb018cf49e3d1103cacc862ba2) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ddc199ee Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ddc199ee Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ddc199ee Branch: refs/heads/branch-2.2 Commit: ddc199eefbf68223f817a4c756b243362c1a95ca Parents: d8328d8 Author: James Shuster Authored: Mon May 22 21:41:11 2017 -0700 Committer: Felix Cheung Committed: Mon May 22 21:41:23 2017 -0700 -- .../scala/org/apache/spark/deploy/RPackageUtils.scala | 3 +++ .../scala/org/apache/spark/deploy/IvyTestUtils.scala | 14 ++ .../org/apache/spark/deploy/RPackageUtilsSuite.scala | 10 ++ 3 files changed, 23 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ddc199ee/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala index 050778a..7d356e8 100644 --- a/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala +++ b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala @@ -92,6 +92,9 @@ private[deploy] object RPackageUtils extends Logging { * Exposed for testing. */ private[deploy] def checkManifestForR(jar: JarFile): Boolean = { +if (jar.getManifest == null) { + return false +} val manifest = jar.getManifest.getMainAttributes manifest.getValue(hasRPackage) != null && manifest.getValue(hasRPackage).trim == "true" } http://git-wip-us.apache.org/repos/asf/spark/blob/ddc199ee/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala -- diff --git a/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala b/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala index f50cb38..42b8cde 100644 --- a/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala +++ b/core/src/test/scala/org/apache/spark/deploy/IvyTestUtils.scala @@ -243,16 +243,22 @@ private[deploy] object IvyTestUtils { withManifest: Option[Manifest] = None): File = { val jarFile = new File(dir, artifactName(artifact, useIvyLayout)) val jarFileStream = new FileOutputStream(jarFile) -val manifest = withManifest.getOrElse { - val mani = new Manifest() +val manifest: Manifest = withManifest.getOrElse { if (withR) { +val mani = new Manifest() val attr = mani.getMainAttributes attr.put(Name.MANIFEST_VERSION, "1.0") attr.put(new Name("Spark-HasRPackage"), "true") +mani + } else { +null } - mani } -val jarStream = new JarOutputStream(jarFileStream, manifest) +val jarStream = if (manifest != null) { + new JarOutputStream(jarFileStream, manifest) +} else { + new JarOutputStream(jarFileStream) +} for (file <- files) { val jarEntry = new JarEntry(file._1) http://git-wip-us.apache.org/repos/asf/spark/blob/ddc199ee/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala index 0055870..5e0bf6d 100644 --- a/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala @@ -133,6 +133,16 @@ class RPackageUtilsSuite } } + test("jars without manifest return false") { +IvyTestUtils.withRepository(main, None, None) { repo => + val jar = IvyTestUtils.packJar(new File(new URI(repo)), dep1, Nil, +useIvyLayout = false, withR = false, None) + val jarFile = new JarFile(jar) + assert(jarFile.getManifest == null, "jar file should have null manifest") + assert(!RPackageUtils.checkManifestForR(jarFile), "null manifest should return false") +} + } + test("SparkR zipping works properly") { val tempDir = Files.createTempDir() Utils.tryWithSafeFinally {
spark git commit: [SPARK-20727] Skip tests that use Hadoop utils on CRAN Windows
Repository: spark Updated Branches: refs/heads/master 4dbb63f08 -> d06610f99 [SPARK-20727] Skip tests that use Hadoop utils on CRAN Windows ## What changes were proposed in this pull request? This change skips tests that use the Hadoop libraries while running on CRAN check with Windows as the operating system. This is to handle cases where the Hadoop winutils binaries are missing on the target system. The skipped tests consist of 1. Tests that save, load a model in MLlib 2. Tests that save, load CSV, JSON and Parquet files in SQL 3. Hive tests ## How was this patch tested? Tested by running on a local windows VM with HADOOP_HOME unset. Also testing with https://win-builder.r-project.org Author: Shivaram Venkataraman Closes #17966 from shivaram/sparkr-windows-cran. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d06610f9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d06610f9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d06610f9 Branch: refs/heads/master Commit: d06610f992ccf199928c0a71699fbf4c01705c31 Parents: 4dbb63f Author: Shivaram Venkataraman Authored: Mon May 22 23:04:22 2017 -0700 Committer: Felix Cheung Committed: Mon May 22 23:04:22 2017 -0700 -- R/pkg/R/utils.R | 16 + .../tests/testthat/test_mllib_classification.R | 90 +++-- .../inst/tests/testthat/test_mllib_clustering.R | 112 +++--- R/pkg/inst/tests/testthat/test_mllib_fpm.R | 16 +- .../tests/testthat/test_mllib_recommendation.R | 42 +- .../inst/tests/testthat/test_mllib_regression.R | 42 +- R/pkg/inst/tests/testthat/test_mllib_tree.R | 112 +++--- R/pkg/inst/tests/testthat/test_sparkSQL.R | 396 ++- 8 files changed, 445 insertions(+), 381 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d06610f9/R/pkg/R/utils.R -- diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index d29af00..ea45e39 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -907,3 +907,19 @@ basenameSansExtFromUrl <- function(url) { isAtomicLengthOne <- function(x) { is.atomic(x) && length(x) == 1 } + +is_cran <- function() { + !identical(Sys.getenv("NOT_CRAN"), "true") +} + +is_windows <- function() { + .Platform$OS.type == "windows" +} + +hadoop_home_set <- function() { + !identical(Sys.getenv("HADOOP_HOME"), "") +} + +not_cran_or_windows_with_hadoop <- function() { + !is_cran() && (!is_windows() || hadoop_home_set()) +} http://git-wip-us.apache.org/repos/asf/spark/blob/d06610f9/R/pkg/inst/tests/testthat/test_mllib_classification.R -- diff --git a/R/pkg/inst/tests/testthat/test_mllib_classification.R b/R/pkg/inst/tests/testthat/test_mllib_classification.R index f3eaeb3..abf8bb2 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_classification.R +++ b/R/pkg/inst/tests/testthat/test_mllib_classification.R @@ -50,15 +50,17 @@ test_that("spark.svmLinear", { expect_equal(sort(as.list(take(select(prediction, "prediction"), 10))[[1]]), expected) # Test model save and load - modelPath <- tempfile(pattern = "spark-svm-linear", fileext = ".tmp") - write.ml(model, modelPath) - expect_error(write.ml(model, modelPath)) - write.ml(model, modelPath, overwrite = TRUE) - model2 <- read.ml(modelPath) - coefs <- summary(model)$coefficients - coefs2 <- summary(model2)$coefficients - expect_equal(coefs, coefs2) - unlink(modelPath) + if (not_cran_or_windows_with_hadoop()) { +modelPath <- tempfile(pattern = "spark-svm-linear", fileext = ".tmp") +write.ml(model, modelPath) +expect_error(write.ml(model, modelPath)) +write.ml(model, modelPath, overwrite = TRUE) +model2 <- read.ml(modelPath) +coefs <- summary(model)$coefficients +coefs2 <- summary(model2)$coefficients +expect_equal(coefs, coefs2) +unlink(modelPath) + } # Test prediction with numeric label label <- c(0.0, 0.0, 0.0, 1.0, 1.0) @@ -128,15 +130,17 @@ test_that("spark.logit", { expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1)) # Test model save and load - modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp") - write.ml(model, modelPath) - expect_error(write.ml(model, modelPath)) - write.ml(model, modelPath, overwrite = TRUE) - model2 <- read.ml(modelPath) - coefs <- summary(model)$coefficients - coefs2 <- summary(model2)$coefficients - expect_equal(coefs, coefs2) - unlink(modelPath) + if (not_cran_or_windows_with_hadoop()) { +modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp") +write.ml(model, modelPath) +expect_error(write.ml(model, modelPath)) +write.ml(model, modelPath, overwrite = TRUE) +model2 <- read.ml(modelPath) +coefs <- summa
spark git commit: [SPARK-20727] Skip tests that use Hadoop utils on CRAN Windows
Repository: spark Updated Branches: refs/heads/branch-2.2 ddc199eef -> 5e9541a4d [SPARK-20727] Skip tests that use Hadoop utils on CRAN Windows ## What changes were proposed in this pull request? This change skips tests that use the Hadoop libraries while running on CRAN check with Windows as the operating system. This is to handle cases where the Hadoop winutils binaries are missing on the target system. The skipped tests consist of 1. Tests that save, load a model in MLlib 2. Tests that save, load CSV, JSON and Parquet files in SQL 3. Hive tests ## How was this patch tested? Tested by running on a local windows VM with HADOOP_HOME unset. Also testing with https://win-builder.r-project.org Author: Shivaram Venkataraman Closes #17966 from shivaram/sparkr-windows-cran. (cherry picked from commit d06610f992ccf199928c0a71699fbf4c01705c31) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5e9541a4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5e9541a4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5e9541a4 Branch: refs/heads/branch-2.2 Commit: 5e9541a4d4896f7a84755265fa1955e256cda449 Parents: ddc199e Author: Shivaram Venkataraman Authored: Mon May 22 23:04:22 2017 -0700 Committer: Felix Cheung Committed: Mon May 22 23:04:34 2017 -0700 -- R/pkg/R/utils.R | 16 + .../tests/testthat/test_mllib_classification.R | 90 +++-- .../inst/tests/testthat/test_mllib_clustering.R | 112 +++--- R/pkg/inst/tests/testthat/test_mllib_fpm.R | 16 +- .../tests/testthat/test_mllib_recommendation.R | 42 +- .../inst/tests/testthat/test_mllib_regression.R | 42 +- R/pkg/inst/tests/testthat/test_mllib_tree.R | 112 +++--- R/pkg/inst/tests/testthat/test_sparkSQL.R | 396 ++- 8 files changed, 445 insertions(+), 381 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5e9541a4/R/pkg/R/utils.R -- diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index fbc89e9..b19556a 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -899,3 +899,19 @@ basenameSansExtFromUrl <- function(url) { isAtomicLengthOne <- function(x) { is.atomic(x) && length(x) == 1 } + +is_cran <- function() { + !identical(Sys.getenv("NOT_CRAN"), "true") +} + +is_windows <- function() { + .Platform$OS.type == "windows" +} + +hadoop_home_set <- function() { + !identical(Sys.getenv("HADOOP_HOME"), "") +} + +not_cran_or_windows_with_hadoop <- function() { + !is_cran() && (!is_windows() || hadoop_home_set()) +} http://git-wip-us.apache.org/repos/asf/spark/blob/5e9541a4/R/pkg/inst/tests/testthat/test_mllib_classification.R -- diff --git a/R/pkg/inst/tests/testthat/test_mllib_classification.R b/R/pkg/inst/tests/testthat/test_mllib_classification.R index f3eaeb3..abf8bb2 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_classification.R +++ b/R/pkg/inst/tests/testthat/test_mllib_classification.R @@ -50,15 +50,17 @@ test_that("spark.svmLinear", { expect_equal(sort(as.list(take(select(prediction, "prediction"), 10))[[1]]), expected) # Test model save and load - modelPath <- tempfile(pattern = "spark-svm-linear", fileext = ".tmp") - write.ml(model, modelPath) - expect_error(write.ml(model, modelPath)) - write.ml(model, modelPath, overwrite = TRUE) - model2 <- read.ml(modelPath) - coefs <- summary(model)$coefficients - coefs2 <- summary(model2)$coefficients - expect_equal(coefs, coefs2) - unlink(modelPath) + if (not_cran_or_windows_with_hadoop()) { +modelPath <- tempfile(pattern = "spark-svm-linear", fileext = ".tmp") +write.ml(model, modelPath) +expect_error(write.ml(model, modelPath)) +write.ml(model, modelPath, overwrite = TRUE) +model2 <- read.ml(modelPath) +coefs <- summary(model)$coefficients +coefs2 <- summary(model2)$coefficients +expect_equal(coefs, coefs2) +unlink(modelPath) + } # Test prediction with numeric label label <- c(0.0, 0.0, 0.0, 1.0, 1.0) @@ -128,15 +130,17 @@ test_that("spark.logit", { expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1)) # Test model save and load - modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp") - write.ml(model, modelPath) - expect_error(write.ml(model, modelPath)) - write.ml(model, modelPath, overwrite = TRUE) - model2 <- read.ml(modelPath) - coefs <- summary(model)$coefficients - coefs2 <- summary(model2)$coefficients - expect_equal(coefs, coefs2) - unlink(modelPath) + if (not_cran_or_windows_with_hadoop()) { +modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp") +write.ml(model, modelPath) +expect_error(write.ml(model, modelPat
spark git commit: [SPARK-20849][DOC][SPARKR] Document R DecisionTree
Repository: spark Updated Branches: refs/heads/master 8ce0d8ffb -> a97c49704 [SPARK-20849][DOC][SPARKR] Document R DecisionTree ## What changes were proposed in this pull request? 1, add an example for sparkr `decisionTree` 2, document it in user guide ## How was this patch tested? local submit Author: Zheng RuiFeng Closes #18067 from zhengruifeng/dt_example. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a97c4970 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a97c4970 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a97c4970 Branch: refs/heads/master Commit: a97c497045e9102b8eefcd0a0567ee08e61c838c Parents: 8ce0d8f Author: Zheng RuiFeng Authored: Thu May 25 23:00:50 2017 -0700 Committer: Felix Cheung Committed: Thu May 25 23:00:50 2017 -0700 -- R/pkg/vignettes/sparkr-vignettes.Rmd | 50 --- docs/ml-classification-regression.md | 7 docs/sparkr.md| 1 + examples/src/main/r/ml/decisionTree.R | 65 ++ 4 files changed, 108 insertions(+), 15 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a97c4970/R/pkg/vignettes/sparkr-vignettes.Rmd -- diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 13a3991..2301a64 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -503,6 +503,8 @@ SparkR supports the following machine learning models and algorithms. Tree - Classification and Regression +* Decision Tree + * Gradient-Boosted Trees (GBT) * Random Forest @@ -776,16 +778,32 @@ newDF <- createDataFrame(data.frame(x = c(1.5, 3.2))) head(predict(isoregModel, newDF)) ``` + Decision Tree + +`spark.decisionTree` fits a [decision tree](https://en.wikipedia.org/wiki/Decision_tree_learning) classification or regression model on a `SparkDataFrame`. +Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models. + +We use the `Titanic` dataset to train a decision tree and make predictions: + +```{r} +t <- as.data.frame(Titanic) +df <- createDataFrame(t) +dtModel <- spark.decisionTree(df, Survived ~ ., type = "classification", maxDepth = 2) +summary(dtModel) +predictions <- predict(dtModel, df) +``` + Gradient-Boosted Trees `spark.gbt` fits a [gradient-boosted tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or regression model on a `SparkDataFrame`. Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models. -We use the `longley` dataset to train a gradient-boosted tree and make predictions: +We use the `Titanic` dataset to train a gradient-boosted tree and make predictions: -```{r, warning=FALSE} -df <- createDataFrame(longley) -gbtModel <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 2, maxIter = 2) +```{r} +t <- as.data.frame(Titanic) +df <- createDataFrame(t) +gbtModel <- spark.gbt(df, Survived ~ ., type = "classification", maxDepth = 2, maxIter = 2) summary(gbtModel) predictions <- predict(gbtModel, df) ``` @@ -795,11 +813,12 @@ predictions <- predict(gbtModel, df) `spark.randomForest` fits a [random forest](https://en.wikipedia.org/wiki/Random_forest) classification or regression model on a `SparkDataFrame`. Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models. -In the following example, we use the `longley` dataset to train a random forest and make predictions: +In the following example, we use the `Titanic` dataset to train a random forest and make predictions: -```{r, warning=FALSE} -df <- createDataFrame(longley) -rfModel <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth = 2, numTrees = 2) +```{r} +t <- as.data.frame(Titanic) +df <- createDataFrame(t) +rfModel <- spark.randomForest(df, Survived ~ ., type = "classification", maxDepth = 2, numTrees = 2) summary(rfModel) predictions <- predict(rfModel, df) ``` @@ -965,17 +984,18 @@ Given a `SparkDataFrame`, the test compares continuous data in a given column `t specified by parameter `nullHypothesis`. Users can call `summary` to get a summary of the test results. -In the following example, we test whether the `longley` dataset's `Armed_Forces` column +In the following example, we test whether the `Titanic` dataset's `Freq` column follows a normal distribution. We set the parameters of the normal distribution using the mean and standard deviation of the sample. -```{r, warning=FALSE} -df <- createDat
[1/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN
Repository: spark Updated Branches: refs/heads/master 5301a19a0 -> dc4c35183 http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_streaming.R -- diff --git a/R/pkg/tests/fulltests/test_streaming.R b/R/pkg/tests/fulltests/test_streaming.R new file mode 100644 index 000..b20b431 --- /dev/null +++ b/R/pkg/tests/fulltests/test_streaming.R @@ -0,0 +1,167 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +library(testthat) + +context("Structured Streaming") + +# Tests for Structured Streaming functions in SparkR + +sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) + +jsonSubDir <- file.path("sparkr-test", "json", "") +if (.Platform$OS.type == "windows") { + # file.path removes the empty separator on Windows, adds it back + jsonSubDir <- paste0(jsonSubDir, .Platform$file.sep) +} +jsonDir <- file.path(tempdir(), jsonSubDir) +dir.create(jsonDir, recursive = TRUE) + +mockLines <- c("{\"name\":\"Michael\"}", + "{\"name\":\"Andy\", \"age\":30}", + "{\"name\":\"Justin\", \"age\":19}") +jsonPath <- tempfile(pattern = jsonSubDir, fileext = ".tmp") +writeLines(mockLines, jsonPath) + +mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}", + "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}", + "{\"name\":\"David\",\"age\":60,\"height\":null}") +jsonPathNa <- tempfile(pattern = jsonSubDir, fileext = ".tmp") + +schema <- structType(structField("name", "string"), + structField("age", "integer"), + structField("count", "double")) + +test_that("read.stream, write.stream, awaitTermination, stopQuery", { + skip_on_cran() + + df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1) + expect_true(isStreaming(df)) + counts <- count(group_by(df, "name")) + q <- write.stream(counts, "memory", queryName = "people", outputMode = "complete") + + expect_false(awaitTermination(q, 5 * 1000)) + callJMethod(q@ssq, "processAllAvailable") + expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 3) + + writeLines(mockLinesNa, jsonPathNa) + awaitTermination(q, 5 * 1000) + callJMethod(q@ssq, "processAllAvailable") + expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 6) + + stopQuery(q) + expect_true(awaitTermination(q, 1)) + expect_error(awaitTermination(q), NA) +}) + +test_that("print from explain, lastProgress, status, isActive", { + skip_on_cran() + + df <- read.stream("json", path = jsonDir, schema = schema) + expect_true(isStreaming(df)) + counts <- count(group_by(df, "name")) + q <- write.stream(counts, "memory", queryName = "people2", outputMode = "complete") + + awaitTermination(q, 5 * 1000) + callJMethod(q@ssq, "processAllAvailable") + + expect_equal(capture.output(explain(q))[[1]], "== Physical Plan ==") + expect_true(any(grepl("\"description\" : \"MemorySink\"", capture.output(lastProgress(q) + expect_true(any(grepl("\"isTriggerActive\" : ", capture.output(status(q) + + expect_equal(queryName(q), "people2") + expect_true(isActive(q)) + + stopQuery(q) +}) + +test_that("Stream other format", { + skip_on_cran() + + parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet") + df <- read.df(jsonPath, "json", schema) + write.df(df, parquetPath, "parquet", "overwrite") + + df <- read.stream(path = parquetPath, schema = schema) + expect_true(isStreaming(df)) + counts <- count(group_by(df, "name")) + q <- write.stream(counts, "memory", queryName = "people3", outputMode = "complete") + + expect_false(awaitTermination(q, 5 * 1000)) + callJMethod(q@ssq, "processAllAvailable") + expect_equal(head(sql("SELECT count(*) FROM people3"))[[1]], 3) + + expect_equal(queryName(q), "people3") + expect_true(any(grepl("\"description\" : \"FileStreamSource[[:print:]]+parquet", + capture.output(lastProgress(q) + expect_true(isActive(q)) + + stopQuery(q) + expect_true(awaitTermination(q, 1)) + expect_false(isActive(q)) + + unlink(parquetPath) +}) + +test_that("Non-streaming DataFrame", { + skip_on_cran() + + c <- as.DataFrame(cars) + exp
[2/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_sparkSQL.R -- diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R new file mode 100644 index 000..c790d02 --- /dev/null +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -0,0 +1,3474 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +library(testthat) + +context("SparkSQL functions") + +# Utility function for easily checking the values of a StructField +checkStructField <- function(actual, expectedName, expectedType, expectedNullable) { + expect_equal(class(actual), "structField") + expect_equal(actual$name(), expectedName) + expect_equal(actual$dataType.toString(), expectedType) + expect_equal(actual$nullable(), expectedNullable) +} + +markUtf8 <- function(s) { + Encoding(s) <- "UTF-8" + s +} + +setHiveContext <- function(sc) { + if (exists(".testHiveSession", envir = .sparkREnv)) { +hiveSession <- get(".testHiveSession", envir = .sparkREnv) + } else { +# initialize once and reuse +ssc <- callJMethod(sc, "sc") +hiveCtx <- tryCatch({ + newJObject("org.apache.spark.sql.hive.test.TestHiveContext", ssc, FALSE) +}, +error = function(err) { + skip("Hive is not build with SparkSQL, skipped") +}) +hiveSession <- callJMethod(hiveCtx, "sparkSession") + } + previousSession <- get(".sparkRsession", envir = .sparkREnv) + assign(".sparkRsession", hiveSession, envir = .sparkREnv) + assign(".prevSparkRsession", previousSession, envir = .sparkREnv) + hiveSession +} + +unsetHiveContext <- function() { + previousSession <- get(".prevSparkRsession", envir = .sparkREnv) + assign(".sparkRsession", previousSession, envir = .sparkREnv) + remove(".prevSparkRsession", envir = .sparkREnv) +} + +# Tests for SparkSQL functions in SparkR + +filesBefore <- list.files(path = sparkRDir, all.files = TRUE) +sparkSession <- if (not_cran_or_windows_with_hadoop()) { +sparkR.session(master = sparkRTestMaster) + } else { +sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) + } +sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession) + +mockLines <- c("{\"name\":\"Michael\"}", + "{\"name\":\"Andy\", \"age\":30}", + "{\"name\":\"Justin\", \"age\":19}") +jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") +parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet") +orcPath <- tempfile(pattern = "sparkr-test", fileext = ".orc") +writeLines(mockLines, jsonPath) + +# For test nafunctions, like dropna(), fillna(),... +mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}", + "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}", + "{\"name\":\"David\",\"age\":60,\"height\":null}", + "{\"name\":\"Amy\",\"age\":null,\"height\":null}", + "{\"name\":null,\"age\":null,\"height\":null}") +jsonPathNa <- tempfile(pattern = "sparkr-test", fileext = ".tmp") +writeLines(mockLinesNa, jsonPathNa) + +# For test complex types in DataFrame +mockLinesComplexType <- + c("{\"c1\":[1, 2, 3], \"c2\":[\"a\", \"b\", \"c\"], \"c3\":[1.0, 2.0, 3.0]}", +"{\"c1\":[4, 5, 6], \"c2\":[\"d\", \"e\", \"f\"], \"c3\":[4.0, 5.0, 6.0]}", +"{\"c1\":[7, 8, 9], \"c2\":[\"g\", \"h\", \"i\"], \"c3\":[7.0, 8.0, 9.0]}") +complexTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") +writeLines(mockLinesComplexType, complexTypeJsonPath) + +# For test map type and struct type in DataFrame +mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}", + "{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}", + "{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}") +mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") +writeLines(mockLinesMapType, mapTypeJsonPath) + +if (.Platform$OS.type == "windows") { + Sys.setenv(TZ = "GMT") +} + +test_that("calling sparkRSQL.init returns existing SQL context", { + skip_on_cran() + + sqlContext <- suppressWarnings(sparkRSQL.init(sc)) + expect_equal(suppressWarni
[3/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN
http://git-wip-us.apache.org/repos/asf/spark/blob/0b0be47e/R/pkg/tests/fulltests/test_mllib_fpm.R -- diff --git a/R/pkg/tests/fulltests/test_mllib_fpm.R b/R/pkg/tests/fulltests/test_mllib_fpm.R new file mode 100644 index 000..4e10ca1 --- /dev/null +++ b/R/pkg/tests/fulltests/test_mllib_fpm.R @@ -0,0 +1,85 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +library(testthat) + +context("MLlib frequent pattern mining") + +# Tests for MLlib frequent pattern mining algorithms in SparkR +sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) + +test_that("spark.fpGrowth", { + data <- selectExpr(createDataFrame(data.frame(items = c( +"1,2", +"1,2", +"1,2,3", +"1,3" + ))), "split(items, ',') as items") + + model <- spark.fpGrowth(data, minSupport = 0.3, minConfidence = 0.8, numPartitions = 1) + + itemsets <- collect(spark.freqItemsets(model)) + + expected_itemsets <- data.frame( +items = I(list(list("3"), list("3", "1"), list("2"), list("2", "1"), list("1"))), +freq = c(2, 2, 3, 3, 4) + ) + + expect_equivalent(expected_itemsets, itemsets) + + expected_association_rules <- data.frame( +antecedent = I(list(list("2"), list("3"))), +consequent = I(list(list("1"), list("1"))), +confidence = c(1, 1) + ) + + expect_equivalent(expected_association_rules, collect(spark.associationRules(model))) + + new_data <- selectExpr(createDataFrame(data.frame(items = c( +"1,2", +"1,3", +"2,3" + ))), "split(items, ',') as items") + + expected_predictions <- data.frame( +items = I(list(list("1", "2"), list("1", "3"), list("2", "3"))), +prediction = I(list(list(), list(), list("1"))) + ) + + expect_equivalent(expected_predictions, collect(predict(model, new_data))) + + if (not_cran_or_windows_with_hadoop()) { +modelPath <- tempfile(pattern = "spark-fpm", fileext = ".tmp") +write.ml(model, modelPath, overwrite = TRUE) +loaded_model <- read.ml(modelPath) + +expect_equivalent( + itemsets, + collect(spark.freqItemsets(loaded_model))) + +unlink(modelPath) + } + + model_without_numpartitions <- spark.fpGrowth(data, minSupport = 0.3, minConfidence = 0.8) + expect_equal( +count(spark.freqItemsets(model_without_numpartitions)), +count(spark.freqItemsets(model)) + ) + +}) + +sparkR.session.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/0b0be47e/R/pkg/tests/fulltests/test_mllib_recommendation.R -- diff --git a/R/pkg/tests/fulltests/test_mllib_recommendation.R b/R/pkg/tests/fulltests/test_mllib_recommendation.R new file mode 100644 index 000..cc8064f --- /dev/null +++ b/R/pkg/tests/fulltests/test_mllib_recommendation.R @@ -0,0 +1,67 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +library(testthat) + +context("MLlib recommendation algorithms") + +# Tests for MLlib recommendation algorithms in SparkR +sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) + +test_that("spark.als", { + data <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 4.0), + list(2, 1, 1.0), list(2, 2, 5.0)) + df <- createDataFrame(data, c("user", "item", "score")) + model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = "item", + rank = 10, maxIter = 5, seed = 0, regParam = 0.1) + stats <-
[3/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_mllib_fpm.R -- diff --git a/R/pkg/tests/fulltests/test_mllib_fpm.R b/R/pkg/tests/fulltests/test_mllib_fpm.R new file mode 100644 index 000..4e10ca1 --- /dev/null +++ b/R/pkg/tests/fulltests/test_mllib_fpm.R @@ -0,0 +1,85 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +library(testthat) + +context("MLlib frequent pattern mining") + +# Tests for MLlib frequent pattern mining algorithms in SparkR +sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) + +test_that("spark.fpGrowth", { + data <- selectExpr(createDataFrame(data.frame(items = c( +"1,2", +"1,2", +"1,2,3", +"1,3" + ))), "split(items, ',') as items") + + model <- spark.fpGrowth(data, minSupport = 0.3, minConfidence = 0.8, numPartitions = 1) + + itemsets <- collect(spark.freqItemsets(model)) + + expected_itemsets <- data.frame( +items = I(list(list("3"), list("3", "1"), list("2"), list("2", "1"), list("1"))), +freq = c(2, 2, 3, 3, 4) + ) + + expect_equivalent(expected_itemsets, itemsets) + + expected_association_rules <- data.frame( +antecedent = I(list(list("2"), list("3"))), +consequent = I(list(list("1"), list("1"))), +confidence = c(1, 1) + ) + + expect_equivalent(expected_association_rules, collect(spark.associationRules(model))) + + new_data <- selectExpr(createDataFrame(data.frame(items = c( +"1,2", +"1,3", +"2,3" + ))), "split(items, ',') as items") + + expected_predictions <- data.frame( +items = I(list(list("1", "2"), list("1", "3"), list("2", "3"))), +prediction = I(list(list(), list(), list("1"))) + ) + + expect_equivalent(expected_predictions, collect(predict(model, new_data))) + + if (not_cran_or_windows_with_hadoop()) { +modelPath <- tempfile(pattern = "spark-fpm", fileext = ".tmp") +write.ml(model, modelPath, overwrite = TRUE) +loaded_model <- read.ml(modelPath) + +expect_equivalent( + itemsets, + collect(spark.freqItemsets(loaded_model))) + +unlink(modelPath) + } + + model_without_numpartitions <- spark.fpGrowth(data, minSupport = 0.3, minConfidence = 0.8) + expect_equal( +count(spark.freqItemsets(model_without_numpartitions)), +count(spark.freqItemsets(model)) + ) + +}) + +sparkR.session.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_mllib_recommendation.R -- diff --git a/R/pkg/tests/fulltests/test_mllib_recommendation.R b/R/pkg/tests/fulltests/test_mllib_recommendation.R new file mode 100644 index 000..cc8064f --- /dev/null +++ b/R/pkg/tests/fulltests/test_mllib_recommendation.R @@ -0,0 +1,67 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +library(testthat) + +context("MLlib recommendation algorithms") + +# Tests for MLlib recommendation algorithms in SparkR +sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) + +test_that("spark.als", { + data <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 4.0), + list(2, 1, 1.0), list(2, 2, 5.0)) + df <- createDataFrame(data, c("user", "item", "score")) + model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = "item", + rank = 10, maxIter = 5, seed = 0, regParam = 0.1) + stats <-
[6/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN
http://git-wip-us.apache.org/repos/asf/spark/blob/0b0be47e/R/pkg/inst/tests/testthat/test_mllib_regression.R -- diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R deleted file mode 100644 index b05fdd3..000 --- a/R/pkg/inst/tests/testthat/test_mllib_regression.R +++ /dev/null @@ -1,480 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -#http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -library(testthat) - -context("MLlib regression algorithms, except for tree-based algorithms") - -# Tests for MLlib regression algorithms in SparkR -sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) - -test_that("formula of spark.glm", { - skip_on_cran() - - training <- suppressWarnings(createDataFrame(iris)) - # directly calling the spark API - # dot minus and intercept vs native glm - model <- spark.glm(training, Sepal_Width ~ . - Species + 0) - vals <- collect(select(predict(model, training), "prediction")) - rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - - # feature interaction vs native glm - model <- spark.glm(training, Sepal_Width ~ Species:Sepal_Length) - vals <- collect(select(predict(model, training), "prediction")) - rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - - # glm should work with long formula - training <- suppressWarnings(createDataFrame(iris)) - training$LongLongLongLongLongName <- training$Sepal_Width - training$VeryLongLongLongLonLongName <- training$Sepal_Length - training$AnotherLongLongLongLongName <- training$Species - model <- spark.glm(training, LongLongLongLongLongName ~ VeryLongLongLongLonLongName + -AnotherLongLongLongLongName) - vals <- collect(select(predict(model, training), "prediction")) - rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) -}) - -test_that("spark.glm and predict", { - training <- suppressWarnings(createDataFrame(iris)) - # gaussian family - model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species) - prediction <- predict(model, training) - expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") - vals <- collect(select(prediction, "prediction")) - rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - - # poisson family - model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species, - family = poisson(link = identity)) - prediction <- predict(model, training) - expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") - vals <- collect(select(prediction, "prediction")) - rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species, -data = iris, family = poisson(link = identity)), iris)) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - - # Gamma family - x <- runif(100, -1, 1) - y <- rgamma(100, rate = 10 / exp(0.5 + 1.2 * x), shape = 10) - df <- as.DataFrame(as.data.frame(list(x = x, y = y))) - model <- glm(y ~ x, family = Gamma, df) - out <- capture.output(print(summary(model))) - expect_true(any(grepl("Dispersion parameter for gamma family", out))) - - # tweedie family - model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species, - family = "tweedie", var.power = 1.2, link.power = 0.0) - prediction <- predict(model, training) - expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") - vals <- collect(select(prediction, "prediction")) - - # manual calculation of the R predicted values to avoid dependence on statmod - #' library(statmod) - #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris, - #' family = tweedie(var.power = 1.2, link.power = 0.0)) - #' print(coef(rModel)) - - rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174) - rVals <- exp(as.numeric(model.
[4/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN
http://git-wip-us.apache.org/repos/asf/spark/blob/0b0be47e/R/pkg/inst/tests/testthat/test_streaming.R -- diff --git a/R/pkg/inst/tests/testthat/test_streaming.R b/R/pkg/inst/tests/testthat/test_streaming.R deleted file mode 100644 index b20b431..000 --- a/R/pkg/inst/tests/testthat/test_streaming.R +++ /dev/null @@ -1,167 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -#http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -library(testthat) - -context("Structured Streaming") - -# Tests for Structured Streaming functions in SparkR - -sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) - -jsonSubDir <- file.path("sparkr-test", "json", "") -if (.Platform$OS.type == "windows") { - # file.path removes the empty separator on Windows, adds it back - jsonSubDir <- paste0(jsonSubDir, .Platform$file.sep) -} -jsonDir <- file.path(tempdir(), jsonSubDir) -dir.create(jsonDir, recursive = TRUE) - -mockLines <- c("{\"name\":\"Michael\"}", - "{\"name\":\"Andy\", \"age\":30}", - "{\"name\":\"Justin\", \"age\":19}") -jsonPath <- tempfile(pattern = jsonSubDir, fileext = ".tmp") -writeLines(mockLines, jsonPath) - -mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}", - "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}", - "{\"name\":\"David\",\"age\":60,\"height\":null}") -jsonPathNa <- tempfile(pattern = jsonSubDir, fileext = ".tmp") - -schema <- structType(structField("name", "string"), - structField("age", "integer"), - structField("count", "double")) - -test_that("read.stream, write.stream, awaitTermination, stopQuery", { - skip_on_cran() - - df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1) - expect_true(isStreaming(df)) - counts <- count(group_by(df, "name")) - q <- write.stream(counts, "memory", queryName = "people", outputMode = "complete") - - expect_false(awaitTermination(q, 5 * 1000)) - callJMethod(q@ssq, "processAllAvailable") - expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 3) - - writeLines(mockLinesNa, jsonPathNa) - awaitTermination(q, 5 * 1000) - callJMethod(q@ssq, "processAllAvailable") - expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 6) - - stopQuery(q) - expect_true(awaitTermination(q, 1)) - expect_error(awaitTermination(q), NA) -}) - -test_that("print from explain, lastProgress, status, isActive", { - skip_on_cran() - - df <- read.stream("json", path = jsonDir, schema = schema) - expect_true(isStreaming(df)) - counts <- count(group_by(df, "name")) - q <- write.stream(counts, "memory", queryName = "people2", outputMode = "complete") - - awaitTermination(q, 5 * 1000) - callJMethod(q@ssq, "processAllAvailable") - - expect_equal(capture.output(explain(q))[[1]], "== Physical Plan ==") - expect_true(any(grepl("\"description\" : \"MemorySink\"", capture.output(lastProgress(q) - expect_true(any(grepl("\"isTriggerActive\" : ", capture.output(status(q) - - expect_equal(queryName(q), "people2") - expect_true(isActive(q)) - - stopQuery(q) -}) - -test_that("Stream other format", { - skip_on_cran() - - parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet") - df <- read.df(jsonPath, "json", schema) - write.df(df, parquetPath, "parquet", "overwrite") - - df <- read.stream(path = parquetPath, schema = schema) - expect_true(isStreaming(df)) - counts <- count(group_by(df, "name")) - q <- write.stream(counts, "memory", queryName = "people3", outputMode = "complete") - - expect_false(awaitTermination(q, 5 * 1000)) - callJMethod(q@ssq, "processAllAvailable") - expect_equal(head(sql("SELECT count(*) FROM people3"))[[1]], 3) - - expect_equal(queryName(q), "people3") - expect_true(any(grepl("\"description\" : \"FileStreamSource[[:print:]]+parquet", - capture.output(lastProgress(q) - expect_true(isActive(q)) - - stopQuery(q) - expect_true(awaitTermination(q, 1)) - expect_false(isActive(q)) - - unlink(parquetPath) -}) - -test_that("Non-streaming DataFrame", { - skip_on_cran() - - c <- as.DataFrame(cars) - expect_false(isStreaming(c)) - - expect_error(write.stream(c, "
[4/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_streaming.R -- diff --git a/R/pkg/inst/tests/testthat/test_streaming.R b/R/pkg/inst/tests/testthat/test_streaming.R deleted file mode 100644 index b20b431..000 --- a/R/pkg/inst/tests/testthat/test_streaming.R +++ /dev/null @@ -1,167 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -#http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -library(testthat) - -context("Structured Streaming") - -# Tests for Structured Streaming functions in SparkR - -sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) - -jsonSubDir <- file.path("sparkr-test", "json", "") -if (.Platform$OS.type == "windows") { - # file.path removes the empty separator on Windows, adds it back - jsonSubDir <- paste0(jsonSubDir, .Platform$file.sep) -} -jsonDir <- file.path(tempdir(), jsonSubDir) -dir.create(jsonDir, recursive = TRUE) - -mockLines <- c("{\"name\":\"Michael\"}", - "{\"name\":\"Andy\", \"age\":30}", - "{\"name\":\"Justin\", \"age\":19}") -jsonPath <- tempfile(pattern = jsonSubDir, fileext = ".tmp") -writeLines(mockLines, jsonPath) - -mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}", - "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}", - "{\"name\":\"David\",\"age\":60,\"height\":null}") -jsonPathNa <- tempfile(pattern = jsonSubDir, fileext = ".tmp") - -schema <- structType(structField("name", "string"), - structField("age", "integer"), - structField("count", "double")) - -test_that("read.stream, write.stream, awaitTermination, stopQuery", { - skip_on_cran() - - df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1) - expect_true(isStreaming(df)) - counts <- count(group_by(df, "name")) - q <- write.stream(counts, "memory", queryName = "people", outputMode = "complete") - - expect_false(awaitTermination(q, 5 * 1000)) - callJMethod(q@ssq, "processAllAvailable") - expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 3) - - writeLines(mockLinesNa, jsonPathNa) - awaitTermination(q, 5 * 1000) - callJMethod(q@ssq, "processAllAvailable") - expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 6) - - stopQuery(q) - expect_true(awaitTermination(q, 1)) - expect_error(awaitTermination(q), NA) -}) - -test_that("print from explain, lastProgress, status, isActive", { - skip_on_cran() - - df <- read.stream("json", path = jsonDir, schema = schema) - expect_true(isStreaming(df)) - counts <- count(group_by(df, "name")) - q <- write.stream(counts, "memory", queryName = "people2", outputMode = "complete") - - awaitTermination(q, 5 * 1000) - callJMethod(q@ssq, "processAllAvailable") - - expect_equal(capture.output(explain(q))[[1]], "== Physical Plan ==") - expect_true(any(grepl("\"description\" : \"MemorySink\"", capture.output(lastProgress(q) - expect_true(any(grepl("\"isTriggerActive\" : ", capture.output(status(q) - - expect_equal(queryName(q), "people2") - expect_true(isActive(q)) - - stopQuery(q) -}) - -test_that("Stream other format", { - skip_on_cran() - - parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet") - df <- read.df(jsonPath, "json", schema) - write.df(df, parquetPath, "parquet", "overwrite") - - df <- read.stream(path = parquetPath, schema = schema) - expect_true(isStreaming(df)) - counts <- count(group_by(df, "name")) - q <- write.stream(counts, "memory", queryName = "people3", outputMode = "complete") - - expect_false(awaitTermination(q, 5 * 1000)) - callJMethod(q@ssq, "processAllAvailable") - expect_equal(head(sql("SELECT count(*) FROM people3"))[[1]], 3) - - expect_equal(queryName(q), "people3") - expect_true(any(grepl("\"description\" : \"FileStreamSource[[:print:]]+parquet", - capture.output(lastProgress(q) - expect_true(isActive(q)) - - stopQuery(q) - expect_true(awaitTermination(q, 1)) - expect_false(isActive(q)) - - unlink(parquetPath) -}) - -test_that("Non-streaming DataFrame", { - skip_on_cran() - - c <- as.DataFrame(cars) - expect_false(isStreaming(c)) - - expect_error(write.stream(c, "
[6/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_mllib_regression.R -- diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R deleted file mode 100644 index b05fdd3..000 --- a/R/pkg/inst/tests/testthat/test_mllib_regression.R +++ /dev/null @@ -1,480 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -#http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -library(testthat) - -context("MLlib regression algorithms, except for tree-based algorithms") - -# Tests for MLlib regression algorithms in SparkR -sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) - -test_that("formula of spark.glm", { - skip_on_cran() - - training <- suppressWarnings(createDataFrame(iris)) - # directly calling the spark API - # dot minus and intercept vs native glm - model <- spark.glm(training, Sepal_Width ~ . - Species + 0) - vals <- collect(select(predict(model, training), "prediction")) - rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - - # feature interaction vs native glm - model <- spark.glm(training, Sepal_Width ~ Species:Sepal_Length) - vals <- collect(select(predict(model, training), "prediction")) - rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - - # glm should work with long formula - training <- suppressWarnings(createDataFrame(iris)) - training$LongLongLongLongLongName <- training$Sepal_Width - training$VeryLongLongLongLonLongName <- training$Sepal_Length - training$AnotherLongLongLongLongName <- training$Species - model <- spark.glm(training, LongLongLongLongLongName ~ VeryLongLongLongLonLongName + -AnotherLongLongLongLongName) - vals <- collect(select(predict(model, training), "prediction")) - rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) -}) - -test_that("spark.glm and predict", { - training <- suppressWarnings(createDataFrame(iris)) - # gaussian family - model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species) - prediction <- predict(model, training) - expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") - vals <- collect(select(prediction, "prediction")) - rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - - # poisson family - model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species, - family = poisson(link = identity)) - prediction <- predict(model, training) - expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") - vals <- collect(select(prediction, "prediction")) - rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species, -data = iris, family = poisson(link = identity)), iris)) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - - # Gamma family - x <- runif(100, -1, 1) - y <- rgamma(100, rate = 10 / exp(0.5 + 1.2 * x), shape = 10) - df <- as.DataFrame(as.data.frame(list(x = x, y = y))) - model <- glm(y ~ x, family = Gamma, df) - out <- capture.output(print(summary(model))) - expect_true(any(grepl("Dispersion parameter for gamma family", out))) - - # tweedie family - model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species, - family = "tweedie", var.power = 1.2, link.power = 0.0) - prediction <- predict(model, training) - expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") - vals <- collect(select(prediction, "prediction")) - - # manual calculation of the R predicted values to avoid dependence on statmod - #' library(statmod) - #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris, - #' family = tweedie(var.power = 1.2, link.power = 0.0)) - #' print(coef(rModel)) - - rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174) - rVals <- exp(as.numeric(model.
[5/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN
http://git-wip-us.apache.org/repos/asf/spark/blob/0b0be47e/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R deleted file mode 100644 index d2d5191..000 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ /dev/null @@ -1,3198 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -#http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -library(testthat) - -context("SparkSQL functions") - -# Utility function for easily checking the values of a StructField -checkStructField <- function(actual, expectedName, expectedType, expectedNullable) { - expect_equal(class(actual), "structField") - expect_equal(actual$name(), expectedName) - expect_equal(actual$dataType.toString(), expectedType) - expect_equal(actual$nullable(), expectedNullable) -} - -markUtf8 <- function(s) { - Encoding(s) <- "UTF-8" - s -} - -setHiveContext <- function(sc) { - if (exists(".testHiveSession", envir = .sparkREnv)) { -hiveSession <- get(".testHiveSession", envir = .sparkREnv) - } else { -# initialize once and reuse -ssc <- callJMethod(sc, "sc") -hiveCtx <- tryCatch({ - newJObject("org.apache.spark.sql.hive.test.TestHiveContext", ssc, FALSE) -}, -error = function(err) { - skip("Hive is not build with SparkSQL, skipped") -}) -hiveSession <- callJMethod(hiveCtx, "sparkSession") - } - previousSession <- get(".sparkRsession", envir = .sparkREnv) - assign(".sparkRsession", hiveSession, envir = .sparkREnv) - assign(".prevSparkRsession", previousSession, envir = .sparkREnv) - hiveSession -} - -unsetHiveContext <- function() { - previousSession <- get(".prevSparkRsession", envir = .sparkREnv) - assign(".sparkRsession", previousSession, envir = .sparkREnv) - remove(".prevSparkRsession", envir = .sparkREnv) -} - -# Tests for SparkSQL functions in SparkR - -filesBefore <- list.files(path = sparkRDir, all.files = TRUE) -sparkSession <- if (not_cran_or_windows_with_hadoop()) { -sparkR.session(master = sparkRTestMaster) - } else { -sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) - } -sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession) - -mockLines <- c("{\"name\":\"Michael\"}", - "{\"name\":\"Andy\", \"age\":30}", - "{\"name\":\"Justin\", \"age\":19}") -jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") -parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet") -orcPath <- tempfile(pattern = "sparkr-test", fileext = ".orc") -writeLines(mockLines, jsonPath) - -# For test nafunctions, like dropna(), fillna(),... -mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}", - "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}", - "{\"name\":\"David\",\"age\":60,\"height\":null}", - "{\"name\":\"Amy\",\"age\":null,\"height\":null}", - "{\"name\":null,\"age\":null,\"height\":null}") -jsonPathNa <- tempfile(pattern = "sparkr-test", fileext = ".tmp") -writeLines(mockLinesNa, jsonPathNa) - -# For test complex types in DataFrame -mockLinesComplexType <- - c("{\"c1\":[1, 2, 3], \"c2\":[\"a\", \"b\", \"c\"], \"c3\":[1.0, 2.0, 3.0]}", -"{\"c1\":[4, 5, 6], \"c2\":[\"d\", \"e\", \"f\"], \"c3\":[4.0, 5.0, 6.0]}", -"{\"c1\":[7, 8, 9], \"c2\":[\"g\", \"h\", \"i\"], \"c3\":[7.0, 8.0, 9.0]}") -complexTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") -writeLines(mockLinesComplexType, complexTypeJsonPath) - -# For test map type and struct type in DataFrame -mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}", - "{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}", - "{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}") -mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") -writeLines(mockLinesMapType, mapTypeJsonPath) - -if (.Platform$OS.type == "windows") { - Sys.setenv(TZ = "GMT") -} - -test_that("calling sparkRSQL.init returns existing SQL context", { - skip_on_cran() - - sqlContext <- suppressWarnings(sparkRSQL.init(sc)) - expect
[1/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN
Repository: spark Updated Branches: refs/heads/branch-2.2 815a0820b -> 0b0be47e7 http://git-wip-us.apache.org/repos/asf/spark/blob/0b0be47e/R/pkg/tests/fulltests/test_streaming.R -- diff --git a/R/pkg/tests/fulltests/test_streaming.R b/R/pkg/tests/fulltests/test_streaming.R new file mode 100644 index 000..b20b431 --- /dev/null +++ b/R/pkg/tests/fulltests/test_streaming.R @@ -0,0 +1,167 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +library(testthat) + +context("Structured Streaming") + +# Tests for Structured Streaming functions in SparkR + +sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) + +jsonSubDir <- file.path("sparkr-test", "json", "") +if (.Platform$OS.type == "windows") { + # file.path removes the empty separator on Windows, adds it back + jsonSubDir <- paste0(jsonSubDir, .Platform$file.sep) +} +jsonDir <- file.path(tempdir(), jsonSubDir) +dir.create(jsonDir, recursive = TRUE) + +mockLines <- c("{\"name\":\"Michael\"}", + "{\"name\":\"Andy\", \"age\":30}", + "{\"name\":\"Justin\", \"age\":19}") +jsonPath <- tempfile(pattern = jsonSubDir, fileext = ".tmp") +writeLines(mockLines, jsonPath) + +mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}", + "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}", + "{\"name\":\"David\",\"age\":60,\"height\":null}") +jsonPathNa <- tempfile(pattern = jsonSubDir, fileext = ".tmp") + +schema <- structType(structField("name", "string"), + structField("age", "integer"), + structField("count", "double")) + +test_that("read.stream, write.stream, awaitTermination, stopQuery", { + skip_on_cran() + + df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1) + expect_true(isStreaming(df)) + counts <- count(group_by(df, "name")) + q <- write.stream(counts, "memory", queryName = "people", outputMode = "complete") + + expect_false(awaitTermination(q, 5 * 1000)) + callJMethod(q@ssq, "processAllAvailable") + expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 3) + + writeLines(mockLinesNa, jsonPathNa) + awaitTermination(q, 5 * 1000) + callJMethod(q@ssq, "processAllAvailable") + expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 6) + + stopQuery(q) + expect_true(awaitTermination(q, 1)) + expect_error(awaitTermination(q), NA) +}) + +test_that("print from explain, lastProgress, status, isActive", { + skip_on_cran() + + df <- read.stream("json", path = jsonDir, schema = schema) + expect_true(isStreaming(df)) + counts <- count(group_by(df, "name")) + q <- write.stream(counts, "memory", queryName = "people2", outputMode = "complete") + + awaitTermination(q, 5 * 1000) + callJMethod(q@ssq, "processAllAvailable") + + expect_equal(capture.output(explain(q))[[1]], "== Physical Plan ==") + expect_true(any(grepl("\"description\" : \"MemorySink\"", capture.output(lastProgress(q) + expect_true(any(grepl("\"isTriggerActive\" : ", capture.output(status(q) + + expect_equal(queryName(q), "people2") + expect_true(isActive(q)) + + stopQuery(q) +}) + +test_that("Stream other format", { + skip_on_cran() + + parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet") + df <- read.df(jsonPath, "json", schema) + write.df(df, parquetPath, "parquet", "overwrite") + + df <- read.stream(path = parquetPath, schema = schema) + expect_true(isStreaming(df)) + counts <- count(group_by(df, "name")) + q <- write.stream(counts, "memory", queryName = "people3", outputMode = "complete") + + expect_false(awaitTermination(q, 5 * 1000)) + callJMethod(q@ssq, "processAllAvailable") + expect_equal(head(sql("SELECT count(*) FROM people3"))[[1]], 3) + + expect_equal(queryName(q), "people3") + expect_true(any(grepl("\"description\" : \"FileStreamSource[[:print:]]+parquet", + capture.output(lastProgress(q) + expect_true(isActive(q)) + + stopQuery(q) + expect_true(awaitTermination(q, 1)) + expect_false(isActive(q)) + + unlink(parquetPath) +}) + +test_that("Non-streaming DataFrame", { + skip_on_cran() + + c <- as.DataFrame(cars) +
[5/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R deleted file mode 100644 index c790d02..000 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ /dev/null @@ -1,3474 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -#http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -library(testthat) - -context("SparkSQL functions") - -# Utility function for easily checking the values of a StructField -checkStructField <- function(actual, expectedName, expectedType, expectedNullable) { - expect_equal(class(actual), "structField") - expect_equal(actual$name(), expectedName) - expect_equal(actual$dataType.toString(), expectedType) - expect_equal(actual$nullable(), expectedNullable) -} - -markUtf8 <- function(s) { - Encoding(s) <- "UTF-8" - s -} - -setHiveContext <- function(sc) { - if (exists(".testHiveSession", envir = .sparkREnv)) { -hiveSession <- get(".testHiveSession", envir = .sparkREnv) - } else { -# initialize once and reuse -ssc <- callJMethod(sc, "sc") -hiveCtx <- tryCatch({ - newJObject("org.apache.spark.sql.hive.test.TestHiveContext", ssc, FALSE) -}, -error = function(err) { - skip("Hive is not build with SparkSQL, skipped") -}) -hiveSession <- callJMethod(hiveCtx, "sparkSession") - } - previousSession <- get(".sparkRsession", envir = .sparkREnv) - assign(".sparkRsession", hiveSession, envir = .sparkREnv) - assign(".prevSparkRsession", previousSession, envir = .sparkREnv) - hiveSession -} - -unsetHiveContext <- function() { - previousSession <- get(".prevSparkRsession", envir = .sparkREnv) - assign(".sparkRsession", previousSession, envir = .sparkREnv) - remove(".prevSparkRsession", envir = .sparkREnv) -} - -# Tests for SparkSQL functions in SparkR - -filesBefore <- list.files(path = sparkRDir, all.files = TRUE) -sparkSession <- if (not_cran_or_windows_with_hadoop()) { -sparkR.session(master = sparkRTestMaster) - } else { -sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) - } -sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession) - -mockLines <- c("{\"name\":\"Michael\"}", - "{\"name\":\"Andy\", \"age\":30}", - "{\"name\":\"Justin\", \"age\":19}") -jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") -parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet") -orcPath <- tempfile(pattern = "sparkr-test", fileext = ".orc") -writeLines(mockLines, jsonPath) - -# For test nafunctions, like dropna(), fillna(),... -mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}", - "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}", - "{\"name\":\"David\",\"age\":60,\"height\":null}", - "{\"name\":\"Amy\",\"age\":null,\"height\":null}", - "{\"name\":null,\"age\":null,\"height\":null}") -jsonPathNa <- tempfile(pattern = "sparkr-test", fileext = ".tmp") -writeLines(mockLinesNa, jsonPathNa) - -# For test complex types in DataFrame -mockLinesComplexType <- - c("{\"c1\":[1, 2, 3], \"c2\":[\"a\", \"b\", \"c\"], \"c3\":[1.0, 2.0, 3.0]}", -"{\"c1\":[4, 5, 6], \"c2\":[\"d\", \"e\", \"f\"], \"c3\":[4.0, 5.0, 6.0]}", -"{\"c1\":[7, 8, 9], \"c2\":[\"g\", \"h\", \"i\"], \"c3\":[7.0, 8.0, 9.0]}") -complexTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") -writeLines(mockLinesComplexType, complexTypeJsonPath) - -# For test map type and struct type in DataFrame -mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}", - "{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}", - "{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}") -mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") -writeLines(mockLinesMapType, mapTypeJsonPath) - -if (.Platform$OS.type == "windows") { - Sys.setenv(TZ = "GMT") -} - -test_that("calling sparkRSQL.init returns existing SQL context", { - skip_on_cran() - - sqlContext <- suppressWarnings(sparkRSQL.init(sc)) - expect
[7/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN
[SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN ## What changes were proposed in this pull request? Move all existing tests to non-installed directory so that it will never run by installing SparkR package For a follow-up PR: - remove all skip_on_cran() calls in tests - clean up test timer - improve or change basic tests that do run on CRAN (if anyone has suggestion) It looks like `R CMD build pkg` will still put pkg\tests (ie. the full tests) into the source package but `R CMD INSTALL` on such source package does not install these tests (and so `R CMD check` does not run them) ## How was this patch tested? - [x] unit tests, Jenkins - [x] AppVeyor - [x] make a source package, install it, `R CMD check` it - verify the full tests are not installed or run Author: Felix Cheung Closes #18264 from felixcheung/rtestset. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dc4c3518 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dc4c3518 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dc4c3518 Branch: refs/heads/master Commit: dc4c351837879dab26ad8fb471dc51c06832a9e4 Parents: 5301a19 Author: Felix Cheung Authored: Sun Jun 11 00:00:33 2017 -0700 Committer: Felix Cheung Committed: Sun Jun 11 00:00:33 2017 -0700 -- R/pkg/inst/tests/testthat/jarTest.R | 32 - R/pkg/inst/tests/testthat/packageInAJarTest.R | 30 - R/pkg/inst/tests/testthat/test_Serde.R | 85 - R/pkg/inst/tests/testthat/test_Windows.R| 32 - R/pkg/inst/tests/testthat/test_basic.R | 90 + R/pkg/inst/tests/testthat/test_binaryFile.R | 100 - .../inst/tests/testthat/test_binary_function.R | 110 - R/pkg/inst/tests/testthat/test_broadcast.R | 55 - R/pkg/inst/tests/testthat/test_client.R | 51 - R/pkg/inst/tests/testthat/test_context.R| 226 -- R/pkg/inst/tests/testthat/test_includePackage.R | 64 - R/pkg/inst/tests/testthat/test_jvm_api.R| 36 - .../tests/testthat/test_mllib_classification.R | 396 -- .../inst/tests/testthat/test_mllib_clustering.R | 328 -- R/pkg/inst/tests/testthat/test_mllib_fpm.R | 85 - .../tests/testthat/test_mllib_recommendation.R | 67 - .../inst/tests/testthat/test_mllib_regression.R | 480 --- R/pkg/inst/tests/testthat/test_mllib_stat.R | 53 - R/pkg/inst/tests/testthat/test_mllib_tree.R | 320 -- .../tests/testthat/test_parallelize_collect.R | 120 - R/pkg/inst/tests/testthat/test_rdd.R| 906 - R/pkg/inst/tests/testthat/test_shuffle.R| 248 -- R/pkg/inst/tests/testthat/test_sparkR.R | 48 - R/pkg/inst/tests/testthat/test_sparkSQL.R | 3474 -- R/pkg/inst/tests/testthat/test_streaming.R | 167 - R/pkg/inst/tests/testthat/test_take.R | 71 - R/pkg/inst/tests/testthat/test_textFile.R | 182 - R/pkg/inst/tests/testthat/test_utils.R | 248 -- R/pkg/tests/fulltests/jarTest.R | 32 + R/pkg/tests/fulltests/packageInAJarTest.R | 30 + R/pkg/tests/fulltests/test_Serde.R | 85 + R/pkg/tests/fulltests/test_Windows.R| 32 + R/pkg/tests/fulltests/test_binaryFile.R | 100 + R/pkg/tests/fulltests/test_binary_function.R| 110 + R/pkg/tests/fulltests/test_broadcast.R | 55 + R/pkg/tests/fulltests/test_client.R | 51 + R/pkg/tests/fulltests/test_context.R| 226 ++ R/pkg/tests/fulltests/test_includePackage.R | 64 + R/pkg/tests/fulltests/test_jvm_api.R| 36 + .../tests/fulltests/test_mllib_classification.R | 396 ++ R/pkg/tests/fulltests/test_mllib_clustering.R | 328 ++ R/pkg/tests/fulltests/test_mllib_fpm.R | 85 + .../tests/fulltests/test_mllib_recommendation.R | 67 + R/pkg/tests/fulltests/test_mllib_regression.R | 480 +++ R/pkg/tests/fulltests/test_mllib_stat.R | 53 + R/pkg/tests/fulltests/test_mllib_tree.R | 320 ++ .../tests/fulltests/test_parallelize_collect.R | 120 + R/pkg/tests/fulltests/test_rdd.R| 906 + R/pkg/tests/fulltests/test_shuffle.R| 248 ++ R/pkg/tests/fulltests/test_sparkR.R | 48 + R/pkg/tests/fulltests/test_sparkSQL.R | 3474 ++ R/pkg/tests/fulltests/test_streaming.R | 167 + R/pkg/tests/fulltests/test_take.R | 71 + R/pkg/tests/fulltests/test_textFile.R | 182 + R/pkg/tests/fulltests/test_utils.R | 248 ++ R/pkg/tests/run-all.R |8 + 56 files changed, 8112 insertions(+), 8014 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/jarTest.R
[2/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN
http://git-wip-us.apache.org/repos/asf/spark/blob/0b0be47e/R/pkg/tests/fulltests/test_sparkSQL.R -- diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R new file mode 100644 index 000..d2d5191 --- /dev/null +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -0,0 +1,3198 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +library(testthat) + +context("SparkSQL functions") + +# Utility function for easily checking the values of a StructField +checkStructField <- function(actual, expectedName, expectedType, expectedNullable) { + expect_equal(class(actual), "structField") + expect_equal(actual$name(), expectedName) + expect_equal(actual$dataType.toString(), expectedType) + expect_equal(actual$nullable(), expectedNullable) +} + +markUtf8 <- function(s) { + Encoding(s) <- "UTF-8" + s +} + +setHiveContext <- function(sc) { + if (exists(".testHiveSession", envir = .sparkREnv)) { +hiveSession <- get(".testHiveSession", envir = .sparkREnv) + } else { +# initialize once and reuse +ssc <- callJMethod(sc, "sc") +hiveCtx <- tryCatch({ + newJObject("org.apache.spark.sql.hive.test.TestHiveContext", ssc, FALSE) +}, +error = function(err) { + skip("Hive is not build with SparkSQL, skipped") +}) +hiveSession <- callJMethod(hiveCtx, "sparkSession") + } + previousSession <- get(".sparkRsession", envir = .sparkREnv) + assign(".sparkRsession", hiveSession, envir = .sparkREnv) + assign(".prevSparkRsession", previousSession, envir = .sparkREnv) + hiveSession +} + +unsetHiveContext <- function() { + previousSession <- get(".prevSparkRsession", envir = .sparkREnv) + assign(".sparkRsession", previousSession, envir = .sparkREnv) + remove(".prevSparkRsession", envir = .sparkREnv) +} + +# Tests for SparkSQL functions in SparkR + +filesBefore <- list.files(path = sparkRDir, all.files = TRUE) +sparkSession <- if (not_cran_or_windows_with_hadoop()) { +sparkR.session(master = sparkRTestMaster) + } else { +sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) + } +sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession) + +mockLines <- c("{\"name\":\"Michael\"}", + "{\"name\":\"Andy\", \"age\":30}", + "{\"name\":\"Justin\", \"age\":19}") +jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") +parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet") +orcPath <- tempfile(pattern = "sparkr-test", fileext = ".orc") +writeLines(mockLines, jsonPath) + +# For test nafunctions, like dropna(), fillna(),... +mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}", + "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}", + "{\"name\":\"David\",\"age\":60,\"height\":null}", + "{\"name\":\"Amy\",\"age\":null,\"height\":null}", + "{\"name\":null,\"age\":null,\"height\":null}") +jsonPathNa <- tempfile(pattern = "sparkr-test", fileext = ".tmp") +writeLines(mockLinesNa, jsonPathNa) + +# For test complex types in DataFrame +mockLinesComplexType <- + c("{\"c1\":[1, 2, 3], \"c2\":[\"a\", \"b\", \"c\"], \"c3\":[1.0, 2.0, 3.0]}", +"{\"c1\":[4, 5, 6], \"c2\":[\"d\", \"e\", \"f\"], \"c3\":[4.0, 5.0, 6.0]}", +"{\"c1\":[7, 8, 9], \"c2\":[\"g\", \"h\", \"i\"], \"c3\":[7.0, 8.0, 9.0]}") +complexTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") +writeLines(mockLinesComplexType, complexTypeJsonPath) + +# For test map type and struct type in DataFrame +mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}", + "{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}", + "{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}") +mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") +writeLines(mockLinesMapType, mapTypeJsonPath) + +if (.Platform$OS.type == "windows") { + Sys.setenv(TZ = "GMT") +} + +test_that("calling sparkRSQL.init returns existing SQL context", { + skip_on_cran() + + sqlContext <- suppressWarnings(sparkRSQL.init(sc)) + expect_equal(suppressWarni
[7/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN
[SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN ## What changes were proposed in this pull request? Move all existing tests to non-installed directory so that it will never run by installing SparkR package For a follow-up PR: - remove all skip_on_cran() calls in tests - clean up test timer - improve or change basic tests that do run on CRAN (if anyone has suggestion) It looks like `R CMD build pkg` will still put pkg\tests (ie. the full tests) into the source package but `R CMD INSTALL` on such source package does not install these tests (and so `R CMD check` does not run them) ## How was this patch tested? - [x] unit tests, Jenkins - [x] AppVeyor - [x] make a source package, install it, `R CMD check` it - verify the full tests are not installed or run Author: Felix Cheung Closes #18264 from felixcheung/rtestset. (cherry picked from commit dc4c351837879dab26ad8fb471dc51c06832a9e4) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0b0be47e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0b0be47e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0b0be47e Branch: refs/heads/branch-2.2 Commit: 0b0be47e7b742d96810c60b19a9aa920242e5224 Parents: 815a082 Author: Felix Cheung Authored: Sun Jun 11 00:00:33 2017 -0700 Committer: Felix Cheung Committed: Sun Jun 11 00:00:45 2017 -0700 -- R/pkg/inst/tests/testthat/jarTest.R | 32 - R/pkg/inst/tests/testthat/packageInAJarTest.R | 30 - R/pkg/inst/tests/testthat/test_Serde.R | 85 - R/pkg/inst/tests/testthat/test_Windows.R| 32 - R/pkg/inst/tests/testthat/test_basic.R | 90 + R/pkg/inst/tests/testthat/test_binaryFile.R | 100 - .../inst/tests/testthat/test_binary_function.R | 110 - R/pkg/inst/tests/testthat/test_broadcast.R | 55 - R/pkg/inst/tests/testthat/test_client.R | 51 - R/pkg/inst/tests/testthat/test_context.R| 226 -- R/pkg/inst/tests/testthat/test_includePackage.R | 64 - R/pkg/inst/tests/testthat/test_jvm_api.R| 36 - .../tests/testthat/test_mllib_classification.R | 396 --- .../inst/tests/testthat/test_mllib_clustering.R | 328 -- R/pkg/inst/tests/testthat/test_mllib_fpm.R | 85 - .../tests/testthat/test_mllib_recommendation.R | 67 - .../inst/tests/testthat/test_mllib_regression.R | 480 --- R/pkg/inst/tests/testthat/test_mllib_stat.R | 53 - R/pkg/inst/tests/testthat/test_mllib_tree.R | 226 -- .../tests/testthat/test_parallelize_collect.R | 120 - R/pkg/inst/tests/testthat/test_rdd.R| 906 - R/pkg/inst/tests/testthat/test_shuffle.R| 248 -- R/pkg/inst/tests/testthat/test_sparkR.R | 48 - R/pkg/inst/tests/testthat/test_sparkSQL.R | 3198 -- R/pkg/inst/tests/testthat/test_streaming.R | 167 - R/pkg/inst/tests/testthat/test_take.R | 71 - R/pkg/inst/tests/testthat/test_textFile.R | 182 - R/pkg/inst/tests/testthat/test_utils.R | 247 -- R/pkg/tests/fulltests/jarTest.R | 32 + R/pkg/tests/fulltests/packageInAJarTest.R | 30 + R/pkg/tests/fulltests/test_Serde.R | 85 + R/pkg/tests/fulltests/test_Windows.R| 32 + R/pkg/tests/fulltests/test_binaryFile.R | 100 + R/pkg/tests/fulltests/test_binary_function.R| 110 + R/pkg/tests/fulltests/test_broadcast.R | 55 + R/pkg/tests/fulltests/test_client.R | 51 + R/pkg/tests/fulltests/test_context.R| 226 ++ R/pkg/tests/fulltests/test_includePackage.R | 64 + R/pkg/tests/fulltests/test_jvm_api.R| 36 + .../tests/fulltests/test_mllib_classification.R | 396 +++ R/pkg/tests/fulltests/test_mllib_clustering.R | 328 ++ R/pkg/tests/fulltests/test_mllib_fpm.R | 85 + .../tests/fulltests/test_mllib_recommendation.R | 67 + R/pkg/tests/fulltests/test_mllib_regression.R | 480 +++ R/pkg/tests/fulltests/test_mllib_stat.R | 53 + R/pkg/tests/fulltests/test_mllib_tree.R | 226 ++ .../tests/fulltests/test_parallelize_collect.R | 120 + R/pkg/tests/fulltests/test_rdd.R| 906 + R/pkg/tests/fulltests/test_shuffle.R| 248 ++ R/pkg/tests/fulltests/test_sparkR.R | 48 + R/pkg/tests/fulltests/test_sparkSQL.R | 3198 ++ R/pkg/tests/fulltests/test_streaming.R | 167 + R/pkg/tests/fulltests/test_take.R | 71 + R/pkg/tests/fulltests/test_textFile.R | 182 + R/pkg/tests/fulltests/test_utils.R | 247 ++ R/pkg/tests/run-all.R |8 + 56 files changed, 7741 insertions(+), 7643 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark
spark git commit: [SPARK-20877][SPARKR][FOLLOWUP] clean up after test move
Repository: spark Updated Branches: refs/heads/master 823f1eef5 -> 9f4ff9552 [SPARK-20877][SPARKR][FOLLOWUP] clean up after test move ## What changes were proposed in this pull request? clean up after big test move ## How was this patch tested? unit tests, jenkins Author: Felix Cheung Closes #18267 from felixcheung/rtestset2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9f4ff955 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9f4ff955 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9f4ff955 Branch: refs/heads/master Commit: 9f4ff9552470fb97ca38bb56bbf43be49a9a316c Parents: 823f1ee Author: Felix Cheung Authored: Sun Jun 11 03:00:44 2017 -0700 Committer: Felix Cheung Committed: Sun Jun 11 03:00:44 2017 -0700 -- R/pkg/.Rbuildignore | 1 + R/pkg/R/install.R | 2 +- R/pkg/R/utils.R | 8 +- R/pkg/tests/fulltests/test_Serde.R | 6 -- R/pkg/tests/fulltests/test_Windows.R| 7 +- R/pkg/tests/fulltests/test_binaryFile.R | 8 -- R/pkg/tests/fulltests/test_binary_function.R| 6 -- R/pkg/tests/fulltests/test_broadcast.R | 4 - R/pkg/tests/fulltests/test_client.R | 8 -- R/pkg/tests/fulltests/test_context.R| 16 --- R/pkg/tests/fulltests/test_includePackage.R | 4 - .../tests/fulltests/test_mllib_classification.R | 12 +-- R/pkg/tests/fulltests/test_mllib_clustering.R | 14 +-- R/pkg/tests/fulltests/test_mllib_fpm.R | 2 +- .../tests/fulltests/test_mllib_recommendation.R | 2 +- R/pkg/tests/fulltests/test_mllib_regression.R | 16 +-- R/pkg/tests/fulltests/test_mllib_tree.R | 22 ++-- .../tests/fulltests/test_parallelize_collect.R | 8 -- R/pkg/tests/fulltests/test_rdd.R| 102 --- R/pkg/tests/fulltests/test_shuffle.R| 24 - R/pkg/tests/fulltests/test_sparkR.R | 2 - R/pkg/tests/fulltests/test_sparkSQL.R | 92 ++--- R/pkg/tests/fulltests/test_streaming.R | 14 +-- R/pkg/tests/fulltests/test_take.R | 2 - R/pkg/tests/fulltests/test_textFile.R | 18 R/pkg/tests/fulltests/test_utils.R | 9 -- R/pkg/tests/run-all.R | 2 - 27 files changed, 35 insertions(+), 376 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9f4ff955/R/pkg/.Rbuildignore -- diff --git a/R/pkg/.Rbuildignore b/R/pkg/.Rbuildignore index f12f8c2..18b2db6 100644 --- a/R/pkg/.Rbuildignore +++ b/R/pkg/.Rbuildignore @@ -6,3 +6,4 @@ ^README\.Rmd$ ^src-native$ ^html$ +^tests/fulltests/* http://git-wip-us.apache.org/repos/asf/spark/blob/9f4ff955/R/pkg/R/install.R -- diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 4ca7aa6..ec931be 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -267,7 +267,7 @@ hadoopVersionName <- function(hadoopVersion) { # The implementation refers to appdirs package: https://pypi.python.org/pypi/appdirs and # adapt to Spark context sparkCachePath <- function() { - if (.Platform$OS.type == "windows") { + if (is_windows()) { winAppPath <- Sys.getenv("LOCALAPPDATA", unset = NA) if (is.na(winAppPath)) { stop(paste("%LOCALAPPDATA% not found.", http://git-wip-us.apache.org/repos/asf/spark/blob/9f4ff955/R/pkg/R/utils.R -- diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index ea45e39..91483a4 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -908,10 +908,6 @@ isAtomicLengthOne <- function(x) { is.atomic(x) && length(x) == 1 } -is_cran <- function() { - !identical(Sys.getenv("NOT_CRAN"), "true") -} - is_windows <- function() { .Platform$OS.type == "windows" } @@ -920,6 +916,6 @@ hadoop_home_set <- function() { !identical(Sys.getenv("HADOOP_HOME"), "") } -not_cran_or_windows_with_hadoop <- function() { - !is_cran() && (!is_windows() || hadoop_home_set()) +windows_with_hadoop <- function() { + !is_windows() || hadoop_home_set() } http://git-wip-us.apache.org/repos/asf/spark/blob/9f4ff955/R/pkg/tests/fulltests/test_Serde.R -- diff --git a/R/pkg/tests/fulltests/test_Serde.R b/R/pkg/tests/fulltests/test_Serde.R index 6e160fa..6bbd201 100644 --- a/R/pkg/tests/fulltests/test_Serde.R +++ b/R/pkg/tests/fulltests/test_Serde.R @@ -20,8 +20,6 @@ context("SerDe func
spark git commit: [SPARK-20877][SPARKR][FOLLOWUP] clean up after test move
Repository: spark Updated Branches: refs/heads/branch-2.2 0b0be47e7 -> 26003de55 [SPARK-20877][SPARKR][FOLLOWUP] clean up after test move clean up after big test move unit tests, jenkins Author: Felix Cheung Closes #18267 from felixcheung/rtestset2. (cherry picked from commit 9f4ff9552470fb97ca38bb56bbf43be49a9a316c) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/26003de5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/26003de5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/26003de5 Branch: refs/heads/branch-2.2 Commit: 26003de55ba13695649b0d874563a76d71cda88d Parents: 0b0be47 Author: Felix Cheung Authored: Sun Jun 11 03:00:44 2017 -0700 Committer: Felix Cheung Committed: Sun Jun 11 03:13:56 2017 -0700 -- R/pkg/.Rbuildignore | 1 + R/pkg/R/install.R | 2 +- R/pkg/R/utils.R | 8 +- R/pkg/tests/fulltests/test_Serde.R | 6 -- R/pkg/tests/fulltests/test_Windows.R| 7 +- R/pkg/tests/fulltests/test_binaryFile.R | 8 -- R/pkg/tests/fulltests/test_binary_function.R| 6 -- R/pkg/tests/fulltests/test_broadcast.R | 4 - R/pkg/tests/fulltests/test_client.R | 8 -- R/pkg/tests/fulltests/test_context.R| 16 --- R/pkg/tests/fulltests/test_includePackage.R | 4 - .../tests/fulltests/test_mllib_classification.R | 12 +-- R/pkg/tests/fulltests/test_mllib_clustering.R | 14 +-- R/pkg/tests/fulltests/test_mllib_fpm.R | 2 +- .../tests/fulltests/test_mllib_recommendation.R | 2 +- R/pkg/tests/fulltests/test_mllib_regression.R | 16 +-- R/pkg/tests/fulltests/test_mllib_tree.R | 14 ++- .../tests/fulltests/test_parallelize_collect.R | 8 -- R/pkg/tests/fulltests/test_rdd.R| 102 --- R/pkg/tests/fulltests/test_shuffle.R| 24 - R/pkg/tests/fulltests/test_sparkR.R | 2 - R/pkg/tests/fulltests/test_sparkSQL.R | 92 ++--- R/pkg/tests/fulltests/test_streaming.R | 14 +-- R/pkg/tests/fulltests/test_take.R | 2 - R/pkg/tests/fulltests/test_textFile.R | 18 R/pkg/tests/fulltests/test_utils.R | 8 -- R/pkg/tests/run-all.R | 2 - 27 files changed, 32 insertions(+), 370 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/26003de5/R/pkg/.Rbuildignore -- diff --git a/R/pkg/.Rbuildignore b/R/pkg/.Rbuildignore index f12f8c2..18b2db6 100644 --- a/R/pkg/.Rbuildignore +++ b/R/pkg/.Rbuildignore @@ -6,3 +6,4 @@ ^README\.Rmd$ ^src-native$ ^html$ +^tests/fulltests/* http://git-wip-us.apache.org/repos/asf/spark/blob/26003de5/R/pkg/R/install.R -- diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 4ca7aa6..ec931be 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -267,7 +267,7 @@ hadoopVersionName <- function(hadoopVersion) { # The implementation refers to appdirs package: https://pypi.python.org/pypi/appdirs and # adapt to Spark context sparkCachePath <- function() { - if (.Platform$OS.type == "windows") { + if (is_windows()) { winAppPath <- Sys.getenv("LOCALAPPDATA", unset = NA) if (is.na(winAppPath)) { stop(paste("%LOCALAPPDATA% not found.", http://git-wip-us.apache.org/repos/asf/spark/blob/26003de5/R/pkg/R/utils.R -- diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index b19556a..7225da9 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -900,10 +900,6 @@ isAtomicLengthOne <- function(x) { is.atomic(x) && length(x) == 1 } -is_cran <- function() { - !identical(Sys.getenv("NOT_CRAN"), "true") -} - is_windows <- function() { .Platform$OS.type == "windows" } @@ -912,6 +908,6 @@ hadoop_home_set <- function() { !identical(Sys.getenv("HADOOP_HOME"), "") } -not_cran_or_windows_with_hadoop <- function() { - !is_cran() && (!is_windows() || hadoop_home_set()) +windows_with_hadoop <- function() { + !is_windows() || hadoop_home_set() } http://git-wip-us.apache.org/repos/asf/spark/blob/26003de5/R/pkg/tests/fulltests/test_Serde.R -- diff --git a/R/pkg/tests/fulltests/test_Serde.R b/R/pkg/tests/fulltests/test_Serde.R index 6e160fa..6bbd201 100644 --- a/R/pkg/tests/fulltests/test_Serde.R +++ b/R/pkg/tests/fulltests/test_Serde.R @@ -20,8
spark git commit: [TEST][SPARKR][CORE] Fix broken SparkSubmitSuite
Repository: spark Updated Branches: refs/heads/master 2639c3ed0 -> 278ba7a2c [TEST][SPARKR][CORE] Fix broken SparkSubmitSuite ## What changes were proposed in this pull request? Fix test file path. This is broken in #18264 and undetected since R-only changes don't build core and subsequent post-commit with the change built fine (again because it wasn't building core) actually appveyor builds everything but it's not running scala suites ... ## How was this patch tested? jenkins srowen gatorsmile Author: Felix Cheung Closes #18283 from felixcheung/rsubmitsuite. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/278ba7a2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/278ba7a2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/278ba7a2 Branch: refs/heads/master Commit: 278ba7a2c62b2cbb7bcfe79ce10d35ab57bb1950 Parents: 2639c3e Author: Felix Cheung Authored: Mon Jun 12 22:08:49 2017 -0700 Committer: Felix Cheung Committed: Mon Jun 12 22:08:49 2017 -0700 -- .../test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/278ba7a2/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index de71999..b089357 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -505,8 +505,8 @@ class SparkSubmitSuite assume(RUtils.isSparkRInstalled, "SparkR is not installed in this build.") val main = MavenCoordinate("my.great.lib", "mylib", "0.1") val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) -val rScriptDir = - Seq(sparkHome, "R", "pkg", "inst", "tests", "packageInAJarTest.R").mkString(File.separator) +val rScriptDir = Seq( + sparkHome, "R", "pkg", "tests", "fulltests", "packageInAJarTest.R").mkString(File.separator) assert(new File(rScriptDir).exists) IvyTestUtils.withRepository(main, None, None, withR = true) { repo => val args = Seq( @@ -527,7 +527,7 @@ class SparkSubmitSuite // Check if the SparkR package is installed assume(RUtils.isSparkRInstalled, "SparkR is not installed in this build.") val rScriptDir = - Seq(sparkHome, "R", "pkg", "inst", "tests", "testthat", "jarTest.R").mkString(File.separator) + Seq(sparkHome, "R", "pkg", "tests", "fulltests", "jarTest.R").mkString(File.separator) assert(new File(rScriptDir).exists) // compile a small jar containing a class that will be called from R code. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [TEST][SPARKR][CORE] Fix broken SparkSubmitSuite
Repository: spark Updated Branches: refs/heads/branch-2.2 48a843b56 -> dae1a9875 [TEST][SPARKR][CORE] Fix broken SparkSubmitSuite ## What changes were proposed in this pull request? Fix test file path. This is broken in #18264 and undetected since R-only changes don't build core and subsequent post-commit with the change built fine (again because it wasn't building core) actually appveyor builds everything but it's not running scala suites ... ## How was this patch tested? jenkins srowen gatorsmile Author: Felix Cheung Closes #18283 from felixcheung/rsubmitsuite. (cherry picked from commit 278ba7a2c62b2cbb7bcfe79ce10d35ab57bb1950) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dae1a987 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dae1a987 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dae1a987 Branch: refs/heads/branch-2.2 Commit: dae1a98758d09dde97a8e7863100d2dd52389bf3 Parents: 48a843b Author: Felix Cheung Authored: Mon Jun 12 22:08:49 2017 -0700 Committer: Felix Cheung Committed: Mon Jun 12 22:09:05 2017 -0700 -- .../test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dae1a987/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index 6e9721c..6fa3a09 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -485,8 +485,8 @@ class SparkSubmitSuite assume(RUtils.isSparkRInstalled, "SparkR is not installed in this build.") val main = MavenCoordinate("my.great.lib", "mylib", "0.1") val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) -val rScriptDir = - Seq(sparkHome, "R", "pkg", "inst", "tests", "packageInAJarTest.R").mkString(File.separator) +val rScriptDir = Seq( + sparkHome, "R", "pkg", "tests", "fulltests", "packageInAJarTest.R").mkString(File.separator) assert(new File(rScriptDir).exists) IvyTestUtils.withRepository(main, None, None, withR = true) { repo => val args = Seq( @@ -507,7 +507,7 @@ class SparkSubmitSuite // Check if the SparkR package is installed assume(RUtils.isSparkRInstalled, "SparkR is not installed in this build.") val rScriptDir = - Seq(sparkHome, "R", "pkg", "inst", "tests", "testthat", "jarTest.R").mkString(File.separator) + Seq(sparkHome, "R", "pkg", "tests", "fulltests", "jarTest.R").mkString(File.separator) assert(new File(rScriptDir).exists) // compile a small jar containing a class that will be called from R code. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20980][DOCS] update doc to reflect multiLine change
Repository: spark Updated Branches: refs/heads/branch-2.2 af4f89c98 -> b5504f6d3 [SPARK-20980][DOCS] update doc to reflect multiLine change ## What changes were proposed in this pull request? doc only change ## How was this patch tested? manually Author: Felix Cheung Closes #18312 from felixcheung/sqljsonwholefiledoc. (cherry picked from commit 1bf55e396c7b995a276df61d9a4eb8e60bcee334) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b5504f6d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b5504f6d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b5504f6d Branch: refs/heads/branch-2.2 Commit: b5504f6d3fc375eecb131460c8b01e0be18f4e9b Parents: af4f89c Author: Felix Cheung Authored: Wed Jun 14 23:08:05 2017 -0700 Committer: Felix Cheung Committed: Wed Jun 14 23:08:18 2017 -0700 -- docs/sql-programming-guide.md | 8 1 file changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b5504f6d/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 314ff6e..8e722ae 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -998,7 +998,7 @@ Note that the file that is offered as _a json file_ is not a typical JSON file. line must contain a separate, self-contained valid JSON object. For more information, please see [JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). -For a regular multi-line JSON file, set the `wholeFile` option to `true`. +For a regular multi-line JSON file, set the `multiLine` option to `true`. {% include_example json_dataset scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %} @@ -1012,7 +1012,7 @@ Note that the file that is offered as _a json file_ is not a typical JSON file. line must contain a separate, self-contained valid JSON object. For more information, please see [JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). -For a regular multi-line JSON file, set the `wholeFile` option to `true`. +For a regular multi-line JSON file, set the `multiLine` option to `true`. {% include_example json_dataset java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %} @@ -1025,7 +1025,7 @@ Note that the file that is offered as _a json file_ is not a typical JSON file. line must contain a separate, self-contained valid JSON object. For more information, please see [JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). -For a regular multi-line JSON file, set the `wholeFile` parameter to `True`. +For a regular multi-line JSON file, set the `multiLine` parameter to `True`. {% include_example json_dataset python/sql/datasource.py %} @@ -1039,7 +1039,7 @@ Note that the file that is offered as _a json file_ is not a typical JSON file. line must contain a separate, self-contained valid JSON object. For more information, please see [JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). -For a regular multi-line JSON file, set a named parameter `wholeFile` to `TRUE`. +For a regular multi-line JSON file, set a named parameter `multiLine` to `TRUE`. {% include_example json_dataset r/RSparkSQLExample.R %} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20980][DOCS] update doc to reflect multiLine change
Repository: spark Updated Branches: refs/heads/master b32b2123d -> 1bf55e396 [SPARK-20980][DOCS] update doc to reflect multiLine change ## What changes were proposed in this pull request? doc only change ## How was this patch tested? manually Author: Felix Cheung Closes #18312 from felixcheung/sqljsonwholefiledoc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1bf55e39 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1bf55e39 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1bf55e39 Branch: refs/heads/master Commit: 1bf55e396c7b995a276df61d9a4eb8e60bcee334 Parents: b32b212 Author: Felix Cheung Authored: Wed Jun 14 23:08:05 2017 -0700 Committer: Felix Cheung Committed: Wed Jun 14 23:08:05 2017 -0700 -- docs/sql-programming-guide.md | 8 1 file changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1bf55e39/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 314ff6e..8e722ae 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -998,7 +998,7 @@ Note that the file that is offered as _a json file_ is not a typical JSON file. line must contain a separate, self-contained valid JSON object. For more information, please see [JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). -For a regular multi-line JSON file, set the `wholeFile` option to `true`. +For a regular multi-line JSON file, set the `multiLine` option to `true`. {% include_example json_dataset scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %} @@ -1012,7 +1012,7 @@ Note that the file that is offered as _a json file_ is not a typical JSON file. line must contain a separate, self-contained valid JSON object. For more information, please see [JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). -For a regular multi-line JSON file, set the `wholeFile` option to `true`. +For a regular multi-line JSON file, set the `multiLine` option to `true`. {% include_example json_dataset java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %} @@ -1025,7 +1025,7 @@ Note that the file that is offered as _a json file_ is not a typical JSON file. line must contain a separate, self-contained valid JSON object. For more information, please see [JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). -For a regular multi-line JSON file, set the `wholeFile` parameter to `True`. +For a regular multi-line JSON file, set the `multiLine` parameter to `True`. {% include_example json_dataset python/sql/datasource.py %} @@ -1039,7 +1039,7 @@ Note that the file that is offered as _a json file_ is not a typical JSON file. line must contain a separate, self-contained valid JSON object. For more information, please see [JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). -For a regular multi-line JSON file, set a named parameter `wholeFile` to `TRUE`. +For a regular multi-line JSON file, set a named parameter `multiLine` to `TRUE`. {% include_example json_dataset r/RSparkSQLExample.R %} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-21128][R] Remove both "spark-warehouse" and "metastore_db" before listing files in R tests
Repository: spark Updated Branches: refs/heads/master 75a6d0585 -> 05f83c532 [SPARK-21128][R] Remove both "spark-warehouse" and "metastore_db" before listing files in R tests ## What changes were proposed in this pull request? This PR proposes to list the files in test _after_ removing both "spark-warehouse" and "metastore_db" so that the next run of R tests pass fine. This is sometimes a bit annoying. ## How was this patch tested? Manually running multiple times R tests via `./R/run-tests.sh`. **Before** Second run: ``` SparkSQL functions: Spark package found in SPARK_HOME: .../spark ... ... ... ... ... 1234... Failed - 1. Failure: No extra files are created in SPARK_HOME by starting session and making calls (test_sparkSQL.R#3384) length(list1) not equal to length(list2). 1/1 mismatches [1] 25 - 23 == 2 2. Failure: No extra files are created in SPARK_HOME by starting session and making calls (test_sparkSQL.R#3384) sort(list1, na.last = TRUE) not equal to sort(list2, na.last = TRUE). 10/25 mismatches x[16]: "metastore_db" y[16]: "pkg" x[17]: "pkg" y[17]: "R" x[18]: "R" y[18]: "README.md" x[19]: "README.md" y[19]: "run-tests.sh" x[20]: "run-tests.sh" y[20]: "SparkR_2.2.0.tar.gz" x[21]: "metastore_db" y[21]: "pkg" x[22]: "pkg" y[22]: "R" x[23]: "R" y[23]: "README.md" x[24]: "README.md" y[24]: "run-tests.sh" x[25]: "run-tests.sh" y[25]: "SparkR_2.2.0.tar.gz" 3. Failure: No extra files are created in SPARK_HOME by starting session and making calls (test_sparkSQL.R#3388) length(list1) not equal to length(list2). 1/1 mismatches [1] 25 - 23 == 2 4. Failure: No extra files are created in SPARK_HOME by starting session and making calls (test_sparkSQL.R#3388) sort(list1, na.last = TRUE) not equal to sort(list2, na.last = TRUE). 10/25 mismatches x[16]: "metastore_db" y[16]: "pkg" x[17]: "pkg" y[17]: "R" x[18]: "R" y[18]: "README.md" x[19]: "README.md" y[19]: "run-tests.sh" x[20]: "run-tests.sh" y[20]: "SparkR_2.2.0.tar.gz" x[21]: "metastore_db" y[21]: "pkg" x[22]: "pkg" y[22]: "R" x[23]: "R" y[23]: "README.md" x[24]: "README.md" y[24]: "run-tests.sh" x[25]: "run-tests.sh" y[25]: "SparkR_2.2.0.tar.gz" DONE === ``` **After** Second run: ``` SparkSQL functions: Spark package found in SPARK_HOME: .../spark ... ... ... ... ... ... ``` Author: hyukjinkwon Closes #18335 from HyukjinKwon/SPARK-21128. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/05f83c53 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/05f83c53 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/05f83c53 Branch: refs/heads/master Commit: 05f83c532a96ead8dec1c046f985164b7f7205c0 Parents: 75a6d05 Author: hyukjinkwon Authored: Sun Jun 18 11:26:27 2017 -0700 Committer: Felix Cheung Committed: Sun Jun 18 11:26:27 2017 -0700 -- R/pkg/tests/run-all.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) --
spark git commit: [SPARK-20892][SPARKR] Add SQL trunc function to SparkR
Repository: spark Updated Branches: refs/heads/master 05f83c532 -> 110ce1f27 [SPARK-20892][SPARKR] Add SQL trunc function to SparkR ## What changes were proposed in this pull request? Add SQL trunc function ## How was this patch tested? standard test Author: actuaryzhang Closes #18291 from actuaryzhang/sparkRTrunc2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/110ce1f2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/110ce1f2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/110ce1f2 Branch: refs/heads/master Commit: 110ce1f27b66905afada6b5fd63c34fbf7602739 Parents: 05f83c5 Author: actuaryzhang Authored: Sun Jun 18 18:00:27 2017 -0700 Committer: Felix Cheung Committed: Sun Jun 18 18:00:27 2017 -0700 -- R/pkg/NAMESPACE | 1 + R/pkg/R/functions.R | 29 + R/pkg/tests/fulltests/test_sparkSQL.R | 2 ++ 3 files changed, 32 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/110ce1f2/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 4e3fe00..229de4a 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -357,6 +357,7 @@ exportMethods("%<=>%", "to_utc_timestamp", "translate", "trim", + "trunc", "unbase64", "unhex", "unix_timestamp", http://git-wip-us.apache.org/repos/asf/spark/blob/110ce1f2/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 06a9019..7128c3b 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -4015,3 +4015,32 @@ setMethod("input_file_name", signature("missing"), jc <- callJStatic("org.apache.spark.sql.functions", "input_file_name") column(jc) }) + +#' trunc +#' +#' Returns date truncated to the unit specified by the format. +#' +#' @param x Column to compute on. +#' @param format string used for specify the truncation method. For example, "year", "", +#' "yy" for truncate by year, or "month", "mon", "mm" for truncate by month. +#' +#' @rdname trunc +#' @name trunc +#' @family date time functions +#' @aliases trunc,Column-method +#' @export +#' @examples +#' \dontrun{ +#' trunc(df$c, "year") +#' trunc(df$c, "yy") +#' trunc(df$c, "month") +#' trunc(df$c, "mon") +#' } +#' @note trunc since 2.3.0 +setMethod("trunc", + signature(x = "Column"), + function(x, format) { +jc <- callJStatic("org.apache.spark.sql.functions", "trunc", + x@jc, as.character(format)) +column(jc) + }) http://git-wip-us.apache.org/repos/asf/spark/blob/110ce1f2/R/pkg/tests/fulltests/test_sparkSQL.R -- diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index af52906..911b73b 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1382,6 +1382,8 @@ test_that("column functions", { c20 <- to_timestamp(c) + to_timestamp(c, "") + to_date(c, "") c21 <- posexplode_outer(c) + explode_outer(c) c22 <- not(c) + c23 <- trunc(c, "year") + trunc(c, "") + trunc(c, "yy") + +trunc(c, "month") + trunc(c, "mon") + trunc(c, "mm") # Test if base::is.nan() is exposed expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20889][SPARKR] Grouped documentation for AGGREGATE column methods
Repository: spark Updated Branches: refs/heads/master 9b57cd8d5 -> 8965fe764 [SPARK-20889][SPARKR] Grouped documentation for AGGREGATE column methods ## What changes were proposed in this pull request? Grouped documentation for the aggregate functions for Column. Author: actuaryzhang Closes #18025 from actuaryzhang/sparkRDoc4. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8965fe76 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8965fe76 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8965fe76 Branch: refs/heads/master Commit: 8965fe764a4218d944938aa4828072f1ad9dbda7 Parents: 9b57cd8 Author: actuaryzhang Authored: Mon Jun 19 19:41:24 2017 -0700 Committer: Felix Cheung Committed: Mon Jun 19 19:41:24 2017 -0700 -- R/pkg/R/functions.R | 427 +++ R/pkg/R/generics.R | 56 --- R/pkg/R/stats.R | 22 +-- 3 files changed, 219 insertions(+), 286 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8965fe76/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 7128c3b..01ca8b8 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -18,6 +18,22 @@ #' @include generics.R column.R NULL +#' Aggregate functions for Column operations +#' +#' Aggregate functions defined for \code{Column}. +#' +#' @param x Column to compute on. +#' @param y,na.rm,use currently not used. +#' @param ... additional argument(s). For example, it could be used to pass additional Columns. +#' @name column_aggregate_functions +#' @rdname column_aggregate_functions +#' @family aggregate functions +#' @examples +#' \dontrun{ +#' # Dataframe used throughout this doc +#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))} +NULL + #' lit #' #' A new \linkS4class{Column} is created to represent the literal value. @@ -85,17 +101,20 @@ setMethod("acos", column(jc) }) -#' Returns the approximate number of distinct items in a group +#' @details +#' \code{approxCountDistinct}: Returns the approximate number of distinct items in a group. #' -#' Returns the approximate number of distinct items in a group. This is a column -#' aggregate function. -#' -#' @rdname approxCountDistinct -#' @name approxCountDistinct -#' @return the approximate number of distinct items in a group. +#' @rdname column_aggregate_functions #' @export -#' @aliases approxCountDistinct,Column-method -#' @examples \dontrun{approxCountDistinct(df$c)} +#' @aliases approxCountDistinct approxCountDistinct,Column-method +#' @examples +#' +#' \dontrun{ +#' head(select(df, approxCountDistinct(df$gear))) +#' head(select(df, approxCountDistinct(df$gear, 0.02))) +#' head(select(df, countDistinct(df$gear, df$cyl))) +#' head(select(df, n_distinct(df$gear))) +#' head(distinct(select(df, "gear")))} #' @note approxCountDistinct(Column) since 1.4.0 setMethod("approxCountDistinct", signature(x = "Column"), @@ -342,10 +361,13 @@ setMethod("column", #' #' @rdname corr #' @name corr -#' @family math functions +#' @family aggregate functions #' @export #' @aliases corr,Column-method -#' @examples \dontrun{corr(df$c, df$d)} +#' @examples +#' \dontrun{ +#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars)) +#' head(select(df, corr(df$mpg, df$hp)))} #' @note corr since 1.6.0 setMethod("corr", signature(x = "Column"), function(x, col2) { @@ -356,20 +378,22 @@ setMethod("corr", signature(x = "Column"), #' cov #' -#' Compute the sample covariance between two expressions. +#' Compute the covariance between two expressions. +#' +#' @details +#' \code{cov}: Compute the sample covariance between two expressions. #' #' @rdname cov #' @name cov -#' @family math functions +#' @family aggregate functions #' @export #' @aliases cov,characterOrColumn-method #' @examples #' \dontrun{ -#' cov(df$c, df$d) -#' cov("c", "d") -#' covar_samp(df$c, df$d) -#' covar_samp("c", "d") -#' } +#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars)) +#' head(select(df, cov(df$mpg, df$hp), cov("mpg", "hp"), +#' covar_samp(df$mpg, df$hp), covar_samp("mpg", "hp"), +#' covar_pop(df$mpg, df$hp), covar_pop("mpg", "hp")))} #' @note cov since 1.6.0 setMethod("cov", signature(x = "characterOrColumn"), function(x, col2) { @@ -377,6 +401,9 @@ setMethod("cov", signature(x = "characterOrColumn"), covar_samp(x, col2) }) +#' @details +#' \code{covar_sample}: Alias for \code{cov}. +#' #' @rdname cov #' #' @param col1 the first Column. @@ -395,23 +422,13 @@ setMethod("covar_samp", signature(col1 = "characterOrColumn", col2 = "characterO column(jc)
spark git commit: [SPARK-20917][ML][SPARKR] SparkR supports string encoding consistent with R
Repository: spark Updated Branches: refs/heads/master cad88f17e -> ad459cfb1 [SPARK-20917][ML][SPARKR] SparkR supports string encoding consistent with R ## What changes were proposed in this pull request? Add `stringIndexerOrderType` to `spark.glm` and `spark.survreg` to support string encoding that is consistent with default R. ## How was this patch tested? new tests Author: actuaryzhang Closes #18140 from actuaryzhang/sparkRFormula. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ad459cfb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ad459cfb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ad459cfb Branch: refs/heads/master Commit: ad459cfb1d169d8dd7b9e039ca135ba5cafcab83 Parents: cad88f1 Author: actuaryzhang Authored: Wed Jun 21 10:35:16 2017 -0700 Committer: Felix Cheung Committed: Wed Jun 21 10:35:16 2017 -0700 -- R/pkg/R/mllib_regression.R | 52 +--- R/pkg/tests/fulltests/test_mllib_regression.R | 62 .../ml/r/AFTSurvivalRegressionWrapper.scala | 4 +- .../r/GeneralizedLinearRegressionWrapper.scala | 6 +- 4 files changed, 115 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ad459cfb/R/pkg/R/mllib_regression.R -- diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index d59c890..9ecd887 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -70,6 +70,12 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' the relationship between the variance and mean of the distribution. Only #' applicable to the Tweedie family. #' @param link.power the index in the power link function. Only applicable to the Tweedie family. +#' @param stringIndexerOrderType how to order categories of a string feature column. This is used to +#' decide the base level of a string feature as the last category after +#' ordering is dropped when encoding strings. Supported options are +#' "frequencyDesc", "frequencyAsc", "alphabetDesc", and "alphabetAsc". +#' The default value is "frequencyDesc". When the ordering is set to +#' "alphabetDesc", this drops the same category as R when encoding strings. #' @param ... additional arguments passed to the method. #' @aliases spark.glm,SparkDataFrame,formula-method #' @return \code{spark.glm} returns a fitted generalized linear model. @@ -79,7 +85,7 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' @examples #' \dontrun{ #' sparkR.session() -#' t <- as.data.frame(Titanic) +#' t <- as.data.frame(Titanic, stringsAsFactors = FALSE) #' df <- createDataFrame(t) #' model <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian") #' summary(model) @@ -96,6 +102,15 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' savedModel <- read.ml(path) #' summary(savedModel) #' +#' # note that the default string encoding is different from R's glm +#' model2 <- glm(Freq ~ Sex + Age, family = "gaussian", data = t) +#' summary(model2) +#' # use stringIndexerOrderType = "alphabetDesc" to force string encoding +#' # to be consistent with R +#' model3 <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian", +#'stringIndexerOrderType = "alphabetDesc") +#' summary(model3) +#' #' # fit tweedie model #' model <- spark.glm(df, Freq ~ Sex + Age, family = "tweedie", #'var.power = 1.2, link.power = 0) @@ -110,8 +125,11 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' @seealso \link{glm}, \link{read.ml} setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL, - regParam = 0.0, var.power = 0.0, link.power = 1.0 - var.power) { + regParam = 0.0, var.power = 0.0, link.power = 1.0 - var.power, + stringIndexerOrderType = c("frequencyDesc", "frequencyAsc", + "alphabetDesc", "alphabetAsc")) { +stringIndexerOrderType <- match.arg(stringIndexerOrderType) if (is.character(family)) { # Handle when family = "tweedie" if (tolower(family) == "tweedie") { @@ -145,7 +163,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper", "fit", f
spark git commit: [SPARK-20906][SPARKR] Constrained Logistic Regression for SparkR
Repository: spark Updated Branches: refs/heads/master 215281d88 -> 53543374c [SPARK-20906][SPARKR] Constrained Logistic Regression for SparkR ## What changes were proposed in this pull request? PR https://github.com/apache/spark/pull/17715 Added Constrained Logistic Regression for ML. We should add it to SparkR. ## How was this patch tested? Add new unit tests. Author: wangmiao1981 Closes #18128 from wangmiao1981/test. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/53543374 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/53543374 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/53543374 Branch: refs/heads/master Commit: 53543374ce0cf0cec26de2382fbc85b7d5c7e9d6 Parents: 215281d Author: wangmiao1981 Authored: Wed Jun 21 20:42:45 2017 -0700 Committer: Felix Cheung Committed: Wed Jun 21 20:42:45 2017 -0700 -- R/pkg/R/mllib_classification.R | 61 +++- .../tests/fulltests/test_mllib_classification.R | 40 + .../ml/classification/LogisticRegression.scala | 8 +-- .../spark/ml/r/LogisticRegressionWrapper.scala | 34 ++- 4 files changed, 135 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/53543374/R/pkg/R/mllib_classification.R -- diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index bdcc081..82d2428 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -204,6 +204,20 @@ function(object, path, overwrite = FALSE) { #' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features #' or the number of partitions are large, this param could be adjusted to a larger size. #' This is an expert parameter. Default value should be good for most cases. +#' @param lowerBoundsOnCoefficients The lower bounds on coefficients if fitting under bound constrained optimization. +#' The bound matrix must be compatible with the shape (1, number of features) for binomial +#' regression, or (number of classes, number of features) for multinomial regression. +#' It is a R matrix. +#' @param upperBoundsOnCoefficients The upper bounds on coefficients if fitting under bound constrained optimization. +#' The bound matrix must be compatible with the shape (1, number of features) for binomial +#' regression, or (number of classes, number of features) for multinomial regression. +#' It is a R matrix. +#' @param lowerBoundsOnIntercepts The lower bounds on intercepts if fitting under bound constrained optimization. +#'The bounds vector size must be equal to 1 for binomial regression, or the number +#'of classes for multinomial regression. +#' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting under bound constrained optimization. +#'The bound vector size must be equal to 1 for binomial regression, or the number +#'of classes for multinomial regression. #' @param ... additional arguments passed to the method. #' @return \code{spark.logit} returns a fitted logistic regression model. #' @rdname spark.logit @@ -241,8 +255,12 @@ function(object, path, overwrite = FALSE) { setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"), function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100, tol = 1E-6, family = "auto", standardization = TRUE, - thresholds = 0.5, weightCol = NULL, aggregationDepth = 2) { + thresholds = 0.5, weightCol = NULL, aggregationDepth = 2, + lowerBoundsOnCoefficients = NULL, upperBoundsOnCoefficients = NULL, + lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = NULL) { formula <- paste(deparse(formula), collapse = "") +row <- 0 +col <- 0 if (!is.null(weightCol) && weightCol == "") { weightCol <- NULL @@ -250,12 +268,51 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") weightCol <- as.character(weightCol) } +if (!is.null(lowerBoundsOnIntercepts)) { +lowerBoundsOnIntercepts <- as.array(lowerBoundsOnIntercepts) +} + +if (!is.null(upperBoundsOnIntercepts)) { +upperBoundsOnIntercep
spark git commit: [SPARK-20889][SPARKR] Grouped documentation for DATETIME column methods
Repository: spark Updated Branches: refs/heads/master 2dadea95c -> 19331b8e4 [SPARK-20889][SPARKR] Grouped documentation for DATETIME column methods ## What changes were proposed in this pull request? Grouped documentation for datetime column methods. Author: actuaryzhang Closes #18114 from actuaryzhang/sparkRDocDate. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/19331b8e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/19331b8e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/19331b8e Branch: refs/heads/master Commit: 19331b8e44ad910550f810b80e2a0caf0ef62cb3 Parents: 2dadea9 Author: actuaryzhang Authored: Thu Jun 22 10:16:51 2017 -0700 Committer: Felix Cheung Committed: Thu Jun 22 10:16:51 2017 -0700 -- R/pkg/R/functions.R | 532 --- R/pkg/R/generics.R | 69 -- 2 files changed, 273 insertions(+), 328 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/19331b8e/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 01ca8b8..3102858 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -34,6 +34,58 @@ NULL #' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))} NULL +#' Date time functions for Column operations +#' +#' Date time functions defined for \code{Column}. +#' +#' @param x Column to compute on. +#' @param format For \code{to_date} and \code{to_timestamp}, it is the string to use to parse +#' x Column to DateType or TimestampType. For \code{trunc}, it is the string used +#' for specifying the truncation method. For example, "year", "", "yy" for +#' truncate by year, or "month", "mon", "mm" for truncate by month. +#' @param ... additional argument(s). +#' @name column_datetime_functions +#' @rdname column_datetime_functions +#' @family data time functions +#' @examples +#' \dontrun{ +#' dts <- c("2005-01-02 18:47:22", +#' "2005-12-24 16:30:58", +#' "2005-10-28 07:30:05", +#' "2005-12-28 07:01:05", +#' "2006-01-24 00:01:10") +#' y <- c(2.0, 2.2, 3.4, 2.5, 1.8) +#' df <- createDataFrame(data.frame(time = as.POSIXct(dts), y = y))} +NULL + +#' Date time arithmetic functions for Column operations +#' +#' Date time arithmetic functions defined for \code{Column}. +#' +#' @param y Column to compute on. +#' @param x For class \code{Column}, it is the column used to perform arithmetic operations +#' with column \code{y}. For class \code{numeric}, it is the number of months or +#' days to be added to or subtracted from \code{y}. For class \code{character}, it is +#' \itemize{ +#' \item \code{date_format}: date format specification. +#' \item \code{from_utc_timestamp}, \code{to_utc_timestamp}: time zone to use. +#' \item \code{next_day}: day of the week string. +#' } +#' +#' @name column_datetime_diff_functions +#' @rdname column_datetime_diff_functions +#' @family data time functions +#' @examples +#' \dontrun{ +#' dts <- c("2005-01-02 18:47:22", +#' "2005-12-24 16:30:58", +#' "2005-10-28 07:30:05", +#' "2005-12-28 07:01:05", +#' "2006-01-24 00:01:10") +#' y <- c(2.0, 2.2, 3.4, 2.5, 1.8) +#' df <- createDataFrame(data.frame(time = as.POSIXct(dts), y = y))} +NULL + #' lit #' #' A new \linkS4class{Column} is created to represent the literal value. @@ -546,18 +598,20 @@ setMethod("hash", column(jc) }) -#' dayofmonth -#' -#' Extracts the day of the month as an integer from a given date/timestamp/string. -#' -#' @param x Column to compute on. +#' @details +#' \code{dayofmonth}: Extracts the day of the month as an integer from a +#' given date/timestamp/string. #' -#' @rdname dayofmonth -#' @name dayofmonth -#' @family date time functions -#' @aliases dayofmonth,Column-method +#' @rdname column_datetime_functions +#' @aliases dayofmonth dayofmonth,Column-method #' @export -#' @examples \dontrun{dayofmonth(df$c)} +#' @examples +#' +#' \dontrun{ +#' head(select(df, df$time, year(df$time), quarter(df$time), month(df$time), +#'dayofmonth(df$time), dayofyear(df$time), weekofyear(df$time))) +#' head(agg(groupBy(df, year(df$time)), count(df$y), avg(df$y))) +#' head(agg(groupBy(df, month(df$time)), avg(df$y)))} #' @note dayofmonth since 1.5.0 setMethod("dayofmonth", signature(x = "Column"), @@ -566,18 +620,13 @@ setMethod("dayofmonth", column(jc) }) -#' dayofyear -#' -#' Extracts the day of the year as an integer from a given date/timestamp/string. -#' -#' @param x Column to compute on. +#' @details +#' \code{dayofyear}: Extracts the day of the year
spark git commit: [SPARK-21149][R] Add job description API for R
Repository: spark Updated Branches: refs/heads/master f3dea6079 -> 07479b3cf [SPARK-21149][R] Add job description API for R ## What changes were proposed in this pull request? Extend `setJobDescription` to SparkR API. ## How was this patch tested? It looks difficult to add a test. Manually tested as below: ```r df <- createDataFrame(iris) count(df) setJobDescription("This is an example job.") count(df) ``` prints ... ![2017-06-22 12 05 49](https://user-images.githubusercontent.com/6477701/27415670-2a649936-5743-11e7-8e95-312f1cd103af.png) Author: hyukjinkwon Closes #18382 from HyukjinKwon/SPARK-21149. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/07479b3c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/07479b3c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/07479b3c Branch: refs/heads/master Commit: 07479b3cfb7a617a18feca14e9e31c208c80630e Parents: f3dea60 Author: hyukjinkwon Authored: Fri Jun 23 09:59:24 2017 -0700 Committer: Felix Cheung Committed: Fri Jun 23 09:59:24 2017 -0700 -- R/pkg/NAMESPACE | 3 ++- R/pkg/R/sparkR.R | 17 + R/pkg/tests/fulltests/test_context.R | 1 + 3 files changed, 20 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/07479b3c/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 229de4a..b7fdae5 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -75,7 +75,8 @@ exportMethods("glm", # Job group lifecycle management methods export("setJobGroup", "clearJobGroup", - "cancelJobGroup") + "cancelJobGroup", + "setJobDescription") # Export Utility methods export("setLogLevel") http://git-wip-us.apache.org/repos/asf/spark/blob/07479b3c/R/pkg/R/sparkR.R -- diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index d0a12b7..f2d2620 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -535,6 +535,23 @@ cancelJobGroup <- function(sc, groupId) { } } +#' Set a human readable description of the current job. +#' +#' Set a description that is shown as a job description in UI. +#' +#' @param value The job description of the current job. +#' @rdname setJobDescription +#' @name setJobDescription +#' @examples +#'\dontrun{ +#' setJobDescription("This is an example job.") +#'} +#' @note setJobDescription since 2.3.0 +setJobDescription <- function(value) { + sc <- getSparkContext() + invisible(callJMethod(sc, "setJobDescription", value)) +} + sparkConfToSubmitOps <- new.env() sparkConfToSubmitOps[["spark.driver.memory"]] <- "--driver-memory" sparkConfToSubmitOps[["spark.driver.extraClassPath"]] <- "--driver-class-path" http://git-wip-us.apache.org/repos/asf/spark/blob/07479b3c/R/pkg/tests/fulltests/test_context.R -- diff --git a/R/pkg/tests/fulltests/test_context.R b/R/pkg/tests/fulltests/test_context.R index 710485d..77635c5 100644 --- a/R/pkg/tests/fulltests/test_context.R +++ b/R/pkg/tests/fulltests/test_context.R @@ -100,6 +100,7 @@ test_that("job group functions can be called", { setJobGroup("groupId", "job description", TRUE) cancelJobGroup("groupId") clearJobGroup() + setJobDescription("job description") suppressWarnings(setJobGroup(sc, "groupId", "job description", TRUE)) suppressWarnings(cancelJobGroup(sc, "groupId")) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-21093][R] Terminate R's worker processes in the parent of R's daemon to prevent a leak
Repository: spark Updated Branches: refs/heads/master 884347e1f -> 6b3d02285 [SPARK-21093][R] Terminate R's worker processes in the parent of R's daemon to prevent a leak ## What changes were proposed in this pull request? `mcfork` in R looks opening a pipe ahead but the existing logic does not properly close it when it is executed hot. This leads to the failure of more forking due to the limit for number of files open. This hot execution looks particularly for `gapply`/`gapplyCollect`. For unknown reason, this happens more easily in CentOS and could be reproduced in Mac too. All the details are described in https://issues.apache.org/jira/browse/SPARK-21093 This PR proposes simply to terminate R's worker processes in the parent of R's daemon to prevent a leak. ## How was this patch tested? I ran the codes below on both CentOS and Mac with that configuration disabled/enabled. ```r df <- createDataFrame(list(list(1L, 1, "1", 0.1)), c("a", "b", "c", "d")) collect(gapply(df, "a", function(key, x) { x }, schema(df))) collect(gapply(df, "a", function(key, x) { x }, schema(df))) ... # 30 times ``` Also, now it passes R tests on CentOS as below: ``` SparkSQL functions: Spark package found in SPARK_HOME: .../spark .. .. .. .. .. ``` Author: hyukjinkwon Closes #18320 from HyukjinKwon/SPARK-21093. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6b3d0228 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6b3d0228 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6b3d0228 Branch: refs/heads/master Commit: 6b3d02285ee0debc73cbcab01b10398a498fbeb8 Parents: 884347e Author: hyukjinkwon Authored: Sun Jun 25 11:05:57 2017 -0700 Committer: Felix Cheung Committed: Sun Jun 25 11:05:57 2017 -0700 -- R/pkg/inst/worker/daemon.R | 59 ++--- 1 file changed, 55 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6b3d0228/R/pkg/inst/worker/daemon.R -- diff --git a/R/pkg/inst/worker/daemon.R b/R/pkg/inst/worker/daemon.R index 3a318b7..6e385b2 100644 --- a/R/pkg/inst/worker/daemon.R +++ b/R/pkg/inst/worker/daemon.R @@ -30,8 +30,55 @@ port <- as.integer(Sys.getenv("SPARKR_WORKER_PORT")) inputCon <- socketConnection( port = port, open = "rb", blocking = TRUE, timeout = connectionTimeout) +# Waits indefinitely for a socket connecion by default. +selectTimeout <- NULL + +# Exit code that children send to the parent to indicate they exited. +exitCode <- 1 + while (TRUE) { - ready <- socketSelect(list(inputCon)) + ready <- socketSelect(list(inputCon), timeout = selectTimeout) + + # Note that the children should be terminated in the parent. If each child terminates + # itself, it appears that the resource is not released properly, that causes an unexpected + # termination of this daemon due to, for example, running out of file descriptors + # (see SPARK-21093). Therefore, the current implementation tries to retrieve children + # that are exited (but not terminated) and then sends a kill signal to terminate them properly + # in the parent. + # + # There are two paths that it attempts to send a signal to terminate the children in the parent. + # + # 1. Every second if any socket connection is not available and if there are child workers + # running. + # 2. Right after a socket connection is available. + # + # In other words, the parent attempts to send the signal to the children every second if + # any worker is running or right before launching other worker children from the following + # new socket connection. + + # Only the process IDs of children sent data to the parent are returned below. The children + # send a custom exit code to the parent after being exited and the parent tries + # to terminate them only if they s
spark git commit: [SPARK-20889][SPARKR] Grouped documentation for MATH column methods
Repository: spark Updated Branches: refs/heads/master 2d686a19e -> e793bf248 [SPARK-20889][SPARKR] Grouped documentation for MATH column methods ## What changes were proposed in this pull request? Grouped documentation for math column methods. Author: actuaryzhang Author: Wayne Zhang Closes #18371 from actuaryzhang/sparkRDocMath. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e793bf24 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e793bf24 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e793bf24 Branch: refs/heads/master Commit: e793bf248bc3c71b9664f26377bce06b0ffa97a7 Parents: 2d686a1 Author: actuaryzhang Authored: Tue Jun 27 23:15:45 2017 -0700 Committer: Felix Cheung Committed: Tue Jun 27 23:15:45 2017 -0700 -- R/pkg/R/functions.R | 619 --- R/pkg/R/generics.R | 48 ++-- 2 files changed, 241 insertions(+), 426 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e793bf24/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 3102858..23ccdf9 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -86,6 +86,31 @@ NULL #' df <- createDataFrame(data.frame(time = as.POSIXct(dts), y = y))} NULL +#' Math functions for Column operations +#' +#' Math functions defined for \code{Column}. +#' +#' @param x Column to compute on. In \code{shiftLeft}, \code{shiftRight} and \code{shiftRightUnsigned}, +#' this is the number of bits to shift. +#' @param y Column to compute on. +#' @param ... additional argument(s). +#' @name column_math_functions +#' @rdname column_math_functions +#' @family math functions +#' @examples +#' \dontrun{ +#' # Dataframe used throughout this doc +#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars)) +#' tmp <- mutate(df, v1 = log(df$mpg), v2 = cbrt(df$disp), +#' v3 = bround(df$wt, 1), v4 = bin(df$cyl), +#' v5 = hex(df$wt), v6 = toDegrees(df$gear), +#' v7 = atan2(df$cyl, df$am), v8 = hypot(df$cyl, df$am), +#' v9 = pmod(df$hp, df$cyl), v10 = shiftLeft(df$disp, 1), +#' v11 = conv(df$hp, 10, 16), v12 = sign(df$vs - 0.5), +#' v13 = sqrt(df$disp), v14 = ceil(df$wt)) +#' head(tmp)} +NULL + #' lit #' #' A new \linkS4class{Column} is created to represent the literal value. @@ -112,18 +137,12 @@ setMethod("lit", signature("ANY"), column(jc) }) -#' abs -#' -#' Computes the absolute value. -#' -#' @param x Column to compute on. +#' @details +#' \code{abs}: Computes the absolute value. #' -#' @rdname abs -#' @name abs -#' @family non-aggregate functions +#' @rdname column_math_functions #' @export -#' @examples \dontrun{abs(df$c)} -#' @aliases abs,Column-method +#' @aliases abs abs,Column-method #' @note abs since 1.5.0 setMethod("abs", signature(x = "Column"), @@ -132,19 +151,13 @@ setMethod("abs", column(jc) }) -#' acos -#' -#' Computes the cosine inverse of the given value; the returned angle is in the range -#' 0.0 through pi. -#' -#' @param x Column to compute on. +#' @details +#' \code{acos}: Computes the cosine inverse of the given value; the returned angle is in +#' the range 0.0 through pi. #' -#' @rdname acos -#' @name acos -#' @family math functions +#' @rdname column_math_functions #' @export -#' @examples \dontrun{acos(df$c)} -#' @aliases acos,Column-method +#' @aliases acos acos,Column-method #' @note acos since 1.5.0 setMethod("acos", signature(x = "Column"), @@ -196,19 +209,13 @@ setMethod("ascii", column(jc) }) -#' asin -#' -#' Computes the sine inverse of the given value; the returned angle is in the range -#' -pi/2 through pi/2. -#' -#' @param x Column to compute on. +#' @details +#' \code{asin}: Computes the sine inverse of the given value; the returned angle is in +#' the range -pi/2 through pi/2. #' -#' @rdname asin -#' @name asin -#' @family math functions +#' @rdname column_math_functions #' @export -#' @aliases asin,Column-method -#' @examples \dontrun{asin(df$c)} +#' @aliases asin asin,Column-method #' @note asin since 1.5.0 setMethod("asin", signature(x = "Column"), @@ -217,18 +224,12 @@ setMethod("asin", column(jc) }) -#' atan -#' -#' Computes the tangent inverse of the given value. -#' -#' @param x Column to compute on. +#' @details +#' \code{atan}: Computes the tangent inverse of the given value. #' -#' @rdname atan -#' @name atan -#' @family math functions +#' @rdname column_math_functions #' @export -#' @aliases atan,Column-method -#' @examples \dontrun{atan(df$c)} +#' @aliases atan atan,
spark git commit: [SPARK-20889][SPARKR] Grouped documentation for STRING column methods
Repository: spark Updated Branches: refs/heads/master b72b8521d -> 376d90d55 [SPARK-20889][SPARKR] Grouped documentation for STRING column methods ## What changes were proposed in this pull request? Grouped documentation for string column methods. Author: actuaryzhang Author: Wayne Zhang Closes #18366 from actuaryzhang/sparkRDocString. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/376d90d5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/376d90d5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/376d90d5 Branch: refs/heads/master Commit: 376d90d556fcd4fd84f70ee42a1323e1f48f829d Parents: b72b852 Author: actuaryzhang Authored: Wed Jun 28 19:31:54 2017 -0700 Committer: Felix Cheung Committed: Wed Jun 28 19:31:54 2017 -0700 -- R/pkg/R/functions.R | 573 --- R/pkg/R/generics.R | 84 --- 2 files changed, 300 insertions(+), 357 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/376d90d5/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 23ccdf9..70ea620 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -111,6 +111,27 @@ NULL #' head(tmp)} NULL +#' String functions for Column operations +#' +#' String functions defined for \code{Column}. +#' +#' @param x Column to compute on except in the following methods: +#' \itemize{ +#' \item \code{instr}: \code{character}, the substring to check. See 'Details'. +#' \item \code{format_number}: \code{numeric}, the number of decimal place to +#' format to. See 'Details'. +#' } +#' @param y Column to compute on. +#' @param ... additional columns. +#' @name column_string_functions +#' @rdname column_string_functions +#' @family string functions +#' @examples +#' \dontrun{ +#' # Dataframe used throughout this doc +#' df <- createDataFrame(as.data.frame(Titanic, stringsAsFactors = FALSE))} +NULL + #' lit #' #' A new \linkS4class{Column} is created to represent the literal value. @@ -188,19 +209,17 @@ setMethod("approxCountDistinct", column(jc) }) -#' ascii -#' -#' Computes the numeric value of the first character of the string column, and returns the -#' result as a int column. -#' -#' @param x Column to compute on. +#' @details +#' \code{ascii}: Computes the numeric value of the first character of the string column, +#' and returns the result as an int column. #' -#' @rdname ascii -#' @name ascii -#' @family string functions +#' @rdname column_string_functions #' @export -#' @aliases ascii,Column-method -#' @examples \dontrun{\dontrun{ascii(df$c)}} +#' @aliases ascii ascii,Column-method +#' @examples +#' +#' \dontrun{ +#' head(select(df, ascii(df$Class), ascii(df$Sex)))} #' @note ascii since 1.5.0 setMethod("ascii", signature(x = "Column"), @@ -256,19 +275,22 @@ setMethod("avg", column(jc) }) -#' base64 -#' -#' Computes the BASE64 encoding of a binary column and returns it as a string column. -#' This is the reverse of unbase64. -#' -#' @param x Column to compute on. +#' @details +#' \code{base64}: Computes the BASE64 encoding of a binary column and returns it as +#' a string column. This is the reverse of unbase64. #' -#' @rdname base64 -#' @name base64 -#' @family string functions +#' @rdname column_string_functions #' @export -#' @aliases base64,Column-method -#' @examples \dontrun{base64(df$c)} +#' @aliases base64 base64,Column-method +#' @examples +#' +#' \dontrun{ +#' tmp <- mutate(df, s1 = encode(df$Class, "UTF-8")) +#' str(tmp) +#' tmp2 <- mutate(tmp, s2 = base64(tmp$s1), s3 = decode(tmp$s1, "UTF-8"), +#' s4 = soundex(tmp$Sex)) +#' head(tmp2) +#' head(select(tmp2, unbase64(tmp2$s2)))} #' @note base64 since 1.5.0 setMethod("base64", signature(x = "Column"), @@ -620,20 +642,16 @@ setMethod("dayofyear", column(jc) }) -#' decode -#' -#' Computes the first argument into a string from a binary using the provided character set -#' (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). +#' @details +#' \code{decode}: Computes the first argument into a string from a binary using the provided +#' character set. #' -#' @param x Column to compute on. -#' @param charset Character set to use +#' @param charset Character set to use (one of "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", +#'"UTF-16LE", "UTF-16"). #' -#' @rdname decode -#' @name decode -#' @family string functions -#' @aliases decode,Column,character-method +#' @rdname column_string_functions +#' @aliases decode decode,Column,character-method #' @export -#' @examples \dontrun{decode(df$c, "UTF-8")}
spark git commit: [SPARK-21224][R] Specify a schema by using a DDL-formatted string when reading in R
Repository: spark Updated Branches: refs/heads/master 0c8444cf6 -> db44f5f3e [SPARK-21224][R] Specify a schema by using a DDL-formatted string when reading in R ## What changes were proposed in this pull request? This PR proposes to support a DDL-formetted string as schema as below: ```r mockLines <- c("{\"name\":\"Michael\"}", "{\"name\":\"Andy\", \"age\":30}", "{\"name\":\"Justin\", \"age\":19}") jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") writeLines(mockLines, jsonPath) df <- read.df(jsonPath, "json", "name STRING, age DOUBLE") collect(df) ``` ## How was this patch tested? Tests added in `test_streaming.R` and `test_sparkSQL.R` and manual tests. Author: hyukjinkwon Closes #18431 from HyukjinKwon/r-ddl-schema. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/db44f5f3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/db44f5f3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/db44f5f3 Branch: refs/heads/master Commit: db44f5f3e8b5bc28c33b154319539d51c05a089c Parents: 0c8444c Author: hyukjinkwon Authored: Wed Jun 28 19:36:00 2017 -0700 Committer: Felix Cheung Committed: Wed Jun 28 19:36:00 2017 -0700 -- R/pkg/R/SQLContext.R| 38 +--- R/pkg/tests/fulltests/test_sparkSQL.R | 20 +-- R/pkg/tests/fulltests/test_streaming.R | 23 .../org/apache/spark/sql/api/r/SQLUtils.scala | 15 4 files changed, 67 insertions(+), 29 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/db44f5f3/R/pkg/R/SQLContext.R -- diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index e3528bc..3b7f71b 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -584,7 +584,7 @@ tableToDF <- function(tableName) { #' #' @param path The path of files to load #' @param source The name of external data source -#' @param schema The data schema defined in structType +#' @param schema The data schema defined in structType or a DDL-formatted string. #' @param na.strings Default string value for NA when source is "csv" #' @param ... additional external data source specific named properties. #' @return SparkDataFrame @@ -600,6 +600,8 @@ tableToDF <- function(tableName) { #' structField("info", "map")) #' df2 <- read.df(mapTypeJsonPath, "json", schema, multiLine = TRUE) #' df3 <- loadDF("data/test_table", "parquet", mergeSchema = "true") +#' stringSchema <- "name STRING, info MAP" +#' df4 <- read.df(mapTypeJsonPath, "json", stringSchema, multiLine = TRUE) #' } #' @name read.df #' @method read.df default @@ -623,14 +625,19 @@ read.df.default <- function(path = NULL, source = NULL, schema = NULL, na.string if (source == "csv" && is.null(options[["nullValue"]])) { options[["nullValue"]] <- na.strings } + read <- callJMethod(sparkSession, "read") + read <- callJMethod(read, "format", source) if (!is.null(schema)) { -stopifnot(class(schema) == "structType") -sdf <- handledCallJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sparkSession, - source, schema$jobj, options) - } else { -sdf <- handledCallJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sparkSession, - source, options) +if (class(schema) == "structType") { + read <- callJMethod(read, "schema", schema$jobj) +} else if (is.character(schema)) { + read <- callJMethod(read, "schema", schema) +} else { + stop("schema should be structType or character.") +} } + read <- callJMethod(read, "options", options) + sdf <- handledCallJMethod(read, "load") dataFrame(sdf) } @@ -717,8 +724,8 @@ read.jdbc <- function(url, tableName, #' "spark.sql.sources.default" will be used. #' #' @param source The name of external data source -#' @param schema The data schema defined in structType, this is required for file-based streaming -#' data source +#' @param schema The data schema defined in structType or a DDL-formatted string, this is +#' required for file-based streaming data source #' @param ... additional external data source specific named options, for instance \code{path} for #'file-based streaming data source #' @return SparkDataFrame @@ -733,6 +740,8 @@ read.jdbc <- function(url, tableName, #' q <- write.stream(df, "text", path = "/home/user/out", checkpointLocation = "/home/user/cp") #' #' df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1) +#' stringSchema <- "name STRING, info MAP" +#' df1 <- read.stream("json", path = jsonDir, schema = stringSchema, maxFilesPerTrigger = 1) #' } #'
spark git commit: Revert "[SPARK-21094][R] Terminate R's worker processes in the parent of R's daemon to prevent a leak"
Repository: spark Updated Branches: refs/heads/master db44f5f3e -> fc92d25f2 Revert "[SPARK-21094][R] Terminate R's worker processes in the parent of R's daemon to prevent a leak" This reverts commit 6b3d02285ee0debc73cbcab01b10398a498fbeb8. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fc92d25f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fc92d25f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fc92d25f Branch: refs/heads/master Commit: fc92d25f2a27e81ef2d5031dcf856af1cc1d8c31 Parents: db44f5f Author: Felix Cheung Authored: Wed Jun 28 20:06:29 2017 -0700 Committer: Felix Cheung Committed: Wed Jun 28 20:06:29 2017 -0700 -- R/pkg/inst/worker/daemon.R | 59 +++-- 1 file changed, 4 insertions(+), 55 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fc92d25f/R/pkg/inst/worker/daemon.R -- diff --git a/R/pkg/inst/worker/daemon.R b/R/pkg/inst/worker/daemon.R index 6e385b2..3a318b7 100644 --- a/R/pkg/inst/worker/daemon.R +++ b/R/pkg/inst/worker/daemon.R @@ -30,55 +30,8 @@ port <- as.integer(Sys.getenv("SPARKR_WORKER_PORT")) inputCon <- socketConnection( port = port, open = "rb", blocking = TRUE, timeout = connectionTimeout) -# Waits indefinitely for a socket connecion by default. -selectTimeout <- NULL - -# Exit code that children send to the parent to indicate they exited. -exitCode <- 1 - while (TRUE) { - ready <- socketSelect(list(inputCon), timeout = selectTimeout) - - # Note that the children should be terminated in the parent. If each child terminates - # itself, it appears that the resource is not released properly, that causes an unexpected - # termination of this daemon due to, for example, running out of file descriptors - # (see SPARK-21093). Therefore, the current implementation tries to retrieve children - # that are exited (but not terminated) and then sends a kill signal to terminate them properly - # in the parent. - # - # There are two paths that it attempts to send a signal to terminate the children in the parent. - # - # 1. Every second if any socket connection is not available and if there are child workers - # running. - # 2. Right after a socket connection is available. - # - # In other words, the parent attempts to send the signal to the children every second if - # any worker is running or right before launching other worker children from the following - # new socket connection. - - # Only the process IDs of children sent data to the parent are returned below. The children - # send a custom exit code to the parent after being exited and the parent tries - # to terminate them only if they sent the exit code. - children <- parallel:::selectChildren(timeout = 0) - - if (is.integer(children)) { -lapply(children, function(child) { - # This data should be raw bytes if any data was sent from this child. - # Otherwise, this returns the PID. - data <- parallel:::readChild(child) - if (is.raw(data)) { -# This checks if the data from this child is the exit code that indicates an exited child. -if (unserialize(data) == exitCode) { - # If so, we terminate this child. - tools::pskill(child, tools::SIGUSR1) -} - } -}) - } else if (is.null(children)) { -# If it is NULL, there are no children. Waits indefinitely for a socket connecion. -selectTimeout <- NULL - } - + ready <- socketSelect(list(inputCon)) if (ready) { port <- SparkR:::readInt(inputCon) # There is a small chance that it could be interrupted by signal, retry one time @@ -91,16 +44,12 @@ while (TRUE) { } p <- parallel:::mcfork() if (inherits(p, "masterProcess")) { - # Reach here because this is a child process. close(inputCon) Sys.setenv(SPARKR_WORKER_PORT = port) try(source(script)) - # Note that this mcexit does not fully terminate this child. So, this writes back - # a custom exit code so that the parent can read and terminate this child. - parallel:::mcexit(0L, send = exitCode) -} else { - # Forking succeeded and we need to check if they finished their jobs every second. - selectTimeout <- 1 + # Set SIGUSR1 so that child can exit + tools::pskill(Sys.getpid(), tools::SIGUSR1) + parallel:::mcexit(0L) } } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20889][SPARKR] Grouped documentation for NONAGGREGATE column methods
Repository: spark Updated Branches: refs/heads/master 9f6b3e65c -> a2d562354 [SPARK-20889][SPARKR] Grouped documentation for NONAGGREGATE column methods ## What changes were proposed in this pull request? Grouped documentation for nonaggregate column methods. Author: actuaryzhang Author: Wayne Zhang Closes #18422 from actuaryzhang/sparkRDocNonAgg. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a2d56235 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a2d56235 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a2d56235 Branch: refs/heads/master Commit: a2d5623548194f15989e7b68118d744673e33819 Parents: 9f6b3e6 Author: actuaryzhang Authored: Thu Jun 29 01:23:13 2017 -0700 Committer: Felix Cheung Committed: Thu Jun 29 01:23:13 2017 -0700 -- R/pkg/R/functions.R | 360 +++ R/pkg/R/generics.R | 55 +--- 2 files changed, 182 insertions(+), 233 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a2d56235/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 70ea620..cb09e84 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -132,23 +132,39 @@ NULL #' df <- createDataFrame(as.data.frame(Titanic, stringsAsFactors = FALSE))} NULL -#' lit +#' Non-aggregate functions for Column operations #' -#' A new \linkS4class{Column} is created to represent the literal value. -#' If the parameter is a \linkS4class{Column}, it is returned unchanged. +#' Non-aggregate functions defined for \code{Column}. #' -#' @param x a literal value or a Column. +#' @param x Column to compute on. In \code{lit}, it is a literal value or a Column. +#' In \code{expr}, it contains an expression character object to be parsed. +#' @param y Column to compute on. +#' @param ... additional Columns. +#' @name column_nonaggregate_functions +#' @rdname column_nonaggregate_functions +#' @seealso coalesce,SparkDataFrame-method #' @family non-aggregate functions -#' @rdname lit -#' @name lit +#' @examples +#' \dontrun{ +#' # Dataframe used throughout this doc +#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))} +NULL + +#' @details +#' \code{lit}: A new Column is created to represent the literal value. +#' If the parameter is a Column, it is returned unchanged. +#' +#' @rdname column_nonaggregate_functions #' @export -#' @aliases lit,ANY-method +#' @aliases lit lit,ANY-method #' @examples +#' #' \dontrun{ -#' lit(df$name) -#' select(df, lit("x")) -#' select(df, lit("2015-01-01")) -#'} +#' tmp <- mutate(df, v1 = lit(df$mpg), v2 = lit("x"), v3 = lit("2015-01-01"), +#' v4 = negate(df$mpg), v5 = expr('length(model)'), +#' v6 = greatest(df$vs, df$am), v7 = least(df$vs, df$am), +#' v8 = column("mpg")) +#' head(tmp)} #' @note lit since 1.5.0 setMethod("lit", signature("ANY"), function(x) { @@ -314,18 +330,16 @@ setMethod("bin", column(jc) }) -#' bitwiseNOT -#' -#' Computes bitwise NOT. -#' -#' @param x Column to compute on. +#' @details +#' \code{bitwiseNOT}: Computes bitwise NOT. #' -#' @rdname bitwiseNOT -#' @name bitwiseNOT -#' @family non-aggregate functions +#' @rdname column_nonaggregate_functions #' @export -#' @aliases bitwiseNOT,Column-method -#' @examples \dontrun{bitwiseNOT(df$c)} +#' @aliases bitwiseNOT bitwiseNOT,Column-method +#' @examples +#' +#' \dontrun{ +#' head(select(df, bitwiseNOT(cast(df$vs, "int"} #' @note bitwiseNOT since 1.5.0 setMethod("bitwiseNOT", signature(x = "Column"), @@ -375,16 +389,12 @@ setMethod("ceiling", ceil(x) }) -#' Returns the first column that is not NA -#' -#' Returns the first column that is not NA, or NA if all inputs are. +#' @details +#' \code{coalesce}: Returns the first column that is not NA, or NA if all inputs are. #' -#' @rdname coalesce -#' @name coalesce -#' @family non-aggregate functions +#' @rdname column_nonaggregate_functions #' @export #' @aliases coalesce,Column-method -#' @examples \dontrun{coalesce(df$c, df$d, df$e)} #' @note coalesce(Column) since 2.1.1 setMethod("coalesce", signature(x = "Column"), @@ -824,22 +834,24 @@ setMethod("initcap", column(jc) }) -#' is.nan -#' -#' Return true if the column is NaN, alias for \link{isnan} -#' -#' @param x Column to compute on. +#' @details +#' \code{isnan}: Returns true if the column is NaN. +#' @rdname column_nonaggregate_functions +#' @aliases isnan isnan,Column-method +#' @note isnan since 2.0.0 +setMethod("isnan", + signature(x = "Column"), + function(x) { +jc <- callJStatic("org.apache.spark.sql.functions", "isna
spark git commit: [SPARK-20889][SPARKR] Grouped documentation for MISC column methods
Repository: spark Updated Branches: refs/heads/master e2f32ee45 -> fddb63f46 [SPARK-20889][SPARKR] Grouped documentation for MISC column methods ## What changes were proposed in this pull request? Grouped documentation for column misc methods. Author: actuaryzhang Author: Wayne Zhang Closes #18448 from actuaryzhang/sparkRDocMisc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fddb63f4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fddb63f4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fddb63f4 Branch: refs/heads/master Commit: fddb63f46345be36c40d9a7f3660920af6502bbd Parents: e2f32ee Author: actuaryzhang Authored: Thu Jun 29 21:35:01 2017 -0700 Committer: Felix Cheung Committed: Thu Jun 29 21:35:01 2017 -0700 -- R/pkg/R/functions.R | 98 ++-- R/pkg/R/generics.R | 15 +--- 2 files changed, 55 insertions(+), 58 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fddb63f4/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index cb09e84..67cb7a7 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -150,6 +150,27 @@ NULL #' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))} NULL +#' Miscellaneous functions for Column operations +#' +#' Miscellaneous functions defined for \code{Column}. +#' +#' @param x Column to compute on. In \code{sha2}, it is one of 224, 256, 384, or 512. +#' @param y Column to compute on. +#' @param ... additional Columns. +#' @name column_misc_functions +#' @rdname column_misc_functions +#' @family misc functions +#' @examples +#' \dontrun{ +#' # Dataframe used throughout this doc +#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars)[, 1:2]) +#' tmp <- mutate(df, v1 = crc32(df$model), v2 = hash(df$model), +#' v3 = hash(df$model, df$mpg), v4 = md5(df$model), +#' v5 = sha1(df$model), v6 = sha2(df$model, 256)) +#' head(tmp) +#' } +NULL + #' @details #' \code{lit}: A new Column is created to represent the literal value. #' If the parameter is a Column, it is returned unchanged. @@ -569,19 +590,13 @@ setMethod("count", column(jc) }) -#' crc32 -#' -#' Calculates the cyclic redundancy check value (CRC32) of a binary column and -#' returns the value as a bigint. -#' -#' @param x Column to compute on. +#' @details +#' \code{crc32}: Calculates the cyclic redundancy check value (CRC32) of a binary column +#' and returns the value as a bigint. #' -#' @rdname crc32 -#' @name crc32 -#' @family misc functions -#' @aliases crc32,Column-method +#' @rdname column_misc_functions +#' @aliases crc32 crc32,Column-method #' @export -#' @examples \dontrun{crc32(df$c)} #' @note crc32 since 1.5.0 setMethod("crc32", signature(x = "Column"), @@ -590,19 +605,13 @@ setMethod("crc32", column(jc) }) -#' hash -#' -#' Calculates the hash code of given columns, and returns the result as a int column. -#' -#' @param x Column to compute on. -#' @param ... additional Column(s) to be included. +#' @details +#' \code{hash}: Calculates the hash code of given columns, and returns the result +#' as an int column. #' -#' @rdname hash -#' @name hash -#' @family misc functions -#' @aliases hash,Column-method +#' @rdname column_misc_functions +#' @aliases hash hash,Column-method #' @export -#' @examples \dontrun{hash(df$c)} #' @note hash since 2.0.0 setMethod("hash", signature(x = "Column"), @@ -1055,19 +1064,13 @@ setMethod("max", column(jc) }) -#' md5 -#' -#' Calculates the MD5 digest of a binary column and returns the value +#' @details +#' \code{md5}: Calculates the MD5 digest of a binary column and returns the value #' as a 32 character hex string. #' -#' @param x Column to compute on. -#' -#' @rdname md5 -#' @name md5 -#' @family misc functions -#' @aliases md5,Column-method +#' @rdname column_misc_functions +#' @aliases md5 md5,Column-method #' @export -#' @examples \dontrun{md5(df$c)} #' @note md5 since 1.5.0 setMethod("md5", signature(x = "Column"), @@ -1307,19 +1310,13 @@ setMethod("second", column(jc) }) -#' sha1 -#' -#' Calculates the SHA-1 digest of a binary column and returns the value +#' @details +#' \code{sha1}: Calculates the SHA-1 digest of a binary column and returns the value #' as a 40 character hex string. #' -#' @param x Column to compute on. -#' -#' @rdname sha1 -#' @name sha1 -#' @family misc functions -#' @aliases sha1,Column-method +#' @rdname column_misc_functions +#' @aliases sha1 sha1,Column-method #' @export -#' @examples \dontrun{sha1(df$c)} #' @note sha1 since 1.5.0 setMe
spark git commit: [SPARK-20889][SPARKR] Grouped documentation for COLLECTION column methods
Repository: spark Updated Branches: refs/heads/master fddb63f46 -> 52981715b [SPARK-20889][SPARKR] Grouped documentation for COLLECTION column methods ## What changes were proposed in this pull request? Grouped documentation for column collection methods. Author: actuaryzhang Author: Wayne Zhang Closes #18458 from actuaryzhang/sparkRDocCollection. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/52981715 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/52981715 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/52981715 Branch: refs/heads/master Commit: 52981715bb8d653a1141f55b36da804412eb783a Parents: fddb63f Author: actuaryzhang Authored: Thu Jun 29 23:00:50 2017 -0700 Committer: Felix Cheung Committed: Thu Jun 29 23:00:50 2017 -0700 -- R/pkg/R/functions.R | 204 +-- R/pkg/R/generics.R | 27 --- 2 files changed, 108 insertions(+), 123 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/52981715/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 67cb7a7..a1f5c4f 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -171,6 +171,35 @@ NULL #' } NULL +#' Collection functions for Column operations +#' +#' Collection functions defined for \code{Column}. +#' +#' @param x Column to compute on. Note the difference in the following methods: +#' \itemize{ +#' \item \code{to_json}: it is the column containing the struct or array of the structs. +#' \item \code{from_json}: it is the column containing the JSON string. +#' } +#' @param ... additional argument(s). In \code{to_json} and \code{from_json}, this contains +#'additional named properties to control how it is converted, accepts the same +#'options as the JSON data source. +#' @name column_collection_functions +#' @rdname column_collection_functions +#' @family collection functions +#' @examples +#' \dontrun{ +#' # Dataframe used throughout this doc +#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars)) +#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars)) +#' tmp <- mutate(df, v1 = create_array(df$mpg, df$cyl, df$hp)) +#' head(select(tmp, array_contains(tmp$v1, 21), size(tmp$v1))) +#' tmp2 <- mutate(tmp, v2 = explode(tmp$v1)) +#' head(tmp2) +#' head(select(tmp, posexplode(tmp$v1))) +#' head(select(tmp, sort_array(tmp$v1))) +#' head(select(tmp, sort_array(tmp$v1, asc = FALSE)))} +NULL + #' @details #' \code{lit}: A new Column is created to represent the literal value. #' If the parameter is a Column, it is returned unchanged. @@ -1642,30 +1671,23 @@ setMethod("to_date", column(jc) }) -#' to_json -#' -#' Converts a column containing a \code{structType} or array of \code{structType} into a Column -#' of JSON string. Resolving the Column can fail if an unsupported type is encountered. -#' -#' @param x Column containing the struct or array of the structs -#' @param ... additional named properties to control how it is converted, accepts the same options -#'as the JSON data source. +#' @details +#' \code{to_json}: Converts a column containing a \code{structType} or array of \code{structType} +#' into a Column of JSON string. Resolving the Column can fail if an unsupported type is encountered. #' -#' @family non-aggregate functions -#' @rdname to_json -#' @name to_json -#' @aliases to_json,Column-method +#' @rdname column_collection_functions +#' @aliases to_json to_json,Column-method #' @export #' @examples +#' #' \dontrun{ #' # Converts a struct into a JSON object -#' df <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d") -#' select(df, to_json(df$d, dateFormat = 'dd/MM/')) +#' df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d") +#' select(df2, to_json(df2$d, dateFormat = 'dd/MM/')) #' #' # Converts an array of structs into a JSON array -#' df <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people") -#' select(df, to_json(df$people)) -#'} +#' df2 <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people") +#' df2 <- mutate(df2, people_json = to_json(df2$people))} #' @note to_json since 2.2.0 setMethod("to_json", signature(x = "Column"), function(x, ...) { @@ -2120,28 +2142,28 @@ setMethod("date_format", signature(y = "Column", x = "character"), column(jc) }) -#' from_json -#' -#' Parses a column containing a JSON string into a Column of \code{structType} with the specified -#' \code{schema} or array of \code{structType} if \code{as.json.array} is set to \code{TR
spark git commit: [SPARK-20889][SPARKR] Grouped documentation for WINDOW column methods
Repository: spark Updated Branches: refs/heads/master 4d6d8192c -> cec392150 [SPARK-20889][SPARKR] Grouped documentation for WINDOW column methods ## What changes were proposed in this pull request? Grouped documentation for column window methods. Author: actuaryzhang Closes #18481 from actuaryzhang/sparkRDocWindow. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cec39215 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cec39215 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cec39215 Branch: refs/heads/master Commit: cec392150451a64c9c2902b7f8f4b3b38f25cbea Parents: 4d6d819 Author: actuaryzhang Authored: Tue Jul 4 12:18:51 2017 -0700 Committer: Felix Cheung Committed: Tue Jul 4 12:18:51 2017 -0700 -- R/pkg/R/functions.R | 225 +++ R/pkg/R/generics.R | 28 +++--- 2 files changed, 88 insertions(+), 165 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cec39215/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index a1f5c4f..8c12308 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -200,6 +200,34 @@ NULL #' head(select(tmp, sort_array(tmp$v1, asc = FALSE)))} NULL +#' Window functions for Column operations +#' +#' Window functions defined for \code{Column}. +#' +#' @param x In \code{lag} and \code{lead}, it is the column as a character string or a Column +#' to compute on. In \code{ntile}, it is the number of ntile groups. +#' @param offset In \code{lag}, the number of rows back from the current row from which to obtain +#' a value. In \code{lead}, the number of rows after the current row from which to +#' obtain a value. If not specified, the default is 1. +#' @param defaultValue (optional) default to use when the offset row does not exist. +#' @param ... additional argument(s). +#' @name column_window_functions +#' @rdname column_window_functions +#' @family window functions +#' @examples +#' \dontrun{ +#' # Dataframe used throughout this doc +#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars)) +#' ws <- orderBy(windowPartitionBy("am"), "hp") +#' tmp <- mutate(df, dist = over(cume_dist(), ws), dense_rank = over(dense_rank(), ws), +#' lag = over(lag(df$mpg), ws), lead = over(lead(df$mpg, 1), ws), +#' percent_rank = over(percent_rank(), ws), +#' rank = over(rank(), ws), row_number = over(row_number(), ws)) +#' # Get ntile group id (1-4) for hp +#' tmp <- mutate(tmp, ntile = over(ntile(4), ws)) +#' head(tmp)} +NULL + #' @details #' \code{lit}: A new Column is created to represent the literal value. #' If the parameter is a Column, it is returned unchanged. @@ -2844,27 +2872,16 @@ setMethod("ifelse", ## Window functions## -#' cume_dist -#' -#' Window function: returns the cumulative distribution of values within a window partition, -#' i.e. the fraction of rows that are below the current row. -#' -#' N = total number of rows in the partition -#' cume_dist(x) = number of values before (and including) x / N -#' +#' @details +#' \code{cume_dist}: Returns the cumulative distribution of values within a window partition, +#' i.e. the fraction of rows that are below the current row: +#' (number of values before and including x) / (total number of rows in the partition). #' This is equivalent to the \code{CUME_DIST} function in SQL. +#' The method should be used with no argument. #' -#' @rdname cume_dist -#' @name cume_dist -#' @family window functions -#' @aliases cume_dist,missing-method +#' @rdname column_window_functions +#' @aliases cume_dist cume_dist,missing-method #' @export -#' @examples -#' \dontrun{ -#' df <- createDataFrame(mtcars) -#' ws <- orderBy(windowPartitionBy("am"), "hp") -#' out <- select(df, over(cume_dist(), ws), df$hp, df$am) -#' } #' @note cume_dist since 1.6.0 setMethod("cume_dist", signature("missing"), @@ -2873,28 +2890,19 @@ setMethod("cume_dist", column(jc) }) -#' dense_rank -#' -#' Window function: returns the rank of rows within a window partition, without any gaps. +#' @details +#' \code{dense_rank}: Returns the rank of rows within a window partition, without any gaps. #' The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking #' sequence when there are ties. That is, if you were ranking a competition using dense_rank #' and had three people tie for second place, you would say that all three were in second #' place and that the next person came in third. Rank would give me sequential numbers, making #' the person that came in third place (af
spark git commit: [MINOR][SPARKR] ignore Rplots.pdf test output after running R tests
Repository: spark Updated Branches: refs/heads/master cec392150 -> daabf425e [MINOR][SPARKR] ignore Rplots.pdf test output after running R tests ## What changes were proposed in this pull request? After running R tests in local build, it outputs Rplots.pdf. This one should be ignored in the git repository. Author: wangmiao1981 Closes #18518 from wangmiao1981/ignore. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/daabf425 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/daabf425 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/daabf425 Branch: refs/heads/master Commit: daabf425ec0272951b11f286e4bec7a48f42cc0d Parents: cec3921 Author: wangmiao1981 Authored: Tue Jul 4 12:37:29 2017 -0700 Committer: Felix Cheung Committed: Tue Jul 4 12:37:29 2017 -0700 -- .gitignore | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/daabf425/.gitignore -- diff --git a/.gitignore b/.gitignore index 1d91b43..cf9780d 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ R-unit-tests.log R/unit-tests.out R/cran-check.out R/pkg/vignettes/sparkr-vignettes.html +R/pkg/tests/fulltests/Rplots.pdf build/*.jar build/apache-maven* build/scala* - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-20889][SPARKR][FOLLOWUP] Clean up grouped doc for column methods
Repository: spark Updated Branches: refs/heads/master ce10545d3 -> e9a93f814 [SPARK-20889][SPARKR][FOLLOWUP] Clean up grouped doc for column methods ## What changes were proposed in this pull request? Add doc for methods that were left out, and fix various style and consistency issues. Author: actuaryzhang Closes #18493 from actuaryzhang/sparkRDocCleanup. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e9a93f81 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e9a93f81 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e9a93f81 Branch: refs/heads/master Commit: e9a93f8140c913b91781b35e0e1b051c30244882 Parents: ce10545 Author: actuaryzhang Authored: Tue Jul 4 21:05:05 2017 -0700 Committer: Felix Cheung Committed: Tue Jul 4 21:05:05 2017 -0700 -- R/pkg/R/functions.R | 100 +-- R/pkg/R/generics.R | 7 ++-- 2 files changed, 49 insertions(+), 58 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e9a93f81/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 8c12308..c529d83 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -38,10 +38,10 @@ NULL #' #' Date time functions defined for \code{Column}. #' -#' @param x Column to compute on. +#' @param x Column to compute on. In \code{window}, it must be a time Column of \code{TimestampType}. #' @param format For \code{to_date} and \code{to_timestamp}, it is the string to use to parse -#' x Column to DateType or TimestampType. For \code{trunc}, it is the string used -#' for specifying the truncation method. For example, "year", "", "yy" for +#' Column \code{x} to DateType or TimestampType. For \code{trunc}, it is the string +#' to use to specify the truncation method. For example, "year", "", "yy" for #' truncate by year, or "month", "mon", "mm" for truncate by month. #' @param ... additional argument(s). #' @name column_datetime_functions @@ -122,7 +122,7 @@ NULL #' format to. See 'Details'. #' } #' @param y Column to compute on. -#' @param ... additional columns. +#' @param ... additional Columns. #' @name column_string_functions #' @rdname column_string_functions #' @family string functions @@ -167,8 +167,7 @@ NULL #' tmp <- mutate(df, v1 = crc32(df$model), v2 = hash(df$model), #' v3 = hash(df$model, df$mpg), v4 = md5(df$model), #' v5 = sha1(df$model), v6 = sha2(df$model, 256)) -#' head(tmp) -#' } +#' head(tmp)} NULL #' Collection functions for Column operations @@ -190,7 +189,6 @@ NULL #' \dontrun{ #' # Dataframe used throughout this doc #' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars)) -#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars)) #' tmp <- mutate(df, v1 = create_array(df$mpg, df$cyl, df$hp)) #' head(select(tmp, array_contains(tmp$v1, 21), size(tmp$v1))) #' tmp2 <- mutate(tmp, v2 = explode(tmp$v1)) @@ -394,7 +392,7 @@ setMethod("base64", }) #' @details -#' \code{bin}: An expression that returns the string representation of the binary value +#' \code{bin}: Returns the string representation of the binary value #' of the given long column. For example, bin("12") returns "1100". #' #' @rdname column_math_functions @@ -722,7 +720,7 @@ setMethod("dayofyear", #' \code{decode}: Computes the first argument into a string from a binary using the provided #' character set. #' -#' @param charset Character set to use (one of "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", +#' @param charset character set to use (one of "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", #'"UTF-16LE", "UTF-16"). #' #' @rdname column_string_functions @@ -855,7 +853,7 @@ setMethod("hex", }) #' @details -#' \code{hour}: Extracts the hours as an integer from a given date/timestamp/string. +#' \code{hour}: Extracts the hour as an integer from a given date/timestamp/string. #' #' @rdname column_datetime_functions #' @aliases hour hour,Column-method @@ -1177,7 +1175,7 @@ setMethod("min", }) #' @details -#' \code{minute}: Extracts the minutes as an integer from a given date/timestamp/string. +#' \code{minute}: Extracts the minute as an integer from a given date/timestamp/string. #' #' @rdname column_datetime_functions #' @aliases minute minute,Column-method @@ -1354,7 +1352,7 @@ setMethod("sd", }) #' @details -#' \code{second}: Extracts the seconds as an integer from a given date/timestamp/string. +#' \code{second}: Extracts the second as an integer from a given date/timestamp/string. #' #' @rdname column_datetime_fu
spark git commit: [SPARK-20307][SPARKR] SparkR: pass on setHandleInvalid to spark.mllib functions that use StringIndexer
Repository: spark Updated Branches: refs/heads/master d0bfc6733 -> a7b46c627 [SPARK-20307][SPARKR] SparkR: pass on setHandleInvalid to spark.mllib functions that use StringIndexer ## What changes were proposed in this pull request? For randomForest classifier, if test data contains unseen labels, it will throw an error. The StringIndexer already has the handleInvalid logic. The patch add a new method to set the underlying StringIndexer handleInvalid logic. This patch should also apply to other classifiers. This PR focuses on the main logic and randomForest classifier. I will do follow-up PR for other classifiers. ## How was this patch tested? Add a new unit test based on the error case in the JIRA. Author: wangmiao1981 Closes #18496 from wangmiao1981/handle. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a7b46c62 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a7b46c62 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a7b46c62 Branch: refs/heads/master Commit: a7b46c627b5d2461257f337139a29f23350e0c77 Parents: d0bfc67 Author: wangmiao1981 Authored: Fri Jul 7 23:51:32 2017 -0700 Committer: Felix Cheung Committed: Fri Jul 7 23:51:32 2017 -0700 -- R/pkg/R/mllib_tree.R| 11 +++-- R/pkg/tests/fulltests/test_mllib_tree.R | 17 + .../org/apache/spark/ml/feature/RFormula.scala | 25 .../r/RandomForestClassificationWrapper.scala | 4 +++- .../spark/ml/feature/StringIndexerSuite.scala | 2 +- 5 files changed, 55 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a7b46c62/R/pkg/R/mllib_tree.R -- diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R index 2f1220a..75b1a74 100644 --- a/R/pkg/R/mllib_tree.R +++ b/R/pkg/R/mllib_tree.R @@ -374,6 +374,10 @@ setMethod("write.ml", signature(object = "GBTClassificationModel", path = "chara #' nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching #' can speed up training of deeper trees. Users can set how often should the #' cache be checkpointed or disable it by setting checkpointInterval. +#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in classification model. +#'Supported options: "skip" (filter out rows with invalid data), +#' "error" (throw an error), "keep" (put invalid data in a special additional +#' bucket, at index numLabels). Default is "error". #' @param ... additional arguments passed to the method. #' @aliases spark.randomForest,SparkDataFrame,formula-method #' @return \code{spark.randomForest} returns a fitted Random Forest model. @@ -409,7 +413,8 @@ setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "fo maxDepth = 5, maxBins = 32, numTrees = 20, impurity = NULL, featureSubsetStrategy = "auto", seed = NULL, subsamplingRate = 1.0, minInstancesPerNode = 1, minInfoGain = 0.0, checkpointInterval = 10, - maxMemoryInMB = 256, cacheNodeIds = FALSE) { + maxMemoryInMB = 256, cacheNodeIds = FALSE, + handleInvalid = c("error", "keep", "skip")) { type <- match.arg(type) formula <- paste(deparse(formula), collapse = "") if (!is.null(seed)) { @@ -430,6 +435,7 @@ setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "fo new("RandomForestRegressionModel", jobj = jobj) }, classification = { + handleInvalid <- match.arg(handleInvalid) if (is.null(impurity)) impurity <- "gini" impurity <- match.arg(impurity, c("gini", "entropy")) jobj <- callJStatic("org.apache.spark.ml.r.RandomForestClassifierWrapper", @@ -439,7 +445,8 @@ setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "fo as.numeric(minInfoGain), as.integer(checkpointInterval), as.character(featureSubsetStrategy), seed, as.numeric(subsamplingRate), - as.integer(maxMemoryInMB), as.logical(cacheNodeIds)) + as.integer(maxMemoryInMB), as.logical(cacheNodeIds), + handleInvalid) new("RandomForestClassificationModel", jobj = jobj) } ) http://git-wip-u
spark git commit: [SPARK-20456][DOCS] Add examples for functions collection for pyspark
Repository: spark Updated Branches: refs/heads/master a7b46c627 -> f5f02d213 [SPARK-20456][DOCS] Add examples for functions collection for pyspark ## What changes were proposed in this pull request? This adds documentation to many functions in pyspark.sql.functions.py: `upper`, `lower`, `reverse`, `unix_timestamp`, `from_unixtime`, `rand`, `randn`, `collect_list`, `collect_set`, `lit` Add units to the trigonometry functions. Renames columns in datetime examples to be more informative. Adds links between some functions. ## How was this patch tested? `./dev/lint-python` `python python/pyspark/sql/functions.py` `./python/run-tests.py --module pyspark-sql` Author: Michael Patterson Closes #17865 from map222/spark-20456. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f5f02d21 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f5f02d21 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f5f02d21 Branch: refs/heads/master Commit: f5f02d213d3151f58070e113d64fcded4f5d401e Parents: a7b46c6 Author: Michael Patterson Authored: Fri Jul 7 23:59:34 2017 -0700 Committer: Felix Cheung Committed: Fri Jul 7 23:59:34 2017 -0700 -- R/pkg/R/functions.R | 11 +- python/pyspark/sql/functions.py | 166 --- .../scala/org/apache/spark/sql/functions.scala | 14 +- 3 files changed, 119 insertions(+), 72 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f5f02d21/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index c529d83..f28d26a 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -336,7 +336,8 @@ setMethod("asin", }) #' @details -#' \code{atan}: Computes the tangent inverse of the given value. +#' \code{atan}: Computes the tangent inverse of the given value; the returned angle is in the range +#' -pi/2 through pi/2. #' #' @rdname column_math_functions #' @export @@ -599,7 +600,7 @@ setMethod("covar_pop", signature(col1 = "characterOrColumn", col2 = "characterOr }) #' @details -#' \code{cos}: Computes the cosine of the given value. +#' \code{cos}: Computes the cosine of the given value. Units in radians. #' #' @rdname column_math_functions #' @aliases cos cos,Column-method @@ -1407,7 +1408,7 @@ setMethod("sign", signature(x = "Column"), }) #' @details -#' \code{sin}: Computes the sine of the given value. +#' \code{sin}: Computes the sine of the given value. Units in radians. #' #' @rdname column_math_functions #' @aliases sin sin,Column-method @@ -1597,7 +1598,7 @@ setMethod("sumDistinct", }) #' @details -#' \code{tan}: Computes the tangent of the given value. +#' \code{tan}: Computes the tangent of the given value. Units in radians. #' #' @rdname column_math_functions #' @aliases tan tan,Column-method @@ -1896,7 +1897,7 @@ setMethod("year", #' @details #' \code{atan2}: Returns the angle theta from the conversion of rectangular coordinates -#' (x, y) to polar coordinates (r, theta). +#' (x, y) to polar coordinates (r, theta). Units in radians. #' #' @rdname column_math_functions #' @aliases atan2 atan2,Column-method http://git-wip-us.apache.org/repos/asf/spark/blob/f5f02d21/python/pyspark/sql/functions.py -- diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 3416c4b..5d8ded8 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -67,9 +67,14 @@ def _create_window_function(name, doc=''): _.__doc__ = 'Window function: ' + doc return _ +_lit_doc = """ +Creates a :class:`Column` of literal value. +>>> df.select(lit(5).alias('height')).withColumn('spark_user', lit(True)).take(1) +[Row(height=5, spark_user=True)] +""" _functions = { -'lit': 'Creates a :class:`Column` of literal value.', +'lit': _lit_doc, 'col': 'Returns a :class:`Column` based on the given column name.', 'column': 'Returns a :class:`Column` based on the given column name.', 'asc': 'Returns a sort expression based on the ascending order of the given column name.', @@ -95,10 +100,13 @@ _functions_1_4 = { '0.0 through pi.', 'asin': 'Computes the sine inverse of the given value; the returned angle is in the range' + '-pi/2 through pi/2.', -'atan': 'Computes the tangent inverse of the given value.', +'atan': 'Computes the tangent inverse of the given value; the returned angle is in the range' + +'-pi/2 through pi/2', 'cbrt': 'Computes the cube-root of the given value.', 'ceil': 'Computes the ceiling of the given value.', -'cos': 'Co
spark git commit: [SPARK-21093][R] Terminate R's worker processes in the parent of R's daemon to prevent a leak
Repository: spark Updated Branches: refs/heads/master c3712b77a -> 08e0d033b [SPARK-21093][R] Terminate R's worker processes in the parent of R's daemon to prevent a leak ## What changes were proposed in this pull request? This is a retry for #18320. This PR was reverted due to unexpected test failures with -10 error code. I was unable to reproduce in MacOS, CentOS and Ubuntu but only in Jenkins. So, the tests proceeded to verify this and revert the past try here - https://github.com/apache/spark/pull/18456 This new approach was tested in https://github.com/apache/spark/pull/18463. **Test results**: - With the part of suspicious change in the past try (https://github.com/apache/spark/pull/18463/commits/466325d3fd353668583f3bde38ae490d9db0b189) Tests ran 4 times and 2 times passed and 2 time failed. - Without the part of suspicious change in the past try (https://github.com/apache/spark/pull/18463/commits/466325d3fd353668583f3bde38ae490d9db0b189) Tests ran 5 times and they all passed. - With this new approach (https://github.com/apache/spark/pull/18463/commits/0a7589c09f53dfc2094497d8d3e59d6407569417) Tests ran 5 times and they all passed. It looks the cause is as below (see https://github.com/apache/spark/pull/18463/commits/466325d3fd353668583f3bde38ae490d9db0b189): ```diff + exitCode <- 1 ... + data <- parallel:::readChild(child) + if (is.raw(data)) { + if (unserialize(data) == exitCode) { ... + } + } ... - parallel:::mcexit(0L) + parallel:::mcexit(0L, send = exitCode) ``` Two possibilities I think - `parallel:::mcexit(.. , send = exitCode)` https://stat.ethz.ch/R-manual/R-devel/library/parallel/html/mcfork.html > It sends send to the master (unless NULL) and then shuts down the child process. However, it looks possible that the parent attemps to terminate the child right after getting our custom exit code. So, the child gets terminated between "send" and "shuts down", failing to exit properly. - A bug between `parallel:::mcexit(..., send = ...)` and `parallel:::readChild`. **Proposal**: To resolve this, I simply decided to avoid both possibilities with this new approach here (https://github.com/apache/spark/pull/18465/commits/9ff89a7859cb9f427fc774f33c3521c7d962b723). To support this idea, I explained with some quotation of the documentation as below: https://stat.ethz.ch/R-manual/R-devel/library/parallel/html/mcfork.html > `readChild` and `readChildren` return a raw vector with a "pid" attribute if > data were available, an integer vector of length one with the process ID if a > child terminated or `NULL` if the child no longer exists (no children at all > for `readChildren`). `readChild` returns "an integer vector of length one with the process ID if a child terminated" so we can check if it is `integer` and the same selected "process ID". I believe this makes sure that the children are exited. In case that children happen to send any data manually to parent (which is why we introduced the suspicious part of the change (https://github.com/apache/spark/pull/18463/commits/466325d3fd353668583f3bde38ae490d9db0b189)), this should be raw bytes and will be discarded (and then will try to read the next and check if it is `integer` in the next loop). ## How was this patch tested? Manual tests and Jenkins tests. Author: hyukjinkwon Closes #18465 from HyukjinKwon/SPARK-21093-retry-1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/08e0d033 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/08e0d033 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/08e0d033 Branch: refs/heads/master Commit: 08e0d033b40946b4ef5741a7aa1e7ba0bd48c6fb Parents: c3712b7 Author: hyukjinkwon Authored: Sat Jul 8 14:24:37 2017 -0700 Committer: Felix Cheung Committed: Sat Jul 8 14:24:37 2017 -0700 -- R/pkg/inst/worker/daemon.R | 51 ++--- 1 file changed, 48 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/08e0d033/R/pkg/inst/worker/daemon.R -- diff --git a/R/pkg/inst/worker/daemon.R b/R/pkg/inst/worker/daemon.R index 3a318b7..2e31dc5 100644 --- a/R/pkg/inst/worker/daemon.R +++ b/R/pkg/inst/worker/daemon.R @@ -30,8 +30,50 @@ port <- as.integer(Sys.getenv("SPARKR_WORKER_PORT")) inputCon <- socketConnection( port = port, open = "rb", blocking = TRUE, timeout = connectionTimeout) +# Waits indefinitely for a socket connecion by default. +selectTimeout <- NULL + while (TRUE) { - ready <- socketSelect(list(inputCon)) + ready <- socketSelect(list(inputCon), timeout = selectTimeout) + + # Note that the children should be terminated in the parent. If each child terminates + # itself,
spark git commit: [MINOR][SPARKR] R API documentation for "coltypes" is confusing
Repository: spark Updated Branches: refs/heads/master 9dc3e602d -> 1203c8415 [MINOR][SPARKR] R API documentation for "coltypes" is confusing ## What changes were proposed in this pull request? R API documentation for "coltypes" is confusing, found when working on another ticket. Current version http://spark.apache.org/docs/2.0.0/api/R/coltypes.html, where parameters have 2 "x" which is a duplicate, and also the example is not very clear ![current](https://cloud.githubusercontent.com/assets/3925641/17386808/effb98ce-59a2-11e6-9657-d477d258a80c.png) ![screen shot 2016-08-03 at 5 56 00 pm](https://cloud.githubusercontent.com/assets/3925641/17386884/91831096-59a3-11e6-84af-39890b3d45d8.png) ## How was this patch tested? Tested manually on local machine. And the screenshots are like below: ![screen shot 2016-08-07 at 11 29 20 pm](https://cloud.githubusercontent.com/assets/3925641/17471144/df36633c-5cf6-11e6-8238-4e32ead0e529.png) ![screen shot 2016-08-03 at 5 56 22 pm](https://cloud.githubusercontent.com/assets/3925641/17386896/9d36cb26-59a3-11e6-9619-6dae29f7ab17.png) Author: Xin Ren Closes #14489 from keypointt/rExample. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1203c841 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1203c841 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1203c841 Branch: refs/heads/master Commit: 1203c8415cd11540f79a235e66a2f241ca6c71e4 Parents: 9dc3e60 Author: Xin Ren Authored: Wed Aug 10 00:49:06 2016 -0700 Committer: Felix Cheung Committed: Wed Aug 10 00:49:06 2016 -0700 -- R/pkg/R/DataFrame.R | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1203c841/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index a473331..0ce4696 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -223,7 +223,7 @@ setMethod("showDF", #' sparkR.session() #' path <- "path/to/file.json" #' df <- read.json(path) -#' df +#' show(df) #'} #' @note show(SparkDataFrame) since 1.4.0 setMethod("show", "SparkDataFrame", @@ -368,7 +368,7 @@ setMethod("colnames<-", #' @examples #'\dontrun{ #' irisDF <- createDataFrame(iris) -#' coltypes(irisDF) +#' coltypes(irisDF) # get column types #'} #' @note coltypes since 1.6.0 setMethod("coltypes", @@ -411,7 +411,6 @@ setMethod("coltypes", #' #' Set the column types of a SparkDataFrame. #' -#' @param x A SparkDataFrame #' @param value A character vector with the target column types for the given #'SparkDataFrame. Column types can be one of integer, numeric/double, character, logical, or NA #'to keep that column as-is. @@ -424,8 +423,8 @@ setMethod("coltypes", #' sparkR.session() #' path <- "path/to/file.json" #' df <- read.json(path) -#' coltypes(df) <- c("character", "integer") -#' coltypes(df) <- c(NA, "numeric") +#' coltypes(df) <- c("character", "integer") # set column types +#' coltypes(df) <- c(NA, "numeric") # set column types #'} #' @note coltypes<- since 1.6.0 setMethod("coltypes<-", - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16444][SPARKR] Isotonic Regression wrapper in SparkR
Repository: spark Updated Branches: refs/heads/master 4d0cc84af -> 363793f2b [SPARK-16444][SPARKR] Isotonic Regression wrapper in SparkR ## What changes were proposed in this pull request? (Please fill in changes proposed in this fix) Add Isotonic Regression wrapper in SparkR Wrappers in R and Scala are added. Unit tests Documentation ## How was this patch tested? Manually tested with sudo ./R/run-tests.sh (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) Author: wm...@hotmail.com Closes #14182 from wangmiao1981/isoR. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/363793f2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/363793f2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/363793f2 Branch: refs/heads/master Commit: 363793f2bf57205f1d753d4705583aaf441849b5 Parents: 4d0cc84 Author: wm...@hotmail.com Authored: Wed Aug 17 06:15:04 2016 -0700 Committer: Felix Cheung Committed: Wed Aug 17 06:15:04 2016 -0700 -- R/pkg/NAMESPACE | 3 +- R/pkg/R/generics.R | 4 + R/pkg/R/mllib.R | 118 ++ R/pkg/inst/tests/testthat/test_mllib.R | 32 + .../spark/ml/r/IsotonicRegressionWrapper.scala | 119 +++ .../scala/org/apache/spark/ml/r/RWrappers.scala | 2 + 6 files changed, 277 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/363793f2/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index aaab92f..1e23b23 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -24,7 +24,8 @@ exportMethods("glm", "spark.kmeans", "fitted", "spark.naiveBayes", - "spark.survreg") + "spark.survreg", + "spark.isoreg") # Job group lifecycle management methods export("setJobGroup", http://git-wip-us.apache.org/repos/asf/spark/blob/363793f2/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 52ab730..ebacc11 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1304,6 +1304,10 @@ setGeneric("spark.naiveBayes", function(data, formula, ...) { standardGeneric("s #' @export setGeneric("spark.survreg", function(data, formula, ...) { standardGeneric("spark.survreg") }) +#' @rdname spark.isoreg +#' @export +setGeneric("spark.isoreg", function(data, formula, ...) { standardGeneric("spark.isoreg") }) + #' @rdname write.ml #' @export setGeneric("write.ml", function(object, path, ...) { standardGeneric("write.ml") }) http://git-wip-us.apache.org/repos/asf/spark/blob/363793f2/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 6f6e2fc..0dcc54d 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -53,6 +53,13 @@ setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj")) #' @note KMeansModel since 2.0.0 setClass("KMeansModel", representation(jobj = "jobj")) +#' S4 class that represents an IsotonicRegressionModel +#' +#' @param jobj a Java object reference to the backing Scala IsotonicRegressionModel +#' @export +#' @note IsotonicRegressionModel since 2.1.0 +setClass("IsotonicRegressionModel", representation(jobj = "jobj")) + #' Saves the MLlib model to the input path #' #' Saves the MLlib model to the input path. For more information, see the specific @@ -62,6 +69,7 @@ setClass("KMeansModel", representation(jobj = "jobj")) #' @export #' @seealso \link{spark.glm}, \link{glm} #' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg} +#' @seealso \link{spark.isoreg} #' @seealso \link{read.ml} NULL @@ -74,6 +82,7 @@ NULL #' @export #' @seealso \link{spark.glm}, \link{glm} #' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg} +#' @seealso \link{spark.isoreg} NULL #' Generalized Linear Models @@ -299,6 +308,94 @@ setMethod("summary", signature(object = "NaiveBayesModel"), return(list(apriori = apriori, tables = tables)) }) +#' Isotonic Regression Model +#' +#' Fits an Isotonic Regression model against a Spark DataFrame, similarly to R's isoreg(). +#' Users can print, make predictions on the produced model and save the model to the input path. +#' +#' @param data SparkDataFrame for training +#' @param formula A symbolic description of the model to be fitted. Currently only a few formula +#'operators are supported, including '~', '.', ':', '+', and '-'. +#' @param isotonic Whether the output sequence should be isoton
spark git commit: [SPARK-16447][ML][SPARKR] LDA wrapper in SparkR
Repository: spark Updated Branches: refs/heads/master 68f5087d2 -> b72bb62d4 [SPARK-16447][ML][SPARKR] LDA wrapper in SparkR ## What changes were proposed in this pull request? Add LDA Wrapper in SparkR with the following interfaces: - spark.lda(data, ...) - spark.posterior(object, newData, ...) - spark.perplexity(object, ...) - summary(object) - write.ml(object) - read.ml(path) ## How was this patch tested? Test with SparkR unit test. Author: Xusen Yin Closes #14229 from yinxusen/SPARK-16447. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b72bb62d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b72bb62d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b72bb62d Branch: refs/heads/master Commit: b72bb62d421840f82d663c6b8e3922bd14383fbb Parents: 68f5087 Author: Xusen Yin Authored: Thu Aug 18 05:33:52 2016 -0700 Committer: Felix Cheung Committed: Thu Aug 18 05:33:52 2016 -0700 -- R/pkg/NAMESPACE | 3 + R/pkg/R/generics.R | 14 ++ R/pkg/R/mllib.R | 166 +- R/pkg/inst/tests/testthat/test_mllib.R | 87 .../org/apache/spark/ml/clustering/LDA.scala| 4 + .../org/apache/spark/ml/r/LDAWrapper.scala | 216 +++ .../scala/org/apache/spark/ml/r/RWrappers.scala | 2 + 7 files changed, 490 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b72bb62d/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index c71eec5..4404cff 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -25,6 +25,9 @@ exportMethods("glm", "fitted", "spark.naiveBayes", "spark.survreg", + "spark.lda", + "spark.posterior", + "spark.perplexity", "spark.isoreg", "spark.gaussianMixture") http://git-wip-us.apache.org/repos/asf/spark/blob/b72bb62d/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 06bb25d..fe04bcf 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1304,6 +1304,19 @@ setGeneric("spark.naiveBayes", function(data, formula, ...) { standardGeneric("s #' @export setGeneric("spark.survreg", function(data, formula, ...) { standardGeneric("spark.survreg") }) +#' @rdname spark.lda +#' @param ... Additional parameters to tune LDA. +#' @export +setGeneric("spark.lda", function(data, ...) { standardGeneric("spark.lda") }) + +#' @rdname spark.lda +#' @export +setGeneric("spark.posterior", function(object, newData) { standardGeneric("spark.posterior") }) + +#' @rdname spark.lda +#' @export +setGeneric("spark.perplexity", function(object, data) { standardGeneric("spark.perplexity") }) + #' @rdname spark.isoreg #' @export setGeneric("spark.isoreg", function(data, formula, ...) { standardGeneric("spark.isoreg") }) @@ -1315,6 +1328,7 @@ setGeneric("spark.gaussianMixture", standardGeneric("spark.gaussianMixture") }) +#' write.ml #' @rdname write.ml #' @export setGeneric("write.ml", function(object, path, ...) { standardGeneric("write.ml") }) http://git-wip-us.apache.org/repos/asf/spark/blob/b72bb62d/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index db74046..b952741 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -39,6 +39,13 @@ setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj")) #' @note NaiveBayesModel since 2.0.0 setClass("NaiveBayesModel", representation(jobj = "jobj")) +#' S4 class that represents an LDAModel +#' +#' @param jobj a Java object reference to the backing Scala LDAWrapper +#' @export +#' @note LDAModel since 2.1.0 +setClass("LDAModel", representation(jobj = "jobj")) + #' S4 class that represents a AFTSurvivalRegressionModel #' #' @param jobj a Java object reference to the backing Scala AFTSurvivalRegressionWrapper @@ -75,7 +82,7 @@ setClass("GaussianMixtureModel", representation(jobj = "jobj")) #' @name write.ml #' @export #' @seealso \link{spark.glm}, \link{glm}, \link{spark.gaussianMixture} -#' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg} +#' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg}, \link{spark.lda} #' @seealso \link{spark.isoreg} #' @seealso \link{read.ml} NULL @@ -315,6 +322,94 @@ setMethod("summary", signature(object = "NaiveBayesModel"), return(list(apriori = apriori, tables = tables)) }) +# Returns posterior probabilities from a Latent Dir
spark git commit: [SPARK-16508][SPARKR] Fix CRAN undocumented/duplicated arguments warnings.
Repository: spark Updated Branches: refs/heads/master 39f328ba3 -> 01401e965 [SPARK-16508][SPARKR] Fix CRAN undocumented/duplicated arguments warnings. ## What changes were proposed in this pull request? This PR tries to fix all the remaining "undocumented/duplicated arguments" warnings given by CRAN-check. One left is doc for R `stats::glm` exported in SparkR. To mute that warning, we have to also provide document for all arguments of that non-SparkR function. Some previous conversation is in #14558. ## How was this patch tested? R unit test and `check-cran.sh` script (with no-test). Author: Junyang Qian Closes #14705 from junyangq/SPARK-16508-master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/01401e96 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/01401e96 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/01401e96 Branch: refs/heads/master Commit: 01401e965b58f7e8ab615764a452d7d18f1d4bf0 Parents: 39f328b Author: Junyang Qian Authored: Sat Aug 20 06:59:23 2016 -0700 Committer: Felix Cheung Committed: Sat Aug 20 06:59:23 2016 -0700 -- R/pkg/R/DataFrame.R | 221 +++--- R/pkg/R/SQLContext.R | 30 --- R/pkg/R/WindowSpec.R | 11 ++- R/pkg/R/column.R | 18 +++- R/pkg/R/functions.R | 173 R/pkg/R/generics.R | 62 ++--- R/pkg/R/group.R | 7 +- R/pkg/R/mllib.R | 113 +--- R/pkg/R/schema.R | 5 +- R/pkg/R/sparkR.R | 21 ++--- R/pkg/R/stats.R | 25 +++--- 11 files changed, 419 insertions(+), 267 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/01401e96/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 09be06d..540dc31 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -120,8 +120,9 @@ setMethod("schema", #' #' Print the logical and physical Catalyst plans to the console for debugging. #' -#' @param x A SparkDataFrame +#' @param x a SparkDataFrame. #' @param extended Logical. If extended is FALSE, explain() only prints the physical plan. +#' @param ... further arguments to be passed to or from other methods. #' @family SparkDataFrame functions #' @aliases explain,SparkDataFrame-method #' @rdname explain @@ -177,11 +178,13 @@ setMethod("isLocal", #' #' Print the first numRows rows of a SparkDataFrame #' -#' @param x A SparkDataFrame -#' @param numRows The number of rows to print. Defaults to 20. -#' @param truncate Whether truncate long strings. If true, strings more than 20 characters will be -#'truncated. However, if set greater than zero, truncates strings longer than `truncate` -#'characters and all cells will be aligned right. +#' @param x a SparkDataFrame. +#' @param numRows the number of rows to print. Defaults to 20. +#' @param truncate whether truncate long strings. If \code{TRUE}, strings more than +#' 20 characters will be truncated. However, if set greater than zero, +#' truncates strings longer than `truncate` characters and all cells +#' will be aligned right. +#' @param ... further arguments to be passed to or from other methods. #' @family SparkDataFrame functions #' @aliases showDF,SparkDataFrame-method #' @rdname showDF @@ -211,7 +214,7 @@ setMethod("showDF", #' #' Print the SparkDataFrame column names and types #' -#' @param x A SparkDataFrame +#' @param object a SparkDataFrame. #' #' @family SparkDataFrame functions #' @rdname show @@ -262,11 +265,11 @@ setMethod("dtypes", }) }) -#' Column names +#' Column Names of SparkDataFrame #' -#' Return all column names as a list +#' Return all column names as a list. #' -#' @param x A SparkDataFrame +#' @param x a SparkDataFrame. #' #' @family SparkDataFrame functions #' @rdname columns @@ -323,6 +326,8 @@ setMethod("colnames", columns(x) }) +#' @param value a character vector. Must have the same length as the number +#' of columns in the SparkDataFrame. #' @rdname columns #' @aliases colnames<-,SparkDataFrame-method #' @name colnames<- @@ -514,9 +519,10 @@ setMethod("registerTempTable", #' #' Insert the contents of a SparkDataFrame into a table registered in the current SparkSession. #' -#' @param x A SparkDataFrame -#' @param tableName A character vector containing the name of the table -#' @param overwrite A logical argument indicating whether or not to overwrite +#' @param x a SparkDataFrame. +#' @param tableName a character vector containing the name of the table. +#' @param overwrite a logical argument indicating whether or not to overwrite. +#' @param ... further arguments t
spark git commit: [SPARKR][EXAMPLE] change example APP name
Repository: spark Updated Branches: refs/heads/master 01401e965 -> 3e5fdeb3f [SPARKR][EXAMPLE] change example APP name ## What changes were proposed in this pull request? (Please fill in changes proposed in this fix) For R SQL example, appname is "MyApp". While examples in scala, Java and python, the appName is "x Spark SQL basic example". I made the R example consistent with other examples. ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) Manual test (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) Author: wm...@hotmail.com Closes #14703 from wangmiao1981/example. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3e5fdeb3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3e5fdeb3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3e5fdeb3 Branch: refs/heads/master Commit: 3e5fdeb3fb084cc9d25ce2f3f8cbf07a0aa2c573 Parents: 01401e9 Author: wm...@hotmail.com Authored: Sat Aug 20 07:00:51 2016 -0700 Committer: Felix Cheung Committed: Sat Aug 20 07:00:51 2016 -0700 -- examples/src/main/r/RSparkSQLExample.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3e5fdeb3/examples/src/main/r/RSparkSQLExample.R -- diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R index de489e1..4e0267a 100644 --- a/examples/src/main/r/RSparkSQLExample.R +++ b/examples/src/main/r/RSparkSQLExample.R @@ -18,7 +18,7 @@ library(SparkR) # $example on:init_session$ -sparkR.session(appName = "MyApp", sparkConfig = list(spark.some.config.option = "some-value")) +sparkR.session(appName = "R Spark SQL basic example", sparkConfig = list(spark.some.config.option = "some-value")) # $example off:init_session$ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17173][SPARKR] R MLlib refactor, cleanup, reformat, fix deprecation in test
Repository: spark Updated Branches: refs/heads/master 342278c09 -> 0583ecda1 [SPARK-17173][SPARKR] R MLlib refactor, cleanup, reformat, fix deprecation in test ## What changes were proposed in this pull request? refactor, cleanup, reformat, fix deprecation in test ## How was this patch tested? unit tests, manual tests Author: Felix Cheung Closes #14735 from felixcheung/rmllibutil. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0583ecda Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0583ecda Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0583ecda Branch: refs/heads/master Commit: 0583ecda1b63a7e3f126c3276059e4f99548a741 Parents: 342278c Author: Felix Cheung Authored: Mon Aug 22 12:27:33 2016 -0700 Committer: Felix Cheung Committed: Mon Aug 22 12:27:33 2016 -0700 -- R/pkg/R/mllib.R| 205 R/pkg/inst/tests/testthat/test_mllib.R | 10 +- 2 files changed, 98 insertions(+), 117 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0583ecda/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 9a53c80..b36fbce 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -88,9 +88,9 @@ setClass("ALSModel", representation(jobj = "jobj")) #' @rdname write.ml #' @name write.ml #' @export -#' @seealso \link{spark.glm}, \link{glm}, \link{spark.gaussianMixture} -#' @seealso \link{spark.als}, \link{spark.kmeans}, \link{spark.lda}, \link{spark.naiveBayes} -#' @seealso \link{spark.survreg}, \link{spark.isoreg} +#' @seealso \link{spark.glm}, \link{glm}, +#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.isoreg}, \link{spark.kmeans}, +#' @seealso \link{spark.lda}, \link{spark.naiveBayes}, \link{spark.survreg}, #' @seealso \link{read.ml} NULL @@ -101,11 +101,22 @@ NULL #' @rdname predict #' @name predict #' @export -#' @seealso \link{spark.glm}, \link{glm}, \link{spark.gaussianMixture} -#' @seealso \link{spark.als}, \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg} -#' @seealso \link{spark.isoreg} +#' @seealso \link{spark.glm}, \link{glm}, +#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.isoreg}, \link{spark.kmeans}, +#' @seealso \link{spark.naiveBayes}, \link{spark.survreg}, NULL +write_internal <- function(object, path, overwrite = FALSE) { + writer <- callJMethod(object@jobj, "write") + if (overwrite) { +writer <- callJMethod(writer, "overwrite") + } + invisible(callJMethod(writer, "save", path)) +} + +predict_internal <- function(object, newData) { + dataFrame(callJMethod(object@jobj, "transform", newData@sdf)) +} #' Generalized Linear Models #' @@ -173,7 +184,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper", "fit", formula, data@sdf, family$family, family$link, tol, as.integer(maxIter), as.character(weightCol)) -return(new("GeneralizedLinearRegressionModel", jobj = jobj)) +new("GeneralizedLinearRegressionModel", jobj = jobj) }) #' Generalized Linear Models (R-compliant) @@ -219,7 +230,7 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDat #' @export #' @note summary(GeneralizedLinearRegressionModel) since 2.0.0 setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"), - function(object, ...) { + function(object) { jobj <- object@jobj is.loaded <- callJMethod(jobj, "isLoaded") features <- callJMethod(jobj, "rFeatures") @@ -245,7 +256,7 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"), deviance = deviance, df.null = df.null, df.residual = df.residual, aic = aic, iter = iter, family = family, is.loaded = is.loaded) class(ans) <- "summary.GeneralizedLinearRegressionModel" -return(ans) +ans }) # Prints the summary of GeneralizedLinearRegressionModel @@ -275,8 +286,7 @@ print.summary.GeneralizedLinearRegressionModel <- function(x, ...) { " on", format(unlist(x[c("df.null
spark git commit: [SPARK-16508][SPARKR] doc updates and more CRAN check fixes
Repository: spark Updated Branches: refs/heads/master 84770b59f -> 71afeeea4 [SPARK-16508][SPARKR] doc updates and more CRAN check fixes ## What changes were proposed in this pull request? replace ``` ` ``` in code doc with `\code{thing}` remove added `...` for drop(DataFrame) fix remaining CRAN check warnings ## How was this patch tested? create doc with knitr junyangq Author: Felix Cheung Closes #14734 from felixcheung/rdoccleanup. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/71afeeea Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/71afeeea Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/71afeeea Branch: refs/heads/master Commit: 71afeeea4ec8e67edc95b5d504c557c88a2598b9 Parents: 84770b5 Author: Felix Cheung Authored: Mon Aug 22 15:53:10 2016 -0700 Committer: Felix Cheung Committed: Mon Aug 22 15:53:10 2016 -0700 -- R/pkg/NAMESPACE | 6 +++- R/pkg/R/DataFrame.R | 71 +++ R/pkg/R/RDD.R| 10 +++ R/pkg/R/SQLContext.R | 30 ++-- R/pkg/R/WindowSpec.R | 23 +++ R/pkg/R/column.R | 2 +- R/pkg/R/functions.R | 36 R/pkg/R/generics.R | 15 +- R/pkg/R/group.R | 1 + R/pkg/R/mllib.R | 19 +++-- R/pkg/R/pairRDD.R| 6 ++-- R/pkg/R/stats.R | 14 +- 12 files changed, 119 insertions(+), 114 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/71afeeea/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index e1b87b2..7090576 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -1,5 +1,9 @@ # Imports from base R -importFrom(methods, setGeneric, setMethod, setOldClass) +# Do not include stats:: "rpois", "runif" - causes error at runtime +importFrom("methods", "setGeneric", "setMethod", "setOldClass") +importFrom("methods", "is", "new", "signature", "show") +importFrom("stats", "gaussian", "setNames") +importFrom("utils", "download.file", "packageVersion", "untar") # Disable native libraries till we figure out how to package it # See SPARKR-7839 http://git-wip-us.apache.org/repos/asf/spark/blob/71afeeea/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 540dc31..52a6628 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -150,7 +150,7 @@ setMethod("explain", #' isLocal #' -#' Returns True if the `collect` and `take` methods can be run locally +#' Returns True if the \code{collect} and \code{take} methods can be run locally #' (without any Spark executors). #' #' @param x A SparkDataFrame @@ -182,7 +182,7 @@ setMethod("isLocal", #' @param numRows the number of rows to print. Defaults to 20. #' @param truncate whether truncate long strings. If \code{TRUE}, strings more than #' 20 characters will be truncated. However, if set greater than zero, -#' truncates strings longer than `truncate` characters and all cells +#' truncates strings longer than \code{truncate} characters and all cells #' will be aligned right. #' @param ... further arguments to be passed to or from other methods. #' @family SparkDataFrame functions @@ -642,10 +642,10 @@ setMethod("unpersist", #' The following options for repartition are possible: #' \itemize{ #' \item{1.} {Return a new SparkDataFrame partitioned by -#' the given columns into `numPartitions`.} -#' \item{2.} {Return a new SparkDataFrame that has exactly `numPartitions`.} +#' the given columns into \code{numPartitions}.} +#' \item{2.} {Return a new SparkDataFrame that has exactly \code{numPartitions}.} #' \item{3.} {Return a new SparkDataFrame partitioned by the given column(s), -#' using `spark.sql.shuffle.partitions` as number of partitions.} +#' using \code{spark.sql.shuffle.partitions} as number of partitions.} #'} #' @param x a SparkDataFrame. #' @param numPartitions the number of partitions to use. @@ -1132,9 +1132,8 @@ setMethod("take", #' Head #' -#' Return the first NUM rows of a SparkDataFrame as a R data.frame. If NUM is NULL, -#' then head() returns the first 6 rows in keeping with the current data.frame -#&
spark git commit: [SPARKR][MINOR] Remove reference link for common Windows environment variables
Repository: spark Updated Branches: refs/heads/master 9afdfc94f -> 8fd63e808 [SPARKR][MINOR] Remove reference link for common Windows environment variables ## What changes were proposed in this pull request? The PR removes reference link in the doc for environment variables for common Windows folders. The cran check gave code 503: service unavailable on the original link. ## How was this patch tested? Manual check. Author: Junyang Qian Closes #14767 from junyangq/SPARKR-RemoveLink. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8fd63e80 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8fd63e80 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8fd63e80 Branch: refs/heads/master Commit: 8fd63e808e15c8a7e78fef847183c86f332daa91 Parents: 9afdfc9 Author: Junyang Qian Authored: Tue Aug 23 11:22:32 2016 -0700 Committer: Felix Cheung Committed: Tue Aug 23 11:22:32 2016 -0700 -- R/pkg/R/install.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8fd63e80/R/pkg/R/install.R -- diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index ff81e86..c6ed88e 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -50,9 +50,7 @@ #' \itemize{ #' \item Mac OS X: \file{~/Library/Caches/spark} #' \item Unix: \env{$XDG_CACHE_HOME} if defined, otherwise \file{~/.cache/spark} -#' \item Windows: \file{\%LOCALAPPDATA\%\\spark\\spark\\Cache}. See -#' \href{https://www.microsoft.com/security/portal/mmpc/shared/variables.aspx}{ -#' Windows Common Folder Variables} about \%LOCALAPPDATA\% +#' \item Windows: \file{\%LOCALAPPDATA\%\\spark\\spark\\Cache}. #' } #' @param overwrite If \code{TRUE}, download and overwrite the existing tar file in localDir #' and force re-install Spark (in case the local directory or file is corrupted) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARKR][MINOR] Remove reference link for common Windows environment variables
Repository: spark Updated Branches: refs/heads/branch-2.0 811a2cef0 -> cc4018996 [SPARKR][MINOR] Remove reference link for common Windows environment variables ## What changes were proposed in this pull request? The PR removes reference link in the doc for environment variables for common Windows folders. The cran check gave code 503: service unavailable on the original link. ## How was this patch tested? Manual check. Author: Junyang Qian Closes #14767 from junyangq/SPARKR-RemoveLink. (cherry picked from commit 8fd63e808e15c8a7e78fef847183c86f332daa91) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cc401899 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cc401899 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cc401899 Branch: refs/heads/branch-2.0 Commit: cc4018996740b3a68d4a557615c59c67b8996ebb Parents: 811a2ce Author: Junyang Qian Authored: Tue Aug 23 11:22:32 2016 -0700 Committer: Felix Cheung Committed: Tue Aug 23 11:22:46 2016 -0700 -- R/pkg/R/install.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cc401899/R/pkg/R/install.R -- diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index ff81e86..c6ed88e 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -50,9 +50,7 @@ #' \itemize{ #' \item Mac OS X: \file{~/Library/Caches/spark} #' \item Unix: \env{$XDG_CACHE_HOME} if defined, otherwise \file{~/.cache/spark} -#' \item Windows: \file{\%LOCALAPPDATA\%\\spark\\spark\\Cache}. See -#' \href{https://www.microsoft.com/security/portal/mmpc/shared/variables.aspx}{ -#' Windows Common Folder Variables} about \%LOCALAPPDATA\% +#' \item Windows: \file{\%LOCALAPPDATA\%\\spark\\spark\\Cache}. #' } #' @param overwrite If \code{TRUE}, download and overwrite the existing tar file in localDir #' and force re-install Spark (in case the local directory or file is corrupted) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARKR][MINOR] Fix doc for show method
Repository: spark Updated Branches: refs/heads/master 45b786aca -> d2932a0e9 [SPARKR][MINOR] Fix doc for show method ## What changes were proposed in this pull request? The original doc of `show` put methods for multiple classes together but the text only talks about `SparkDataFrame`. This PR tries to fix this problem. ## How was this patch tested? Manual test. Author: Junyang Qian Closes #14776 from junyangq/SPARK-FixShowDoc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d2932a0e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d2932a0e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d2932a0e Branch: refs/heads/master Commit: d2932a0e987132c694ed59515b7c77adaad052e6 Parents: 45b786a Author: Junyang Qian Authored: Wed Aug 24 10:40:09 2016 -0700 Committer: Felix Cheung Committed: Wed Aug 24 10:40:09 2016 -0700 -- R/pkg/R/DataFrame.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d2932a0e/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 52a6628..e12b58e 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -212,9 +212,9 @@ setMethod("showDF", #' show #' -#' Print the SparkDataFrame column names and types +#' Print class and type information of a Spark object. #' -#' @param object a SparkDataFrame. +#' @param object a Spark object. Can be a SparkDataFrame, Column, GroupedData, WindowSpec. #' #' @family SparkDataFrame functions #' @rdname show - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARKR][MINOR] Fix doc for show method
Repository: spark Updated Branches: refs/heads/branch-2.0 33d79b587 -> 29091d7cd [SPARKR][MINOR] Fix doc for show method ## What changes were proposed in this pull request? The original doc of `show` put methods for multiple classes together but the text only talks about `SparkDataFrame`. This PR tries to fix this problem. ## How was this patch tested? Manual test. Author: Junyang Qian Closes #14776 from junyangq/SPARK-FixShowDoc. (cherry picked from commit d2932a0e987132c694ed59515b7c77adaad052e6) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/29091d7c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/29091d7c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/29091d7c Branch: refs/heads/branch-2.0 Commit: 29091d7cd60c20bf019dc9c1625a22e80ea50928 Parents: 33d79b5 Author: Junyang Qian Authored: Wed Aug 24 10:40:09 2016 -0700 Committer: Felix Cheung Committed: Wed Aug 24 10:40:26 2016 -0700 -- R/pkg/R/DataFrame.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/29091d7c/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index f8a05c6..ab45d2c 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -205,9 +205,9 @@ setMethod("showDF", #' show #' -#' Print the SparkDataFrame column names and types +#' Print class and type information of a Spark object. #' -#' @param object a SparkDataFrame. +#' @param object a Spark object. Can be a SparkDataFrame, Column, GroupedData, WindowSpec. #' #' @family SparkDataFrame functions #' @rdname show - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16445][MLLIB][SPARKR] Multilayer Perceptron Classifier wrapper in SparkR
Repository: spark Updated Branches: refs/heads/master d2932a0e9 -> 2fbdb6063 [SPARK-16445][MLLIB][SPARKR] Multilayer Perceptron Classifier wrapper in SparkR https://issues.apache.org/jira/browse/SPARK-16445 ## What changes were proposed in this pull request? Create Multilayer Perceptron Classifier wrapper in SparkR ## How was this patch tested? Tested manually on local machine Author: Xin Ren Closes #14447 from keypointt/SPARK-16445. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2fbdb606 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2fbdb606 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2fbdb606 Branch: refs/heads/master Commit: 2fbdb606392631b1dff88ec86f388cc2559c28f5 Parents: d2932a0 Author: Xin Ren Authored: Wed Aug 24 11:18:10 2016 -0700 Committer: Felix Cheung Committed: Wed Aug 24 11:18:10 2016 -0700 -- R/pkg/NAMESPACE | 1 + R/pkg/R/generics.R | 4 + R/pkg/R/mllib.R | 125 - R/pkg/inst/tests/testthat/test_mllib.R | 32 + .../MultilayerPerceptronClassifierWrapper.scala | 134 +++ .../scala/org/apache/spark/ml/r/RWrappers.scala | 2 + 6 files changed, 293 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2fbdb606/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 7090576..ad587a6 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -27,6 +27,7 @@ exportMethods("glm", "summary", "spark.kmeans", "fitted", + "spark.mlp", "spark.naiveBayes", "spark.survreg", "spark.lda", http://git-wip-us.apache.org/repos/asf/spark/blob/2fbdb606/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 4e6..7e626be 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1330,6 +1330,10 @@ setGeneric("spark.kmeans", function(data, formula, ...) { standardGeneric("spark #' @export setGeneric("fitted") +#' @rdname spark.mlp +#' @export +setGeneric("spark.mlp", function(data, ...) { standardGeneric("spark.mlp") }) + #' @rdname spark.naiveBayes #' @export setGeneric("spark.naiveBayes", function(data, formula, ...) { standardGeneric("spark.naiveBayes") }) http://git-wip-us.apache.org/repos/asf/spark/blob/2fbdb606/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index a40310d..a670600 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -60,6 +60,13 @@ setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj")) #' @note KMeansModel since 2.0.0 setClass("KMeansModel", representation(jobj = "jobj")) +#' S4 class that represents a MultilayerPerceptronClassificationModel +#' +#' @param jobj a Java object reference to the backing Scala MultilayerPerceptronClassifierWrapper +#' @export +#' @note MultilayerPerceptronClassificationModel since 2.1.0 +setClass("MultilayerPerceptronClassificationModel", representation(jobj = "jobj")) + #' S4 class that represents an IsotonicRegressionModel #' #' @param jobj a Java object reference to the backing Scala IsotonicRegressionModel @@ -90,7 +97,7 @@ setClass("ALSModel", representation(jobj = "jobj")) #' @export #' @seealso \link{spark.glm}, \link{glm}, #' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.isoreg}, \link{spark.kmeans}, -#' @seealso \link{spark.lda}, \link{spark.naiveBayes}, \link{spark.survreg}, +#' @seealso \link{spark.lda}, \link{spark.mlp}, \link{spark.naiveBayes}, \link{spark.survreg} #' @seealso \link{read.ml} NULL @@ -103,7 +110,7 @@ NULL #' @export #' @seealso \link{spark.glm}, \link{glm}, #' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.isoreg}, \link{spark.kmeans}, -#' @seealso \link{spark.naiveBayes}, \link{spark.survreg}, +#' @seealso \link{spark.mlp}, \link{spark.naiveBayes}, \link{spark.survreg} NULL write_internal <- function(object, path, overwrite = FALSE) { @@ -631,6 +638,95 @@ setMethod("predict", signature(object = "KMeansModel"), predict_internal(object, newData) }) +#' Multilayer Perceptron Classification Model +#' +#' \code{spark.mlp} fits a multi-layer perceptron neural network model against a SparkDataFrame. +#' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make +#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. +#' Only categorical data is supported. +#' For more det
spark git commit: [MINOR][SPARKR] fix R MLlib parameter documentation
Repository: spark Updated Branches: refs/heads/master 29952ed09 -> 945c04bcd [MINOR][SPARKR] fix R MLlib parameter documentation ## What changes were proposed in this pull request? Fixed several misplaced param tag - they should be on the spark.* method generics ## How was this patch tested? run knitr junyangq Author: Felix Cheung Closes #14792 from felixcheung/rdocmllib. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/945c04bc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/945c04bc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/945c04bc Branch: refs/heads/master Commit: 945c04bcd439e0624232c040df529f12bcc05e13 Parents: 29952ed Author: Felix Cheung Authored: Wed Aug 24 15:59:09 2016 -0700 Committer: Felix Cheung Committed: Wed Aug 24 15:59:09 2016 -0700 -- R/pkg/R/mllib.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/945c04bc/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index a670600..dfc5a1c 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -444,6 +444,7 @@ setMethod("write.ml", signature(object = "LDAModel", path = "character"), #' @param featureIndex The index of the feature if \code{featuresCol} is a vector column #' (default: 0), no effect otherwise #' @param weightCol The weight column name. +#' @param ... additional arguments passed to the method. #' @return \code{spark.isoreg} returns a fitted Isotonic Regression model #' @rdname spark.isoreg #' @aliases spark.isoreg,SparkDataFrame,formula-method @@ -504,7 +505,6 @@ setMethod("predict", signature(object = "IsotonicRegressionModel"), # Get the summary of an IsotonicRegressionModel model -#' @param ... Other optional arguments to summary of an IsotonicRegressionModel #' @return \code{summary} returns the model's boundaries and prediction as lists #' @rdname spark.isoreg #' @aliases summary,IsotonicRegressionModel-method @@ -1074,6 +1074,7 @@ setMethod("predict", signature(object = "AFTSurvivalRegressionModel"), #' @param k number of independent Gaussians in the mixture model. #' @param maxIter maximum iteration number. #' @param tol the convergence tolerance. +#' @param ... additional arguments passed to the method. #' @aliases spark.gaussianMixture,SparkDataFrame,formula-method #' @return \code{spark.gaussianMixture} returns a fitted multivariate gaussian mixture model. #' @rdname spark.gaussianMixture @@ -1117,7 +1118,6 @@ setMethod("spark.gaussianMixture", signature(data = "SparkDataFrame", formula = # Get the summary of a multivariate gaussian mixture model #' @param object a fitted gaussian mixture model. -#' @param ... currently not used argument(s) passed to the method. #' @return \code{summary} returns the model's lambda, mu, sigma and posterior. #' @aliases spark.gaussianMixture,SparkDataFrame,formula-method #' @rdname spark.gaussianMixture - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARKR][MINOR] Add more examples to window function docs
Repository: spark Updated Branches: refs/heads/master 945c04bcd -> 18708f76c [SPARKR][MINOR] Add more examples to window function docs ## What changes were proposed in this pull request? This PR adds more examples to window function docs to make them more accessible to the users. It also fixes default value issues for `lag` and `lead`. ## How was this patch tested? Manual test, R unit test. Author: Junyang Qian Closes #14779 from junyangq/SPARKR-FixWindowFunctionDocs. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/18708f76 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/18708f76 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/18708f76 Branch: refs/heads/master Commit: 18708f76c366c6e01b5865981666e40d8642ac20 Parents: 945c04b Author: Junyang Qian Authored: Wed Aug 24 16:00:04 2016 -0700 Committer: Felix Cheung Committed: Wed Aug 24 16:00:04 2016 -0700 -- R/pkg/R/WindowSpec.R | 12 R/pkg/R/functions.R | 78 --- 2 files changed, 72 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/18708f76/R/pkg/R/WindowSpec.R -- diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R index ddd2ef2..4ac83c2 100644 --- a/R/pkg/R/WindowSpec.R +++ b/R/pkg/R/WindowSpec.R @@ -203,6 +203,18 @@ setMethod("rangeBetween", #' @aliases over,Column,WindowSpec-method #' @family colum_func #' @export +#' @examples \dontrun{ +#' df <- createDataFrame(mtcars) +#' +#' # Partition by am (transmission) and order by hp (horsepower) +#' ws <- orderBy(windowPartitionBy("am"), "hp") +#' +#' # Rank on hp within each partition +#' out <- select(df, over(rank(), ws), df$hp, df$am) +#' +#' # Lag mpg values by 1 row on the partition-and-ordered table +#' out <- select(df, over(lead(df$mpg), ws), df$mpg, df$hp, df$am) +#' } #' @note over since 2.0.0 setMethod("over", signature(x = "Column", window = "WindowSpec"), http://git-wip-us.apache.org/repos/asf/spark/blob/18708f76/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index f042add..dbf8dd8 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -3121,9 +3121,9 @@ setMethod("ifelse", #' @aliases cume_dist,missing-method #' @export #' @examples \dontrun{ -#' df <- createDataFrame(iris) -#' ws <- orderBy(windowPartitionBy("Species"), "Sepal_Length") -#' out <- select(df, over(cume_dist(), ws), df$Sepal_Length, df$Species) +#' df <- createDataFrame(mtcars) +#' ws <- orderBy(windowPartitionBy("am"), "hp") +#' out <- select(df, over(cume_dist(), ws), df$hp, df$am) #' } #' @note cume_dist since 1.6.0 setMethod("cume_dist", @@ -3148,7 +3148,11 @@ setMethod("cume_dist", #' @family window_funcs #' @aliases dense_rank,missing-method #' @export -#' @examples \dontrun{dense_rank()} +#' @examples \dontrun{ +#' df <- createDataFrame(mtcars) +#' ws <- orderBy(windowPartitionBy("am"), "hp") +#' out <- select(df, over(dense_rank(), ws), df$hp, df$am) +#' } #' @note dense_rank since 1.6.0 setMethod("dense_rank", signature("missing"), @@ -3168,18 +3172,26 @@ setMethod("dense_rank", #' @param x the column as a character string or a Column to compute on. #' @param offset the number of rows back from the current row from which to obtain a value. #' If not specified, the default is 1. -#' @param defaultValue default to use when the offset row does not exist. +#' @param defaultValue (optional) default to use when the offset row does not exist. #' @param ... further arguments to be passed to or from other methods. #' @rdname lag #' @name lag #' @aliases lag,characterOrColumn-method #' @family window_funcs #' @export -#' @examples \dontrun{lag(df$c)} +#' @examples \dontrun{ +#' df <- createDataFrame(mtcars) +#' +#' # Partition by am (transmission) and order by hp (horsepower) +#' ws <- orderBy(windowPartitionBy("am"), "hp") +#' +#' # Lag mpg values by 1 row on the partition-and-ordered table +#' out <- select(df, over(lag(df$mpg), ws), df$mpg, df$hp, df$am) +#' } #' @note lag since 1.6.0 setMethod("lag", signature(x = "characterOrColumn"), - function(x, offset, defaultValue = NULL) { + function(x, offset = 1, defaultValue = NULL) { col <- if (class(x) == "Column") { x@jc } else { @@ -3194,25 +3206,35 @@ setMethod("lag", #' lead #' #' Window function: returns the value that is \code{offset} rows after the current row, and -#' NULL if there is less than \code{offset} rows after the current row. For example, -#' an \code{offset} of one will return the nex
spark git commit: [SPARKR][MINOR] Add installation message for remote master mode and improve other messages
Repository: spark Updated Branches: refs/heads/branch-2.0 43273377a -> 9f363a690 [SPARKR][MINOR] Add installation message for remote master mode and improve other messages ## What changes were proposed in this pull request? This PR gives informative message to users when they try to connect to a remote master but don't have Spark package in their local machine. As a clarification, for now, automatic installation will only happen if they start SparkR in R console (rather than from sparkr-shell) and connect to local master. In the remote master mode, local Spark package is still needed, but we will not trigger the install.spark function because the versions have to match those on the cluster, which involves more user input. Instead, we here try to provide detailed message that may help the users. Some of the other messages have also been slightly changed. ## How was this patch tested? Manual test. Author: Junyang Qian Closes #14761 from junyangq/SPARK-16579-V1. (cherry picked from commit 3a60be4b15a5ab9b6e0c4839df99dac7738aa7fe) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9f363a69 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9f363a69 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9f363a69 Branch: refs/heads/branch-2.0 Commit: 9f363a690102f04a2a486853c1b89134455518bc Parents: 4327337 Author: Junyang Qian Authored: Wed Aug 24 16:04:14 2016 -0700 Committer: Felix Cheung Committed: Wed Aug 24 16:04:26 2016 -0700 -- R/pkg/R/install.R | 64 ++ R/pkg/R/sparkR.R | 51 ++-- R/pkg/R/utils.R | 4 ++-- 3 files changed, 80 insertions(+), 39 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9f363a69/R/pkg/R/install.R -- diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index c6ed88e..69b0a52 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -70,9 +70,9 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, localDir = NULL, overwrite = FALSE) { version <- paste0("spark-", packageVersion("SparkR")) hadoopVersion <- tolower(hadoopVersion) - hadoopVersionName <- hadoop_version_name(hadoopVersion) + hadoopVersionName <- hadoopVersionName(hadoopVersion) packageName <- paste(version, "bin", hadoopVersionName, sep = "-") - localDir <- ifelse(is.null(localDir), spark_cache_path(), + localDir <- ifelse(is.null(localDir), sparkCachePath(), normalizePath(localDir, mustWork = FALSE)) if (is.na(file.info(localDir)$isdir)) { @@ -88,12 +88,14 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, # can use dir.exists(packageLocalDir) under R 3.2.0 or later if (!is.na(file.info(packageLocalDir)$isdir) && !overwrite) { -fmt <- "Spark %s for Hadoop %s is found, and SPARK_HOME set to %s" +fmt <- "%s for Hadoop %s found, with SPARK_HOME set to %s" msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), packageLocalDir) message(msg) Sys.setenv(SPARK_HOME = packageLocalDir) return(invisible(packageLocalDir)) + } else { +message("Spark not found in the cache directory. Installation will start.") } packageLocalPath <- paste0(packageLocalDir, ".tgz") @@ -102,7 +104,7 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, if (tarExists && !overwrite) { message("tar file found.") } else { -robust_download_tar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) +robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) } message(sprintf("Installing to %s", localDir)) @@ -116,33 +118,37 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, invisible(packageLocalDir) } -robust_download_tar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) { +robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) { # step 1: use user-provided url if (!is.null(mirrorUrl)) { msg <- sprintf("Use user-provided mirror site: %s.", mirrorUrl) message(msg) -success <- direct_download_tar(mirrorUrl, version, hadoopVersion, +success <- directDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) -if (success) return() +if (success) { + return() +} else { + message(paste0("Unable to download from mirrorUrl: ", mirrorUrl)) +} } else { -message("Mirror site not provided.") +message("MirrorUrl not provided.") } # step
spark git commit: [SPARKR][MINOR] Add installation message for remote master mode and improve other messages
Repository: spark Updated Branches: refs/heads/master 18708f76c -> 3a60be4b1 [SPARKR][MINOR] Add installation message for remote master mode and improve other messages ## What changes were proposed in this pull request? This PR gives informative message to users when they try to connect to a remote master but don't have Spark package in their local machine. As a clarification, for now, automatic installation will only happen if they start SparkR in R console (rather than from sparkr-shell) and connect to local master. In the remote master mode, local Spark package is still needed, but we will not trigger the install.spark function because the versions have to match those on the cluster, which involves more user input. Instead, we here try to provide detailed message that may help the users. Some of the other messages have also been slightly changed. ## How was this patch tested? Manual test. Author: Junyang Qian Closes #14761 from junyangq/SPARK-16579-V1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3a60be4b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3a60be4b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3a60be4b Branch: refs/heads/master Commit: 3a60be4b15a5ab9b6e0c4839df99dac7738aa7fe Parents: 18708f7 Author: Junyang Qian Authored: Wed Aug 24 16:04:14 2016 -0700 Committer: Felix Cheung Committed: Wed Aug 24 16:04:14 2016 -0700 -- R/pkg/R/install.R | 64 ++ R/pkg/R/sparkR.R | 51 ++-- R/pkg/R/utils.R | 4 ++-- 3 files changed, 80 insertions(+), 39 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3a60be4b/R/pkg/R/install.R -- diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index c6ed88e..69b0a52 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -70,9 +70,9 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, localDir = NULL, overwrite = FALSE) { version <- paste0("spark-", packageVersion("SparkR")) hadoopVersion <- tolower(hadoopVersion) - hadoopVersionName <- hadoop_version_name(hadoopVersion) + hadoopVersionName <- hadoopVersionName(hadoopVersion) packageName <- paste(version, "bin", hadoopVersionName, sep = "-") - localDir <- ifelse(is.null(localDir), spark_cache_path(), + localDir <- ifelse(is.null(localDir), sparkCachePath(), normalizePath(localDir, mustWork = FALSE)) if (is.na(file.info(localDir)$isdir)) { @@ -88,12 +88,14 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, # can use dir.exists(packageLocalDir) under R 3.2.0 or later if (!is.na(file.info(packageLocalDir)$isdir) && !overwrite) { -fmt <- "Spark %s for Hadoop %s is found, and SPARK_HOME set to %s" +fmt <- "%s for Hadoop %s found, with SPARK_HOME set to %s" msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), packageLocalDir) message(msg) Sys.setenv(SPARK_HOME = packageLocalDir) return(invisible(packageLocalDir)) + } else { +message("Spark not found in the cache directory. Installation will start.") } packageLocalPath <- paste0(packageLocalDir, ".tgz") @@ -102,7 +104,7 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, if (tarExists && !overwrite) { message("tar file found.") } else { -robust_download_tar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) +robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) } message(sprintf("Installing to %s", localDir)) @@ -116,33 +118,37 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, invisible(packageLocalDir) } -robust_download_tar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) { +robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) { # step 1: use user-provided url if (!is.null(mirrorUrl)) { msg <- sprintf("Use user-provided mirror site: %s.", mirrorUrl) message(msg) -success <- direct_download_tar(mirrorUrl, version, hadoopVersion, +success <- directDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) -if (success) return() +if (success) { + return() +} else { + message(paste0("Unable to download from mirrorUrl: ", mirrorUrl)) +} } else { -message("Mirror site not provided.") +message("MirrorUrl not provided.") } # step 2: use url suggested from apache website - message("Looking for site suggested from apache website...") -
spark git commit: [SPARKR][MINOR] Add more examples to window function docs
Repository: spark Updated Branches: refs/heads/branch-2.0 9f924a01b -> 43273377a [SPARKR][MINOR] Add more examples to window function docs ## What changes were proposed in this pull request? This PR adds more examples to window function docs to make them more accessible to the users. It also fixes default value issues for `lag` and `lead`. ## How was this patch tested? Manual test, R unit test. Author: Junyang Qian Closes #14779 from junyangq/SPARKR-FixWindowFunctionDocs. (cherry picked from commit 18708f76c366c6e01b5865981666e40d8642ac20) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/43273377 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/43273377 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/43273377 Branch: refs/heads/branch-2.0 Commit: 43273377a38a9136ff5e56929630930f076af5af Parents: 9f924a0 Author: Junyang Qian Authored: Wed Aug 24 16:00:04 2016 -0700 Committer: Felix Cheung Committed: Wed Aug 24 16:00:18 2016 -0700 -- R/pkg/R/WindowSpec.R | 12 R/pkg/R/functions.R | 78 --- 2 files changed, 72 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/43273377/R/pkg/R/WindowSpec.R -- diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R index ddd2ef2..4ac83c2 100644 --- a/R/pkg/R/WindowSpec.R +++ b/R/pkg/R/WindowSpec.R @@ -203,6 +203,18 @@ setMethod("rangeBetween", #' @aliases over,Column,WindowSpec-method #' @family colum_func #' @export +#' @examples \dontrun{ +#' df <- createDataFrame(mtcars) +#' +#' # Partition by am (transmission) and order by hp (horsepower) +#' ws <- orderBy(windowPartitionBy("am"), "hp") +#' +#' # Rank on hp within each partition +#' out <- select(df, over(rank(), ws), df$hp, df$am) +#' +#' # Lag mpg values by 1 row on the partition-and-ordered table +#' out <- select(df, over(lead(df$mpg), ws), df$mpg, df$hp, df$am) +#' } #' @note over since 2.0.0 setMethod("over", signature(x = "Column", window = "WindowSpec"), http://git-wip-us.apache.org/repos/asf/spark/blob/43273377/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index f042add..dbf8dd8 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -3121,9 +3121,9 @@ setMethod("ifelse", #' @aliases cume_dist,missing-method #' @export #' @examples \dontrun{ -#' df <- createDataFrame(iris) -#' ws <- orderBy(windowPartitionBy("Species"), "Sepal_Length") -#' out <- select(df, over(cume_dist(), ws), df$Sepal_Length, df$Species) +#' df <- createDataFrame(mtcars) +#' ws <- orderBy(windowPartitionBy("am"), "hp") +#' out <- select(df, over(cume_dist(), ws), df$hp, df$am) #' } #' @note cume_dist since 1.6.0 setMethod("cume_dist", @@ -3148,7 +3148,11 @@ setMethod("cume_dist", #' @family window_funcs #' @aliases dense_rank,missing-method #' @export -#' @examples \dontrun{dense_rank()} +#' @examples \dontrun{ +#' df <- createDataFrame(mtcars) +#' ws <- orderBy(windowPartitionBy("am"), "hp") +#' out <- select(df, over(dense_rank(), ws), df$hp, df$am) +#' } #' @note dense_rank since 1.6.0 setMethod("dense_rank", signature("missing"), @@ -3168,18 +3172,26 @@ setMethod("dense_rank", #' @param x the column as a character string or a Column to compute on. #' @param offset the number of rows back from the current row from which to obtain a value. #' If not specified, the default is 1. -#' @param defaultValue default to use when the offset row does not exist. +#' @param defaultValue (optional) default to use when the offset row does not exist. #' @param ... further arguments to be passed to or from other methods. #' @rdname lag #' @name lag #' @aliases lag,characterOrColumn-method #' @family window_funcs #' @export -#' @examples \dontrun{lag(df$c)} +#' @examples \dontrun{ +#' df <- createDataFrame(mtcars) +#' +#' # Partition by am (transmission) and order by hp (horsepower) +#' ws <- orderBy(windowPartitionBy("am"), "hp") +#' +#' # Lag mpg values by 1 row on the partition-and-ordered table +#' out <- select(df, over(lag(df$mpg), ws), df$mpg, df$hp, df$am) +#' } #' @note lag since 1.6.0 setMethod("lag", signature(x = "characterOrColumn"), - function(x, offset, defaultValue = NULL) { + function(x, offset = 1, defaultValue = NULL) { col <- if (class(x) == "Column") { x@jc } else { @@ -3194,25 +3206,35 @@ setMethod("lag", #' lead #' #' Window function: returns the value that is \code{offset} rows after the current row, and -#' NULL if there is less t
spark git commit: [SPARKR][MINOR] Fix example of spark.naiveBayes
Repository: spark Updated Branches: refs/heads/master 970ab8f6d -> 188321623 [SPARKR][MINOR] Fix example of spark.naiveBayes ## What changes were proposed in this pull request? The original example doesn't work because the features are not categorical. This PR fixes this by changing to another dataset. ## How was this patch tested? Manual test. Author: Junyang Qian Closes #14820 from junyangq/SPARK-FixNaiveBayes. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/18832162 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/18832162 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/18832162 Branch: refs/heads/master Commit: 18832162357282ec81515b5b2ba93747be3ad18b Parents: 970ab8f Author: Junyang Qian Authored: Fri Aug 26 11:01:48 2016 -0700 Committer: Felix Cheung Committed: Fri Aug 26 11:01:48 2016 -0700 -- R/pkg/R/mllib.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/18832162/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index dfc5a1c..6808aae 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -747,10 +747,11 @@ setMethod("summary", signature(object = "MultilayerPerceptronClassificationModel #' @export #' @examples #' \dontrun{ -#' df <- createDataFrame(infert) +#' data <- as.data.frame(UCBAdmissions) +#' df <- createDataFrame(data) #' #' # fit a Bernoulli naive Bayes model -#' model <- spark.naiveBayes(df, education ~ ., smoothing = 0) +#' model <- spark.naiveBayes(df, Admit ~ Gender + Dept, smoothing = 0) #' #' # get the summary of the model #' summary(model) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARKR][MINOR] Fix example of spark.naiveBayes
Repository: spark Updated Branches: refs/heads/branch-2.0 27ed6d5dc -> 6f82d2da3 [SPARKR][MINOR] Fix example of spark.naiveBayes ## What changes were proposed in this pull request? The original example doesn't work because the features are not categorical. This PR fixes this by changing to another dataset. ## How was this patch tested? Manual test. Author: Junyang Qian Closes #14820 from junyangq/SPARK-FixNaiveBayes. (cherry picked from commit 18832162357282ec81515b5b2ba93747be3ad18b) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f82d2da Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f82d2da Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f82d2da Branch: refs/heads/branch-2.0 Commit: 6f82d2da382cee2950a0797436e5d48805cbba5f Parents: 27ed6d5 Author: Junyang Qian Authored: Fri Aug 26 11:01:48 2016 -0700 Committer: Felix Cheung Committed: Fri Aug 26 11:02:04 2016 -0700 -- R/pkg/R/mllib.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6f82d2da/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 25dcb3a..b33a16a 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -433,10 +433,11 @@ setMethod("predict", signature(object = "KMeansModel"), #' @export #' @examples #' \dontrun{ -#' df <- createDataFrame(infert) +#' data <- as.data.frame(UCBAdmissions) +#' df <- createDataFrame(data) #' #' # fit a Bernoulli naive Bayes model -#' model <- spark.naiveBayes(df, education ~ ., smoothing = 0) +#' model <- spark.naiveBayes(df, Admit ~ Gender + Dept, smoothing = 0) #' #' # get the summary of the model #' summary(model) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17178][SPARKR][SPARKSUBMIT] Allow to set sparkr shell command through --conf
Repository: spark Updated Branches: refs/heads/master d92cd227c -> fa6347938 [SPARK-17178][SPARKR][SPARKSUBMIT] Allow to set sparkr shell command through --conf ## What changes were proposed in this pull request? Allow user to set sparkr shell command through --conf spark.r.shell.command ## How was this patch tested? Unit test is added and also verify it manually through ``` bin/sparkr --master yarn-client --conf spark.r.shell.command=/usr/local/bin/R ``` Author: Jeff Zhang Closes #14744 from zjffdu/SPARK-17178. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fa634793 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fa634793 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fa634793 Branch: refs/heads/master Commit: fa6347938fc1c72ddc03a5f3cd2e929b5694f0a6 Parents: d92cd22 Author: Jeff Zhang Authored: Wed Aug 31 00:20:41 2016 -0700 Committer: Felix Cheung Committed: Wed Aug 31 00:20:41 2016 -0700 -- docs/configuration.md | 11 ++- .../org/apache/spark/launcher/SparkLauncher.java | 2 ++ .../spark/launcher/SparkSubmitCommandBuilder.java | 3 ++- .../launcher/SparkSubmitCommandBuilderSuite.java | 18 ++ 4 files changed, 32 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fa634793/docs/configuration.md -- diff --git a/docs/configuration.md b/docs/configuration.md index d0c76aa..6e98f67 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1786,6 +1786,14 @@ showDF(properties, numRows = 200, truncate = FALSE) Executable for executing R scripts in client modes for driver. Ignored in cluster modes. + + spark.r.shell.command + R + +Executable for executing sparkR shell in client modes for driver. Ignored in cluster modes. It is the same as environment variable SPARKR_DRIVER_R, but take precedence over it. +spark.r.shell.command is used for sparkR shell while spark.r.driver.command is used for running R script. + + Deploy @@ -1852,7 +1860,8 @@ The following variables can be set in `spark-env.sh`: SPARKR_DRIVER_R -R binary executable to use for SparkR shell (default is R). +R binary executable to use for SparkR shell (default is R). +Property spark.r.shell.command take precedence if it is set SPARK_LOCAL_IP http://git-wip-us.apache.org/repos/asf/spark/blob/fa634793/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java -- diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java index 7b7a7bf..ea56214 100644 --- a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java +++ b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java @@ -68,6 +68,8 @@ public class SparkLauncher { static final String PYSPARK_PYTHON = "spark.pyspark.python"; + static final String SPARKR_R_SHELL = "spark.r.shell.command"; + /** Logger name to use when launching a child process. */ public static final String CHILD_PROCESS_LOGGER_NAME = "spark.launcher.childProcLoggerName"; http://git-wip-us.apache.org/repos/asf/spark/blob/fa634793/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java -- diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java index f6da644..29c6d82 100644 --- a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java +++ b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java @@ -336,7 +336,8 @@ class SparkSubmitCommandBuilder extends AbstractCommandBuilder { join(File.separator, sparkHome, "R", "lib", "SparkR", "profile", "shell.R")); List args = new ArrayList<>(); -args.add(firstNonEmpty(System.getenv("SPARKR_DRIVER_R"), "R")); +args.add(firstNonEmpty(conf.get(SparkLauncher.SPARKR_R_SHELL), + System.getenv("SPARKR_DRIVER_R"), "R")); return args; } http://git-wip-us.apache.org/repos/asf/spark/blob/fa634793/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java -- diff --git a/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java b/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java index 16e5a22..ad2e7a7 100644 --- a/launcher/src/test/java/org/apache/spark
spark git commit: [SPARK-16883][SPARKR] SQL decimal type is not properly cast to number when collecting SparkDataFrame
Repository: spark Updated Branches: refs/heads/master 2ab8dbdda -> 0f30cdedb [SPARK-16883][SPARKR] SQL decimal type is not properly cast to number when collecting SparkDataFrame ## What changes were proposed in this pull request? (Please fill in changes proposed in this fix) registerTempTable(createDataFrame(iris), "iris") str(collect(sql("select cast('1' as double) as x, cast('2' as decimal) as y from iris limit 5"))) 'data.frame': 5 obs. of 2 variables: $ x: num 1 1 1 1 1 $ y:List of 5 ..$ : num 2 ..$ : num 2 ..$ : num 2 ..$ : num 2 ..$ : num 2 The problem is that spark returns `decimal(10, 0)` col type, instead of `decimal`. Thus, `decimal(10, 0)` is not handled correctly. It should be handled as "double". As discussed in JIRA thread, we can have two potential fixes: 1). Scala side fix to add a new case when writing the object back; However, I can't use spark.sql.types._ in Spark core due to dependency issues. I don't find a way of doing type case match; 2). SparkR side fix: Add a helper function to check special type like `"decimal(10, 0)"` and replace it with `double`, which is PRIMITIVE type. This special helper is generic for adding new types handling in the future. I open this PR to discuss pros and cons of both approaches. If we want to do Scala side fix, we need to find a way to match the case of DecimalType and StructType in Spark Core. ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) Manual test: > str(collect(sql("select cast('1' as double) as x, cast('2' as decimal) as y > from iris limit 5"))) 'data.frame': 5 obs. of 2 variables: $ x: num 1 1 1 1 1 $ y: num 2 2 2 2 2 R Unit tests Author: wm...@hotmail.com Closes #14613 from wangmiao1981/type. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0f30cded Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0f30cded Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0f30cded Branch: refs/heads/master Commit: 0f30cdedbdb0d38e8c479efab6bb1c6c376206ff Parents: 2ab8dbd Author: wm...@hotmail.com Authored: Fri Sep 2 01:47:17 2016 -0700 Committer: Felix Cheung Committed: Fri Sep 2 01:47:17 2016 -0700 -- R/pkg/R/DataFrame.R | 13 - R/pkg/R/types.R | 16 R/pkg/inst/tests/testthat/test_sparkSQL.R | 22 ++ 3 files changed, 50 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0f30cded/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index e12b58e..a924502 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -397,7 +397,11 @@ setMethod("coltypes", } if (is.null(type)) { - stop(paste("Unsupported data type: ", x)) + specialtype <- specialtypeshandle(x) + if (is.null(specialtype)) { +stop(paste("Unsupported data type: ", x)) + } + type <- PRIMITIVE_TYPES[[specialtype]] } } type @@ -1063,6 +1067,13 @@ setMethod("collect", df[[colIndex]] <- col } else { colType <- dtypes[[colIndex]][[2]] + if (is.null(PRIMITIVE_TYPES[[colType]])) { +specialtype <- specialtypeshandle(colType) +if (!is.null(specialtype)) { + colType <- specialtype +} + } + # Note that "binary" columns behave like complex types. if (!is.null(PRIMITIVE_TYPES[[colType]]) && colType != "binary") { vec <- do.call(c, col) http://git-wip-us.apache.org/repos/asf/spark/blob/0f30cded/R/pkg/R/types.R -- diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R index ad048b1..abca703 100644 --- a/R/pkg/R/types.R +++ b/R/pkg/R/types.R @@ -67,3 +67,19 @@ rToSQLTypes <- as.environment(list( "double" = "double", "character" = "string", "logical" = "boolean")) + +# Helper function of coverting decimal type. When backend returns column type in the +# format of decimal(,) (e.g., decimal(10, 0)), this function coverts the column type +# as double type. This function converts backend returned types that are not the key +# of PRIMITIVE_TYPES, but should be treated as PRIMITIVE_TYPES. +# @param A type returned from the JVM backend. +# @return A type is the key of the PRIMITIVE_TYPES. +specialtypeshandle <- function(type) { + returntype <- NULL + m <- regexec("^decim
spark git commit: [SPARK-16883][SPARKR] SQL decimal type is not properly cast to number when collecting SparkDataFrame
Repository: spark Updated Branches: refs/heads/branch-2.0 f9463238d -> 171bdfd96 [SPARK-16883][SPARKR] SQL decimal type is not properly cast to number when collecting SparkDataFrame ## What changes were proposed in this pull request? (Please fill in changes proposed in this fix) registerTempTable(createDataFrame(iris), "iris") str(collect(sql("select cast('1' as double) as x, cast('2' as decimal) as y from iris limit 5"))) 'data.frame': 5 obs. of 2 variables: $ x: num 1 1 1 1 1 $ y:List of 5 ..$ : num 2 ..$ : num 2 ..$ : num 2 ..$ : num 2 ..$ : num 2 The problem is that spark returns `decimal(10, 0)` col type, instead of `decimal`. Thus, `decimal(10, 0)` is not handled correctly. It should be handled as "double". As discussed in JIRA thread, we can have two potential fixes: 1). Scala side fix to add a new case when writing the object back; However, I can't use spark.sql.types._ in Spark core due to dependency issues. I don't find a way of doing type case match; 2). SparkR side fix: Add a helper function to check special type like `"decimal(10, 0)"` and replace it with `double`, which is PRIMITIVE type. This special helper is generic for adding new types handling in the future. I open this PR to discuss pros and cons of both approaches. If we want to do Scala side fix, we need to find a way to match the case of DecimalType and StructType in Spark Core. ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) Manual test: > str(collect(sql("select cast('1' as double) as x, cast('2' as decimal) as y > from iris limit 5"))) 'data.frame': 5 obs. of 2 variables: $ x: num 1 1 1 1 1 $ y: num 2 2 2 2 2 R Unit tests Author: wm...@hotmail.com Closes #14613 from wangmiao1981/type. (cherry picked from commit 0f30cdedbdb0d38e8c479efab6bb1c6c376206ff) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/171bdfd9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/171bdfd9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/171bdfd9 Branch: refs/heads/branch-2.0 Commit: 171bdfd963b5dda85ddf5e72b72471fdaaaf2fe3 Parents: f946323 Author: wm...@hotmail.com Authored: Fri Sep 2 01:47:17 2016 -0700 Committer: Felix Cheung Committed: Fri Sep 2 01:48:11 2016 -0700 -- R/pkg/R/DataFrame.R | 13 - R/pkg/R/types.R | 16 R/pkg/inst/tests/testthat/test_sparkSQL.R | 22 ++ 3 files changed, 50 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/171bdfd9/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index ab45d2c..8aea228 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -390,7 +390,11 @@ setMethod("coltypes", } if (is.null(type)) { - stop(paste("Unsupported data type: ", x)) + specialtype <- specialtypeshandle(x) + if (is.null(specialtype)) { +stop(paste("Unsupported data type: ", x)) + } + type <- PRIMITIVE_TYPES[[specialtype]] } } type @@ -1056,6 +1060,13 @@ setMethod("collect", df[[colIndex]] <- col } else { colType <- dtypes[[colIndex]][[2]] + if (is.null(PRIMITIVE_TYPES[[colType]])) { +specialtype <- specialtypeshandle(colType) +if (!is.null(specialtype)) { + colType <- specialtype +} + } + # Note that "binary" columns behave like complex types. if (!is.null(PRIMITIVE_TYPES[[colType]]) && colType != "binary") { vec <- do.call(c, col) http://git-wip-us.apache.org/repos/asf/spark/blob/171bdfd9/R/pkg/R/types.R -- diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R index ad048b1..abca703 100644 --- a/R/pkg/R/types.R +++ b/R/pkg/R/types.R @@ -67,3 +67,19 @@ rToSQLTypes <- as.environment(list( "double" = "double", "character" = "string", "logical" = "boolean")) + +# Helper function of coverting decimal type. When backend returns column type in the +# format of decimal(,) (e.g., decimal(10, 0)), this function coverts the column type +# as double type. This function converts backend returned types that are not the key +# of PRIMITIVE_TYPES, but should be treated as PRIMITIVE_TYPES. +# @param A type returned from the JVM backend. +# @return A type is the key of
spark git commit: [SPARK-15509][ML][SPARKR] R MLlib algorithms should support input columns "features" and "label"
Repository: spark Updated Branches: refs/heads/master 0f30cdedb -> 6969dcc79 [SPARK-15509][ML][SPARKR] R MLlib algorithms should support input columns "features" and "label" https://issues.apache.org/jira/browse/SPARK-15509 ## What changes were proposed in this pull request? Currently in SparkR, when you load a LibSVM dataset using the sqlContext and then pass it to an MLlib algorithm, the ML wrappers will fail since they will try to create a "features" column, which conflicts with the existing "features" column from the LibSVM loader. E.g., using the "mnist" dataset from LibSVM: `training <- loadDF(sqlContext, ".../mnist", "libsvm")` `model <- naiveBayes(label ~ features, training)` This fails with: ``` 16/05/24 11:52:41 ERROR RBackendHandler: fit on org.apache.spark.ml.r.NaiveBayesWrapper failed Error in invokeJava(isStatic = TRUE, className, methodName, ...) : java.lang.IllegalArgumentException: Output column features already exists. at org.apache.spark.ml.feature.VectorAssembler.transformSchema(VectorAssembler.scala:120) at org.apache.spark.ml.Pipeline$$anonfun$transformSchema$4.apply(Pipeline.scala:179) at org.apache.spark.ml.Pipeline$$anonfun$transformSchema$4.apply(Pipeline.scala:179) at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:57) at scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:66) at scala.collection.mutable.ArrayOps$ofRef.foldLeft(ArrayOps.scala:186) at org.apache.spark.ml.Pipeline.transformSchema(Pipeline.scala:179) at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:67) at org.apache.spark.ml.Pipeline.fit(Pipeline.scala:131) at org.apache.spark.ml.feature.RFormula.fit(RFormula.scala:169) at org.apache.spark.ml.r.NaiveBayesWrapper$.fit(NaiveBayesWrapper.scala:62) at org.apache.spark.ml.r.NaiveBayesWrapper.fit(NaiveBayesWrapper.sca The same issue appears for the "label" column once you rename the "features" column. ``` The cause is, when using `loadDF()` to generate dataframes, sometimes itâs with default column name `âlabelâ` and `âfeaturesâ`, and these two name will conflict with default column names `setDefault(labelCol, "label")` and ` setDefault(featuresCol, "features")` of `SharedParams.scala` ## How was this patch tested? Test on my local machine. Author: Xin Ren Closes #13584 from keypointt/SPARK-15509. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6969dcc7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6969dcc7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6969dcc7 Branch: refs/heads/master Commit: 6969dcc79a33d715250958b24361f2d43552d840 Parents: 0f30cde Author: Xin Ren Authored: Fri Sep 2 01:54:28 2016 -0700 Committer: Felix Cheung Committed: Fri Sep 2 01:54:28 2016 -0700 -- .../ml/r/AFTSurvivalRegressionWrapper.scala | 1 + .../spark/ml/r/GaussianMixtureWrapper.scala | 5 +- .../r/GeneralizedLinearRegressionWrapper.scala | 1 + .../spark/ml/r/IsotonicRegressionWrapper.scala | 5 +- .../org/apache/spark/ml/r/KMeansWrapper.scala | 5 +- .../apache/spark/ml/r/NaiveBayesWrapper.scala | 11 +-- .../org/apache/spark/ml/r/RWrapperUtils.scala | 71 .../apache/spark/ml/feature/RFormulaSuite.scala | 3 - .../apache/spark/ml/r/RWrapperUtilsSuite.scala | 56 +++ 9 files changed, 144 insertions(+), 14 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6969dcc7/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala index 5462f80..67d037e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala @@ -87,6 +87,7 @@ private[r] object AFTSurvivalRegressionWrapper extends MLReadable[AFTSurvivalReg val (rewritedFormula, censorCol) = formulaRewrite(formula) val rFormula = new RFormula().setFormula(rewritedFormula) +RWrapperUtils.checkDataColumns(rFormula, data) val rFormulaModel = rFormula.fit(data) // get feature names from output schema http://git-wip-us.apache.org/repos/asf/spark/blob/6969dcc7/mllib/src/main/scala/org/apache/spark/ml/r/GaussianMixtureWrapper.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GaussianMixtureWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GaussianMixtureW
spark git commit: [SPARK-17376][SPARKR] followup - change since version
Repository: spark Updated Branches: refs/heads/branch-2.0 d4ae35d02 -> 03d9af604 [SPARK-17376][SPARKR] followup - change since version ## What changes were proposed in this pull request? change since version in doc ## How was this patch tested? manual Author: Felix Cheung Closes #14939 from felixcheung/rsparkversion2. (cherry picked from commit eac1d0e921345b5d15aa35d8c565140292ab2af3) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/03d9af60 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/03d9af60 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/03d9af60 Branch: refs/heads/branch-2.0 Commit: 03d9af6043ae443ced004383c996fa8eebf3a1d1 Parents: d4ae35d Author: Felix Cheung Authored: Fri Sep 2 11:08:25 2016 -0700 Committer: Felix Cheung Committed: Fri Sep 2 11:08:38 2016 -0700 -- R/pkg/R/SQLContext.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/03d9af60/R/pkg/R/SQLContext.R -- diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index a140454..783df53 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -169,7 +169,7 @@ sparkR.conf <- function(key, defaultValue) { #' sparkR.session() #' version <- sparkR.version() #' } -#' @note sparkR.version since 2.1.0 +#' @note sparkR.version since 2.0.1 sparkR.version <- function() { sparkSession <- getSparkSession() callJMethod(sparkSession, "version") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17376][SPARKR] followup - change since version
Repository: spark Updated Branches: refs/heads/master e79962f2f -> eac1d0e92 [SPARK-17376][SPARKR] followup - change since version ## What changes were proposed in this pull request? change since version in doc ## How was this patch tested? manual Author: Felix Cheung Closes #14939 from felixcheung/rsparkversion2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eac1d0e9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eac1d0e9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eac1d0e9 Branch: refs/heads/master Commit: eac1d0e921345b5d15aa35d8c565140292ab2af3 Parents: e79962f Author: Felix Cheung Authored: Fri Sep 2 11:08:25 2016 -0700 Committer: Felix Cheung Committed: Fri Sep 2 11:08:25 2016 -0700 -- R/pkg/R/SQLContext.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/eac1d0e9/R/pkg/R/SQLContext.R -- diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index a140454..783df53 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -169,7 +169,7 @@ sparkR.conf <- function(key, defaultValue) { #' sparkR.session() #' version <- sparkR.version() #' } -#' @note sparkR.version since 2.1.0 +#' @note sparkR.version since 2.0.1 sparkR.version <- function() { sparkSession <- getSparkSession() callJMethod(sparkSession, "version") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17315][SPARKR] Kolmogorov-Smirnov test SparkR wrapper
Repository: spark Updated Branches: refs/heads/master c2a1576c2 -> abb2f9210 [SPARK-17315][SPARKR] Kolmogorov-Smirnov test SparkR wrapper ## What changes were proposed in this pull request? This PR tries to add Kolmogorov-Smirnov Test wrapper to SparkR. This wrapper implementation only supports one sample test against normal distribution. ## How was this patch tested? R unit test. Author: Junyang Qian Closes #14881 from junyangq/SPARK-17315. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/abb2f921 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/abb2f921 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/abb2f921 Branch: refs/heads/master Commit: abb2f921036d97d8cab033838ae559eb731bf0fd Parents: c2a1576 Author: Junyang Qian Authored: Sat Sep 3 12:26:30 2016 -0700 Committer: Felix Cheung Committed: Sat Sep 3 12:26:30 2016 -0700 -- R/pkg/NAMESPACE | 7 +- R/pkg/R/generics.R | 4 + R/pkg/R/mllib.R | 105 +++ R/pkg/inst/tests/testthat/test_mllib.R | 34 ++ .../org/apache/spark/ml/r/KSTestWrapper.scala | 57 ++ 5 files changed, 205 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/abb2f921/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index ce41b51..a5e9cbd 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -42,7 +42,8 @@ exportMethods("glm", "spark.perplexity", "spark.isoreg", "spark.gaussianMixture", - "spark.als") + "spark.als", + "spark.kstest") # Job group lifecycle management methods export("setJobGroup", @@ -342,7 +343,8 @@ export("as.DataFrame", "tables", "uncacheTable", "print.summary.GeneralizedLinearRegressionModel", - "read.ml") + "read.ml", + "print.summary.KSTest") export("structField", "structField.jobj", @@ -366,6 +368,7 @@ S3method(print, jobj) S3method(print, structField) S3method(print, structType) S3method(print, summary.GeneralizedLinearRegressionModel) +S3method(print, summary.KSTest) S3method(structField, character) S3method(structField, jobj) S3method(structType, jobj) http://git-wip-us.apache.org/repos/asf/spark/blob/abb2f921/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 7e626be..67a999d 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1375,3 +1375,7 @@ setGeneric("write.ml", function(object, path, ...) { standardGeneric("write.ml") #' @rdname spark.als #' @export setGeneric("spark.als", function(data, ...) { standardGeneric("spark.als") }) + +#' @rdname spark.kstest +#' @export +setGeneric("spark.kstest", function(data, ...) { standardGeneric("spark.kstest") }) http://git-wip-us.apache.org/repos/asf/spark/blob/abb2f921/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 9a53f75..f321fd1 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -88,6 +88,13 @@ setClass("GaussianMixtureModel", representation(jobj = "jobj")) #' @note ALSModel since 2.1.0 setClass("ALSModel", representation(jobj = "jobj")) +#' S4 class that represents an KSTest +#' +#' @param jobj a Java object reference to the backing Scala KSTestWrapper +#' @export +#' @note KSTest since 2.1.0 +setClass("KSTest", representation(jobj = "jobj")) + #' Saves the MLlib model to the input path #' #' Saves the MLlib model to the input path. For more information, see the specific @@ -1310,3 +1317,101 @@ setMethod("write.ml", signature(object = "ALSModel", path = "character"), function(object, path, overwrite = FALSE) { write_internal(object, path, overwrite) }) + +#' (One-Sample) Kolmogorov-Smirnov Test +#' +#' @description +#' \code{spark.kstest} Conduct the two-sided Kolmogorov-Smirnov (KS) test for data sampled from a +#' continuous distribution. +#' +#' By comparing the largest difference between the empirical cumulative +#' distribution of the sample data and the theoretical distribution we can provide a test for the +#' the null hypothesis that the sample data comes from that theoretical distribution. +#' +#' Users can call \code{summary} to obtain a summary of the test, and \code{print.summary.KSTest} +#' to print out a summary result. +#' +#' @param data a SparkDataFrame of user data. +#' @param testCol column name where the test data is from. It should be a column of double type. +#' @param nullHypothesis name of the theoretical dis
spark git commit: [SPARK-19133][SPARKR][ML][BACKPORT-2.0] fix glm for Gamma, clarify glm family supported
Repository: spark Updated Branches: refs/heads/branch-2.0 6fe676c09 -> ec2fe925c [SPARK-19133][SPARKR][ML][BACKPORT-2.0] fix glm for Gamma, clarify glm family supported ## What changes were proposed in this pull request? Backport to 2.0 (cherry picking from 2.1 didn't work) ## How was this patch tested? unit test Author: Felix Cheung Closes #16543 from felixcheung/rgammabackport20. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ec2fe925 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ec2fe925 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ec2fe925 Branch: refs/heads/branch-2.0 Commit: ec2fe925cd359ca5c132372d4b18ff791b70605a Parents: 6fe676c Author: Felix Cheung Authored: Wed Jan 11 20:01:11 2017 -0800 Committer: Felix Cheung Committed: Wed Jan 11 20:01:11 2017 -0800 -- R/pkg/R/mllib.R| 7 ++- R/pkg/inst/tests/testthat/test_mllib.R | 8 2 files changed, 14 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ec2fe925/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index b33a16a..cd07f27 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -89,6 +89,8 @@ NULL #' This can be a character string naming a family function, a family function or #' the result of a call to a family function. Refer R family at #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. +#' Currently these families are supported: \code{binomial}, \code{gaussian}, +#' \code{Gamma}, and \code{poisson}. #' @param tol positive convergence tolerance of iterations. #' @param maxIter integer giving the maximal number of IRLS iterations. #' @param ... additional arguments passed to the method. @@ -134,8 +136,9 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), formula <- paste(deparse(formula), collapse = "") +# For known families, Gamma is upper-cased jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper", -"fit", formula, data@sdf, family$family, family$link, +"fit", formula, data@sdf, tolower(family$family), family$link, tol, as.integer(maxIter)) return(new("GeneralizedLinearRegressionModel", jobj = jobj)) }) @@ -150,6 +153,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), #' This can be a character string naming a family function, a family function or #' the result of a call to a family function. Refer R family at #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. +#' Currently these families are supported: \code{binomial}, \code{gaussian}, +#' \code{Gamma}, and \code{poisson}. #' @param epsilon positive convergence tolerance of iterations. #' @param maxit integer giving the maximal number of IRLS iterations. #' @return \code{glm} returns a fitted generalized linear model. http://git-wip-us.apache.org/repos/asf/spark/blob/ec2fe925/R/pkg/inst/tests/testthat/test_mllib.R -- diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 753da81..e0d2e53 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -69,6 +69,14 @@ test_that("spark.glm and predict", { data = iris, family = poisson(link = identity)), iris)) expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) + # Gamma family + x <- runif(100, -1, 1) + y <- rgamma(100, rate = 10 / exp(0.5 + 1.2 * x), shape = 10) + df <- as.DataFrame(as.data.frame(list(x = x, y = y))) + model <- glm(y ~ x, family = Gamma, df) + out <- capture.output(print(summary(model))) + expect_true(any(grepl("Dispersion parameter for gamma family", out))) + # Test stats::predict is working x <- rnorm(15) y <- x + rnorm(15) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12757][CORE] lower "block locks were not released" log to info level
Repository: spark Updated Branches: refs/heads/master c6c37b8af -> 2bc4d4e28 [SPARK-12757][CORE] lower "block locks were not released" log to info level ## What changes were proposed in this pull request? lower "block locks were not released" log to info level, as it is generating a lot of warnings in running ML, graph calls, as pointed out in the JIRA. Author: Felix Cheung Closes #16513 from felixcheung/blocklockswarn. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2bc4d4e2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2bc4d4e2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2bc4d4e2 Branch: refs/heads/master Commit: 2bc4d4e286e65f8b4e9ee21bccd913b62e6061f2 Parents: c6c37b8 Author: Felix Cheung Authored: Thu Jan 12 09:45:16 2017 -0800 Committer: Felix Cheung Committed: Thu Jan 12 09:45:16 2017 -0800 -- core/src/main/scala/org/apache/spark/executor/Executor.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2bc4d4e2/core/src/main/scala/org/apache/spark/executor/Executor.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index 789198f..b6c0f0c 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -336,7 +336,7 @@ private[spark] class Executor( if (conf.getBoolean("spark.storage.exceptionOnPinLeak", false)) { throw new SparkException(errMsg) } else { - logWarning(errMsg) + logInfo(errMsg) } } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18828][SPARKR] Refactor scripts for R
Repository: spark Updated Branches: refs/heads/master a115a5439 -> c84f7d3e1 [SPARK-18828][SPARKR] Refactor scripts for R ## What changes were proposed in this pull request? Refactored script to remove duplications and clearer purpose for each script ## How was this patch tested? manually Author: Felix Cheung Closes #16249 from felixcheung/rscripts. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c84f7d3e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c84f7d3e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c84f7d3e Branch: refs/heads/master Commit: c84f7d3e1b845bc1e595ce9a6e2de663c2d218f4 Parents: a115a54 Author: Felix Cheung Authored: Mon Jan 16 13:49:12 2017 -0800 Committer: Felix Cheung Committed: Mon Jan 16 13:49:12 2017 -0800 -- R/check-cran.sh | 32 +++--- R/create-docs.sh| 11 R/create-rd.sh | 37 ++ R/find-r.sh | 34 R/install-dev.sh| 20 +++--- R/install-source-package.sh | 57 dev/make-distribution.sh| 7 +++-- 7 files changed, 146 insertions(+), 52 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c84f7d3e/R/check-cran.sh -- diff --git a/R/check-cran.sh b/R/check-cran.sh index 1288e7f..a188b14 100755 --- a/R/check-cran.sh +++ b/R/check-cran.sh @@ -20,25 +20,14 @@ set -o pipefail set -e -FWDIR="$(cd `dirname $0`; pwd)" +FWDIR="$(cd `dirname "${BASH_SOURCE[0]}"`; pwd)" pushd $FWDIR > /dev/null -if [ ! -z "$R_HOME" ] - then -R_SCRIPT_PATH="$R_HOME/bin" - else -# if system wide R_HOME is not found, then exit -if [ ! `command -v R` ]; then - echo "Cannot find 'R_HOME'. Please specify 'R_HOME' or make sure R is properly installed." - exit 1 -fi -R_SCRIPT_PATH="$(dirname $(which R))" -fi -echo "Using R_SCRIPT_PATH = ${R_SCRIPT_PATH}" +. $FWDIR/find-r.sh # Install the package (this is required for code in vignettes to run when building it later) # Build the latest docs, but not vignettes, which is built with the package next -$FWDIR/create-docs.sh +. $FWDIR/install-dev.sh # Build source package with vignettes SPARK_HOME="$(cd "${FWDIR}"/..; pwd)" @@ -84,19 +73,4 @@ else SPARK_HOME="${SPARK_HOME}" "$R_SCRIPT_PATH/"R CMD check $CRAN_CHECK_OPTIONS SparkR_"$VERSION".tar.gz fi -# Install source package to get it to generate vignettes rds files, etc. -if [ -n "$CLEAN_INSTALL" ] -then - echo "Removing lib path and installing from source package" - LIB_DIR="$FWDIR/lib" - rm -rf $LIB_DIR - mkdir -p $LIB_DIR - "$R_SCRIPT_PATH/"R CMD INSTALL SparkR_"$VERSION".tar.gz --library=$LIB_DIR - - # Zip the SparkR package so that it can be distributed to worker nodes on YARN - pushd $LIB_DIR > /dev/null - jar cfM "$LIB_DIR/sparkr.zip" SparkR - popd > /dev/null -fi - popd > /dev/null http://git-wip-us.apache.org/repos/asf/spark/blob/c84f7d3e/R/create-docs.sh -- diff --git a/R/create-docs.sh b/R/create-docs.sh index 84e6aa9..6bef7e7 100755 --- a/R/create-docs.sh +++ b/R/create-docs.sh @@ -29,18 +29,19 @@ set -o pipefail set -e # Figure out where the script is -export FWDIR="$(cd "`dirname "$0"`"; pwd)" -export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" +export FWDIR="$(cd "`dirname "${BASH_SOURCE[0]}"`"; pwd)" +export SPARK_HOME="$(cd "`dirname "${BASH_SOURCE[0]}"`"/..; pwd)" # Required for setting SPARK_SCALA_VERSION . "${SPARK_HOME}"/bin/load-spark-env.sh echo "Using Scala $SPARK_SCALA_VERSION" -pushd $FWDIR +pushd $FWDIR > /dev/null +. $FWDIR/find-r.sh # Install the package (this will also generate the Rd files) -./install-dev.sh +. $FWDIR/install-dev.sh # Now create HTML files @@ -48,7 +49,7 @@ pushd $FWDIR mkdir -p pkg/html pushd pkg/html -Rscript -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); library(knitr); knit_rd("SparkR", links = tools::findHTMLlinks(paste(libDir, "SparkR", sep="/")))' +"$R_SCRIPT_PATH/"Rscript -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); library(knitr); knit_rd("SparkR", links = tools::findHTMLlinks(paste(libDir, "SparkR", sep=&quo
spark git commit: [SPARK-19066][SPARKR][BACKPORT-2.1] LDA doesn't set optimizer correctly
Repository: spark Updated Branches: refs/heads/branch-2.1 3ec3e3f2e -> 29b954bba [SPARK-19066][SPARKR][BACKPORT-2.1] LDA doesn't set optimizer correctly ## What changes were proposed in this pull request? Back port the fix to SPARK-19066 to 2.1 branch. ## How was this patch tested? Unit tests Author: wm...@hotmail.com Closes #16623 from wangmiao1981/bugport. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/29b954bb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/29b954bb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/29b954bb Branch: refs/heads/branch-2.1 Commit: 29b954bba1a9fa6e3bd823fa36ea7df4c2461381 Parents: 3ec3e3f Author: wm...@hotmail.com Authored: Tue Jan 17 21:24:33 2017 -0800 Committer: Felix Cheung Committed: Tue Jan 17 21:24:33 2017 -0800 -- R/pkg/inst/tests/testthat/test_mllib.R | 4 ++-- mllib/src/main/scala/org/apache/spark/ml/r/LDAWrapper.scala | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/29b954bb/R/pkg/inst/tests/testthat/test_mllib.R -- diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 1f2fae9..3891f00 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -860,7 +860,7 @@ test_that("spark.lda with libsvm", { weights <- stats$topicTopTermsWeights vocabulary <- stats$vocabulary - expect_false(isDistributed) + expect_true(isDistributed) expect_true(logLikelihood <= 0 & is.finite(logLikelihood)) expect_true(logPerplexity >= 0 & is.finite(logPerplexity)) expect_equal(vocabSize, 11) @@ -874,7 +874,7 @@ test_that("spark.lda with libsvm", { model2 <- read.ml(modelPath) stats2 <- summary(model2) - expect_false(stats2$isDistributed) + expect_true(stats2$isDistributed) expect_equal(logLikelihood, stats2$logLikelihood) expect_equal(logPerplexity, stats2$logPerplexity) expect_equal(vocabSize, stats2$vocabSize) http://git-wip-us.apache.org/repos/asf/spark/blob/29b954bb/mllib/src/main/scala/org/apache/spark/ml/r/LDAWrapper.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LDAWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LDAWrapper.scala index cbe6a70..e7851e1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/LDAWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/LDAWrapper.scala @@ -122,6 +122,7 @@ private[r] object LDAWrapper extends MLReadable[LDAWrapper] { .setK(k) .setMaxIter(maxIter) .setSubsamplingRate(subsamplingRate) + .setOptimizer(optimizer) val featureSchema = data.schema(features) val stages = featureSchema.dataType match { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19231][SPARKR] add error handling for download and untar for Spark release
Repository: spark Updated Branches: refs/heads/branch-2.1 29b954bba -> 77202a6c5 [SPARK-19231][SPARKR] add error handling for download and untar for Spark release ## What changes were proposed in this pull request? When R is starting as a package and it needs to download the Spark release distribution we need to handle error for download and untar, and clean up, otherwise it will get stuck. ## How was this patch tested? manually Author: Felix Cheung Closes #16589 from felixcheung/rtarreturncode. (cherry picked from commit 278fa1eb305220a85c816c948932d6af8fa619aa) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/77202a6c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/77202a6c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/77202a6c Branch: refs/heads/branch-2.1 Commit: 77202a6c57e6ac2438cdb6bd232a187b6734fa2b Parents: 29b954b Author: Felix Cheung Authored: Wed Jan 18 09:53:14 2017 -0800 Committer: Felix Cheung Committed: Wed Jan 18 09:53:31 2017 -0800 -- R/pkg/R/install.R | 55 -- 1 file changed, 40 insertions(+), 15 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/77202a6c/R/pkg/R/install.R -- diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index cb6bbe5..72386e6 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -54,7 +54,7 @@ #' } #' @param overwrite If \code{TRUE}, download and overwrite the existing tar file in localDir #' and force re-install Spark (in case the local directory or file is corrupted) -#' @return \code{install.spark} returns the local directory where Spark is found or installed +#' @return the (invisible) local directory where Spark is found or installed #' @rdname install.spark #' @name install.spark #' @aliases install.spark @@ -115,17 +115,35 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, } else { if (releaseUrl != "") { message("Downloading from alternate URL:\n- ", releaseUrl) - downloadUrl(releaseUrl, packageLocalPath, paste0("Fetch failed from ", releaseUrl)) + success <- downloadUrl(releaseUrl, packageLocalPath) + if (!success) { +unlink(packageLocalPath) +stop(paste0("Fetch failed from ", releaseUrl)) + } } else { robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) } } message(sprintf("Installing to %s", localDir)) - untar(tarfile = packageLocalPath, exdir = localDir) - if (!tarExists || overwrite) { + # There are two ways untar can fail - untar could stop() on errors like incomplete block on file + # or, tar command can return failure code + success <- tryCatch(untar(tarfile = packageLocalPath, exdir = localDir) == 0, + error = function(e) { + message(e) + message() + FALSE + }, + warning = function(w) { + # Treat warning as error, add an empty line with message() + message(w) + message() + FALSE + }) + if (!tarExists || overwrite || !success) { unlink(packageLocalPath) } + if (!success) stop("Extract archive failed.") message("DONE.") Sys.setenv(SPARK_HOME = packageLocalDir) message(paste("SPARK_HOME set to", packageLocalDir)) @@ -135,8 +153,7 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) { # step 1: use user-provided url if (!is.null(mirrorUrl)) { -msg <- sprintf("Use user-provided mirror site: %s.", mirrorUrl) -message(msg) +message("Use user-provided mirror site: ", mirrorUrl) success <- directDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) if (success) { @@ -156,7 +173,7 @@ robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, pa packageName, packageLocalPath) if (success) return() } else { -message("Unable to find preferred mirror site.") +message("Unable to download from preferred mirror site: ", mirrorUrl) } # step 3: use backup option @@ -165,8 +182,11 @@ robustDownloadTar <- function(mirrorUrl, version, hadoopVersio
spark git commit: [SPARK-19231][SPARKR] add error handling for download and untar for Spark release
Repository: spark Updated Branches: refs/heads/master d06172b88 -> 278fa1eb3 [SPARK-19231][SPARKR] add error handling for download and untar for Spark release ## What changes were proposed in this pull request? When R is starting as a package and it needs to download the Spark release distribution we need to handle error for download and untar, and clean up, otherwise it will get stuck. ## How was this patch tested? manually Author: Felix Cheung Closes #16589 from felixcheung/rtarreturncode. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/278fa1eb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/278fa1eb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/278fa1eb Branch: refs/heads/master Commit: 278fa1eb305220a85c816c948932d6af8fa619aa Parents: d06172b Author: Felix Cheung Authored: Wed Jan 18 09:53:14 2017 -0800 Committer: Felix Cheung Committed: Wed Jan 18 09:53:14 2017 -0800 -- R/pkg/R/install.R | 55 -- 1 file changed, 40 insertions(+), 15 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/278fa1eb/R/pkg/R/install.R -- diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index cb6bbe5..72386e6 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -54,7 +54,7 @@ #' } #' @param overwrite If \code{TRUE}, download and overwrite the existing tar file in localDir #' and force re-install Spark (in case the local directory or file is corrupted) -#' @return \code{install.spark} returns the local directory where Spark is found or installed +#' @return the (invisible) local directory where Spark is found or installed #' @rdname install.spark #' @name install.spark #' @aliases install.spark @@ -115,17 +115,35 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, } else { if (releaseUrl != "") { message("Downloading from alternate URL:\n- ", releaseUrl) - downloadUrl(releaseUrl, packageLocalPath, paste0("Fetch failed from ", releaseUrl)) + success <- downloadUrl(releaseUrl, packageLocalPath) + if (!success) { +unlink(packageLocalPath) +stop(paste0("Fetch failed from ", releaseUrl)) + } } else { robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) } } message(sprintf("Installing to %s", localDir)) - untar(tarfile = packageLocalPath, exdir = localDir) - if (!tarExists || overwrite) { + # There are two ways untar can fail - untar could stop() on errors like incomplete block on file + # or, tar command can return failure code + success <- tryCatch(untar(tarfile = packageLocalPath, exdir = localDir) == 0, + error = function(e) { + message(e) + message() + FALSE + }, + warning = function(w) { + # Treat warning as error, add an empty line with message() + message(w) + message() + FALSE + }) + if (!tarExists || overwrite || !success) { unlink(packageLocalPath) } + if (!success) stop("Extract archive failed.") message("DONE.") Sys.setenv(SPARK_HOME = packageLocalDir) message(paste("SPARK_HOME set to", packageLocalDir)) @@ -135,8 +153,7 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) { # step 1: use user-provided url if (!is.null(mirrorUrl)) { -msg <- sprintf("Use user-provided mirror site: %s.", mirrorUrl) -message(msg) +message("Use user-provided mirror site: ", mirrorUrl) success <- directDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) if (success) { @@ -156,7 +173,7 @@ robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, pa packageName, packageLocalPath) if (success) return() } else { -message("Unable to find preferred mirror site.") +message("Unable to download from preferred mirror site: ", mirrorUrl) } # step 3: use backup option @@ -165,8 +182,11 @@ robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, pa success <- directDownloadTar(mirrorUrl, version, hadoopVersion,
spark git commit: [SPARK-18823][SPARKR] add support for assigning to column
Repository: spark Updated Branches: refs/heads/branch-2.1 570e5e11d -> 9c04e427d [SPARK-18823][SPARKR] add support for assigning to column ## What changes were proposed in this pull request? Support for ``` df[[myname]] <- 1 df[[2]] <- df$eruptions ``` ## How was this patch tested? manual tests, unit tests Author: Felix Cheung Closes #16663 from felixcheung/rcolset. (cherry picked from commit f27e024768e328b96704a9ef35b77381da480328) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9c04e427 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9c04e427 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9c04e427 Branch: refs/heads/branch-2.1 Commit: 9c04e427d0a4b99bfdb6af1ea1bc8c4bdaee724e Parents: 570e5e1 Author: Felix Cheung Authored: Tue Jan 24 00:23:23 2017 -0800 Committer: Felix Cheung Committed: Tue Jan 24 00:23:35 2017 -0800 -- R/pkg/R/DataFrame.R | 48 +++--- R/pkg/inst/tests/testthat/test_sparkSQL.R | 20 +++ 2 files changed, 55 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9c04e427/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index c79b1d3..48ac307 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1711,6 +1711,23 @@ getColumn <- function(x, c) { column(callJMethod(x@sdf, "col", c)) } +setColumn <- function(x, c, value) { + if (class(value) != "Column" && !is.null(value)) { +if (isAtomicLengthOne(value)) { + value <- lit(value) +} else { + stop("value must be a Column, literal value as atomic in length of 1, or NULL") +} + } + + if (is.null(value)) { +nx <- drop(x, c) + } else { +nx <- withColumn(x, c, value) + } + nx +} + #' @param name name of a Column (without being wrapped by \code{""}). #' @rdname select #' @name $ @@ -1729,19 +1746,7 @@ setMethod("$", signature(x = "SparkDataFrame"), #' @note $<- since 1.4.0 setMethod("$<-", signature(x = "SparkDataFrame"), function(x, name, value) { -if (class(value) != "Column" && !is.null(value)) { - if (isAtomicLengthOne(value)) { -value <- lit(value) - } else { -stop("value must be a Column, literal value as atomic in length of 1, or NULL") - } -} - -if (is.null(value)) { - nx <- drop(x, name) -} else { - nx <- withColumn(x, name, value) -} +nx <- setColumn(x, name, value) x@sdf <- nx@sdf x }) @@ -1762,6 +1767,21 @@ setMethod("[[", signature(x = "SparkDataFrame", i = "numericOrcharacter"), }) #' @rdname subset +#' @name [[<- +#' @aliases [[<-,SparkDataFrame,numericOrcharacter-method +#' @note [[<- since 2.1.1 +setMethod("[[<-", signature(x = "SparkDataFrame", i = "numericOrcharacter"), + function(x, i, value) { +if (is.numeric(i)) { + cols <- columns(x) + i <- cols[[i]] +} +nx <- setColumn(x, i, value) +x@sdf <- nx@sdf +x + }) + +#' @rdname subset #' @name [ #' @aliases [,SparkDataFrame-method #' @note [ since 1.4.0 @@ -1808,6 +1828,8 @@ setMethod("[", signature(x = "SparkDataFrame"), #' @param j,select expression for the single Column or a list of columns to select from the SparkDataFrame. #' @param drop if TRUE, a Column will be returned if the resulting dataset has only one column. #' Otherwise, a SparkDataFrame will always be returned. +#' @param value a Column or an atomic vector in the length of 1 as literal value, or \code{NULL}. +#' If \code{NULL}, the specified Column is dropped. #' @param ... currently not used. #' @return A new SparkDataFrame containing only the rows that meet the condition with selected columns. #' @export http://git-wip-us.apache.org/repos/asf/spark/blob/9c04e427/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 7f27ba6..1f9daf5 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_spa
spark git commit: [SPARK-18823][SPARKR] add support for assigning to column
Repository: spark Updated Branches: refs/heads/master ec9493b44 -> f27e02476 [SPARK-18823][SPARKR] add support for assigning to column ## What changes were proposed in this pull request? Support for ``` df[[myname]] <- 1 df[[2]] <- df$eruptions ``` ## How was this patch tested? manual tests, unit tests Author: Felix Cheung Closes #16663 from felixcheung/rcolset. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f27e0247 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f27e0247 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f27e0247 Branch: refs/heads/master Commit: f27e024768e328b96704a9ef35b77381da480328 Parents: ec9493b Author: Felix Cheung Authored: Tue Jan 24 00:23:23 2017 -0800 Committer: Felix Cheung Committed: Tue Jan 24 00:23:23 2017 -0800 -- R/pkg/R/DataFrame.R | 48 +++--- R/pkg/inst/tests/testthat/test_sparkSQL.R | 20 +++ 2 files changed, 55 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f27e0247/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 3d912c9..0a10122 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1717,6 +1717,23 @@ getColumn <- function(x, c) { column(callJMethod(x@sdf, "col", c)) } +setColumn <- function(x, c, value) { + if (class(value) != "Column" && !is.null(value)) { +if (isAtomicLengthOne(value)) { + value <- lit(value) +} else { + stop("value must be a Column, literal value as atomic in length of 1, or NULL") +} + } + + if (is.null(value)) { +nx <- drop(x, c) + } else { +nx <- withColumn(x, c, value) + } + nx +} + #' @param name name of a Column (without being wrapped by \code{""}). #' @rdname select #' @name $ @@ -1735,19 +1752,7 @@ setMethod("$", signature(x = "SparkDataFrame"), #' @note $<- since 1.4.0 setMethod("$<-", signature(x = "SparkDataFrame"), function(x, name, value) { -if (class(value) != "Column" && !is.null(value)) { - if (isAtomicLengthOne(value)) { -value <- lit(value) - } else { -stop("value must be a Column, literal value as atomic in length of 1, or NULL") - } -} - -if (is.null(value)) { - nx <- drop(x, name) -} else { - nx <- withColumn(x, name, value) -} +nx <- setColumn(x, name, value) x@sdf <- nx@sdf x }) @@ -1768,6 +1773,21 @@ setMethod("[[", signature(x = "SparkDataFrame", i = "numericOrcharacter"), }) #' @rdname subset +#' @name [[<- +#' @aliases [[<-,SparkDataFrame,numericOrcharacter-method +#' @note [[<- since 2.1.1 +setMethod("[[<-", signature(x = "SparkDataFrame", i = "numericOrcharacter"), + function(x, i, value) { +if (is.numeric(i)) { + cols <- columns(x) + i <- cols[[i]] +} +nx <- setColumn(x, i, value) +x@sdf <- nx@sdf +x + }) + +#' @rdname subset #' @name [ #' @aliases [,SparkDataFrame-method #' @note [ since 1.4.0 @@ -1814,6 +1834,8 @@ setMethod("[", signature(x = "SparkDataFrame"), #' @param j,select expression for the single Column or a list of columns to select from the SparkDataFrame. #' @param drop if TRUE, a Column will be returned if the resulting dataset has only one column. #' Otherwise, a SparkDataFrame will always be returned. +#' @param value a Column or an atomic vector in the length of 1 as literal value, or \code{NULL}. +#' If \code{NULL}, the specified Column is dropped. #' @param ... currently not used. #' @return A new SparkDataFrame containing only the rows that meet the condition with selected columns. #' @export http://git-wip-us.apache.org/repos/asf/spark/blob/f27e0247/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 2601742..aaa8fb4 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1021,6 +1021,9 @@ test_that("select operators", { df$age2 <- df$age * 2 exp
spark git commit: [SPARK-18821][SPARKR] Bisecting k-means wrapper in SparkR
Repository: spark Updated Branches: refs/heads/master 1191fe267 -> c0ba28430 [SPARK-18821][SPARKR] Bisecting k-means wrapper in SparkR ## What changes were proposed in this pull request? Add R wrapper for bisecting Kmeans. As JIRA is down, I will update title to link with corresponding JIRA later. ## How was this patch tested? Add new unit tests. Author: wm...@hotmail.com Closes #16566 from wangmiao1981/bk. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c0ba2843 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c0ba2843 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c0ba2843 Branch: refs/heads/master Commit: c0ba284300e494354f5bb205a10a12ac7daa2b5e Parents: 1191fe2 Author: wm...@hotmail.com Authored: Thu Jan 26 21:01:59 2017 -0800 Committer: Felix Cheung Committed: Thu Jan 26 21:01:59 2017 -0800 -- R/pkg/NAMESPACE | 3 +- R/pkg/R/generics.R | 5 + R/pkg/R/mllib_clustering.R | 149 +++ R/pkg/R/mllib_utils.R | 10 +- .../inst/tests/testthat/test_mllib_clustering.R | 40 + .../spark/ml/r/BisectingKMeansWrapper.scala | 143 ++ .../scala/org/apache/spark/ml/r/RWrappers.scala | 2 + 7 files changed, 347 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c0ba2843/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 0cd9cb8..caa1c3b 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -47,7 +47,8 @@ exportMethods("glm", "spark.kstest", "spark.logit", "spark.randomForest", - "spark.gbt") + "spark.gbt", + "spark.bisectingKmeans") # Job group lifecycle management methods export("setJobGroup", http://git-wip-us.apache.org/repos/asf/spark/blob/c0ba2843/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 499c7b2..433c166 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1338,6 +1338,11 @@ setGeneric("rbind", signature = "...") #' @export setGeneric("spark.als", function(data, ...) { standardGeneric("spark.als") }) +#' @rdname spark.bisectingKmeans +#' @export +setGeneric("spark.bisectingKmeans", + function(data, formula, ...) { standardGeneric("spark.bisectingKmeans") }) + #' @rdname spark.gaussianMixture #' @export setGeneric("spark.gaussianMixture", http://git-wip-us.apache.org/repos/asf/spark/blob/c0ba2843/R/pkg/R/mllib_clustering.R -- diff --git a/R/pkg/R/mllib_clustering.R b/R/pkg/R/mllib_clustering.R index fa40f9d..05bbab6 100644 --- a/R/pkg/R/mllib_clustering.R +++ b/R/pkg/R/mllib_clustering.R @@ -17,6 +17,13 @@ # mllib_clustering.R: Provides methods for MLlib clustering algorithms integration +#' S4 class that represents a BisectingKMeansModel +#' +#' @param jobj a Java object reference to the backing Scala BisectingKMeansModel +#' @export +#' @note BisectingKMeansModel since 2.2.0 +setClass("BisectingKMeansModel", representation(jobj = "jobj")) + #' S4 class that represents a GaussianMixtureModel #' #' @param jobj a Java object reference to the backing Scala GaussianMixtureModel @@ -38,6 +45,148 @@ setClass("KMeansModel", representation(jobj = "jobj")) #' @note LDAModel since 2.1.0 setClass("LDAModel", representation(jobj = "jobj")) +#' Bisecting K-Means Clustering Model +#' +#' Fits a bisecting k-means clustering model against a Spark DataFrame. +#' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make +#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. +#' +#' @param data a SparkDataFrame for training. +#' @param formula a symbolic description of the model to be fitted. Currently only a few formula +#'operators are supported, including '~', '.', ':', '+', and '-'. +#'Note that the response variable of formula is empty in spark.bisectingKmeans. +#' @param k the desired number of leaf clusters. Must be > 1. +#' The actual number could be smaller if there are no divisible leaf clusters. +#' @param maxIter maximum iteration number. +#' @param seed the random seed. +#' @param minDivisibleClusterSize The minimum number of points (if greater than or equal to 1.0) +#'or the minimum proportion of points (if less than 1.0) of a divisible cluster. +#'Note that it is an expert parameter. The default value should be good enough +
spark git commit: [SPARK-18788][SPARKR] Add API for getNumPartitions
Repository: spark Updated Branches: refs/heads/master c0ba28430 -> 90817a6cd [SPARK-18788][SPARKR] Add API for getNumPartitions ## What changes were proposed in this pull request? With doc to say this would convert DF into RDD ## How was this patch tested? unit tests, manual tests Author: Felix Cheung Closes #16668 from felixcheung/rgetnumpartitions. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/90817a6c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/90817a6c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/90817a6c Branch: refs/heads/master Commit: 90817a6cd06068fa9f9ff77384a1fcba73b43006 Parents: c0ba284 Author: Felix Cheung Authored: Thu Jan 26 21:06:39 2017 -0800 Committer: Felix Cheung Committed: Thu Jan 26 21:06:39 2017 -0800 -- R/pkg/NAMESPACE | 1 + R/pkg/R/DataFrame.R | 23 R/pkg/R/RDD.R | 30 +- R/pkg/R/generics.R| 8 +-- R/pkg/R/pairRDD.R | 4 ++-- R/pkg/inst/tests/testthat/test_rdd.R | 10 - R/pkg/inst/tests/testthat/test_sparkSQL.R | 14 ++-- 7 files changed, 59 insertions(+), 31 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/90817a6c/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index caa1c3b..7ff6e9a 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -95,6 +95,7 @@ exportMethods("arrange", "freqItems", "gapply", "gapplyCollect", + "getNumPartitions", "group_by", "groupBy", "head", http://git-wip-us.apache.org/repos/asf/spark/blob/90817a6c/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 0a10122..523343e 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -3428,3 +3428,26 @@ setMethod("randomSplit", } sapply(sdfs, dataFrame) }) + +#' getNumPartitions +#' +#' Return the number of partitions +#' +#' @param x A SparkDataFrame +#' @family SparkDataFrame functions +#' @aliases getNumPartitions,SparkDataFrame-method +#' @rdname getNumPartitions +#' @name getNumPartitions +#' @export +#' @examples +#'\dontrun{ +#' sparkR.session() +#' df <- createDataFrame(cars, numPartitions = 2) +#' getNumPartitions(df) +#' } +#' @note getNumPartitions since 2.1.1 +setMethod("getNumPartitions", + signature(x = "SparkDataFrame"), + function(x) { +callJMethod(callJMethod(x@sdf, "rdd"), "getNumPartitions") + }) http://git-wip-us.apache.org/repos/asf/spark/blob/90817a6c/R/pkg/R/RDD.R -- diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index 0f1162f..91bab33 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -313,7 +313,7 @@ setMethod("checkpoint", #' @rdname getNumPartitions #' @aliases getNumPartitions,RDD-method #' @noRd -setMethod("getNumPartitions", +setMethod("getNumPartitionsRDD", signature(x = "RDD"), function(x) { callJMethod(getJRDD(x), "getNumPartitions") @@ -329,7 +329,7 @@ setMethod("numPartitions", signature(x = "RDD"), function(x) { .Deprecated("getNumPartitions") -getNumPartitions(x) +getNumPartitionsRDD(x) }) #' Collect elements of an RDD @@ -460,7 +460,7 @@ setMethod("countByValue", signature(x = "RDD"), function(x) { ones <- lapply(x, function(item) { list(item, 1L) }) -collectRDD(reduceByKey(ones, `+`, getNumPartitions(x))) +collectRDD(reduceByKey(ones, `+`, getNumPartitionsRDD(x))) }) #' Apply a function to all elements @@ -780,7 +780,7 @@ setMethod("takeRDD", resList <- list() index <- -1 jrdd <- getJRDD(x) -numPartitions <- getNumPartitions(x) +numPartitions <- getNumPartitionsRDD(x) serializedModeRDD <- getSerializedMode(x) # TODO(shivaram): Collect more than one partition based on size @@ -846,7 +846,7 @@ setMethod("firstRDD", #' @noRd setMeth
spark git commit: [SPARK-18788][SPARKR] Add API for getNumPartitions
Repository: spark Updated Branches: refs/heads/branch-2.1 59502bbcf -> ba2a5ada4 [SPARK-18788][SPARKR] Add API for getNumPartitions ## What changes were proposed in this pull request? With doc to say this would convert DF into RDD ## How was this patch tested? unit tests, manual tests Author: Felix Cheung Closes #16668 from felixcheung/rgetnumpartitions. (cherry picked from commit 90817a6cd06068fa9f9ff77384a1fcba73b43006) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ba2a5ada Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ba2a5ada Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ba2a5ada Branch: refs/heads/branch-2.1 Commit: ba2a5ada4825a9ca3e4e954a51574a2eede096a3 Parents: 59502bb Author: Felix Cheung Authored: Thu Jan 26 21:06:39 2017 -0800 Committer: Felix Cheung Committed: Thu Jan 26 21:06:54 2017 -0800 -- R/pkg/NAMESPACE | 1 + R/pkg/R/DataFrame.R | 23 R/pkg/R/RDD.R | 30 +- R/pkg/R/generics.R| 8 +-- R/pkg/R/pairRDD.R | 4 ++-- R/pkg/inst/tests/testthat/test_rdd.R | 10 - R/pkg/inst/tests/testthat/test_sparkSQL.R | 14 ++-- 7 files changed, 59 insertions(+), 31 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ba2a5ada/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index c3ec3f4..8a19fd0 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -94,6 +94,7 @@ exportMethods("arrange", "freqItems", "gapply", "gapplyCollect", + "getNumPartitions", "group_by", "groupBy", "head", http://git-wip-us.apache.org/repos/asf/spark/blob/ba2a5ada/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 48ac307..39e8376 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -3422,3 +3422,26 @@ setMethod("randomSplit", } sapply(sdfs, dataFrame) }) + +#' getNumPartitions +#' +#' Return the number of partitions +#' +#' @param x A SparkDataFrame +#' @family SparkDataFrame functions +#' @aliases getNumPartitions,SparkDataFrame-method +#' @rdname getNumPartitions +#' @name getNumPartitions +#' @export +#' @examples +#'\dontrun{ +#' sparkR.session() +#' df <- createDataFrame(cars, numPartitions = 2) +#' getNumPartitions(df) +#' } +#' @note getNumPartitions since 2.1.1 +setMethod("getNumPartitions", + signature(x = "SparkDataFrame"), + function(x) { +callJMethod(callJMethod(x@sdf, "rdd"), "getNumPartitions") + }) http://git-wip-us.apache.org/repos/asf/spark/blob/ba2a5ada/R/pkg/R/RDD.R -- diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index 0f1162f..91bab33 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -313,7 +313,7 @@ setMethod("checkpoint", #' @rdname getNumPartitions #' @aliases getNumPartitions,RDD-method #' @noRd -setMethod("getNumPartitions", +setMethod("getNumPartitionsRDD", signature(x = "RDD"), function(x) { callJMethod(getJRDD(x), "getNumPartitions") @@ -329,7 +329,7 @@ setMethod("numPartitions", signature(x = "RDD"), function(x) { .Deprecated("getNumPartitions") -getNumPartitions(x) +getNumPartitionsRDD(x) }) #' Collect elements of an RDD @@ -460,7 +460,7 @@ setMethod("countByValue", signature(x = "RDD"), function(x) { ones <- lapply(x, function(item) { list(item, 1L) }) -collectRDD(reduceByKey(ones, `+`, getNumPartitions(x))) +collectRDD(reduceByKey(ones, `+`, getNumPartitionsRDD(x))) }) #' Apply a function to all elements @@ -780,7 +780,7 @@ setMethod("takeRDD", resList <- list() index <- -1 jrdd <- getJRDD(x) -numPartitions <- getNumPartitions(x) +numPartitions <- getNumPartitionsRDD(x) serializedModeRDD <- getSerializedMode(x) # TODO(shivaram): Collect more than o