spark git commit: [SPARK-15319][SPARKR][DOCS] Fix SparkR doc layout for corr and other DataFrame stats functions
Repository: spark Updated Branches: refs/heads/branch-2.0 38f3b76bd -> 34feea336 [SPARK-15319][SPARKR][DOCS] Fix SparkR doc layout for corr and other DataFrame stats functions ## What changes were proposed in this pull request? Doc only changes. Please see screenshots. Before: http://spark.apache.org/docs/latest/api/R/statfunctions.html ![image](https://cloud.githubusercontent.com/assets/8969467/15264110/cd458826-1924-11e6-85bd-8ee2e2e1a85f.png) After ![image](https://cloud.githubusercontent.com/assets/8969467/16218452/b9e89f08-3732-11e6-969d-a3a1796e7ad0.png) (please ignore the style differences - this is due to not having the css in my local copy) This is still a bit weird. As discussed in SPARK-15237, I think the better approach is to separate out the DataFrame stats function instead of putting everything on one page. At least now it is clearer which description is on which function. ## How was this patch tested? Build doc Author: Felix Cheung Author: felixcheung Closes #13109 from felixcheung/rstatdoc. (cherry picked from commit 843a1eba8ec9d5a7beac0c74b54d24cb3c41b45a) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/34feea33 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/34feea33 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/34feea33 Branch: refs/heads/branch-2.0 Commit: 34feea336886b241135e6c60677000c2ca6b52b4 Parents: 38f3b76 Author: Felix Cheung Authored: Tue Jun 21 00:19:09 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Jun 21 00:19:18 2016 -0700 -- R/pkg/R/generics.R | 8 R/pkg/R/stats.R| 32 +--- 2 files changed, 17 insertions(+), 23 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/34feea33/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index ead403b..43395aa 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -434,19 +434,19 @@ setGeneric("coltypes<-", function(x, value) { standardGeneric("coltypes<-") }) #' @export setGeneric("columns", function(x) {standardGeneric("columns") }) -#' @rdname statfunctions +#' @rdname cov #' @export setGeneric("cov", function(x, ...) {standardGeneric("cov") }) -#' @rdname statfunctions +#' @rdname corr #' @export setGeneric("corr", function(x, ...) {standardGeneric("corr") }) -#' @rdname statfunctions +#' @rdname cov #' @export setGeneric("covar_samp", function(col1, col2) {standardGeneric("covar_samp") }) -#' @rdname statfunctions +#' @rdname covar_pop #' @export setGeneric("covar_pop", function(col1, col2) {standardGeneric("covar_pop") }) http://git-wip-us.apache.org/repos/asf/spark/blob/34feea33/R/pkg/R/stats.R -- diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index e92b9e3..e40b177 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -19,9 +19,10 @@ setOldClass("jobj") -#' crosstab -#' -#' Computes a pair-wise frequency table of the given columns. Also known as a contingency +#' @title SparkDataFrame statistic functions + +#' @description +#' crosstab - Computes a pair-wise frequency table of the given columns. Also known as a contingency #' table. The number of distinct values for each column should be less than 1e4. At most 1e6 #' non-zero pair frequencies will be returned. #' @@ -49,8 +50,6 @@ setMethod("crosstab", collect(dataFrame(sct)) }) -#' cov -#' #' Calculate the sample covariance of two numerical columns of a SparkDataFrame. #' #' @param x A SparkDataFrame @@ -58,7 +57,7 @@ setMethod("crosstab", #' @param col2 the name of the second column #' @return the covariance of the two columns. #' -#' @rdname statfunctions +#' @rdname cov #' @name cov #' @export #' @examples @@ -75,8 +74,6 @@ setMethod("cov", callJMethod(statFunctions, "cov", col1, col2) }) -#' corr -#' #' Calculates the correlation of two columns of a SparkDataFrame. #' Currently only supports the Pearson Correlation Coefficient. #' For Spearman Correlation, consider using RDD methods found in MLlib's Statistics. @@ -88,7 +85,7 @@ setMethod("cov", #' only "pearson" is allowed now. #' @return The Pearson Correlation Coefficient as a Double.
spark git commit: [SPARK-16109][SPARKR][DOC] R more doc fixes
Repository: spark Updated Branches: refs/heads/master 2d6919bea -> 57746295e [SPARK-16109][SPARKR][DOC] R more doc fixes ## What changes were proposed in this pull request? Found these issues while reviewing for SPARK-16090 ## How was this patch tested? roxygen2 doc gen, checked output html Author: Felix Cheung Closes #13803 from felixcheung/rdocrd. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/57746295 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/57746295 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/57746295 Branch: refs/heads/master Commit: 57746295e6fb705f8393a00ab1cc570ddb7da44e Parents: 2d6919b Author: Felix Cheung Authored: Tue Jun 21 11:01:42 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Jun 21 11:01:42 2016 -0700 -- R/pkg/R/DataFrame.R | 7 +-- R/pkg/R/functions.R | 4 +++- R/pkg/R/generics.R | 8 R/pkg/R/schema.R| 7 +-- R/pkg/R/stats.R | 37 +++-- 5 files changed, 40 insertions(+), 23 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/57746295/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index a8ade1a..ed0bb85 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -59,6 +59,7 @@ setMethod("initialize", "SparkDataFrame", function(.Object, sdf, isCached) { #' @export #' @param sdf A Java object reference to the backing Scala DataFrame #' @param isCached TRUE if the SparkDataFrame is cached +#' @noRd dataFrame <- function(sdf, isCached = FALSE) { new("SparkDataFrame", sdf, isCached) } @@ -119,7 +120,7 @@ setMethod("schema", #' Print the logical and physical Catalyst plans to the console for debugging. #' #' @param x A SparkDataFrame -#' @param extended Logical. If extended is False, explain() only prints the physical plan. +#' @param extended Logical. If extended is FALSE, explain() only prints the physical plan. #' @family SparkDataFrame functions #' @rdname explain #' @name explain @@ -175,6 +176,8 @@ setMethod("isLocal", #' #' @param x A SparkDataFrame #' @param numRows The number of rows to print. Defaults to 20. +#' @param truncate Whether truncate long strings. If true, strings more than 20 characters will be +#' truncated and all cells will be aligned right #' #' @family SparkDataFrame functions #' @rdname showDF @@ -1854,7 +1857,7 @@ setMethod("withColumnRenamed", select(x, cols) }) -#' @param newColPair A named pair of the form new_column_name = existing_column +#' @param ... A named pair of the form new_column_name = existing_column #' @rdname rename #' @name rename #' @export http://git-wip-us.apache.org/repos/asf/spark/blob/57746295/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 6e0009f..09e5afa 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -1777,7 +1777,7 @@ setMethod("months_between", signature(y = "Column"), #' nanvl #' #' Returns col1 if it is not NaN, or col2 if col1 is NaN. -#' hhBoth inputs should be floating point columns (DoubleType or FloatType). +#' Both inputs should be floating point columns (DoubleType or FloatType). #' #' @rdname nanvl #' @name nanvl @@ -2008,6 +2008,8 @@ setMethod("from_utc_timestamp", signature(y = "Column", x = "character"), #' NOTE: The position is not zero based, but 1 based index, returns 0 if substr #' could not be found in str. #' +#' @param y column to check +#' @param x substring to check #' @family string_funcs #' @rdname instr #' @name instr http://git-wip-us.apache.org/repos/asf/spark/blob/57746295/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 43395aa..7b08a8e 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -59,15 +59,15 @@ setGeneric("count", function(x) { standardGeneric("count") }) # @export setGeneric("countByValue", function(x) { standardGeneric("countByValue") }) -# @rdname statfunctions +# @rdname crosstab # @export setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") }) -# @rdname statfunctions +# @rdname freqItems # @export setGeneric("freqItems", function(x, cols, supp
spark git commit: [SPARK-16109][SPARKR][DOC] R more doc fixes
Repository: spark Updated Branches: refs/heads/branch-2.0 703a526e7 -> 867baaada [SPARK-16109][SPARKR][DOC] R more doc fixes ## What changes were proposed in this pull request? Found these issues while reviewing for SPARK-16090 ## How was this patch tested? roxygen2 doc gen, checked output html Author: Felix Cheung Closes #13803 from felixcheung/rdocrd. (cherry picked from commit 57746295e6fb705f8393a00ab1cc570ddb7da44e) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/867baaad Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/867baaad Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/867baaad Branch: refs/heads/branch-2.0 Commit: 867baaadad48a378b36933df0635a09cddc4c8de Parents: 703a526 Author: Felix Cheung Authored: Tue Jun 21 11:01:42 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Jun 21 11:01:56 2016 -0700 -- R/pkg/R/DataFrame.R | 7 +-- R/pkg/R/functions.R | 4 +++- R/pkg/R/generics.R | 8 R/pkg/R/schema.R| 7 +-- R/pkg/R/stats.R | 37 +++-- 5 files changed, 40 insertions(+), 23 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/867baaad/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index a8ade1a..ed0bb85 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -59,6 +59,7 @@ setMethod("initialize", "SparkDataFrame", function(.Object, sdf, isCached) { #' @export #' @param sdf A Java object reference to the backing Scala DataFrame #' @param isCached TRUE if the SparkDataFrame is cached +#' @noRd dataFrame <- function(sdf, isCached = FALSE) { new("SparkDataFrame", sdf, isCached) } @@ -119,7 +120,7 @@ setMethod("schema", #' Print the logical and physical Catalyst plans to the console for debugging. #' #' @param x A SparkDataFrame -#' @param extended Logical. If extended is False, explain() only prints the physical plan. +#' @param extended Logical. If extended is FALSE, explain() only prints the physical plan. #' @family SparkDataFrame functions #' @rdname explain #' @name explain @@ -175,6 +176,8 @@ setMethod("isLocal", #' #' @param x A SparkDataFrame #' @param numRows The number of rows to print. Defaults to 20. +#' @param truncate Whether truncate long strings. If true, strings more than 20 characters will be +#' truncated and all cells will be aligned right #' #' @family SparkDataFrame functions #' @rdname showDF @@ -1854,7 +1857,7 @@ setMethod("withColumnRenamed", select(x, cols) }) -#' @param newColPair A named pair of the form new_column_name = existing_column +#' @param ... A named pair of the form new_column_name = existing_column #' @rdname rename #' @name rename #' @export http://git-wip-us.apache.org/repos/asf/spark/blob/867baaad/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 6e0009f..09e5afa 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -1777,7 +1777,7 @@ setMethod("months_between", signature(y = "Column"), #' nanvl #' #' Returns col1 if it is not NaN, or col2 if col1 is NaN. -#' hhBoth inputs should be floating point columns (DoubleType or FloatType). +#' Both inputs should be floating point columns (DoubleType or FloatType). #' #' @rdname nanvl #' @name nanvl @@ -2008,6 +2008,8 @@ setMethod("from_utc_timestamp", signature(y = "Column", x = "character"), #' NOTE: The position is not zero based, but 1 based index, returns 0 if substr #' could not be found in str. #' +#' @param y column to check +#' @param x substring to check #' @family string_funcs #' @rdname instr #' @name instr http://git-wip-us.apache.org/repos/asf/spark/blob/867baaad/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 43395aa..7b08a8e 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -59,15 +59,15 @@ setGeneric("count", function(x) { standardGeneric("count") }) # @export setGeneric("countByValue", function(x) { standardGeneric("countByValue") }) -# @rdname statfunctions +# @rdname crosstab # @export setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab&qu
spark git commit: [SPARK-16096][SPARKR] add union and deprecate unionAll
Repository: spark Updated Branches: refs/heads/master 918c91954 -> dbfdae4e4 [SPARK-16096][SPARKR] add union and deprecate unionAll ## What changes were proposed in this pull request? add union and deprecate unionAll, separate roxygen2 doc for rbind (since their usage and parameter lists are quite different) `explode` is also deprecated - but seems like replacement is a combination of calls; not sure if we should deprecate it in SparkR, yet. ## How was this patch tested? unit tests, manual checks for r doc Author: Felix Cheung Closes #13805 from felixcheung/runion. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dbfdae4e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dbfdae4e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dbfdae4e Branch: refs/heads/master Commit: dbfdae4e41a900de01b48639d6554d32edbb2e0b Parents: 918c919 Author: Felix Cheung Authored: Tue Jun 21 13:36:50 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Jun 21 13:36:50 2016 -0700 -- R/pkg/NAMESPACE | 1 + R/pkg/R/DataFrame.R | 43 -- R/pkg/R/generics.R| 6 +++- R/pkg/inst/tests/testthat/test_context.R | 2 +- R/pkg/inst/tests/testthat/test_sparkSQL.R | 8 +++-- 5 files changed, 47 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dbfdae4e/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index ea42888..2272d8b 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -107,6 +107,7 @@ exportMethods("arrange", "summary", "take", "transform", + "union", "unionAll", "unique", "unpersist", http://git-wip-us.apache.org/repos/asf/spark/blob/dbfdae4e/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index ed0bb85..725cbf2 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2251,7 +2251,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) { cols } -#' rbind +#' Return a new SparkDataFrame containing the union of rows #' #' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame #' and another SparkDataFrame. This is equivalent to `UNION ALL` in SQL. @@ -2261,39 +2261,64 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) { #' @param y A SparkDataFrame #' @return A SparkDataFrame containing the result of the union. #' @family SparkDataFrame functions -#' @rdname rbind -#' @name unionAll +#' @rdname union +#' @name union +#' @seealso \link{rbind} #' @export #' @examples #'\dontrun{ #' sparkR.session() #' df1 <- read.json(path) #' df2 <- read.json(path2) -#' unioned <- unionAll(df, df2) +#' unioned <- union(df, df2) +#' unions <- rbind(df, df2, df3, df4) #' } +#' @note union since 2.0.0 +setMethod("union", + signature(x = "SparkDataFrame", y = "SparkDataFrame"), + function(x, y) { +unioned <- callJMethod(x@sdf, "union", y@sdf) +dataFrame(unioned) + }) + +#' unionAll is deprecated - use union instead +#' @rdname union +#' @name unionAll +#' @export #' @note unionAll since 1.4.0 setMethod("unionAll", signature(x = "SparkDataFrame", y = "SparkDataFrame"), function(x, y) { -unioned <- callJMethod(x@sdf, "unionAll", y@sdf) -dataFrame(unioned) +.Deprecated("union") +union(x, y) }) #' Union two or more SparkDataFrames #' -#' Returns a new SparkDataFrame containing rows of all parameters. +#' Union two or more SparkDataFrames. This is equivalent to `UNION ALL` in SQL. +#' Note that this does not remove duplicate rows across the two SparkDataFrames. #' +#' @param x A SparkDataFrame +#' @param ... Additional SparkDataFrame +#' @return A SparkDataFrame containing the result of the union. +#' @family SparkDataFrame functions #' @rdname rbind #' @name rbind +#' @seealso \link{union} #' @export +#' @examples +#'\dontrun{ +#' sparkR.session() +#' unions <- rbind(df, df2, df3, df4) +#' } #' @note rbind since 1.
spark git commit: [SPARK-16096][SPARKR] add union and deprecate unionAll
Repository: spark Updated Branches: refs/heads/branch-2.0 591bf7909 -> aeda9a153 [SPARK-16096][SPARKR] add union and deprecate unionAll ## What changes were proposed in this pull request? add union and deprecate unionAll, separate roxygen2 doc for rbind (since their usage and parameter lists are quite different) `explode` is also deprecated - but seems like replacement is a combination of calls; not sure if we should deprecate it in SparkR, yet. ## How was this patch tested? unit tests, manual checks for r doc Author: Felix Cheung Closes #13805 from felixcheung/runion. (cherry picked from commit dbfdae4e41a900de01b48639d6554d32edbb2e0b) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/aeda9a15 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/aeda9a15 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/aeda9a15 Branch: refs/heads/branch-2.0 Commit: aeda9a153c117921e95cf204daab0df3202f1d95 Parents: 591bf79 Author: Felix Cheung Authored: Tue Jun 21 13:36:50 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Jun 21 13:36:58 2016 -0700 -- R/pkg/NAMESPACE | 1 + R/pkg/R/DataFrame.R | 43 -- R/pkg/R/generics.R| 6 +++- R/pkg/inst/tests/testthat/test_context.R | 2 +- R/pkg/inst/tests/testthat/test_sparkSQL.R | 8 +++-- 5 files changed, 47 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/aeda9a15/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index ea42888..2272d8b 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -107,6 +107,7 @@ exportMethods("arrange", "summary", "take", "transform", + "union", "unionAll", "unique", "unpersist", http://git-wip-us.apache.org/repos/asf/spark/blob/aeda9a15/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index ed0bb85..725cbf2 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2251,7 +2251,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) { cols } -#' rbind +#' Return a new SparkDataFrame containing the union of rows #' #' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame #' and another SparkDataFrame. This is equivalent to `UNION ALL` in SQL. @@ -2261,39 +2261,64 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) { #' @param y A SparkDataFrame #' @return A SparkDataFrame containing the result of the union. #' @family SparkDataFrame functions -#' @rdname rbind -#' @name unionAll +#' @rdname union +#' @name union +#' @seealso \link{rbind} #' @export #' @examples #'\dontrun{ #' sparkR.session() #' df1 <- read.json(path) #' df2 <- read.json(path2) -#' unioned <- unionAll(df, df2) +#' unioned <- union(df, df2) +#' unions <- rbind(df, df2, df3, df4) #' } +#' @note union since 2.0.0 +setMethod("union", + signature(x = "SparkDataFrame", y = "SparkDataFrame"), + function(x, y) { +unioned <- callJMethod(x@sdf, "union", y@sdf) +dataFrame(unioned) + }) + +#' unionAll is deprecated - use union instead +#' @rdname union +#' @name unionAll +#' @export #' @note unionAll since 1.4.0 setMethod("unionAll", signature(x = "SparkDataFrame", y = "SparkDataFrame"), function(x, y) { -unioned <- callJMethod(x@sdf, "unionAll", y@sdf) -dataFrame(unioned) +.Deprecated("union") +union(x, y) }) #' Union two or more SparkDataFrames #' -#' Returns a new SparkDataFrame containing rows of all parameters. +#' Union two or more SparkDataFrames. This is equivalent to `UNION ALL` in SQL. +#' Note that this does not remove duplicate rows across the two SparkDataFrames. #' +#' @param x A SparkDataFrame +#' @param ... Additional SparkDataFrame +#' @return A SparkDataFrame containing the result of the union. +#' @family SparkDataFrame functions #' @rdname rbind #' @name rbind +#' @seealso \link{union} #' @export +#' @examples +#'
spark git commit: [SPARK-16088][SPARKR] update setJobGroup, cancelJobGroup, clearJobGroup
Repository: spark Updated Branches: refs/heads/master 65d1f0f71 -> b5a997667 [SPARK-16088][SPARKR] update setJobGroup, cancelJobGroup, clearJobGroup ## What changes were proposed in this pull request? Updated setJobGroup, cancelJobGroup, clearJobGroup to not require sc/SparkContext as parameter. Also updated roxygen2 doc and R programming guide on deprecations. ## How was this patch tested? unit tests Author: Felix Cheung Closes #13838 from felixcheung/rjobgroup. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b5a99766 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b5a99766 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b5a99766 Branch: refs/heads/master Commit: b5a997667f4c0e514217da6df5af37b8b849dfdf Parents: 65d1f0f Author: Felix Cheung Authored: Thu Jun 23 09:45:01 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Jun 23 09:45:01 2016 -0700 -- R/pkg/R/DataFrame.R | 1 - R/pkg/R/context.R| 10 +--- R/pkg/R/sparkR.R | 68 ++- R/pkg/R/utils.R | 8 R/pkg/inst/tests/testthat/test_context.R | 10 ++-- docs/sparkr.md | 2 + 6 files changed, 75 insertions(+), 24 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b5a99766/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 725cbf2..f856979 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -55,7 +55,6 @@ setMethod("initialize", "SparkDataFrame", function(.Object, sdf, isCached) { .Object }) -#' @rdname SparkDataFrame #' @export #' @param sdf A Java object reference to the backing Scala DataFrame #' @param isCached TRUE if the SparkDataFrame is cached http://git-wip-us.apache.org/repos/asf/spark/blob/b5a99766/R/pkg/R/context.R -- diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index dd0ceae..2538bb2 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -264,10 +264,7 @@ setCheckpointDir <- function(sc, dirName) { #'} #' @note spark.lapply since 2.0.0 spark.lapply <- function(list, func) { - if (!exists(".sparkRjsc", envir = .sparkREnv)) { -stop("SparkR has not been initialized. Please call sparkR.session()") - } - sc <- get(".sparkRjsc", envir = .sparkREnv) + sc <- getSparkContext() rdd <- parallelize(sc, list, length(list)) results <- map(rdd, func) local <- collect(results) @@ -287,9 +284,6 @@ spark.lapply <- function(list, func) { #'} #' @note setLogLevel since 2.0.0 setLogLevel <- function(level) { - if (!exists(".sparkRjsc", envir = .sparkREnv)) { -stop("SparkR has not been initialized. Please call sparkR.session()") - } - sc <- get(".sparkRjsc", envir = .sparkREnv) + sc <- getSparkContext() callJMethod(sc, "setLogLevel", level) } http://git-wip-us.apache.org/repos/asf/spark/blob/b5a99766/R/pkg/R/sparkR.R -- diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 2b6e124..62659b0 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -392,47 +392,91 @@ sparkR.session <- function( #' Assigns a group ID to all the jobs started by this thread until the group ID is set to a #' different value or cleared. #' -#' @param sc existing spark context #' @param groupid the ID to be assigned to job groups #' @param description description for the job group ID #' @param interruptOnCancel flag to indicate if the job is interrupted on job cancellation +#' @rdname setJobGroup +#' @name setJobGroup #' @examples #'\dontrun{ -#' sc <- sparkR.init() -#' setJobGroup(sc, "myJobGroup", "My job group description", TRUE) +#' sparkR.session() +#' setJobGroup("myJobGroup", "My job group description", TRUE) #'} #' @note setJobGroup since 1.5.0 -setJobGroup <- function(sc, groupId, description, interruptOnCancel) { +#' @method setJobGroup default +setJobGroup.default <- function(groupId, description, interruptOnCancel) { + sc <- getSparkContext() callJMethod(sc, "setJobGroup", groupId, description, interruptOnCancel) } +setJobGroup <- function(sc, groupId, description, interruptOnCancel) { + if (class(sc) == "jobj" && any(grepl("JavaSparkContext", getClassName.jobj(sc { +.Deprecated
spark git commit: [SPARK-16088][SPARKR] update setJobGroup, cancelJobGroup, clearJobGroup
Repository: spark Updated Branches: refs/heads/branch-2.0 567093596 -> 9f18c8f38 [SPARK-16088][SPARKR] update setJobGroup, cancelJobGroup, clearJobGroup ## What changes were proposed in this pull request? Updated setJobGroup, cancelJobGroup, clearJobGroup to not require sc/SparkContext as parameter. Also updated roxygen2 doc and R programming guide on deprecations. ## How was this patch tested? unit tests Author: Felix Cheung Closes #13838 from felixcheung/rjobgroup. (cherry picked from commit b5a997667f4c0e514217da6df5af37b8b849dfdf) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9f18c8f3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9f18c8f3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9f18c8f3 Branch: refs/heads/branch-2.0 Commit: 9f18c8f386af558ed72b88ad372835f25e807e79 Parents: 5670935 Author: Felix Cheung Authored: Thu Jun 23 09:45:01 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Jun 23 09:45:11 2016 -0700 -- R/pkg/R/DataFrame.R | 1 - R/pkg/R/context.R| 10 +--- R/pkg/R/sparkR.R | 68 ++- R/pkg/R/utils.R | 8 R/pkg/inst/tests/testthat/test_context.R | 10 ++-- docs/sparkr.md | 2 + 6 files changed, 75 insertions(+), 24 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9f18c8f3/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 725cbf2..f856979 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -55,7 +55,6 @@ setMethod("initialize", "SparkDataFrame", function(.Object, sdf, isCached) { .Object }) -#' @rdname SparkDataFrame #' @export #' @param sdf A Java object reference to the backing Scala DataFrame #' @param isCached TRUE if the SparkDataFrame is cached http://git-wip-us.apache.org/repos/asf/spark/blob/9f18c8f3/R/pkg/R/context.R -- diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index dd0ceae..2538bb2 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -264,10 +264,7 @@ setCheckpointDir <- function(sc, dirName) { #'} #' @note spark.lapply since 2.0.0 spark.lapply <- function(list, func) { - if (!exists(".sparkRjsc", envir = .sparkREnv)) { -stop("SparkR has not been initialized. Please call sparkR.session()") - } - sc <- get(".sparkRjsc", envir = .sparkREnv) + sc <- getSparkContext() rdd <- parallelize(sc, list, length(list)) results <- map(rdd, func) local <- collect(results) @@ -287,9 +284,6 @@ spark.lapply <- function(list, func) { #'} #' @note setLogLevel since 2.0.0 setLogLevel <- function(level) { - if (!exists(".sparkRjsc", envir = .sparkREnv)) { -stop("SparkR has not been initialized. Please call sparkR.session()") - } - sc <- get(".sparkRjsc", envir = .sparkREnv) + sc <- getSparkContext() callJMethod(sc, "setLogLevel", level) } http://git-wip-us.apache.org/repos/asf/spark/blob/9f18c8f3/R/pkg/R/sparkR.R -- diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 2b6e124..62659b0 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -392,47 +392,91 @@ sparkR.session <- function( #' Assigns a group ID to all the jobs started by this thread until the group ID is set to a #' different value or cleared. #' -#' @param sc existing spark context #' @param groupid the ID to be assigned to job groups #' @param description description for the job group ID #' @param interruptOnCancel flag to indicate if the job is interrupted on job cancellation +#' @rdname setJobGroup +#' @name setJobGroup #' @examples #'\dontrun{ -#' sc <- sparkR.init() -#' setJobGroup(sc, "myJobGroup", "My job group description", TRUE) +#' sparkR.session() +#' setJobGroup("myJobGroup", "My job group description", TRUE) #'} #' @note setJobGroup since 1.5.0 -setJobGroup <- function(sc, groupId, description, interruptOnCancel) { +#' @method setJobGroup default +setJobGroup.default <- function(groupId, description, interruptOnCancel) { + sc <- getSparkContext() callJMethod(sc, "setJobGroup", groupId, description, interruptOnCancel) } +setJobGroup <- function(sc, groupId, description, interruptOnCancel) { + if (class(sc) == "jobj&q
spark git commit: [SPARK-16184][SPARKR] conf API for SparkSession
Repository: spark Updated Branches: refs/heads/master e87741589 -> 30b182bcc [SPARK-16184][SPARKR] conf API for SparkSession ## What changes were proposed in this pull request? Add `conf` method to get Runtime Config from SparkSession ## How was this patch tested? unit tests, manual tests This is how it works in sparkR shell: ``` SparkSession available as 'spark'. > conf() $hive.metastore.warehouse.dir [1] "file:/opt/spark-2.0.0-bin-hadoop2.6/R/spark-warehouse" $spark.app.id [1] "local-1466749575523" $spark.app.name [1] "SparkR" $spark.driver.host [1] "10.0.2.1" $spark.driver.port [1] "45629" $spark.executorEnv.LD_LIBRARY_PATH [1] "$LD_LIBRARY_PATH:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/default-java/jre/lib/amd64/server" $spark.executor.id [1] "driver" $spark.home [1] "/opt/spark-2.0.0-bin-hadoop2.6" $spark.master [1] "local[*]" $spark.sql.catalogImplementation [1] "hive" $spark.submit.deployMode [1] "client" > conf("spark.master") $spark.master [1] "local[*]" ``` Author: Felix Cheung Closes #13885 from felixcheung/rconf. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/30b182bc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/30b182bc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/30b182bc Branch: refs/heads/master Commit: 30b182bcc088aef161585211c517f473b9ee6632 Parents: e877415 Author: Felix Cheung Authored: Sun Jun 26 13:10:43 2016 -0700 Committer: Shivaram Venkataraman Committed: Sun Jun 26 13:10:43 2016 -0700 -- R/pkg/NAMESPACE | 1 + R/pkg/R/SQLContext.R| 50 ++-- R/pkg/inst/tests/testthat/test_sparkSQL.R | 12 ++--- .../org/apache/spark/sql/api/r/SQLUtils.scala | 4 ++ 4 files changed, 57 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/30b182bc/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 2272d8b..e0ffde9 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -10,6 +10,7 @@ export("sparkR.session") export("sparkR.init") export("sparkR.stop") export("sparkR.session.stop") +export("sparkR.conf") export("print.jobj") export("sparkRSQL.init", http://git-wip-us.apache.org/repos/asf/spark/blob/30b182bc/R/pkg/R/SQLContext.R -- diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index ee3a41c..8df73db 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -110,11 +110,53 @@ infer_type <- function(x) { } } -getDefaultSqlSource <- function() { +#' Get Runtime Config from the current active SparkSession +#' +#' Get Runtime Config from the current active SparkSession. +#' To change SparkSession Runtime Config, please see `sparkR.session()`. +#' +#' @param key (optional) The key of the config to get, if omitted, all config is returned +#' @param defaultValue (optional) The default value of the config to return if they config is not +#' set, if omitted, the call fails if the config key is not set +#' @return a list of config values with keys as their names +#' @rdname sparkR.conf +#' @name sparkR.conf +#' @export +#' @examples +#'\dontrun{ +#' sparkR.session() +#' allConfigs <- sparkR.conf() +#' masterValue <- unlist(sparkR.conf("spark.master")) +#' namedConfig <- sparkR.conf("spark.executor.memory", "0g") +#' } +#' @note sparkR.conf since 2.0.0 +sparkR.conf <- function(key, defaultValue) { sparkSession <- getSparkSession() - conf <- callJMethod(sparkSession, "conf") - source <- callJMethod(conf, "get", "spark.sql.sources.default", "org.apache.spark.sql.parquet") - source + if (missing(key)) { +m <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getSessionConf", sparkSession) +as.list(m, all.names = TRUE, sorted = TRUE) + } else { +conf <- callJMethod(sparkSession, "conf") +value <- if (missing(defaultValue)) { + tryCatch(callJMethod(conf, "get", key), + error = function(e) { +if (any(grep("java.util.NoSuchElementException", as.character(e { + stop(paste0("Config '", key, "' is not set")) +} else { +
spark git commit: [SPARK-16184][SPARKR] conf API for SparkSession
Repository: spark Updated Branches: refs/heads/branch-2.0 b03b0976f -> e01776395 [SPARK-16184][SPARKR] conf API for SparkSession ## What changes were proposed in this pull request? Add `conf` method to get Runtime Config from SparkSession ## How was this patch tested? unit tests, manual tests This is how it works in sparkR shell: ``` SparkSession available as 'spark'. > conf() $hive.metastore.warehouse.dir [1] "file:/opt/spark-2.0.0-bin-hadoop2.6/R/spark-warehouse" $spark.app.id [1] "local-1466749575523" $spark.app.name [1] "SparkR" $spark.driver.host [1] "10.0.2.1" $spark.driver.port [1] "45629" $spark.executorEnv.LD_LIBRARY_PATH [1] "$LD_LIBRARY_PATH:/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/default-java/jre/lib/amd64/server" $spark.executor.id [1] "driver" $spark.home [1] "/opt/spark-2.0.0-bin-hadoop2.6" $spark.master [1] "local[*]" $spark.sql.catalogImplementation [1] "hive" $spark.submit.deployMode [1] "client" > conf("spark.master") $spark.master [1] "local[*]" ``` Author: Felix Cheung Closes #13885 from felixcheung/rconf. (cherry picked from commit 30b182bcc088aef161585211c517f473b9ee6632) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e0177639 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e0177639 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e0177639 Branch: refs/heads/branch-2.0 Commit: e01776395d14bb3ff1b9d6f9317938871457ac2c Parents: b03b097 Author: Felix Cheung Authored: Sun Jun 26 13:10:43 2016 -0700 Committer: Shivaram Venkataraman Committed: Sun Jun 26 13:10:53 2016 -0700 -- R/pkg/NAMESPACE | 1 + R/pkg/R/SQLContext.R| 50 ++-- R/pkg/inst/tests/testthat/test_sparkSQL.R | 12 ++--- .../org/apache/spark/sql/api/r/SQLUtils.scala | 4 ++ 4 files changed, 57 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e0177639/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 2272d8b..e0ffde9 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -10,6 +10,7 @@ export("sparkR.session") export("sparkR.init") export("sparkR.stop") export("sparkR.session.stop") +export("sparkR.conf") export("print.jobj") export("sparkRSQL.init", http://git-wip-us.apache.org/repos/asf/spark/blob/e0177639/R/pkg/R/SQLContext.R -- diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index ee3a41c..8df73db 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -110,11 +110,53 @@ infer_type <- function(x) { } } -getDefaultSqlSource <- function() { +#' Get Runtime Config from the current active SparkSession +#' +#' Get Runtime Config from the current active SparkSession. +#' To change SparkSession Runtime Config, please see `sparkR.session()`. +#' +#' @param key (optional) The key of the config to get, if omitted, all config is returned +#' @param defaultValue (optional) The default value of the config to return if they config is not +#' set, if omitted, the call fails if the config key is not set +#' @return a list of config values with keys as their names +#' @rdname sparkR.conf +#' @name sparkR.conf +#' @export +#' @examples +#'\dontrun{ +#' sparkR.session() +#' allConfigs <- sparkR.conf() +#' masterValue <- unlist(sparkR.conf("spark.master")) +#' namedConfig <- sparkR.conf("spark.executor.memory", "0g") +#' } +#' @note sparkR.conf since 2.0.0 +sparkR.conf <- function(key, defaultValue) { sparkSession <- getSparkSession() - conf <- callJMethod(sparkSession, "conf") - source <- callJMethod(conf, "get", "spark.sql.sources.default", "org.apache.spark.sql.parquet") - source + if (missing(key)) { +m <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getSessionConf", sparkSession) +as.list(m, all.names = TRUE, sorted = TRUE) + } else { +conf <- callJMethod(sparkSession, "conf") +value <- if (missing(defaultValue)) { + tryCatch(callJMethod(conf, "get", key), + error = function(e) { +if (any(grep("java.util.NoSuchElementException", as.character(e { + stop(paste0("Config &
spark git commit: [MINOR][SPARKR] update sparkR DataFrame.R comment
Repository: spark Updated Branches: refs/heads/master 26252f706 -> d59ba8e30 [MINOR][SPARKR] update sparkR DataFrame.R comment ## What changes were proposed in this pull request? update sparkR DataFrame.R comment SQLContext ==> SparkSession ## How was this patch tested? N/A Author: WeichenXu Closes #13946 from WeichenXu123/sparkR_comment_update_sparkSession. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d59ba8e3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d59ba8e3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d59ba8e3 Branch: refs/heads/master Commit: d59ba8e30751bbf91d49f5530b8242a12bbfb569 Parents: 26252f7 Author: WeichenXu Authored: Tue Jun 28 12:12:20 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Jun 28 12:12:20 2016 -0700 -- R/pkg/R/DataFrame.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d59ba8e3/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 61d47a8..25327be 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -471,7 +471,7 @@ setMethod("createOrReplaceTempView", #' (Deprecated) Register Temporary Table #' -#' Registers a SparkDataFrame as a Temporary Table in the SQLContext +#' Registers a SparkDataFrame as a Temporary Table in the SparkSession #' @param x A SparkDataFrame #' @param tableName A character vector containing the name of the table #' @@ -498,7 +498,7 @@ setMethod("registerTempTable", #' insertInto #' -#' Insert the contents of a SparkDataFrame into a table registered in the current SQL Context. +#' Insert the contents of a SparkDataFrame into a table registered in the current SparkSession. #' #' @param x A SparkDataFrame #' @param tableName A character vector containing the name of the table - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][SPARKR] update sparkR DataFrame.R comment
Repository: spark Updated Branches: refs/heads/branch-2.0 a1d04cc03 -> c86d29b2e [MINOR][SPARKR] update sparkR DataFrame.R comment ## What changes were proposed in this pull request? update sparkR DataFrame.R comment SQLContext ==> SparkSession ## How was this patch tested? N/A Author: WeichenXu Closes #13946 from WeichenXu123/sparkR_comment_update_sparkSession. (cherry picked from commit d59ba8e30751bbf91d49f5530b8242a12bbfb569) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c86d29b2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c86d29b2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c86d29b2 Branch: refs/heads/branch-2.0 Commit: c86d29b2e6bfda05124c20ba3c6db9275c24faa8 Parents: a1d04cc Author: WeichenXu Authored: Tue Jun 28 12:12:20 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Jun 28 12:12:28 2016 -0700 -- R/pkg/R/DataFrame.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c86d29b2/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index f856979..567758d 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -466,7 +466,7 @@ setMethod("createOrReplaceTempView", #' (Deprecated) Register Temporary Table #' -#' Registers a SparkDataFrame as a Temporary Table in the SQLContext +#' Registers a SparkDataFrame as a Temporary Table in the SparkSession #' @param x A SparkDataFrame #' @param tableName A character vector containing the name of the table #' @@ -493,7 +493,7 @@ setMethod("registerTempTable", #' insertInto #' -#' Insert the contents of a SparkDataFrame into a table registered in the current SQL Context. +#' Insert the contents of a SparkDataFrame into a table registered in the current SparkSession. #' #' @param x A SparkDataFrame #' @param tableName A character vector containing the name of the table - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARKR] add csv tests
Repository: spark Updated Branches: refs/heads/master 5545b7910 -> 823518c2b [SPARKR] add csv tests ## What changes were proposed in this pull request? Add unit tests for csv data for SPARKR ## How was this patch tested? unit tests Author: Felix Cheung Closes #13904 from felixcheung/rcsv. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/823518c2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/823518c2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/823518c2 Branch: refs/heads/master Commit: 823518c2b5259c8a954431467639198c808c9198 Parents: 5545b79 Author: Felix Cheung Authored: Tue Jun 28 17:08:28 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Jun 28 17:08:28 2016 -0700 -- R/pkg/inst/tests/testthat/test_sparkSQL.R | 18 ++ 1 file changed, 18 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/823518c2/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 7562fa9..d4662ad 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -208,6 +208,24 @@ test_that("create DataFrame from RDD", { unsetHiveContext() }) +test_that("read csv as DataFrame", { + csvPath <- tempfile(pattern = "sparkr-test", fileext = ".csv") + mockLinesCsv <- c("year,make,model,comment,blank", + "\"2012\",\"Tesla\",\"S\",\"No comment\",", + "1997,Ford,E350,\"Go get one now they are going fast\",", + "2015,Chevy,Volt") + writeLines(mockLinesCsv, csvPath) + + # default "header" is false + df <- read.df(csvPath, "csv", header = "true") + expect_equal(count(df), 3) + expect_equal(columns(df), c("year", "make", "model", "comment", "blank")) + expect_equal(sort(unlist(collect(where(df, df$year == "2015", + sort(unlist(list(year = "2015", make = "Chevy", model = "Volt" + + unlink(csvPath) +}) + test_that("convert NAs to null type in DataFrames", { rdd <- parallelize(sc, list(list(1L, 2L), list(NA, 4L))) df <- createDataFrame(rdd, list("a", "b")) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARKR] add csv tests
Repository: spark Updated Branches: refs/heads/branch-2.0 52c9d69f7 -> d7a59f1f4 [SPARKR] add csv tests ## What changes were proposed in this pull request? Add unit tests for csv data for SPARKR ## How was this patch tested? unit tests Author: Felix Cheung Closes #13904 from felixcheung/rcsv. (cherry picked from commit 823518c2b5259c8a954431467639198c808c9198) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d7a59f1f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d7a59f1f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d7a59f1f Branch: refs/heads/branch-2.0 Commit: d7a59f1f450aae06baac96867a26042bd1ccd1d5 Parents: 52c9d69 Author: Felix Cheung Authored: Tue Jun 28 17:08:28 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Jun 28 17:08:36 2016 -0700 -- R/pkg/inst/tests/testthat/test_sparkSQL.R | 18 ++ 1 file changed, 18 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d7a59f1f/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 74def5c..deda1b6 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -208,6 +208,24 @@ test_that("create DataFrame from RDD", { unsetHiveContext() }) +test_that("read csv as DataFrame", { + csvPath <- tempfile(pattern = "sparkr-test", fileext = ".csv") + mockLinesCsv <- c("year,make,model,comment,blank", + "\"2012\",\"Tesla\",\"S\",\"No comment\",", + "1997,Ford,E350,\"Go get one now they are going fast\",", + "2015,Chevy,Volt") + writeLines(mockLinesCsv, csvPath) + + # default "header" is false + df <- read.df(csvPath, "csv", header = "true") + expect_equal(count(df), 3) + expect_equal(columns(df), c("year", "make", "model", "comment", "blank")) + expect_equal(sort(unlist(collect(where(df, df$year == "2015", + sort(unlist(list(year = "2015", make = "Chevy", model = "Volt" + + unlink(csvPath) +}) + test_that("convert NAs to null type in DataFrames", { rdd <- parallelize(sc, list(list(1L, 2L), list(NA, 4L))) df <- createDataFrame(rdd, list("a", "b")) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16012][SPARKR] Implement gapplyCollect which will apply a R function on each group similar to gapply and collect the result back to R data.frame
Repository: spark Updated Branches: refs/heads/branch-2.0 3665927c6 -> 4c96ded84 [SPARK-16012][SPARKR] Implement gapplyCollect which will apply a R function on each group similar to gapply and collect the result back to R data.frame ## What changes were proposed in this pull request? gapplyCollect() does gapply() on a SparkDataFrame and collect the result back to R. Compared to gapply() + collect(), gapplyCollect() offers performance optimization as well as programming convenience, as no schema is needed to be provided. This is similar to dapplyCollect(). ## How was this patch tested? Added test cases for gapplyCollect similar to dapplyCollect Author: Narine Kokhlikyan Closes #13760 from NarineK/gapplyCollect. (cherry picked from commit 26afb4ce4099e7942f8db1ead3817ed8fbf71ce3) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4c96ded8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4c96ded8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4c96ded8 Branch: refs/heads/branch-2.0 Commit: 4c96ded84b8f882ab6d2d42c343ccff8c972d713 Parents: 3665927 Author: Narine Kokhlikyan Authored: Fri Jul 1 13:55:13 2016 -0700 Committer: Shivaram Venkataraman Committed: Fri Jul 1 13:55:39 2016 -0700 -- R/pkg/NAMESPACE | 1 + R/pkg/R/DataFrame.R | 111 +++-- R/pkg/R/generics.R| 4 + R/pkg/R/group.R | 93 + R/pkg/inst/tests/testthat/test_sparkSQL.R | 35 ++-- 5 files changed, 177 insertions(+), 67 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4c96ded8/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index e0ffde9..9fd2568 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -69,6 +69,7 @@ exportMethods("arrange", "first", "freqItems", "gapply", + "gapplyCollect", "group_by", "groupBy", "head", http://git-wip-us.apache.org/repos/asf/spark/blob/4c96ded8/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 567758d..17474d4 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1339,7 +1339,7 @@ setMethod("dapplyCollect", #' gapply #' -#' Group the SparkDataFrame using the specified columns and apply the R function to each +#' Groups the SparkDataFrame using the specified columns and applies the R function to each #' group. #' #' @param x A SparkDataFrame @@ -1351,9 +1351,11 @@ setMethod("dapplyCollect", #' @param schema The schema of the resulting SparkDataFrame after the function is applied. #' The schema must match to output of `func`. It has to be defined for each #' output column with preferred output column name and corresponding data type. +#' @return a SparkDataFrame #' @family SparkDataFrame functions #' @rdname gapply #' @name gapply +#' @seealso \link{gapplyCollect} #' @export #' @examples #' @@ -1369,14 +1371,22 @@ setMethod("dapplyCollect", #' columns with data types integer and string and the mean which is a double. #' schema <- structType(structField("a", "integer"), structField("c", "string"), #' structField("avg", "double")) -#' df1 <- gapply( +#' result <- gapply( #' df, -#' list("a", "c"), +#' c("a", "c"), #' function(key, x) { #' y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE) -#' }, -#' schema) -#' collect(df1) +#' }, schema) +#' +#' We can also group the data and afterwards call gapply on GroupedData. +#' For Example: +#' gdf <- group_by(df, "a", "c") +#' result <- gapply( +#' gdf, +#' function(key, x) { +#' y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE) +#' }, schema) +#' collect(result) #' #' Result #' -- @@ -1394,7 +1404,7 @@ setMethod("dapplyCollect", #' structField("Petal_Width", "double")) #' df1 <- gapply( #' df, -#' list(df$"Species"), +#' df$"Species", #' function(key, x) { #' m <-
spark git commit: [SPARK-16012][SPARKR] Implement gapplyCollect which will apply a R function on each group similar to gapply and collect the result back to R data.frame
Repository: spark Updated Branches: refs/heads/master c55397652 -> 26afb4ce4 [SPARK-16012][SPARKR] Implement gapplyCollect which will apply a R function on each group similar to gapply and collect the result back to R data.frame ## What changes were proposed in this pull request? gapplyCollect() does gapply() on a SparkDataFrame and collect the result back to R. Compared to gapply() + collect(), gapplyCollect() offers performance optimization as well as programming convenience, as no schema is needed to be provided. This is similar to dapplyCollect(). ## How was this patch tested? Added test cases for gapplyCollect similar to dapplyCollect Author: Narine Kokhlikyan Closes #13760 from NarineK/gapplyCollect. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/26afb4ce Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/26afb4ce Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/26afb4ce Branch: refs/heads/master Commit: 26afb4ce4099e7942f8db1ead3817ed8fbf71ce3 Parents: c553976 Author: Narine Kokhlikyan Authored: Fri Jul 1 13:55:13 2016 -0700 Committer: Shivaram Venkataraman Committed: Fri Jul 1 13:55:13 2016 -0700 -- R/pkg/NAMESPACE | 1 + R/pkg/R/DataFrame.R | 111 +++-- R/pkg/R/generics.R| 4 + R/pkg/R/group.R | 93 + R/pkg/inst/tests/testthat/test_sparkSQL.R | 35 ++-- 5 files changed, 177 insertions(+), 67 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/26afb4ce/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index abc6588..bc3aceb 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -69,6 +69,7 @@ exportMethods("arrange", "first", "freqItems", "gapply", + "gapplyCollect", "group_by", "groupBy", "head", http://git-wip-us.apache.org/repos/asf/spark/blob/26afb4ce/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 25327be..5944bbc 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1344,7 +1344,7 @@ setMethod("dapplyCollect", #' gapply #' -#' Group the SparkDataFrame using the specified columns and apply the R function to each +#' Groups the SparkDataFrame using the specified columns and applies the R function to each #' group. #' #' @param x A SparkDataFrame @@ -1356,9 +1356,11 @@ setMethod("dapplyCollect", #' @param schema The schema of the resulting SparkDataFrame after the function is applied. #' The schema must match to output of `func`. It has to be defined for each #' output column with preferred output column name and corresponding data type. +#' @return a SparkDataFrame #' @family SparkDataFrame functions #' @rdname gapply #' @name gapply +#' @seealso \link{gapplyCollect} #' @export #' @examples #' @@ -1374,14 +1376,22 @@ setMethod("dapplyCollect", #' columns with data types integer and string and the mean which is a double. #' schema <- structType(structField("a", "integer"), structField("c", "string"), #' structField("avg", "double")) -#' df1 <- gapply( +#' result <- gapply( #' df, -#' list("a", "c"), +#' c("a", "c"), #' function(key, x) { #' y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE) -#' }, -#' schema) -#' collect(df1) +#' }, schema) +#' +#' We can also group the data and afterwards call gapply on GroupedData. +#' For Example: +#' gdf <- group_by(df, "a", "c") +#' result <- gapply( +#' gdf, +#' function(key, x) { +#' y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE) +#' }, schema) +#' collect(result) #' #' Result #' -- @@ -1399,7 +1409,7 @@ setMethod("dapplyCollect", #' structField("Petal_Width", "double")) #' df1 <- gapply( #' df, -#' list(df$"Species"), +#' df$"Species", #' function(key, x) { #' m <- suppressWarnings(lm(Sepal_Length ~ #' Sepal_Width + Petal_Length + Petal_Width, x)) @@ -1407,8 +1417,8
spark git commit: [SPARK-16299][SPARKR] Capture errors from R workers in daemon.R to avoid deletion of R session temporary directory.
Repository: spark Updated Branches: refs/heads/branch-2.0 4c96ded84 -> d6588115a [SPARK-16299][SPARKR] Capture errors from R workers in daemon.R to avoid deletion of R session temporary directory. ## What changes were proposed in this pull request? Capture errors from R workers in daemon.R to avoid deletion of R session temporary directory. See detailed description at https://issues.apache.org/jira/browse/SPARK-16299 ## How was this patch tested? SparkR unit tests. Author: Sun Rui Closes #13975 from sun-rui/SPARK-16299. (cherry picked from commit e4fa58c43ce2bf8d76bffb0d9dc1132f8d0eae6a) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d6588115 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d6588115 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d6588115 Branch: refs/heads/branch-2.0 Commit: d6588115a9ec3178f7d1edc86418a9832c9b3ac7 Parents: 4c96ded Author: Sun Rui Authored: Fri Jul 1 14:37:03 2016 -0700 Committer: Shivaram Venkataraman Committed: Fri Jul 1 14:37:10 2016 -0700 -- R/pkg/inst/worker/daemon.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d6588115/R/pkg/inst/worker/daemon.R -- diff --git a/R/pkg/inst/worker/daemon.R b/R/pkg/inst/worker/daemon.R index f55beac..b92e6be 100644 --- a/R/pkg/inst/worker/daemon.R +++ b/R/pkg/inst/worker/daemon.R @@ -44,7 +44,7 @@ while (TRUE) { if (inherits(p, "masterProcess")) { close(inputCon) Sys.setenv(SPARKR_WORKER_PORT = port) - source(script) + try(source(script)) # Set SIGUSR1 so that child can exit tools::pskill(Sys.getpid(), tools::SIGUSR1) parallel:::mcexit(0L) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16299][SPARKR] Capture errors from R workers in daemon.R to avoid deletion of R session temporary directory.
Repository: spark Updated Branches: refs/heads/master 26afb4ce4 -> e4fa58c43 [SPARK-16299][SPARKR] Capture errors from R workers in daemon.R to avoid deletion of R session temporary directory. ## What changes were proposed in this pull request? Capture errors from R workers in daemon.R to avoid deletion of R session temporary directory. See detailed description at https://issues.apache.org/jira/browse/SPARK-16299 ## How was this patch tested? SparkR unit tests. Author: Sun Rui Closes #13975 from sun-rui/SPARK-16299. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e4fa58c4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e4fa58c4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e4fa58c4 Branch: refs/heads/master Commit: e4fa58c43ce2bf8d76bffb0d9dc1132f8d0eae6a Parents: 26afb4c Author: Sun Rui Authored: Fri Jul 1 14:37:03 2016 -0700 Committer: Shivaram Venkataraman Committed: Fri Jul 1 14:37:03 2016 -0700 -- R/pkg/inst/worker/daemon.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e4fa58c4/R/pkg/inst/worker/daemon.R -- diff --git a/R/pkg/inst/worker/daemon.R b/R/pkg/inst/worker/daemon.R index f55beac..b92e6be 100644 --- a/R/pkg/inst/worker/daemon.R +++ b/R/pkg/inst/worker/daemon.R @@ -44,7 +44,7 @@ while (TRUE) { if (inherits(p, "masterProcess")) { close(inputCon) Sys.setenv(SPARKR_WORKER_PORT = port) - source(script) + try(source(script)) # Set SIGUSR1 so that child can exit tools::pskill(Sys.getpid(), tools::SIGUSR1) parallel:::mcexit(0L) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16233][R][TEST] ORC test should be enabled only when HiveContext is available.
Repository: spark Updated Branches: refs/heads/master d601894c0 -> d17e5f2f1 [SPARK-16233][R][TEST] ORC test should be enabled only when HiveContext is available. ## What changes were proposed in this pull request? ORC test should be enabled only when HiveContext is available. ## How was this patch tested? Manual. ``` $ R/run-tests.sh ... 1. create DataFrame from RDD (test_sparkSQL.R#200) - Hive is not build with SparkSQL, skipped 2. test HiveContext (test_sparkSQL.R#1021) - Hive is not build with SparkSQL, skipped 3. read/write ORC files (test_sparkSQL.R#1728) - Hive is not build with SparkSQL, skipped 4. enableHiveSupport on SparkSession (test_sparkSQL.R#2448) - Hive is not build with SparkSQL, skipped 5. sparkJars tag in SparkContext (test_Windows.R#21) - This test is only for Windows, skipped DONE === Tests passed. ``` Author: Dongjoon Hyun Closes #14019 from dongjoon-hyun/SPARK-16233. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d17e5f2f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d17e5f2f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d17e5f2f Branch: refs/heads/master Commit: d17e5f2f123eecd5a7a1d87f5ce75a0fc44552b4 Parents: d601894 Author: Dongjoon Hyun Authored: Fri Jul 1 15:35:19 2016 -0700 Committer: Shivaram Venkataraman Committed: Fri Jul 1 15:35:19 2016 -0700 -- R/pkg/inst/tests/testthat/test_sparkSQL.R | 2 ++ 1 file changed, 2 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d17e5f2f/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 3b8d570..a3aa26d 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1725,6 +1725,7 @@ test_that("mutate(), transform(), rename() and names()", { }) test_that("read/write ORC files", { + setHiveContext(sc) df <- read.df(jsonPath, "json") # Test write.df and read.df @@ -1741,6 +1742,7 @@ test_that("read/write ORC files", { expect_equal(count(orcDF), count(df)) unlink(orcPath2) + unsetHiveContext() }) test_that("read/write Parquet files", { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16233][R][TEST] ORC test should be enabled only when HiveContext is available.
Repository: spark Updated Branches: refs/heads/branch-2.0 78387ce05 -> 794d09969 [SPARK-16233][R][TEST] ORC test should be enabled only when HiveContext is available. ## What changes were proposed in this pull request? ORC test should be enabled only when HiveContext is available. ## How was this patch tested? Manual. ``` $ R/run-tests.sh ... 1. create DataFrame from RDD (test_sparkSQL.R#200) - Hive is not build with SparkSQL, skipped 2. test HiveContext (test_sparkSQL.R#1021) - Hive is not build with SparkSQL, skipped 3. read/write ORC files (test_sparkSQL.R#1728) - Hive is not build with SparkSQL, skipped 4. enableHiveSupport on SparkSession (test_sparkSQL.R#2448) - Hive is not build with SparkSQL, skipped 5. sparkJars tag in SparkContext (test_Windows.R#21) - This test is only for Windows, skipped DONE === Tests passed. ``` Author: Dongjoon Hyun Closes #14019 from dongjoon-hyun/SPARK-16233. (cherry picked from commit d17e5f2f123eecd5a7a1d87f5ce75a0fc44552b4) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/794d0996 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/794d0996 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/794d0996 Branch: refs/heads/branch-2.0 Commit: 794d099691c3ef71b25178992086b4f25e4019e6 Parents: 78387ce Author: Dongjoon Hyun Authored: Fri Jul 1 15:35:19 2016 -0700 Committer: Shivaram Venkataraman Committed: Fri Jul 1 15:35:27 2016 -0700 -- R/pkg/inst/tests/testthat/test_sparkSQL.R | 2 ++ 1 file changed, 2 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/794d0996/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index e05e5c4..d22baf6 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1717,6 +1717,7 @@ test_that("mutate(), transform(), rename() and names()", { }) test_that("read/write ORC files", { + setHiveContext(sc) df <- read.df(jsonPath, "json") # Test write.df and read.df @@ -1733,6 +1734,7 @@ test_that("read/write ORC files", { expect_equal(count(orcDF), count(df)) unlink(orcPath2) + unsetHiveContext() }) test_that("read/write Parquet files", { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16310][SPARKR] R na.string-like default for csv source
Repository: spark Updated Branches: refs/heads/branch-2.0 30cb3f1d3 -> 5828da41c [SPARK-16310][SPARKR] R na.string-like default for csv source ## What changes were proposed in this pull request? Apply default "NA" as null string for R, like R read.csv na.string parameter. https://stat.ethz.ch/R-manual/R-devel/library/utils/html/read.table.html na.strings = "NA" An user passing a csv file with NA value should get the same behavior with SparkR read.df(... source = "csv") (couldn't open JIRA, will do that later) ## How was this patch tested? unit tests shivaram Author: Felix Cheung Closes #13984 from felixcheung/rcsvnastring. (cherry picked from commit f4767bcc7a9d1bdd301f054776aa45e7c9f344a7) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5828da41 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5828da41 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5828da41 Branch: refs/heads/branch-2.0 Commit: 5828da41cb2d815708191bd9a5cf3bd82795aa41 Parents: 30cb3f1 Author: Felix Cheung Authored: Thu Jul 7 15:21:57 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Jul 7 15:22:06 2016 -0700 -- R/pkg/R/SQLContext.R | 10 ++-- R/pkg/inst/tests/testthat/test_sparkSQL.R | 32 +- 2 files changed, 34 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5828da41/R/pkg/R/SQLContext.R -- diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index 8df73db..bc0daa2 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -714,11 +714,14 @@ dropTempView <- function(viewName) { #' #' The data source is specified by the `source` and a set of options(...). #' If `source` is not specified, the default data source configured by -#' "spark.sql.sources.default" will be used. +#' "spark.sql.sources.default" will be used. \cr +#' Similar to R read.csv, when `source` is "csv", by default, a value of "NA" will be interpreted +#' as NA. #' #' @param path The path of files to load #' @param source The name of external data source #' @param schema The data schema defined in structType +#' @param na.strings Default string value for NA when source is "csv" #' @return SparkDataFrame #' @rdname read.df #' @name read.df @@ -735,7 +738,7 @@ dropTempView <- function(viewName) { #' @name read.df #' @method read.df default #' @note read.df since 1.4.0 -read.df.default <- function(path = NULL, source = NULL, schema = NULL, ...) { +read.df.default <- function(path = NULL, source = NULL, schema = NULL, na.strings = "NA", ...) { sparkSession <- getSparkSession() options <- varargsToEnv(...) if (!is.null(path)) { @@ -744,6 +747,9 @@ read.df.default <- function(path = NULL, source = NULL, schema = NULL, ...) { if (is.null(source)) { source <- getDefaultSqlSource() } + if (source == "csv" && is.null(options[["nullValue"]])) { +options[["nullValue"]] <- na.strings + } if (!is.null(schema)) { stopifnot(class(schema) == "structType") sdf <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sparkSession, source, http://git-wip-us.apache.org/repos/asf/spark/blob/5828da41/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index d22baf6..003fcce 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -213,15 +213,35 @@ test_that("read csv as DataFrame", { mockLinesCsv <- c("year,make,model,comment,blank", "\"2012\",\"Tesla\",\"S\",\"No comment\",", "1997,Ford,E350,\"Go get one now they are going fast\",", - "2015,Chevy,Volt") + "2015,Chevy,Volt", + "NA,Dummy,Placeholder") writeLines(mockLinesCsv, csvPath) - # default "header" is false - df <- read.df(csvPath, "csv", header = "true") - expect_equal(count(df), 3) + # default "header" is false, inferSchema to handle "year" as "int" + df <- read.df(csvPath, "csv", header = "true", inferSchema = "true&quo
spark git commit: [SPARK-16310][SPARKR] R na.string-like default for csv source
Repository: spark Updated Branches: refs/heads/master 28710b42b -> f4767bcc7 [SPARK-16310][SPARKR] R na.string-like default for csv source ## What changes were proposed in this pull request? Apply default "NA" as null string for R, like R read.csv na.string parameter. https://stat.ethz.ch/R-manual/R-devel/library/utils/html/read.table.html na.strings = "NA" An user passing a csv file with NA value should get the same behavior with SparkR read.df(... source = "csv") (couldn't open JIRA, will do that later) ## How was this patch tested? unit tests shivaram Author: Felix Cheung Closes #13984 from felixcheung/rcsvnastring. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f4767bcc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f4767bcc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f4767bcc Branch: refs/heads/master Commit: f4767bcc7a9d1bdd301f054776aa45e7c9f344a7 Parents: 28710b4 Author: Felix Cheung Authored: Thu Jul 7 15:21:57 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Jul 7 15:21:57 2016 -0700 -- R/pkg/R/SQLContext.R | 10 ++-- R/pkg/inst/tests/testthat/test_sparkSQL.R | 32 +- 2 files changed, 34 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f4767bcc/R/pkg/R/SQLContext.R -- diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index 8df73db..bc0daa2 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -714,11 +714,14 @@ dropTempView <- function(viewName) { #' #' The data source is specified by the `source` and a set of options(...). #' If `source` is not specified, the default data source configured by -#' "spark.sql.sources.default" will be used. +#' "spark.sql.sources.default" will be used. \cr +#' Similar to R read.csv, when `source` is "csv", by default, a value of "NA" will be interpreted +#' as NA. #' #' @param path The path of files to load #' @param source The name of external data source #' @param schema The data schema defined in structType +#' @param na.strings Default string value for NA when source is "csv" #' @return SparkDataFrame #' @rdname read.df #' @name read.df @@ -735,7 +738,7 @@ dropTempView <- function(viewName) { #' @name read.df #' @method read.df default #' @note read.df since 1.4.0 -read.df.default <- function(path = NULL, source = NULL, schema = NULL, ...) { +read.df.default <- function(path = NULL, source = NULL, schema = NULL, na.strings = "NA", ...) { sparkSession <- getSparkSession() options <- varargsToEnv(...) if (!is.null(path)) { @@ -744,6 +747,9 @@ read.df.default <- function(path = NULL, source = NULL, schema = NULL, ...) { if (is.null(source)) { source <- getDefaultSqlSource() } + if (source == "csv" && is.null(options[["nullValue"]])) { +options[["nullValue"]] <- na.strings + } if (!is.null(schema)) { stopifnot(class(schema) == "structType") sdf <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sparkSession, source, http://git-wip-us.apache.org/repos/asf/spark/blob/f4767bcc/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index a3aa26d..a0ab719 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -213,15 +213,35 @@ test_that("read csv as DataFrame", { mockLinesCsv <- c("year,make,model,comment,blank", "\"2012\",\"Tesla\",\"S\",\"No comment\",", "1997,Ford,E350,\"Go get one now they are going fast\",", - "2015,Chevy,Volt") + "2015,Chevy,Volt", + "NA,Dummy,Placeholder") writeLines(mockLinesCsv, csvPath) - # default "header" is false - df <- read.df(csvPath, "csv", header = "true") - expect_equal(count(df), 3) + # default "header" is false, inferSchema to handle "year" as "int" + df <- read.df(csvPath, "csv", header = "true", inferSchema = "true") + expect_equal(count(df), 4) expect_equal(columns(df), c("year", "make", "model&quo
spark git commit: [SPARK-16425][R] `describe()` should not fail with non-numeric columns
Repository: spark Updated Branches: refs/heads/master f4767bcc7 -> 6aa7d09f4 [SPARK-16425][R] `describe()` should not fail with non-numeric columns ## What changes were proposed in this pull request? This PR prevents ERRORs when `summary(df)` is called for `SparkDataFrame` with not-numeric columns. This failure happens only in `SparkR`. **Before** ```r > df <- createDataFrame(faithful) > df <- withColumn(df, "boolean", df$waiting==79) > summary(df) 16/07/07 14:15:16 ERROR RBackendHandler: describe on 34 failed Error in invokeJava(isStatic = FALSE, objId$id, methodName, ...) : org.apache.spark.sql.AnalysisException: cannot resolve 'avg(`boolean`)' due to data type mismatch: function average requires numeric types, not BooleanType; ``` **After** ```r > df <- createDataFrame(faithful) > df <- withColumn(df, "boolean", df$waiting==79) > summary(df) SparkDataFrame[summary:string, eruptions:string, waiting:string] ``` ## How was this patch tested? Pass the Jenkins with a updated testcase. Author: Dongjoon Hyun Closes #14096 from dongjoon-hyun/SPARK-16425. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6aa7d09f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6aa7d09f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6aa7d09f Branch: refs/heads/master Commit: 6aa7d09f4e126f42e41085dec169c813379ed354 Parents: f4767bc Author: Dongjoon Hyun Authored: Thu Jul 7 17:47:29 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Jul 7 17:47:29 2016 -0700 -- R/pkg/R/DataFrame.R | 3 +-- R/pkg/inst/tests/testthat/test_sparkSQL.R | 8 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6aa7d09f/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 5944bbc..a18eee3 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2622,8 +2622,7 @@ setMethod("describe", setMethod("describe", signature(x = "SparkDataFrame"), function(x) { -colList <- as.list(c(columns(x))) -sdf <- callJMethod(x@sdf, "describe", colList) +sdf <- callJMethod(x@sdf, "describe", list()) dataFrame(sdf) }) http://git-wip-us.apache.org/repos/asf/spark/blob/6aa7d09f/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index a0ab719..e2a1da0 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1824,13 +1824,17 @@ test_that("describe() and summarize() on a DataFrame", { expect_equal(collect(stats)[2, "age"], "24.5") expect_equal(collect(stats)[3, "age"], "7.7781745930520225") stats <- describe(df) - expect_equal(collect(stats)[4, "name"], "Andy") + expect_equal(collect(stats)[4, "name"], NULL) expect_equal(collect(stats)[5, "age"], "30") stats2 <- summary(df) - expect_equal(collect(stats2)[4, "name"], "Andy") + expect_equal(collect(stats2)[4, "name"], NULL) expect_equal(collect(stats2)[5, "age"], "30") + # SPARK-16425: SparkR summary() fails on column of type logical + df <- withColumn(df, "boolean", df$age == 30) + summary(df) + # Test base::summary is working expect_equal(length(summary(attenu, digits = 4)), 35) }) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16425][R] `describe()` should not fail with non-numeric columns
Repository: spark Updated Branches: refs/heads/branch-2.0 5828da41c -> 73c764a04 [SPARK-16425][R] `describe()` should not fail with non-numeric columns ## What changes were proposed in this pull request? This PR prevents ERRORs when `summary(df)` is called for `SparkDataFrame` with not-numeric columns. This failure happens only in `SparkR`. **Before** ```r > df <- createDataFrame(faithful) > df <- withColumn(df, "boolean", df$waiting==79) > summary(df) 16/07/07 14:15:16 ERROR RBackendHandler: describe on 34 failed Error in invokeJava(isStatic = FALSE, objId$id, methodName, ...) : org.apache.spark.sql.AnalysisException: cannot resolve 'avg(`boolean`)' due to data type mismatch: function average requires numeric types, not BooleanType; ``` **After** ```r > df <- createDataFrame(faithful) > df <- withColumn(df, "boolean", df$waiting==79) > summary(df) SparkDataFrame[summary:string, eruptions:string, waiting:string] ``` ## How was this patch tested? Pass the Jenkins with a updated testcase. Author: Dongjoon Hyun Closes #14096 from dongjoon-hyun/SPARK-16425. (cherry picked from commit 6aa7d09f4e126f42e41085dec169c813379ed354) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/73c764a0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/73c764a0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/73c764a0 Branch: refs/heads/branch-2.0 Commit: 73c764a047f795c85909c7a7ea4324f286d2aafa Parents: 5828da4 Author: Dongjoon Hyun Authored: Thu Jul 7 17:47:29 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Jul 7 17:47:38 2016 -0700 -- R/pkg/R/DataFrame.R | 3 +-- R/pkg/inst/tests/testthat/test_sparkSQL.R | 8 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/73c764a0/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 17474d4..ec09aab 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2617,8 +2617,7 @@ setMethod("describe", setMethod("describe", signature(x = "SparkDataFrame"), function(x) { -colList <- as.list(c(columns(x))) -sdf <- callJMethod(x@sdf, "describe", colList) +sdf <- callJMethod(x@sdf, "describe", list()) dataFrame(sdf) }) http://git-wip-us.apache.org/repos/asf/spark/blob/73c764a0/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 003fcce..755aded 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1816,13 +1816,17 @@ test_that("describe() and summarize() on a DataFrame", { expect_equal(collect(stats)[2, "age"], "24.5") expect_equal(collect(stats)[3, "age"], "7.7781745930520225") stats <- describe(df) - expect_equal(collect(stats)[4, "name"], "Andy") + expect_equal(collect(stats)[4, "name"], NULL) expect_equal(collect(stats)[5, "age"], "30") stats2 <- summary(df) - expect_equal(collect(stats2)[4, "name"], "Andy") + expect_equal(collect(stats2)[4, "name"], NULL) expect_equal(collect(stats2)[5, "age"], "30") + # SPARK-16425: SparkR summary() fails on column of type logical + df <- withColumn(df, "boolean", df$age == 30) + summary(df) + # Test base::summary is working expect_equal(length(summary(attenu, digits = 4)), 35) }) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17317][SPARKR] Add SparkR vignette to branch 2.0
Repository: spark Updated Branches: refs/heads/branch-2.0 5c2bc8360 -> a09c258c9 [SPARK-17317][SPARKR] Add SparkR vignette to branch 2.0 ## What changes were proposed in this pull request? This PR adds SparkR vignette to branch 2.0, which works as a friendly guidance going through the functionality provided by SparkR. ## How was this patch tested? R unit test. Author: junyangq Author: Shivaram Venkataraman Author: Junyang Qian Closes #15100 from junyangq/SPARKR-vignette-2.0. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a09c258c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a09c258c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a09c258c Branch: refs/heads/branch-2.0 Commit: a09c258c9a97e701fa7650cc0651e3c6a7a1cab9 Parents: 5c2bc83 Author: junyangq Authored: Thu Sep 15 10:00:36 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Sep 15 10:00:36 2016 -0700 -- R/create-docs.sh | 11 +- R/pkg/vignettes/sparkr-vignettes.Rmd | 643 ++ 2 files changed, 652 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a09c258c/R/create-docs.sh -- diff --git a/R/create-docs.sh b/R/create-docs.sh index d2ae160..0dfba22 100755 --- a/R/create-docs.sh +++ b/R/create-docs.sh @@ -17,11 +17,13 @@ # limitations under the License. # -# Script to create API docs for SparkR -# This requires `devtools` and `knitr` to be installed on the machine. +# Script to create API docs and vignettes for SparkR +# This requires `devtools`, `knitr` and `rmarkdown` to be installed on the machine. # After running this script the html docs can be found in # $SPARK_HOME/R/pkg/html +# The vignettes can be found in +# $SPARK_HOME/R/pkg/vignettes/sparkr_vignettes.html set -o pipefail set -e @@ -43,4 +45,9 @@ Rscript -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); library(knit popd +# render creates SparkR vignettes +Rscript -e 'library(rmarkdown); paths <- .libPaths(); .libPaths(c("lib", paths)); Sys.setenv(SPARK_HOME=tools::file_path_as_absolute("..")); render("pkg/vignettes/sparkr-vignettes.Rmd"); .libPaths(paths)' + +find pkg/vignettes/. -not -name '.' -not -name '*.Rmd' -not -name '*.md' -not -name '*.pdf' -not -name '*.html' -delete + popd http://git-wip-us.apache.org/repos/asf/spark/blob/a09c258c/R/pkg/vignettes/sparkr-vignettes.Rmd -- diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd new file mode 100644 index 000..5156c9e --- /dev/null +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -0,0 +1,643 @@ +--- +title: "SparkR - Practical Guide" +output: + html_document: +theme: united +toc: true +toc_depth: 4 +toc_float: true +highlight: textmate +--- + +## Overview + +SparkR is an R package that provides a light-weight frontend to use Apache Spark from R. With Spark `r packageVersion("SparkR")`, SparkR provides a distributed data frame implementation that supports data processing operations like selection, filtering, aggregation etc. and distributed machine learning using [MLlib](http://spark.apache.org/mllib/). + +## Getting Started + +We begin with an example running on the local machine and provide an overview of the use of SparkR: data ingestion, data processing and machine learning. + +First, let's load and attach the package. +```{r, message=FALSE} +library(SparkR) +``` + +`SparkSession` is the entry point into SparkR which connects your R program to a Spark cluster. You can create a `SparkSession` using `sparkR.session` and pass in options such as the application name, any Spark packages depended on, etc. + +We use default settings in which it runs in local mode. It auto downloads Spark package in the background if no previous installation is found. For more details about setup, see [Spark Session](#SetupSparkSession). + +```{r, message=FALSE} +sparkR.session() +``` + +The operations in SparkR are centered around an R class called `SparkDataFrame`. It is a distributed collection of data organized into named columns, which is conceptually equivalent to a table in a relational database or a data frame in R, but with richer optimizations under the hood. + +`SparkDataFrame` can be constructed from a wide array of sources such as: structured data files, tables in Hive, external databases, or existing local R data frames. For example, we create a `SparkDataFrame` from a local R data frame, + +```{r} +cars <- cbind(model = rownames(mt
spark git commit: [SPARKR][DOC] minor formatting and output cleanup for R vignettes
Repository: spark Updated Branches: refs/heads/master c17f97183 -> 068c198e9 [SPARKR][DOC] minor formatting and output cleanup for R vignettes ## What changes were proposed in this pull request? Clean up output, format table, truncate long example output, hide warnings (new - Left; existing - Right) ![image](https://cloud.githubusercontent.com/assets/8969467/19064018/5dcde4d0-89bc-11e6-857b-052df3f52a4e.png) ![image](https://cloud.githubusercontent.com/assets/8969467/19064034/6db09956-89bc-11e6-8e43-232d5c3fe5e6.png) ![image](https://cloud.githubusercontent.com/assets/8969467/19064058/88f09590-89bc-11e6-9993-61639e29dfdd.png) ![image](https://cloud.githubusercontent.com/assets/8969467/19064066/95ccbf64-89bc-11e6-877f-45af03ddcadc.png) ![image](https://cloud.githubusercontent.com/assets/8969467/19064082/a8445404-89bc-11e6-8532-26d8bc9b206f.png) ## How was this patch tested? Run create-doc.sh manually Author: Felix Cheung Closes #15340 from felixcheung/vignettes. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/068c198e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/068c198e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/068c198e Branch: refs/heads/master Commit: 068c198e956346b90968a4d74edb7bc820c4be28 Parents: c17f971 Author: Felix Cheung Authored: Tue Oct 4 09:22:26 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Oct 4 09:22:26 2016 -0700 -- R/pkg/vignettes/sparkr-vignettes.Rmd | 31 --- 1 file changed, 20 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/068c198e/R/pkg/vignettes/sparkr-vignettes.Rmd -- diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index aea52db..80e8760 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -26,7 +26,7 @@ library(SparkR) We use default settings in which it runs in local mode. It auto downloads Spark package in the background if no previous installation is found. For more details about setup, see [Spark Session](#SetupSparkSession). -```{r, message=FALSE} +```{r, message=FALSE, results="hide"} sparkR.session() ``` @@ -114,10 +114,12 @@ In particular, the following Spark driver properties can be set in `sparkConfig` Property Name | Property group | spark-submit equivalent | -- | -- -spark.driver.memory | Application Properties | --driver-memory -spark.driver.extraClassPath | Runtime Environment | --driver-class-path -spark.driver.extraJavaOptions | Runtime Environment | --driver-java-options -spark.driver.extraLibraryPath | Runtime Environment | --driver-library-path +`spark.driver.memory` | Application Properties | `--driver-memory` +`spark.driver.extraClassPath` | Runtime Environment | `--driver-class-path` +`spark.driver.extraJavaOptions` | Runtime Environment | `--driver-java-options` +`spark.driver.extraLibraryPath` | Runtime Environment | `--driver-library-path` +`spark.yarn.keytab` | Application Properties | `--keytab` +`spark.yarn.principal` | Application Properties | `--principal` **For Windows users**: Due to different file prefixes across operating systems, to avoid the issue of potential wrong prefix, a current workaround is to specify `spark.sql.warehouse.dir` when starting the `SparkSession`. @@ -161,7 +163,7 @@ head(df) ### Data Sources SparkR supports operating on a variety of data sources through the `SparkDataFrame` interface. You can check the Spark SQL programming guide for more [specific options](https://spark.apache.org/docs/latest/sql-programming-guide.html#manually-specifying-options) that are available for the built-in data sources. -The general method for creating `SparkDataFrame` from data sources is `read.df`. This method takes in the path for the file to load and the type of data source, and the currently active Spark Session will be used automatically. SparkR supports reading CSV, JSON and Parquet files natively and through Spark Packages you can find data source connectors for popular file formats like Avro. These packages can be added with `sparkPackages` parameter when initializing SparkSession using `sparkR.session'.` +The general method for creating `SparkDataFrame` from data sources is `read.df`. This method takes in the path for the file to load and the type of data source, and the currently active Spark Session will be used automatically. SparkR supports reading CSV, JSON and Parquet files natively and through Spark Packages you can find data source connectors for popular file formats like Avro. These packages can be added with `sparkPackages` parameter
spark git commit: [SPARKR][DOC] minor formatting and output cleanup for R vignettes
Repository: spark Updated Branches: refs/heads/branch-2.0 3dbe8097f -> 50f6be759 [SPARKR][DOC] minor formatting and output cleanup for R vignettes Clean up output, format table, truncate long example output, hide warnings (new - Left; existing - Right) ![image](https://cloud.githubusercontent.com/assets/8969467/19064018/5dcde4d0-89bc-11e6-857b-052df3f52a4e.png) ![image](https://cloud.githubusercontent.com/assets/8969467/19064034/6db09956-89bc-11e6-8e43-232d5c3fe5e6.png) ![image](https://cloud.githubusercontent.com/assets/8969467/19064058/88f09590-89bc-11e6-9993-61639e29dfdd.png) ![image](https://cloud.githubusercontent.com/assets/8969467/19064066/95ccbf64-89bc-11e6-877f-45af03ddcadc.png) ![image](https://cloud.githubusercontent.com/assets/8969467/19064082/a8445404-89bc-11e6-8532-26d8bc9b206f.png) Run create-doc.sh manually Author: Felix Cheung Closes #15340 from felixcheung/vignettes. (cherry picked from commit 068c198e956346b90968a4d74edb7bc820c4be28) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/50f6be75 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/50f6be75 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/50f6be75 Branch: refs/heads/branch-2.0 Commit: 50f6be7598547fed5190a920fd3cebb4bc908524 Parents: 3dbe809 Author: Felix Cheung Authored: Tue Oct 4 09:22:26 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Oct 4 09:28:56 2016 -0700 -- R/pkg/vignettes/sparkr-vignettes.Rmd | 29 +++-- 1 file changed, 19 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/50f6be75/R/pkg/vignettes/sparkr-vignettes.Rmd -- diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 5156c9e..babfb71 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -26,7 +26,7 @@ library(SparkR) We use default settings in which it runs in local mode. It auto downloads Spark package in the background if no previous installation is found. For more details about setup, see [Spark Session](#SetupSparkSession). -```{r, message=FALSE} +```{r, message=FALSE, results="hide"} sparkR.session() ``` @@ -114,10 +114,12 @@ In particular, the following Spark driver properties can be set in `sparkConfig` Property Name | Property group | spark-submit equivalent | -- | -- -spark.driver.memory | Application Properties | --driver-memory -spark.driver.extraClassPath | Runtime Environment | --driver-class-path -spark.driver.extraJavaOptions | Runtime Environment | --driver-java-options -spark.driver.extraLibraryPath | Runtime Environment | --driver-library-path +`spark.driver.memory` | Application Properties | `--driver-memory` +`spark.driver.extraClassPath` | Runtime Environment | `--driver-class-path` +`spark.driver.extraJavaOptions` | Runtime Environment | `--driver-java-options` +`spark.driver.extraLibraryPath` | Runtime Environment | `--driver-library-path` +`spark.yarn.keytab` | Application Properties | `--keytab` +`spark.yarn.principal` | Application Properties | `--principal` **For Windows users**: Due to different file prefixes across operating systems, to avoid the issue of potential wrong prefix, a current workaround is to specify `spark.sql.warehouse.dir` when starting the `SparkSession`. @@ -161,7 +163,7 @@ head(df) ### Data Sources SparkR supports operating on a variety of data sources through the `SparkDataFrame` interface. You can check the Spark SQL programming guide for more [specific options](https://spark.apache.org/docs/latest/sql-programming-guide.html#manually-specifying-options) that are available for the built-in data sources. -The general method for creating `SparkDataFrame` from data sources is `read.df`. This method takes in the path for the file to load and the type of data source, and the currently active Spark Session will be used automatically. SparkR supports reading CSV, JSON and Parquet files natively and through Spark Packages you can find data source connectors for popular file formats like Avro. These packages can be added with `sparkPackages` parameter when initializing SparkSession using `sparkR.session'.` +The general method for creating `SparkDataFrame` from data sources is `read.df`. This method takes in the path for the file to load and the type of data source, and the currently active Spark Session will be used automatically. SparkR supports reading CSV, JSON and Parquet files natively and through Spark Packages you can find data source connectors for popular file formats like Avro. These packages can be added w
spark git commit: [SPARK-18264][SPARKR] build vignettes with package, update vignettes for CRAN release build and add info on release
Repository: spark Updated Branches: refs/heads/branch-2.1 87820da78 -> c2ebda443 [SPARK-18264][SPARKR] build vignettes with package, update vignettes for CRAN release build and add info on release ## What changes were proposed in this pull request? Changes to DESCRIPTION to build vignettes. Changes the metadata for vignettes to generate the recommended format (which is about <10% of size before). Unfortunately it does not look as nice (before - left, after - right) ![image](https://cloud.githubusercontent.com/assets/8969467/20040492/b75883e6-a40d-11e6-9534-25cdd5d59a8b.png) ![image](https://cloud.githubusercontent.com/assets/8969467/20040490/a40f4d42-a40d-11e6-8c91-af00ddcbdad9.png) Also add information on how to run build/release to CRAN later. ## How was this patch tested? manually, unit tests shivaram We need this for branch-2.1 Author: Felix Cheung Closes #15790 from felixcheung/rpkgvignettes. (cherry picked from commit ba23f768f7419039df85530b84258ec31f0c22b4) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c2ebda44 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c2ebda44 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c2ebda44 Branch: refs/heads/branch-2.1 Commit: c2ebda443b2678e554d859d866af53e2e94822f2 Parents: 87820da Author: Felix Cheung Authored: Fri Nov 11 15:49:55 2016 -0800 Committer: Shivaram Venkataraman Committed: Fri Nov 11 15:50:03 2016 -0800 -- R/CRAN_RELEASE.md| 91 +++ R/README.md | 8 +-- R/check-cran.sh | 33 +-- R/create-docs.sh | 19 +-- R/pkg/DESCRIPTION| 9 ++- R/pkg/vignettes/sparkr-vignettes.Rmd | 9 +-- 6 files changed, 134 insertions(+), 35 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c2ebda44/R/CRAN_RELEASE.md -- diff --git a/R/CRAN_RELEASE.md b/R/CRAN_RELEASE.md new file mode 100644 index 000..bea8f9f --- /dev/null +++ b/R/CRAN_RELEASE.md @@ -0,0 +1,91 @@ +# SparkR CRAN Release + +To release SparkR as a package to CRAN, we would use the `devtools` package. Please work with the +`d...@spark.apache.org` community and R package maintainer on this. + +### Release + +First, check that the `Version:` field in the `pkg/DESCRIPTION` file is updated. Also, check for stale files not under source control. + +Note that while `check-cran.sh` is running `R CMD check`, it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. + +To upload a release, we would need to update the `cran-comments.md`. This should generally contain the results from running the `check-cran.sh` script along with comments on status of all `WARNING` (should not be any) or `NOTE`. As a part of `check-cran.sh` and the release process, the vignettes is build - make sure `SPARK_HOME` is set and Spark jars are accessible. + +Once everything is in place, run in R under the `SPARK_HOME/R` directory: + +```R +paths <- .libPaths(); .libPaths(c("lib", paths)); Sys.setenv(SPARK_HOME=tools::file_path_as_absolute("..")); devtools::release(); .libPaths(paths) +``` + +For more information please refer to http://r-pkgs.had.co.nz/release.html#release-check + +### Testing: build package manually + +To build package manually such as to inspect the resulting `.tar.gz` file content, we would also use the `devtools` package. + +Source package is what get released to CRAN. CRAN would then build platform-specific binary packages from the source package. + + Build source package + +To build source package locally without releasing to CRAN, run in R under the `SPARK_HOME/R` directory: + +```R +paths <- .libPaths(); .libPaths(c("lib", paths)); Sys.setenv(SPARK_HOME=tools::file_path_as_absolute("..")); devtools::build("pkg"); .libPaths(paths) +``` + +(http://r-pkgs.had.co.nz/vignettes.html#vignette-workflow-2) + +Similarly, the source package is also created by `check-cran.sh` with `R CMD build pkg`. + +For example, this should be the content of the source package: + +```sh +DESCRIPTIONR insttests +NAMESPACE build man vignettes + +inst/doc/ +sparkr-vignettes.html +sparkr-vignettes.Rmd +sparkr-vignettes.Rman + +build/ +vignette.rds + +man/ + *.Rd files... + +vignettes/ +sparkr-vignettes.Rmd +``` + + Test source package + +To install, run this: + +```sh +R CMD INSTALL SparkR_2.1.0.tar.gz +``` + +With "2.1.0" replaced wi
spark git commit: [SPARK-18264][SPARKR] build vignettes with package, update vignettes for CRAN release build and add info on release
Repository: spark Updated Branches: refs/heads/master 6e95325fc -> ba23f768f [SPARK-18264][SPARKR] build vignettes with package, update vignettes for CRAN release build and add info on release ## What changes were proposed in this pull request? Changes to DESCRIPTION to build vignettes. Changes the metadata for vignettes to generate the recommended format (which is about <10% of size before). Unfortunately it does not look as nice (before - left, after - right) ![image](https://cloud.githubusercontent.com/assets/8969467/20040492/b75883e6-a40d-11e6-9534-25cdd5d59a8b.png) ![image](https://cloud.githubusercontent.com/assets/8969467/20040490/a40f4d42-a40d-11e6-8c91-af00ddcbdad9.png) Also add information on how to run build/release to CRAN later. ## How was this patch tested? manually, unit tests shivaram We need this for branch-2.1 Author: Felix Cheung Closes #15790 from felixcheung/rpkgvignettes. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ba23f768 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ba23f768 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ba23f768 Branch: refs/heads/master Commit: ba23f768f7419039df85530b84258ec31f0c22b4 Parents: 6e95325 Author: Felix Cheung Authored: Fri Nov 11 15:49:55 2016 -0800 Committer: Shivaram Venkataraman Committed: Fri Nov 11 15:49:55 2016 -0800 -- R/CRAN_RELEASE.md| 91 +++ R/README.md | 8 +-- R/check-cran.sh | 33 +-- R/create-docs.sh | 19 +-- R/pkg/DESCRIPTION| 9 ++- R/pkg/vignettes/sparkr-vignettes.Rmd | 9 +-- 6 files changed, 134 insertions(+), 35 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ba23f768/R/CRAN_RELEASE.md -- diff --git a/R/CRAN_RELEASE.md b/R/CRAN_RELEASE.md new file mode 100644 index 000..bea8f9f --- /dev/null +++ b/R/CRAN_RELEASE.md @@ -0,0 +1,91 @@ +# SparkR CRAN Release + +To release SparkR as a package to CRAN, we would use the `devtools` package. Please work with the +`d...@spark.apache.org` community and R package maintainer on this. + +### Release + +First, check that the `Version:` field in the `pkg/DESCRIPTION` file is updated. Also, check for stale files not under source control. + +Note that while `check-cran.sh` is running `R CMD check`, it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. + +To upload a release, we would need to update the `cran-comments.md`. This should generally contain the results from running the `check-cran.sh` script along with comments on status of all `WARNING` (should not be any) or `NOTE`. As a part of `check-cran.sh` and the release process, the vignettes is build - make sure `SPARK_HOME` is set and Spark jars are accessible. + +Once everything is in place, run in R under the `SPARK_HOME/R` directory: + +```R +paths <- .libPaths(); .libPaths(c("lib", paths)); Sys.setenv(SPARK_HOME=tools::file_path_as_absolute("..")); devtools::release(); .libPaths(paths) +``` + +For more information please refer to http://r-pkgs.had.co.nz/release.html#release-check + +### Testing: build package manually + +To build package manually such as to inspect the resulting `.tar.gz` file content, we would also use the `devtools` package. + +Source package is what get released to CRAN. CRAN would then build platform-specific binary packages from the source package. + + Build source package + +To build source package locally without releasing to CRAN, run in R under the `SPARK_HOME/R` directory: + +```R +paths <- .libPaths(); .libPaths(c("lib", paths)); Sys.setenv(SPARK_HOME=tools::file_path_as_absolute("..")); devtools::build("pkg"); .libPaths(paths) +``` + +(http://r-pkgs.had.co.nz/vignettes.html#vignette-workflow-2) + +Similarly, the source package is also created by `check-cran.sh` with `R CMD build pkg`. + +For example, this should be the content of the source package: + +```sh +DESCRIPTIONR insttests +NAMESPACE build man vignettes + +inst/doc/ +sparkr-vignettes.html +sparkr-vignettes.Rmd +sparkr-vignettes.Rman + +build/ +vignette.rds + +man/ + *.Rd files... + +vignettes/ +sparkr-vignettes.Rmd +``` + + Test source package + +To install, run this: + +```sh +R CMD INSTALL SparkR_2.1.0.tar.gz +``` + +With "2.1.0" replaced with the version of SparkR. + +This command installs SparkR to the default libPaths. Once that is done, you shoul
spark git commit: [SPARK-13734][SPARKR] Added histogram function
Repository: spark Updated Branches: refs/heads/master 75879ac3c -> 0c99c23b7 [SPARK-13734][SPARKR] Added histogram function ## What changes were proposed in this pull request? Added method histogram() to compute the histogram of a Column Usage: ``` ## Create a DataFrame from the Iris dataset irisDF <- createDataFrame(sqlContext, iris) ## Render a histogram for the Sepal_Length column histogram(irisDF, "Sepal_Length", nbins=12) ``` ![histogram](https://cloud.githubusercontent.com/assets/13985649/13588486/e1e751c6-e484-11e5-85db-2fc2115c4bb2.png) Note: Usage will change once SPARK-9325 is figured out so that histogram() only takes a Column as a parameter, as opposed to a DataFrame and a name ## How was this patch tested? All unit tests pass. I added specific unit cases for different scenarios. Author: Oscar D. Lara Yejas Author: Oscar D. Lara Yejas Closes #11569 from olarayej/SPARK-13734. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0c99c23b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0c99c23b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0c99c23b Branch: refs/heads/master Commit: 0c99c23b7d9f0c3538cd2b062d551411712a2bcc Parents: 75879ac Author: Oscar D. Lara Yejas Authored: Tue Apr 26 15:34:30 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Apr 26 15:34:30 2016 -0700 -- R/pkg/NAMESPACE | 1 + R/pkg/R/DataFrame.R | 120 + R/pkg/R/generics.R| 4 + R/pkg/inst/tests/testthat/test_sparkSQL.R | 45 ++ 4 files changed, 170 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0c99c23b/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index c0a63d6..ea31bae 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -173,6 +173,7 @@ exportMethods("%in%", "getItem", "greatest", "hex", + "histogram", "hour", "hypot", "ifelse", http://git-wip-us.apache.org/repos/asf/spark/blob/0c99c23b/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 890d15d..36aedfa 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2469,6 +2469,126 @@ setMethod("drop", base::drop(x) }) +#' This function computes a histogram for a given SparkR Column. +#' +#' @name histogram +#' @title Histogram +#' @param nbins the number of bins (optional). Default value is 10. +#' @param df the SparkDataFrame containing the Column to build the histogram from. +#' @param colname the name of the column to build the histogram from. +#' @return a data.frame with the histogram statistics, i.e., counts and centroids. +#' @rdname histogram +#' @family SparkDataFrame functions +#' @export +#' @examples +#' \dontrun{ +#' +#' # Create a SparkDataFrame from the Iris dataset +#' irisDF <- createDataFrame(sqlContext, iris) +#' +#' # Compute histogram statistics +#' histStats <- histogram(irisDF, irisDF$Sepal_Length, nbins = 12) +#' +#' # Once SparkR has computed the histogram statistics, the histogram can be +#' # rendered using the ggplot2 library: +#' +#' require(ggplot2) +#' plot <- ggplot(histStats, aes(x = centroids, y = counts)) + +#' geom_bar(stat = "identity") + +#' xlab("Sepal_Length") + ylab("Frequency") +#' } +setMethod("histogram", + signature(df = "SparkDataFrame", col = "characterOrColumn"), + function(df, col, nbins = 10) { +# Validate nbins +if (nbins < 2) { + stop("The number of bins must be a positive integer number greater than 1.") +} + +# Round nbins to the smallest integer +nbins <- floor(nbins) + +# Validate col +if (is.null(col)) { + stop("col must be specified.") +} + +colname <- col +x <- if (class(col) == "character") { + if (!colname %in% names(df)) { +stop("Specified colname does not belong to the given SparkDataFrame.") + } + + # Filter NA values in the target column and remove all other columns + df <- na.omit(df[, colname])
spark git commit: [SPARK-13436][SPARKR] Added parameter drop to subsetting operator [
Repository: spark Updated Branches: refs/heads/master 37575115b -> e4bfb4aa7 [SPARK-13436][SPARKR] Added parameter drop to subsetting operator [ Added parameter drop to subsetting operator [. This is useful to get a Column from a DataFrame, given its name. R supports it. In R: ``` > name <- "Sepal_Length" > class(iris[, name]) [1] "numeric" ``` Currently, in SparkR: ``` > name <- "Sepal_Length" > class(irisDF[, name]) [1] "DataFrame" ``` Previous code returns a DataFrame, which is inconsistent with R's behavior. SparkR should return a Column instead. Currently, in order for the user to return a Column given a column name as a character variable would be through `eval(parse(x))`, where x is the string `"irisDF$Sepal_Length"`. That itself is pretty hacky. `SparkR:::getColumn() `is another choice, but I don't see why this method should be externalized. Instead, following R's way to do things, the proposed implementation allows this: ``` > name <- "Sepal_Length" > class(irisDF[, name, drop=T]) [1] "Column" > class(irisDF[, name, drop=F]) [1] "DataFrame" ``` This is consistent with R: ``` > name <- "Sepal_Length" > class(iris[, name]) [1] "numeric" > class(iris[, name, drop=F]) [1] "data.frame" ``` Author: Oscar D. Lara Yejas Author: Oscar D. Lara Yejas Closes #11318 from olarayej/SPARK-13436. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e4bfb4aa Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e4bfb4aa Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e4bfb4aa Branch: refs/heads/master Commit: e4bfb4aa7382cb9c5e4eb7e2211551d5da716a61 Parents: 3757511 Author: Oscar D. Lara Yejas Authored: Wed Apr 27 15:47:54 2016 -0700 Committer: Shivaram Venkataraman Committed: Wed Apr 27 15:47:54 2016 -0700 -- R/pkg/R/DataFrame.R | 70 ++ R/pkg/R/utils.R | 2 +- R/pkg/inst/tests/testthat/test_sparkSQL.R | 24 ++--- 3 files changed, 54 insertions(+), 42 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e4bfb4aa/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 36aedfa..48ac1b0 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1237,29 +1237,38 @@ setMethod("[[", signature(x = "SparkDataFrame", i = "numericOrcharacter"), #' @rdname subset #' @name [ -setMethod("[", signature(x = "SparkDataFrame", i = "missing"), - function(x, i, j, ...) { -if (is.numeric(j)) { - cols <- columns(x) - j <- cols[j] -} -if (length(j) > 1) { - j <- as.list(j) +setMethod("[", signature(x = "SparkDataFrame"), + function(x, i, j, ..., drop = F) { +# Perform filtering first if needed +filtered <- if (missing(i)) { + x +} else { + if (class(i) != "Column") { +stop(paste0("Expressions other than filtering predicates are not supported ", + "in the first parameter of extract operator [ or subset() method.")) + } + filter(x, i) } -select(x, j) - }) -#' @rdname subset -#' @name [ -setMethod("[", signature(x = "SparkDataFrame", i = "Column"), - function(x, i, j, ...) { -# It could handle i as "character" but it seems confusing and not required -# https://stat.ethz.ch/R-manual/R-devel/library/base/html/Extract.data.frame.html -filtered <- filter(x, i) -if (!missing(j)) { - filtered[, j, ...] -} else { +# If something is to be projected, then do so on the filtered SparkDataFrame +if (missing(j)) { filtered +} else { + if (is.numeric(j)) { +cols <- columns(filtered) +j <- cols[j] + } + if (length(j) > 1) { +j <- as.list(j) + } + selected <- select(filtered, j) + + # Acknowledge parameter drop. Return a Column or SparkDataFrame accordingly + if (ncol(selected) == 1 & drop == T) { +getColumn(selected, names(selected)) + } else { +selected + } }
spark git commit: [SPARK-12235][SPARKR] Enhance mutate() to support replace existing columns.
Repository: spark Updated Branches: refs/heads/master 23256be0d -> 9e785079b [SPARK-12235][SPARKR] Enhance mutate() to support replace existing columns. Make the behavior of mutate more consistent with that in dplyr, besides support for replacing existing columns. 1. Throw error message when there are duplicated column names in the DataFrame being mutated. 2. when there are duplicated column names in specified columns by arguments, the last column of the same name takes effect. Author: Sun Rui Closes #10220 from sun-rui/SPARK-12235. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9e785079 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9e785079 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9e785079 Branch: refs/heads/master Commit: 9e785079b6ed4ea691c3c14c762a7f73fb6254bf Parents: 23256be Author: Sun Rui Authored: Thu Apr 28 09:33:58 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Apr 28 09:33:58 2016 -0700 -- R/pkg/R/DataFrame.R | 60 ++ R/pkg/inst/tests/testthat/test_sparkSQL.R | 18 2 files changed, 69 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9e785079/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 48ac1b0..a741fdf 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1431,11 +1431,11 @@ setMethod("withColumn", #' Mutate #' -#' Return a new SparkDataFrame with the specified columns added. +#' Return a new SparkDataFrame with the specified columns added or replaced. #' #' @param .data A SparkDataFrame #' @param col a named argument of the form name = col -#' @return A new SparkDataFrame with the new columns added. +#' @return A new SparkDataFrame with the new columns added or replaced. #' @family SparkDataFrame functions #' @rdname mutate #' @name mutate @@ -1450,23 +1450,65 @@ setMethod("withColumn", #' newDF <- mutate(df, newCol = df$col1 * 5, newCol2 = df$col1 * 2) #' names(newDF) # Will contain newCol, newCol2 #' newDF2 <- transform(df, newCol = df$col1 / 5, newCol2 = df$col1 * 2) +#' +#' df <- createDataFrame(sqlContext, +#' list(list("Andy", 30L), list("Justin", 19L)), c("name", "age")) +#' # Replace the "age" column +#' df1 <- mutate(df, age = df$age + 1L) #' } setMethod("mutate", signature(.data = "SparkDataFrame"), function(.data, ...) { x <- .data cols <- list(...) -stopifnot(length(cols) > 0) -stopifnot(class(cols[[1]]) == "Column") +if (length(cols) <= 0) { + return(x) +} + +lapply(cols, function(col) { + stopifnot(class(col) == "Column") +}) + +# Check if there is any duplicated column name in the DataFrame +dfCols <- columns(x) +if (length(unique(dfCols)) != length(dfCols)) { + stop("Error: found duplicated column name in the DataFrame") +} + +# TODO: simplify the implementation of this method after SPARK-12225 is resolved. + +# For named arguments, use the names for arguments as the column names +# For unnamed arguments, use the argument symbols as the column names +args <- sapply(substitute(list(...))[-1], deparse) ns <- names(cols) if (!is.null(ns)) { - for (n in ns) { -if (n != "") { - cols[[n]] <- alias(cols[[n]], n) + lapply(seq_along(args), function(i) { +if (ns[[i]] != "") { + args[[i]] <<- ns[[i]] } - } + }) +} +ns <- args + +# The last column of the same name in the specific columns takes effect +deDupCols <- list() +for (i in 1:length(cols)) { + deDupCols[[ns[[i <- alias(cols[[i]], ns[[i]]) } -do.call(select, c(x, x$"*", cols)) + +# Construct the column list for projection +colList <- lapply(dfCols, function(col) { + if (!is.null(deDupCols[[col]])) { +# Replace existing column +tmpCol <- deDupCols[[col]] +deDu
spark git commit: [SPARK-12919][SPARKR] Implement dapply() on DataFrame in SparkR.
Repository: spark Updated Branches: refs/heads/master d78fbcc3c -> 4ae9fe091 [SPARK-12919][SPARKR] Implement dapply() on DataFrame in SparkR. ## What changes were proposed in this pull request? dapply() applies an R function on each partition of a DataFrame and returns a new DataFrame. The function signature is: dapply(df, function(localDF) {}, schema = NULL) R function input: local data.frame from the partition on local node R function output: local data.frame Schema specifies the Row format of the resulting DataFrame. It must match the R function's output. If schema is not specified, each partition of the result DataFrame will be serialized in R into a single byte array. Such resulting DataFrame can be processed by successive calls to dapply(). ## How was this patch tested? SparkR unit tests. Author: Sun Rui Author: Sun Rui Closes #12493 from sun-rui/SPARK-12919. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4ae9fe09 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4ae9fe09 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4ae9fe09 Branch: refs/heads/master Commit: 4ae9fe091c2cb8388c581093d62d3deaef40993e Parents: d78fbcc Author: Sun Rui Authored: Fri Apr 29 16:41:07 2016 -0700 Committer: Shivaram Venkataraman Committed: Fri Apr 29 16:41:07 2016 -0700 -- R/pkg/NAMESPACE | 1 + R/pkg/R/DataFrame.R | 61 ++ R/pkg/R/generics.R | 4 ++ R/pkg/inst/tests/testthat/test_sparkSQL.R | 40 R/pkg/inst/worker/worker.R | 36 ++- .../scala/org/apache/spark/api/r/RRDD.scala | 2 +- .../scala/org/apache/spark/api/r/RRunner.scala | 13 +++- .../scala/org/apache/spark/api/r/SerDe.scala| 2 +- docs/sql-programming-guide.md | 5 ++ .../sql/catalyst/optimizer/Optimizer.scala | 13 ++-- .../sql/catalyst/plans/logical/object.scala | 54 +++- .../scala/org/apache/spark/sql/Dataset.scala| 18 ++ .../org/apache/spark/sql/api/r/SQLUtils.scala | 32 - .../spark/sql/execution/SparkStrategies.scala | 3 + .../sql/execution/r/MapPartitionsRWrapper.scala | 68 15 files changed, 337 insertions(+), 15 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4ae9fe09/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 002e469..647db22 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -45,6 +45,7 @@ exportMethods("arrange", "covar_samp", "covar_pop", "crosstab", + "dapply", "describe", "dim", "distinct", http://git-wip-us.apache.org/repos/asf/spark/blob/4ae9fe09/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index a741fdf..9e30fa0 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -21,6 +21,7 @@ NULL setOldClass("jobj") +setOldClass("structType") #' @title S4 class that represents a SparkDataFrame #' @description DataFrames can be created using functions like \link{createDataFrame}, @@ -1125,6 +1126,66 @@ setMethod("summarize", agg(x, ...) }) +#' dapply +#' +#' Apply a function to each partition of a DataFrame. +#' +#' @param x A SparkDataFrame +#' @param func A function to be applied to each partition of the SparkDataFrame. +#' func should have only one parameter, to which a data.frame corresponds +#' to each partition will be passed. +#' The output of func should be a data.frame. +#' @param schema The schema of the resulting DataFrame after the function is applied. +#' It must match the output of func. +#' @family SparkDataFrame functions +#' @rdname dapply +#' @name dapply +#' @export +#' @examples +#' \dontrun{ +#' df <- createDataFrame (sqlContext, iris) +#' df1 <- dapply(df, function(x) { x }, schema(df)) +#' collect(df1) +#' +#' # filter and add a column +#' df <- createDataFrame ( +#' sqlContext, +#' list(list(1L, 1, "1"), list(2L, 2, "2"), list(3L, 3, "3")), +#' c("a", "b", "c")) +#' schema <- structType(structField("a", "integer"), stru
spark git commit: [SPARK-15091][SPARKR] Fix warnings and a failure in SparkR test cases with testthat version 1.0.1
Repository: spark Updated Branches: refs/heads/master d26f7cb01 -> 8b6491fc0 [SPARK-15091][SPARKR] Fix warnings and a failure in SparkR test cases with testthat version 1.0.1 ## What changes were proposed in this pull request? Fix warnings and a failure in SparkR test cases with testthat version 1.0.1 ## How was this patch tested? SparkR unit test cases. Author: Sun Rui Closes #12867 from sun-rui/SPARK-15091. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8b6491fc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8b6491fc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8b6491fc Branch: refs/heads/master Commit: 8b6491fc0b49b4e363887ae4b452ba69fe0290d5 Parents: d26f7cb Author: Sun Rui Authored: Tue May 3 09:29:49 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue May 3 09:29:49 2016 -0700 -- R/pkg/inst/tests/testthat/test_client.R | 2 +- R/pkg/inst/tests/testthat/test_context.R | 2 +- R/pkg/inst/tests/testthat/test_mllib.R| 4 ++-- R/pkg/inst/tests/testthat/test_sparkSQL.R | 11 +-- 4 files changed, 9 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8b6491fc/R/pkg/inst/tests/testthat/test_client.R -- diff --git a/R/pkg/inst/tests/testthat/test_client.R b/R/pkg/inst/tests/testthat/test_client.R index a0664f3..28276a0 100644 --- a/R/pkg/inst/tests/testthat/test_client.R +++ b/R/pkg/inst/tests/testthat/test_client.R @@ -32,7 +32,7 @@ test_that("no package specified doesn't add packages flag", { }) test_that("multiple packages don't produce a warning", { - expect_that(generateSparkSubmitArgs("", "", "", "", c("A", "B")), not(gives_warning())) + expect_warning(generateSparkSubmitArgs("", "", "", "", c("A", "B")), NA) }) test_that("sparkJars sparkPackages as character vectors", { http://git-wip-us.apache.org/repos/asf/spark/blob/8b6491fc/R/pkg/inst/tests/testthat/test_context.R -- diff --git a/R/pkg/inst/tests/testthat/test_context.R b/R/pkg/inst/tests/testthat/test_context.R index ca04342..0e5e15c 100644 --- a/R/pkg/inst/tests/testthat/test_context.R +++ b/R/pkg/inst/tests/testthat/test_context.R @@ -138,7 +138,7 @@ test_that("sparkJars sparkPackages as comma-separated strings", { # check normalizePath f <- dir()[[1]] - expect_that(processSparkJars(f), not(gives_warning())) + expect_warning(processSparkJars(f), NA) expect_match(processSparkJars(f), f) }) http://git-wip-us.apache.org/repos/asf/spark/blob/8b6491fc/R/pkg/inst/tests/testthat/test_mllib.R -- diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 37d87aa..5f8a27d 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -450,9 +450,9 @@ test_that("spark.survreg", { if (requireNamespace("survival", quietly = TRUE)) { rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0), x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1)) -expect_that( +expect_error( model <- survival::survreg(formula = survival::Surv(time, status) ~ x + sex, data = rData), - not(throws_error())) + NA) expect_equal(predict(model, rData)[[1]], 3.724591, tolerance = 1e-4) } }) http://git-wip-us.apache.org/repos/asf/spark/blob/8b6491fc/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 5cf9dc4..081f7b1 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1196,9 +1196,9 @@ test_that("date functions on a DataFrame", { c(as.POSIXlt("2012-12-13 21:34:00 UTC"), as.POSIXlt("2014-12-15 10:24:34 UTC"))) expect_equal(collect(select(df2, to_utc_timestamp(df2$b, "JST")))[, 1], c(as.POSIXlt("2012-12-13 03:34:00 UTC"), as.POSIXlt("2014-12-14 16:24:34 UTC"))) - expect_more_than(collect(select(df2, unix_timestamp()))[1, 1], 0) - expect_more_than(collect(select(df2, unix_timestamp(df2$b)))[1, 1], 0) - expect_more_than(collect(select(df2, unix_timestamp(lit("2015-01-01"), "-MM-dd")))[1, 1], 0) + expect_gt(collect(select(df2, unix_timestamp()))[1, 1], 0) +
spark git commit: [SPARK-15091][SPARKR] Fix warnings and a failure in SparkR test cases with testthat version 1.0.1
Repository: spark Updated Branches: refs/heads/branch-2.0 932e1b5b2 -> a373c39a9 [SPARK-15091][SPARKR] Fix warnings and a failure in SparkR test cases with testthat version 1.0.1 ## What changes were proposed in this pull request? Fix warnings and a failure in SparkR test cases with testthat version 1.0.1 ## How was this patch tested? SparkR unit test cases. Author: Sun Rui Closes #12867 from sun-rui/SPARK-15091. (cherry picked from commit 8b6491fc0b49b4e363887ae4b452ba69fe0290d5) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a373c39a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a373c39a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a373c39a Branch: refs/heads/branch-2.0 Commit: a373c39a98a395e78ac4c0116c47a9eec39ac3e6 Parents: 932e1b5 Author: Sun Rui Authored: Tue May 3 09:29:49 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue May 3 09:29:57 2016 -0700 -- R/pkg/inst/tests/testthat/test_client.R | 2 +- R/pkg/inst/tests/testthat/test_context.R | 2 +- R/pkg/inst/tests/testthat/test_mllib.R| 4 ++-- R/pkg/inst/tests/testthat/test_sparkSQL.R | 11 +-- 4 files changed, 9 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a373c39a/R/pkg/inst/tests/testthat/test_client.R -- diff --git a/R/pkg/inst/tests/testthat/test_client.R b/R/pkg/inst/tests/testthat/test_client.R index a0664f3..28276a0 100644 --- a/R/pkg/inst/tests/testthat/test_client.R +++ b/R/pkg/inst/tests/testthat/test_client.R @@ -32,7 +32,7 @@ test_that("no package specified doesn't add packages flag", { }) test_that("multiple packages don't produce a warning", { - expect_that(generateSparkSubmitArgs("", "", "", "", c("A", "B")), not(gives_warning())) + expect_warning(generateSparkSubmitArgs("", "", "", "", c("A", "B")), NA) }) test_that("sparkJars sparkPackages as character vectors", { http://git-wip-us.apache.org/repos/asf/spark/blob/a373c39a/R/pkg/inst/tests/testthat/test_context.R -- diff --git a/R/pkg/inst/tests/testthat/test_context.R b/R/pkg/inst/tests/testthat/test_context.R index ca04342..0e5e15c 100644 --- a/R/pkg/inst/tests/testthat/test_context.R +++ b/R/pkg/inst/tests/testthat/test_context.R @@ -138,7 +138,7 @@ test_that("sparkJars sparkPackages as comma-separated strings", { # check normalizePath f <- dir()[[1]] - expect_that(processSparkJars(f), not(gives_warning())) + expect_warning(processSparkJars(f), NA) expect_match(processSparkJars(f), f) }) http://git-wip-us.apache.org/repos/asf/spark/blob/a373c39a/R/pkg/inst/tests/testthat/test_mllib.R -- diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 37d87aa..5f8a27d 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -450,9 +450,9 @@ test_that("spark.survreg", { if (requireNamespace("survival", quietly = TRUE)) { rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0), x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1)) -expect_that( +expect_error( model <- survival::survreg(formula = survival::Surv(time, status) ~ x + sex, data = rData), - not(throws_error())) + NA) expect_equal(predict(model, rData)[[1]], 3.724591, tolerance = 1e-4) } }) http://git-wip-us.apache.org/repos/asf/spark/blob/a373c39a/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 5cf9dc4..081f7b1 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1196,9 +1196,9 @@ test_that("date functions on a DataFrame", { c(as.POSIXlt("2012-12-13 21:34:00 UTC"), as.POSIXlt("2014-12-15 10:24:34 UTC"))) expect_equal(collect(select(df2, to_utc_timestamp(df2$b, "JST")))[, 1], c(as.POSIXlt("2012-12-13 03:34:00 UTC"), as.POSIXlt("2014-12-14 16:24:34 UTC"))) - expect_more_than(collect(select(df2, unix_timestamp()))[1, 1], 0) - expect_more_than(collect(select(df2, unix_timestamp(df2$b)))[1, 1], 0) - expect_more_than(collect(select(df2, unix_timestamp(lit("2015
spark git commit: [SPARK-11395][SPARKR] Support over and window specification in SparkR.
Repository: spark Updated Branches: refs/heads/master 7f5922aa4 -> 157a49aa4 [SPARK-11395][SPARKR] Support over and window specification in SparkR. This PR: 1. Implement WindowSpec S4 class. 2. Implement Window.partitionBy() and Window.orderBy() as utility functions to create WindowSpec objects. 3. Implement over() of Column class. Author: Sun Rui Author: Sun Rui Closes #10094 from sun-rui/SPARK-11395. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/157a49aa Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/157a49aa Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/157a49aa Branch: refs/heads/master Commit: 157a49aa410dc1870cd171148d317084c5a90d23 Parents: 7f5922a Author: Sun Rui Authored: Thu May 5 18:49:43 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu May 5 18:49:43 2016 -0700 -- R/pkg/DESCRIPTION | 2 + R/pkg/NAMESPACE | 10 ++ R/pkg/R/DataFrame.R | 4 +- R/pkg/R/WindowSpec.R | 188 + R/pkg/R/generics.R| 29 +++- R/pkg/R/pairRDD.R | 4 +- R/pkg/R/window.R | 98 + R/pkg/inst/tests/testthat/test_sparkSQL.R | 36 + 8 files changed, 364 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/157a49aa/R/pkg/DESCRIPTION -- diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 7179438..963a1bb 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -26,6 +26,7 @@ Collate: 'pairRDD.R' 'DataFrame.R' 'SQLContext.R' +'WindowSpec.R' 'backend.R' 'broadcast.R' 'client.R' @@ -38,4 +39,5 @@ Collate: 'stats.R' 'types.R' 'utils.R' +'window.R' RoxygenNote: 5.0.1 http://git-wip-us.apache.org/repos/asf/spark/blob/157a49aa/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 73f7c59..1432ab8 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -216,6 +216,7 @@ exportMethods("%in%", "next_day", "ntile", "otherwise", + "over", "percent_rank", "pmod", "quarter", @@ -315,3 +316,12 @@ export("structField", "structType.jobj", "structType.structField", "print.structType") + +exportClasses("WindowSpec") + +export("partitionBy", + "rowsBetween", + "rangeBetween") + +export("window.partitionBy", + "window.orderBy") http://git-wip-us.apache.org/repos/asf/spark/blob/157a49aa/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index fcf473a..43c46b8 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1749,8 +1749,8 @@ setMethod("arrange", #' @export setMethod("orderBy", signature(x = "SparkDataFrame", col = "characterOrColumn"), - function(x, col) { -arrange(x, col) + function(x, col, ...) { +arrange(x, col, ...) }) #' Filter http://git-wip-us.apache.org/repos/asf/spark/blob/157a49aa/R/pkg/R/WindowSpec.R -- diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R new file mode 100644 index 000..581176a --- /dev/null +++ b/R/pkg/R/WindowSpec.R @@ -0,0 +1,188 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# WindowSpec.R - WindowSpec class and methods imple
spark git commit: [SPARK-11395][SPARKR] Support over and window specification in SparkR.
Repository: spark Updated Branches: refs/heads/branch-2.0 7dc3fb6ae -> 42f2ee6c5 [SPARK-11395][SPARKR] Support over and window specification in SparkR. This PR: 1. Implement WindowSpec S4 class. 2. Implement Window.partitionBy() and Window.orderBy() as utility functions to create WindowSpec objects. 3. Implement over() of Column class. Author: Sun Rui Author: Sun Rui Closes #10094 from sun-rui/SPARK-11395. (cherry picked from commit 157a49aa410dc1870cd171148d317084c5a90d23) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/42f2ee6c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/42f2ee6c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/42f2ee6c Branch: refs/heads/branch-2.0 Commit: 42f2ee6c5d981cdc8bd6b3845f0593a87aae48b6 Parents: 7dc3fb6 Author: Sun Rui Authored: Thu May 5 18:49:43 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu May 5 18:49:51 2016 -0700 -- R/pkg/DESCRIPTION | 2 + R/pkg/NAMESPACE | 10 ++ R/pkg/R/DataFrame.R | 4 +- R/pkg/R/WindowSpec.R | 188 + R/pkg/R/generics.R| 29 +++- R/pkg/R/pairRDD.R | 4 +- R/pkg/R/window.R | 98 + R/pkg/inst/tests/testthat/test_sparkSQL.R | 36 + 8 files changed, 364 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/42f2ee6c/R/pkg/DESCRIPTION -- diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 7179438..963a1bb 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -26,6 +26,7 @@ Collate: 'pairRDD.R' 'DataFrame.R' 'SQLContext.R' +'WindowSpec.R' 'backend.R' 'broadcast.R' 'client.R' @@ -38,4 +39,5 @@ Collate: 'stats.R' 'types.R' 'utils.R' +'window.R' RoxygenNote: 5.0.1 http://git-wip-us.apache.org/repos/asf/spark/blob/42f2ee6c/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 73f7c59..1432ab8 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -216,6 +216,7 @@ exportMethods("%in%", "next_day", "ntile", "otherwise", + "over", "percent_rank", "pmod", "quarter", @@ -315,3 +316,12 @@ export("structField", "structType.jobj", "structType.structField", "print.structType") + +exportClasses("WindowSpec") + +export("partitionBy", + "rowsBetween", + "rangeBetween") + +export("window.partitionBy", + "window.orderBy") http://git-wip-us.apache.org/repos/asf/spark/blob/42f2ee6c/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index fcf473a..43c46b8 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1749,8 +1749,8 @@ setMethod("arrange", #' @export setMethod("orderBy", signature(x = "SparkDataFrame", col = "characterOrColumn"), - function(x, col) { -arrange(x, col) + function(x, col, ...) { +arrange(x, col, ...) }) #' Filter http://git-wip-us.apache.org/repos/asf/spark/blob/42f2ee6c/R/pkg/R/WindowSpec.R -- diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R new file mode 100644 index 000..581176a --- /dev/null +++ b/R/pkg/R/WindowSpec.R @@ -0,0 +1,188 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific languag
spark git commit: [SPARK-12479][SPARKR] sparkR collect on GroupedData throws R error "missing value where TRUE/FALSE needed"
Repository: spark Updated Branches: refs/heads/master 6e268b9ee -> 454ba4d67 [SPARK-12479][SPARKR] sparkR collect on GroupedData throws R error "missing value where TRUE/FALSE needed" ## What changes were proposed in this pull request? This PR is a workaround for NA handling in hash code computation. This PR is on behalf of paulomagalhaes whose PR is https://github.com/apache/spark/pull/10436 ## How was this patch tested? SparkR unit tests. Author: Sun Rui Author: ray Closes #12976 from sun-rui/SPARK-12479. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/454ba4d6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/454ba4d6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/454ba4d6 Branch: refs/heads/master Commit: 454ba4d67e782369627dfe60261e6648a27b91a0 Parents: 6e268b9 Author: Sun Rui Authored: Sun May 8 00:17:36 2016 -0700 Committer: Shivaram Venkataraman Committed: Sun May 8 00:17:36 2016 -0700 -- R/pkg/R/utils.R| 3 +++ R/pkg/inst/tests/testthat/test_utils.R | 4 2 files changed, 7 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/454ba4d6/R/pkg/R/utils.R -- diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index bf67e23..784f737 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -157,8 +157,11 @@ wrapInt <- function(value) { # Multiply `val` by 31 and add `addVal` to the result. Ensures that # integer-overflows are handled at every step. +# +# TODO: this function does not handle integer overflow well mult31AndAdd <- function(val, addVal) { vec <- c(bitwShiftL(val, c(4, 3, 2, 1, 0)), addVal) + vec[is.na(vec)] <- 0 Reduce(function(a, b) { wrapInt(as.numeric(a) + as.numeric(b)) }, http://git-wip-us.apache.org/repos/asf/spark/blob/454ba4d6/R/pkg/inst/tests/testthat/test_utils.R -- diff --git a/R/pkg/inst/tests/testthat/test_utils.R b/R/pkg/inst/tests/testthat/test_utils.R index 01694ab..54d2eca 100644 --- a/R/pkg/inst/tests/testthat/test_utils.R +++ b/R/pkg/inst/tests/testthat/test_utils.R @@ -164,3 +164,7 @@ test_that("convertToJSaveMode", { expect_error(convertToJSaveMode("foo"), 'mode should be one of "append", "overwrite", "error", "ignore"') #nolint }) + +test_that("hashCode", { + expect_error(hashCode("bc53d3605e8a5b7de1e8e271c2317645"), NA) +}) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12479][SPARKR] sparkR collect on GroupedData throws R error "missing value where TRUE/FALSE needed"
Repository: spark Updated Branches: refs/heads/branch-2.0 69f3edc32 -> cf156e611 [SPARK-12479][SPARKR] sparkR collect on GroupedData throws R error "missing value where TRUE/FALSE needed" ## What changes were proposed in this pull request? This PR is a workaround for NA handling in hash code computation. This PR is on behalf of paulomagalhaes whose PR is https://github.com/apache/spark/pull/10436 ## How was this patch tested? SparkR unit tests. Author: Sun Rui Author: ray Closes #12976 from sun-rui/SPARK-12479. (cherry picked from commit 454ba4d67e782369627dfe60261e6648a27b91a0) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cf156e61 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cf156e61 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cf156e61 Branch: refs/heads/branch-2.0 Commit: cf156e611f3e0e2e71463b87506b0233c66eb6d0 Parents: 69f3edc Author: Sun Rui Authored: Sun May 8 00:17:36 2016 -0700 Committer: Shivaram Venkataraman Committed: Sun May 8 00:17:45 2016 -0700 -- R/pkg/R/utils.R| 3 +++ R/pkg/inst/tests/testthat/test_utils.R | 4 2 files changed, 7 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cf156e61/R/pkg/R/utils.R -- diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index bf67e23..784f737 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -157,8 +157,11 @@ wrapInt <- function(value) { # Multiply `val` by 31 and add `addVal` to the result. Ensures that # integer-overflows are handled at every step. +# +# TODO: this function does not handle integer overflow well mult31AndAdd <- function(val, addVal) { vec <- c(bitwShiftL(val, c(4, 3, 2, 1, 0)), addVal) + vec[is.na(vec)] <- 0 Reduce(function(a, b) { wrapInt(as.numeric(a) + as.numeric(b)) }, http://git-wip-us.apache.org/repos/asf/spark/blob/cf156e61/R/pkg/inst/tests/testthat/test_utils.R -- diff --git a/R/pkg/inst/tests/testthat/test_utils.R b/R/pkg/inst/tests/testthat/test_utils.R index 01694ab..54d2eca 100644 --- a/R/pkg/inst/tests/testthat/test_utils.R +++ b/R/pkg/inst/tests/testthat/test_utils.R @@ -164,3 +164,7 @@ test_that("convertToJSaveMode", { expect_error(convertToJSaveMode("foo"), 'mode should be one of "append", "overwrite", "error", "ignore"') #nolint }) + +test_that("hashCode", { + expect_error(hashCode("bc53d3605e8a5b7de1e8e271c2317645"), NA) +}) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15202][SPARKR] add dapplyCollect() method for DataFrame in SparkR.
Repository: spark Updated Branches: refs/heads/master bb1362eb3 -> b3930f74a [SPARK-15202][SPARKR] add dapplyCollect() method for DataFrame in SparkR. ## What changes were proposed in this pull request? dapplyCollect() applies an R function on each partition of a SparkDataFrame and collects the result back to R as a data.frame. ``` dapplyCollect(df, function(ldf) {...}) ``` ## How was this patch tested? SparkR unit tests. Author: Sun Rui Closes #12989 from sun-rui/SPARK-15202. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b3930f74 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b3930f74 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b3930f74 Branch: refs/heads/master Commit: b3930f74a0929b2cdcbbe5cbe34f0b1d35eb01cc Parents: bb1362e Author: Sun Rui Authored: Thu May 12 17:50:55 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu May 12 17:50:55 2016 -0700 -- R/pkg/NAMESPACE | 1 + R/pkg/R/DataFrame.R | 86 +- R/pkg/R/generics.R| 4 ++ R/pkg/inst/tests/testthat/test_sparkSQL.R | 21 ++- 4 files changed, 95 insertions(+), 17 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b3930f74/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 1432ab8..239ad06 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -47,6 +47,7 @@ exportMethods("arrange", "covar_pop", "crosstab", "dapply", + "dapplyCollect", "describe", "dim", "distinct", http://git-wip-us.apache.org/repos/asf/spark/blob/b3930f74/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 43c46b8..0c2a194 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1153,9 +1153,27 @@ setMethod("summarize", agg(x, ...) }) +dapplyInternal <- function(x, func, schema) { + packageNamesArr <- serialize(.sparkREnv[[".packages"]], + connection = NULL) + + broadcastArr <- lapply(ls(.broadcastNames), + function(name) { get(name, .broadcastNames) }) + + sdf <- callJStatic( + "org.apache.spark.sql.api.r.SQLUtils", + "dapply", + x@sdf, + serialize(cleanClosure(func), connection = NULL), + packageNamesArr, + broadcastArr, + if (is.null(schema)) { schema } else { schema$jobj }) + dataFrame(sdf) +} + #' dapply #' -#' Apply a function to each partition of a DataFrame. +#' Apply a function to each partition of a SparkDataFrame. #' #' @param x A SparkDataFrame #' @param func A function to be applied to each partition of the SparkDataFrame. @@ -1197,21 +1215,57 @@ setMethod("summarize", setMethod("dapply", signature(x = "SparkDataFrame", func = "function", schema = "structType"), function(x, func, schema) { -packageNamesArr <- serialize(.sparkREnv[[".packages"]], - connection = NULL) - -broadcastArr <- lapply(ls(.broadcastNames), - function(name) { get(name, .broadcastNames) }) - -sdf <- callJStatic( - "org.apache.spark.sql.api.r.SQLUtils", - "dapply", - x@sdf, - serialize(cleanClosure(func), connection = NULL), - packageNamesArr, - broadcastArr, - schema$jobj) -dataFrame(sdf) +dapplyInternal(x, func, schema) + }) + +#' dapplyCollect +#' +#' Apply a function to each partition of a SparkDataFrame and collect the result back +#’ to R as a data.frame. +#' +#' @param x A SparkDataFrame +#' @param func A function to be applied to each partition of the SparkDataFrame. +#' func should have only one parameter, to which a data.frame corresponds +#' to each partition will be passed. +#' The output of func should be a data.frame. +#' @family SparkDataFrame functions +#' @rdname dapply +#' @name dapplyCollect +#' @export +#' @examples +#' \dontrun{ +#' df <- createDataFrame (sqlContext, iris) +#'
spark git commit: [SPARK-15202][SPARKR] add dapplyCollect() method for DataFrame in SparkR.
Repository: spark Updated Branches: refs/heads/branch-2.0 0d24fe09a -> 54c04aa5d [SPARK-15202][SPARKR] add dapplyCollect() method for DataFrame in SparkR. ## What changes were proposed in this pull request? dapplyCollect() applies an R function on each partition of a SparkDataFrame and collects the result back to R as a data.frame. ``` dapplyCollect(df, function(ldf) {...}) ``` ## How was this patch tested? SparkR unit tests. Author: Sun Rui Closes #12989 from sun-rui/SPARK-15202. (cherry picked from commit b3930f74a0929b2cdcbbe5cbe34f0b1d35eb01cc) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/54c04aa5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/54c04aa5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/54c04aa5 Branch: refs/heads/branch-2.0 Commit: 54c04aa5d0a6012eb58efd0e7cf6d1d287818fa8 Parents: 0d24fe0 Author: Sun Rui Authored: Thu May 12 17:50:55 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu May 12 17:51:02 2016 -0700 -- R/pkg/NAMESPACE | 1 + R/pkg/R/DataFrame.R | 86 +- R/pkg/R/generics.R| 4 ++ R/pkg/inst/tests/testthat/test_sparkSQL.R | 21 ++- 4 files changed, 95 insertions(+), 17 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/54c04aa5/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 1432ab8..239ad06 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -47,6 +47,7 @@ exportMethods("arrange", "covar_pop", "crosstab", "dapply", + "dapplyCollect", "describe", "dim", "distinct", http://git-wip-us.apache.org/repos/asf/spark/blob/54c04aa5/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 43c46b8..0c2a194 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1153,9 +1153,27 @@ setMethod("summarize", agg(x, ...) }) +dapplyInternal <- function(x, func, schema) { + packageNamesArr <- serialize(.sparkREnv[[".packages"]], + connection = NULL) + + broadcastArr <- lapply(ls(.broadcastNames), + function(name) { get(name, .broadcastNames) }) + + sdf <- callJStatic( + "org.apache.spark.sql.api.r.SQLUtils", + "dapply", + x@sdf, + serialize(cleanClosure(func), connection = NULL), + packageNamesArr, + broadcastArr, + if (is.null(schema)) { schema } else { schema$jobj }) + dataFrame(sdf) +} + #' dapply #' -#' Apply a function to each partition of a DataFrame. +#' Apply a function to each partition of a SparkDataFrame. #' #' @param x A SparkDataFrame #' @param func A function to be applied to each partition of the SparkDataFrame. @@ -1197,21 +1215,57 @@ setMethod("summarize", setMethod("dapply", signature(x = "SparkDataFrame", func = "function", schema = "structType"), function(x, func, schema) { -packageNamesArr <- serialize(.sparkREnv[[".packages"]], - connection = NULL) - -broadcastArr <- lapply(ls(.broadcastNames), - function(name) { get(name, .broadcastNames) }) - -sdf <- callJStatic( - "org.apache.spark.sql.api.r.SQLUtils", - "dapply", - x@sdf, - serialize(cleanClosure(func), connection = NULL), - packageNamesArr, - broadcastArr, - schema$jobj) -dataFrame(sdf) +dapplyInternal(x, func, schema) + }) + +#' dapplyCollect +#' +#' Apply a function to each partition of a SparkDataFrame and collect the result back +#’ to R as a data.frame. +#' +#' @param x A SparkDataFrame +#' @param func A function to be applied to each partition of the SparkDataFrame. +#' func should have only one parameter, to which a data.frame corresponds +#' to each partition will be passed. +#' The output of func should be a data.frame. +#' @family SparkDataFrame functions +#' @rdname dapply +#' @name dapplyCollect +#&
spark git commit: [SPARK-18643][SPARKR] SparkR hangs at session start when installed as a package without Spark
Repository: spark Updated Branches: refs/heads/master d9eb4c721 -> b019b3a8a [SPARK-18643][SPARKR] SparkR hangs at session start when installed as a package without Spark ## What changes were proposed in this pull request? If SparkR is running as a package and it has previously downloaded Spark Jar it should be able to run as before without having to set SPARK_HOME. Basically with this bug the auto install Spark will only work in the first session. This seems to be a regression on the earlier behavior. Fix is to always try to install or check for the cached Spark if running in an interactive session. As discussed before, we should probably only install Spark iff running in an interactive session (R shell, RStudio etc) ## How was this patch tested? Manually Author: Felix Cheung Closes #16077 from felixcheung/rsessioninteractive. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b019b3a8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b019b3a8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b019b3a8 Branch: refs/heads/master Commit: b019b3a8ac49336e657f5e093fa2fba77f8d12d2 Parents: d9eb4c7 Author: Felix Cheung Authored: Sun Dec 4 20:25:11 2016 -0800 Committer: Shivaram Venkataraman Committed: Sun Dec 4 20:25:11 2016 -0800 -- R/pkg/R/sparkR.R | 5 - R/pkg/vignettes/sparkr-vignettes.Rmd | 4 ++-- docs/sparkr.md | 4 +++- 3 files changed, 9 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b019b3a8/R/pkg/R/sparkR.R -- diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index a7152b4..43bff97 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -322,6 +322,9 @@ sparkRHive.init <- function(jsc = NULL) { #' SparkSession or initializes a new SparkSession. #' Additional Spark properties can be set in \code{...}, and these named parameters take priority #' over values in \code{master}, \code{appName}, named lists of \code{sparkConfig}. +#' When called in an interactive session, this checks for the Spark installation, and, if not +#' found, it will be downloaded and cached automatically. Alternatively, \code{install.spark} can +#' be called manually. #' #' For details on how to initialize and use SparkR, refer to SparkR programming guide at #' \url{http://spark.apache.org/docs/latest/sparkr.html#starting-up-sparksession}. @@ -565,7 +568,7 @@ sparkCheckInstall <- function(sparkHome, master, deployMode) { message(msg) NULL } else { - if (isMasterLocal(master)) { + if (interactive() || isMasterLocal(master)) { msg <- paste0("Spark not found in SPARK_HOME: ", sparkHome) message(msg) packageLocalDir <- install.spark() http://git-wip-us.apache.org/repos/asf/spark/blob/b019b3a8/R/pkg/vignettes/sparkr-vignettes.Rmd -- diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 73a5e26..a36f8fc 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -94,13 +94,13 @@ sparkR.session.stop() Different from many other R packages, to use SparkR, you need an additional installation of Apache Spark. The Spark installation will be used to run a backend process that will compile and execute SparkR programs. -If you don't have Spark installed on the computer, you may download it from [Apache Spark Website](http://spark.apache.org/downloads.html). Alternatively, we provide an easy-to-use function `install.spark` to complete this process. You don't have to call it explicitly. We will check the installation when `sparkR.session` is called and `install.spark` function will be triggered automatically if no installation is found. +After installing the SparkR package, you can call `sparkR.session` as explained in the previous section to start and it will check for the Spark installation. If you are working with SparkR from an interactive shell (eg. R, RStudio) then Spark is downloaded and cached automatically if it is not found. Alternatively, we provide an easy-to-use function `install.spark` for running this manually. If you don't have Spark installed on the computer, you may download it from [Apache Spark Website](http://spark.apache.org/downloads.html). ```{r, eval=FALSE} install.spark() ``` -If you already have Spark installed, you don't have to install again and can pass the `sparkHome` argument to `sparkR.session` to let SparkR know where the Spark installation is. +If you already have Spark installed, you don't have
spark git commit: [SPARK-18643][SPARKR] SparkR hangs at session start when installed as a package without Spark
Repository: spark Updated Branches: refs/heads/branch-2.1 41d698ece -> c13c2939f [SPARK-18643][SPARKR] SparkR hangs at session start when installed as a package without Spark ## What changes were proposed in this pull request? If SparkR is running as a package and it has previously downloaded Spark Jar it should be able to run as before without having to set SPARK_HOME. Basically with this bug the auto install Spark will only work in the first session. This seems to be a regression on the earlier behavior. Fix is to always try to install or check for the cached Spark if running in an interactive session. As discussed before, we should probably only install Spark iff running in an interactive session (R shell, RStudio etc) ## How was this patch tested? Manually Author: Felix Cheung Closes #16077 from felixcheung/rsessioninteractive. (cherry picked from commit b019b3a8ac49336e657f5e093fa2fba77f8d12d2) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c13c2939 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c13c2939 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c13c2939 Branch: refs/heads/branch-2.1 Commit: c13c2939fb19901d86ee013aa7bb5e200d79be85 Parents: 41d698e Author: Felix Cheung Authored: Sun Dec 4 20:25:11 2016 -0800 Committer: Shivaram Venkataraman Committed: Sun Dec 4 20:25:21 2016 -0800 -- R/pkg/R/sparkR.R | 5 - R/pkg/vignettes/sparkr-vignettes.Rmd | 4 ++-- docs/sparkr.md | 4 +++- 3 files changed, 9 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c13c2939/R/pkg/R/sparkR.R -- diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index a7152b4..43bff97 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -322,6 +322,9 @@ sparkRHive.init <- function(jsc = NULL) { #' SparkSession or initializes a new SparkSession. #' Additional Spark properties can be set in \code{...}, and these named parameters take priority #' over values in \code{master}, \code{appName}, named lists of \code{sparkConfig}. +#' When called in an interactive session, this checks for the Spark installation, and, if not +#' found, it will be downloaded and cached automatically. Alternatively, \code{install.spark} can +#' be called manually. #' #' For details on how to initialize and use SparkR, refer to SparkR programming guide at #' \url{http://spark.apache.org/docs/latest/sparkr.html#starting-up-sparksession}. @@ -565,7 +568,7 @@ sparkCheckInstall <- function(sparkHome, master, deployMode) { message(msg) NULL } else { - if (isMasterLocal(master)) { + if (interactive() || isMasterLocal(master)) { msg <- paste0("Spark not found in SPARK_HOME: ", sparkHome) message(msg) packageLocalDir <- install.spark() http://git-wip-us.apache.org/repos/asf/spark/blob/c13c2939/R/pkg/vignettes/sparkr-vignettes.Rmd -- diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 73a5e26..a36f8fc 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -94,13 +94,13 @@ sparkR.session.stop() Different from many other R packages, to use SparkR, you need an additional installation of Apache Spark. The Spark installation will be used to run a backend process that will compile and execute SparkR programs. -If you don't have Spark installed on the computer, you may download it from [Apache Spark Website](http://spark.apache.org/downloads.html). Alternatively, we provide an easy-to-use function `install.spark` to complete this process. You don't have to call it explicitly. We will check the installation when `sparkR.session` is called and `install.spark` function will be triggered automatically if no installation is found. +After installing the SparkR package, you can call `sparkR.session` as explained in the previous section to start and it will check for the Spark installation. If you are working with SparkR from an interactive shell (eg. R, RStudio) then Spark is downloaded and cached automatically if it is not found. Alternatively, we provide an easy-to-use function `install.spark` for running this manually. If you don't have Spark installed on the computer, you may download it from [Apache Spark Website](http://spark.apache.org/downloads.html). ```{r, eval=FALSE} install.spark() ``` -If you already have Spark installed, you don't have to install again and can pass the `sparkHome` argument to `spark
spark git commit: [MINOR][DOC] Use SparkR `TRUE` value and add default values for `StructField` in SQL Guide.
Repository: spark Updated Branches: refs/heads/master eb8dd6813 -> 410b78986 [MINOR][DOC] Use SparkR `TRUE` value and add default values for `StructField` in SQL Guide. ## What changes were proposed in this pull request? In `SQL Programming Guide`, this PR uses `TRUE` instead of `True` in SparkR and adds default values of `nullable` for `StructField` in Scala/Python/R (i.e., "Note: The default value of nullable is true."). In Java API, `nullable` is not optional. **BEFORE** * SPARK 2.1.0 RC1 http://people.apache.org/~pwendell/spark-releases/spark-2.1.0-rc1-docs/sql-programming-guide.html#data-types **AFTER** * R https://cloud.githubusercontent.com/assets/9700541/20877443/abba19a6-ba7d-11e6-8984-afbe00333fb0.png";> * Scala https://cloud.githubusercontent.com/assets/9700541/20877433/99ce734a-ba7d-11e6-8bb5-e8619041b09b.png";> * Python https://cloud.githubusercontent.com/assets/9700541/20877440/a5c89338-ba7d-11e6-8f92-6c0ae9388d7e.png";> ## How was this patch tested? Manual. ``` cd docs SKIP_API=1 jekyll build open _site/index.html ``` Author: Dongjoon Hyun Closes #16141 from dongjoon-hyun/SPARK-SQL-GUIDE. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/410b7898 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/410b7898 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/410b7898 Branch: refs/heads/master Commit: 410b7898661f77e748564aaee6a5ab7747ce34ad Parents: eb8dd68 Author: Dongjoon Hyun Authored: Mon Dec 5 10:36:13 2016 -0800 Committer: Shivaram Venkataraman Committed: Mon Dec 5 10:36:13 2016 -0800 -- docs/sql-programming-guide.md | 13 - 1 file changed, 8 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/410b7898/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index c7ad06c..e59c327 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1851,7 +1851,8 @@ You can access them by doing The value type in Scala of the data type of this field (For example, Int for a StructField with the data type IntegerType) - StructField(name, dataType, nullable) + StructField(name, dataType, [nullable]) + Note: The default value of nullable is true. @@ -2139,7 +2140,8 @@ from pyspark.sql.types import * The value type in Python of the data type of this field (For example, Int for a StructField with the data type IntegerType) - StructField(name, dataType, nullable) + StructField(name, dataType, [nullable]) + Note: The default value of nullable is True. @@ -2260,7 +2262,7 @@ from pyspark.sql.types import * vector or list list(type="array", elementType=elementType, containsNull=[containsNull]) - Note: The default value of containsNull is True. + Note: The default value of containsNull is TRUE. @@ -2268,7 +2270,7 @@ from pyspark.sql.types import * environment list(type="map", keyType=keyType, valueType=valueType, valueContainsNull=[valueContainsNull]) - Note: The default value of valueContainsNull is True. + Note: The default value of valueContainsNull is TRUE. @@ -2285,7 +2287,8 @@ from pyspark.sql.types import * The value type in R of the data type of this field (For example, integer for a StructField with the data type IntegerType) - list(name=name, type=dataType, nullable=nullable) + list(name=name, type=dataType, nullable=[nullable]) + Note: The default value of nullable is TRUE. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][DOC] Use SparkR `TRUE` value and add default values for `StructField` in SQL Guide.
Repository: spark Updated Branches: refs/heads/branch-2.1 1821cbead -> afd2321b6 [MINOR][DOC] Use SparkR `TRUE` value and add default values for `StructField` in SQL Guide. ## What changes were proposed in this pull request? In `SQL Programming Guide`, this PR uses `TRUE` instead of `True` in SparkR and adds default values of `nullable` for `StructField` in Scala/Python/R (i.e., "Note: The default value of nullable is true."). In Java API, `nullable` is not optional. **BEFORE** * SPARK 2.1.0 RC1 http://people.apache.org/~pwendell/spark-releases/spark-2.1.0-rc1-docs/sql-programming-guide.html#data-types **AFTER** * R https://cloud.githubusercontent.com/assets/9700541/20877443/abba19a6-ba7d-11e6-8984-afbe00333fb0.png";> * Scala https://cloud.githubusercontent.com/assets/9700541/20877433/99ce734a-ba7d-11e6-8bb5-e8619041b09b.png";> * Python https://cloud.githubusercontent.com/assets/9700541/20877440/a5c89338-ba7d-11e6-8f92-6c0ae9388d7e.png";> ## How was this patch tested? Manual. ``` cd docs SKIP_API=1 jekyll build open _site/index.html ``` Author: Dongjoon Hyun Closes #16141 from dongjoon-hyun/SPARK-SQL-GUIDE. (cherry picked from commit 410b7898661f77e748564aaee6a5ab7747ce34ad) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/afd2321b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/afd2321b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/afd2321b Branch: refs/heads/branch-2.1 Commit: afd2321b689fb29d18fee1840f5a0058cefd6d60 Parents: 1821cbe Author: Dongjoon Hyun Authored: Mon Dec 5 10:36:13 2016 -0800 Committer: Shivaram Venkataraman Committed: Mon Dec 5 10:36:26 2016 -0800 -- docs/sql-programming-guide.md | 13 - 1 file changed, 8 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/afd2321b/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 51ba911..d57f22e 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1840,7 +1840,8 @@ You can access them by doing The value type in Scala of the data type of this field (For example, Int for a StructField with the data type IntegerType) - StructField(name, dataType, nullable) + StructField(name, dataType, [nullable]) + Note: The default value of nullable is true. @@ -2128,7 +2129,8 @@ from pyspark.sql.types import * The value type in Python of the data type of this field (For example, Int for a StructField with the data type IntegerType) - StructField(name, dataType, nullable) + StructField(name, dataType, [nullable]) + Note: The default value of nullable is True. @@ -2249,7 +2251,7 @@ from pyspark.sql.types import * vector or list list(type="array", elementType=elementType, containsNull=[containsNull]) - Note: The default value of containsNull is True. + Note: The default value of containsNull is TRUE. @@ -2257,7 +2259,7 @@ from pyspark.sql.types import * environment list(type="map", keyType=keyType, valueType=valueType, valueContainsNull=[valueContainsNull]) - Note: The default value of valueContainsNull is True. + Note: The default value of valueContainsNull is TRUE. @@ -2274,7 +2276,8 @@ from pyspark.sql.types import * The value type in R of the data type of this field (For example, integer for a StructField with the data type IntegerType) - list(name=name, type=dataType, nullable=nullable) + list(name=name, type=dataType, nullable=[nullable]) + Note: The default value of nullable is TRUE. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18590][SPARKR] build R source package when making distribution
Repository: spark Updated Branches: refs/heads/master 3c68944b2 -> c3d3a9d0e [SPARK-18590][SPARKR] build R source package when making distribution ## What changes were proposed in this pull request? This PR has 2 key changes. One, we are building source package (aka bundle package) for SparkR which could be released on CRAN. Two, we should include in the official Spark binary distributions SparkR installed from this source package instead (which would have help/vignettes rds needed for those to work when the SparkR package is loaded in R, whereas earlier approach with devtools does not) But, because of various differences in how R performs different tasks, this PR is a fair bit more complicated. More details below. This PR also includes a few minor fixes. ### more details These are the additional steps in make-distribution; please see [here](https://github.com/apache/spark/blob/master/R/CRAN_RELEASE.md) on what's going to a CRAN release, which is now run during make-distribution.sh. 1. package needs to be installed because the first code block in vignettes is `library(SparkR)` without lib path 2. `R CMD build` will build vignettes (this process runs Spark/SparkR code and captures outputs into pdf documentation) 3. `R CMD check` on the source package will install package and build vignettes again (this time from source packaged) - this is a key step required to release R package on CRAN (will skip tests here but tests will need to pass for CRAN release process to success - ideally, during release signoff we should install from the R source package and run tests) 4. `R CMD Install` on the source package (this is the only way to generate doc/vignettes rds files correctly, not in step # 1) (the output of this step is what we package into Spark dist and sparkr.zip) Alternatively, R CMD build should already be installing the package in a temp directory though it might just be finding this location and set it to lib.loc parameter; another approach is perhaps we could try calling `R CMD INSTALL --build pkg` instead. But in any case, despite installing the package multiple times this is relatively fast. Building vignettes takes a while though. ## How was this patch tested? Manually, CI. Author: Felix Cheung Closes #16014 from felixcheung/rdist. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c3d3a9d0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c3d3a9d0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c3d3a9d0 Branch: refs/heads/master Commit: c3d3a9d0e85b834abef87069e4edd27db87fc607 Parents: 3c68944 Author: Felix Cheung Authored: Thu Dec 8 11:29:31 2016 -0800 Committer: Shivaram Venkataraman Committed: Thu Dec 8 11:29:31 2016 -0800 -- R/CRAN_RELEASE.md | 2 +- R/check-cran.sh | 19 ++- R/install-dev.sh| 2 +- R/pkg/.Rbuildignore | 3 +++ R/pkg/DESCRIPTION | 13 ++--- R/pkg/NAMESPACE | 2 +- dev/create-release/release-build.sh | 27 +++ dev/make-distribution.sh| 25 + 8 files changed, 74 insertions(+), 19 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c3d3a9d0/R/CRAN_RELEASE.md -- diff --git a/R/CRAN_RELEASE.md b/R/CRAN_RELEASE.md index bea8f9f..d6084c7 100644 --- a/R/CRAN_RELEASE.md +++ b/R/CRAN_RELEASE.md @@ -7,7 +7,7 @@ To release SparkR as a package to CRAN, we would use the `devtools` package. Ple First, check that the `Version:` field in the `pkg/DESCRIPTION` file is updated. Also, check for stale files not under source control. -Note that while `check-cran.sh` is running `R CMD check`, it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. +Note that while `run-tests.sh` runs `check-cran.sh` (which runs `R CMD check`), it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. Also note that for CRAN checks for pdf vignettes to success, `qpdf` tool must be there (to install it, eg. `yum -q -y install qpdf`). To upload a release, we would need to update the `cran-comments.md`. This should generally contain the results from running the `check-cran.sh` script along with comments on status of all `WARNING` (should not be any) or `NOTE`. As a part of `check-cran.sh` and the release process, the vignettes is build - make sure `SP
spark git commit: [SPARK-18590][SPARKR] build R source package when making distribution
Repository: spark Updated Branches: refs/heads/branch-2.1 e0173f14e -> d69df9073 [SPARK-18590][SPARKR] build R source package when making distribution This PR has 2 key changes. One, we are building source package (aka bundle package) for SparkR which could be released on CRAN. Two, we should include in the official Spark binary distributions SparkR installed from this source package instead (which would have help/vignettes rds needed for those to work when the SparkR package is loaded in R, whereas earlier approach with devtools does not) But, because of various differences in how R performs different tasks, this PR is a fair bit more complicated. More details below. This PR also includes a few minor fixes. These are the additional steps in make-distribution; please see [here](https://github.com/apache/spark/blob/master/R/CRAN_RELEASE.md) on what's going to a CRAN release, which is now run during make-distribution.sh. 1. package needs to be installed because the first code block in vignettes is `library(SparkR)` without lib path 2. `R CMD build` will build vignettes (this process runs Spark/SparkR code and captures outputs into pdf documentation) 3. `R CMD check` on the source package will install package and build vignettes again (this time from source packaged) - this is a key step required to release R package on CRAN (will skip tests here but tests will need to pass for CRAN release process to success - ideally, during release signoff we should install from the R source package and run tests) 4. `R CMD Install` on the source package (this is the only way to generate doc/vignettes rds files correctly, not in step # 1) (the output of this step is what we package into Spark dist and sparkr.zip) Alternatively, R CMD build should already be installing the package in a temp directory though it might just be finding this location and set it to lib.loc parameter; another approach is perhaps we could try calling `R CMD INSTALL --build pkg` instead. But in any case, despite installing the package multiple times this is relatively fast. Building vignettes takes a while though. Manually, CI. Author: Felix Cheung Closes #16014 from felixcheung/rdist. (cherry picked from commit c3d3a9d0e85b834abef87069e4edd27db87fc607) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d69df907 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d69df907 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d69df907 Branch: refs/heads/branch-2.1 Commit: d69df9073274f7ab3a3598bb182a3233fd7775cd Parents: e0173f1 Author: Felix Cheung Authored: Thu Dec 8 11:29:31 2016 -0800 Committer: Shivaram Venkataraman Committed: Thu Dec 8 11:31:24 2016 -0800 -- R/CRAN_RELEASE.md | 2 +- R/check-cran.sh | 19 ++- R/install-dev.sh| 2 +- R/pkg/.Rbuildignore | 3 +++ R/pkg/DESCRIPTION | 13 ++--- R/pkg/NAMESPACE | 2 +- dev/create-release/release-build.sh | 27 +++ dev/make-distribution.sh| 25 + 8 files changed, 74 insertions(+), 19 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d69df907/R/CRAN_RELEASE.md -- diff --git a/R/CRAN_RELEASE.md b/R/CRAN_RELEASE.md index bea8f9f..d6084c7 100644 --- a/R/CRAN_RELEASE.md +++ b/R/CRAN_RELEASE.md @@ -7,7 +7,7 @@ To release SparkR as a package to CRAN, we would use the `devtools` package. Ple First, check that the `Version:` field in the `pkg/DESCRIPTION` file is updated. Also, check for stale files not under source control. -Note that while `check-cran.sh` is running `R CMD check`, it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. +Note that while `run-tests.sh` runs `check-cran.sh` (which runs `R CMD check`), it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. Also note that for CRAN checks for pdf vignettes to success, `qpdf` tool must be there (to install it, eg. `yum -q -y install qpdf`). To upload a release, we would need to update the `cran-comments.md`. This should generally contain the results from running the `check-cran.sh` script along with comments on status of all `WARNING` (should not be any) or `NOTE`. As a part of `check-cran.sh` and the release process, the vignettes is build
spark git commit: [SPARK-18590][SPARKR] Change the R source build to Hadoop 2.6
Repository: spark Updated Branches: refs/heads/branch-2.1 9483242f4 -> e43209fe2 [SPARK-18590][SPARKR] Change the R source build to Hadoop 2.6 This PR changes the SparkR source release tarball to be built using the Hadoop 2.6 profile. Previously it was using the without hadoop profile which leads to an error as discussed in https://github.com/apache/spark/pull/16014#issuecomment-265843991 Author: Shivaram Venkataraman Closes #16218 from shivaram/fix-sparkr-release-build. (cherry picked from commit 202fcd21ce01393fa6dfaa1c2126e18e9b85ee96) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e43209fe Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e43209fe Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e43209fe Branch: refs/heads/branch-2.1 Commit: e43209fe2a69fb239dff8bc1a18297d3696f0dcd Parents: 9483242 Author: Shivaram Venkataraman Authored: Thu Dec 8 13:01:46 2016 -0800 Committer: Shivaram Venkataraman Committed: Thu Dec 8 13:01:54 2016 -0800 -- dev/create-release/release-build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e43209fe/dev/create-release/release-build.sh -- diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index 8863ee6..1b05b20 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -238,10 +238,10 @@ if [[ "$1" == "package" ]]; then FLAGS="-Psparkr -Phive -Phive-thriftserver -Pyarn -Pmesos" make_binary_release "hadoop2.3" "-Phadoop-2.3 $FLAGS" "3033" & make_binary_release "hadoop2.4" "-Phadoop-2.4 $FLAGS" "3034" & - make_binary_release "hadoop2.6" "-Phadoop-2.6 $FLAGS" "3035" & + make_binary_release "hadoop2.6" "-Phadoop-2.6 $FLAGS" "3035" "withr" & make_binary_release "hadoop2.7" "-Phadoop-2.7 $FLAGS" "3036" "withpip" & make_binary_release "hadoop2.4-without-hive" "-Psparkr -Phadoop-2.4 -Pyarn -Pmesos" "3037" & - make_binary_release "without-hadoop" "-Psparkr -Phadoop-provided -Pyarn -Pmesos" "3038" "withr" & + make_binary_release "without-hadoop" "-Psparkr -Phadoop-provided -Pyarn -Pmesos" "3038" & wait rm -rf spark-$SPARK_VERSION-bin-*/ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18590][SPARKR] Change the R source build to Hadoop 2.6
Repository: spark Updated Branches: refs/heads/master 3261e25da -> 202fcd21c [SPARK-18590][SPARKR] Change the R source build to Hadoop 2.6 This PR changes the SparkR source release tarball to be built using the Hadoop 2.6 profile. Previously it was using the without hadoop profile which leads to an error as discussed in https://github.com/apache/spark/pull/16014#issuecomment-265843991 Author: Shivaram Venkataraman Closes #16218 from shivaram/fix-sparkr-release-build. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/202fcd21 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/202fcd21 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/202fcd21 Branch: refs/heads/master Commit: 202fcd21ce01393fa6dfaa1c2126e18e9b85ee96 Parents: 3261e25 Author: Shivaram Venkataraman Authored: Thu Dec 8 13:01:46 2016 -0800 Committer: Shivaram Venkataraman Committed: Thu Dec 8 13:01:46 2016 -0800 -- dev/create-release/release-build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/202fcd21/dev/create-release/release-build.sh -- diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index 8863ee6..1b05b20 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -238,10 +238,10 @@ if [[ "$1" == "package" ]]; then FLAGS="-Psparkr -Phive -Phive-thriftserver -Pyarn -Pmesos" make_binary_release "hadoop2.3" "-Phadoop-2.3 $FLAGS" "3033" & make_binary_release "hadoop2.4" "-Phadoop-2.4 $FLAGS" "3034" & - make_binary_release "hadoop2.6" "-Phadoop-2.6 $FLAGS" "3035" & + make_binary_release "hadoop2.6" "-Phadoop-2.6 $FLAGS" "3035" "withr" & make_binary_release "hadoop2.7" "-Phadoop-2.7 $FLAGS" "3036" "withpip" & make_binary_release "hadoop2.4-without-hive" "-Psparkr -Phadoop-2.4 -Pyarn -Pmesos" "3037" & - make_binary_release "without-hadoop" "-Psparkr -Phadoop-provided -Pyarn -Pmesos" "3038" "withr" & + make_binary_release "without-hadoop" "-Psparkr -Phadoop-provided -Pyarn -Pmesos" "3038" & wait rm -rf spark-$SPARK_VERSION-bin-*/ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARKR][PYSPARK] Fix R source package name to match Spark version. Remove pip tar.gz from distribution
Repository: spark Updated Branches: refs/heads/master 458fa3325 -> 4ac8b20bf [SPARKR][PYSPARK] Fix R source package name to match Spark version. Remove pip tar.gz from distribution ## What changes were proposed in this pull request? Fixes name of R source package so that the `cp` in release-build.sh works correctly. Issue discussed in https://github.com/apache/spark/pull/16014#issuecomment-265867125 Author: Shivaram Venkataraman Closes #16221 from shivaram/fix-sparkr-release-build-name. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4ac8b20b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4ac8b20b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4ac8b20b Branch: refs/heads/master Commit: 4ac8b20bf2f962d9b8b6b209468896758d49efe3 Parents: 458fa33 Author: Shivaram Venkataraman Authored: Thu Dec 8 18:26:54 2016 -0800 Committer: Shivaram Venkataraman Committed: Thu Dec 8 18:26:54 2016 -0800 -- dev/make-distribution.sh | 9 + 1 file changed, 9 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4ac8b20b/dev/make-distribution.sh -- diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh index fe281bb..4da7d57 100755 --- a/dev/make-distribution.sh +++ b/dev/make-distribution.sh @@ -222,11 +222,14 @@ fi # Make R package - this is used for both CRAN release and packing R layout into distribution if [ "$MAKE_R" == "true" ]; then echo "Building R source package" + R_PACKAGE_VERSION=`grep Version $SPARK_HOME/R/pkg/DESCRIPTION | awk '{print $NF}'` pushd "$SPARK_HOME/R" > /dev/null # Build source package and run full checks # Install source package to get it to generate vignettes, etc. # Do not source the check-cran.sh - it should be run from where it is for it to set SPARK_HOME NO_TESTS=1 CLEAN_INSTALL=1 "$SPARK_HOME/"R/check-cran.sh + # Make a copy of R source package matching the Spark release version. + cp $SPARK_HOME/R/SparkR_"$R_PACKAGE_VERSION".tar.gz $SPARK_HOME/R/SparkR_"$VERSION".tar.gz popd > /dev/null else echo "Skipping building R source package" @@ -238,6 +241,12 @@ cp "$SPARK_HOME"/conf/*.template "$DISTDIR"/conf cp "$SPARK_HOME/README.md" "$DISTDIR" cp -r "$SPARK_HOME/bin" "$DISTDIR" cp -r "$SPARK_HOME/python" "$DISTDIR" + +# Remove the python distribution from dist/ if we built it +if [ "$MAKE_PIP" == "true" ]; then + rm -f $DISTDIR/python/dist/pyspark-*.tar.gz +fi + cp -r "$SPARK_HOME/sbin" "$DISTDIR" # Copy SparkR if it exists if [ -d "$SPARK_HOME"/R/lib/SparkR ]; then - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARKR][PYSPARK] Fix R source package name to match Spark version. Remove pip tar.gz from distribution
Repository: spark Updated Branches: refs/heads/branch-2.1 1cafc76ea -> ef5646b4c [SPARKR][PYSPARK] Fix R source package name to match Spark version. Remove pip tar.gz from distribution ## What changes were proposed in this pull request? Fixes name of R source package so that the `cp` in release-build.sh works correctly. Issue discussed in https://github.com/apache/spark/pull/16014#issuecomment-265867125 Author: Shivaram Venkataraman Closes #16221 from shivaram/fix-sparkr-release-build-name. (cherry picked from commit 4ac8b20bf2f962d9b8b6b209468896758d49efe3) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ef5646b4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ef5646b4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ef5646b4 Branch: refs/heads/branch-2.1 Commit: ef5646b4c6792a96e85d1dd4bb3103ba8306949b Parents: 1cafc76 Author: Shivaram Venkataraman Authored: Thu Dec 8 18:26:54 2016 -0800 Committer: Shivaram Venkataraman Committed: Thu Dec 8 18:27:05 2016 -0800 -- dev/make-distribution.sh | 9 + 1 file changed, 9 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ef5646b4/dev/make-distribution.sh -- diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh index fe281bb..4da7d57 100755 --- a/dev/make-distribution.sh +++ b/dev/make-distribution.sh @@ -222,11 +222,14 @@ fi # Make R package - this is used for both CRAN release and packing R layout into distribution if [ "$MAKE_R" == "true" ]; then echo "Building R source package" + R_PACKAGE_VERSION=`grep Version $SPARK_HOME/R/pkg/DESCRIPTION | awk '{print $NF}'` pushd "$SPARK_HOME/R" > /dev/null # Build source package and run full checks # Install source package to get it to generate vignettes, etc. # Do not source the check-cran.sh - it should be run from where it is for it to set SPARK_HOME NO_TESTS=1 CLEAN_INSTALL=1 "$SPARK_HOME/"R/check-cran.sh + # Make a copy of R source package matching the Spark release version. + cp $SPARK_HOME/R/SparkR_"$R_PACKAGE_VERSION".tar.gz $SPARK_HOME/R/SparkR_"$VERSION".tar.gz popd > /dev/null else echo "Skipping building R source package" @@ -238,6 +241,12 @@ cp "$SPARK_HOME"/conf/*.template "$DISTDIR"/conf cp "$SPARK_HOME/README.md" "$DISTDIR" cp -r "$SPARK_HOME/bin" "$DISTDIR" cp -r "$SPARK_HOME/python" "$DISTDIR" + +# Remove the python distribution from dist/ if we built it +if [ "$MAKE_PIP" == "true" ]; then + rm -f $DISTDIR/python/dist/pyspark-*.tar.gz +fi + cp -r "$SPARK_HOME/sbin" "$DISTDIR" # Copy SparkR if it exists if [ -d "$SPARK_HOME"/R/lib/SparkR ]; then - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Copy the SparkR source package with LFTP
Repository: spark Updated Branches: refs/heads/master 9338aa4f8 -> 934035ae7 Copy the SparkR source package with LFTP This PR adds a line in release-build.sh to copy the SparkR source archive using LFTP Author: Shivaram Venkataraman Closes #16226 from shivaram/fix-sparkr-copy-build. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/934035ae Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/934035ae Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/934035ae Branch: refs/heads/master Commit: 934035ae7cb648fe61665d8efe0b7aa2bbe4ca47 Parents: 9338aa4 Author: Shivaram Venkataraman Authored: Thu Dec 8 22:21:24 2016 -0800 Committer: Shivaram Venkataraman Committed: Thu Dec 8 22:21:24 2016 -0800 -- dev/create-release/release-build.sh | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/934035ae/dev/create-release/release-build.sh -- diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index 1b05b20..7c77791 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -258,6 +258,7 @@ if [[ "$1" == "package" ]]; then LFTP mkdir -p $dest_dir LFTP mput -O $dest_dir 'spark-*' LFTP mput -O $dest_dir 'pyspark-*' + LFTP mput -O $dest_dir 'SparkR-*' exit 0 fi - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Copy the SparkR source package with LFTP
Repository: spark Updated Branches: refs/heads/branch-2.1 4ceed95b4 -> e8f351f9a Copy the SparkR source package with LFTP This PR adds a line in release-build.sh to copy the SparkR source archive using LFTP Author: Shivaram Venkataraman Closes #16226 from shivaram/fix-sparkr-copy-build. (cherry picked from commit 934035ae7cb648fe61665d8efe0b7aa2bbe4ca47) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e8f351f9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e8f351f9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e8f351f9 Branch: refs/heads/branch-2.1 Commit: e8f351f9a670fc4d43f15c8d7cd57e49fb9ceba2 Parents: 4ceed95b Author: Shivaram Venkataraman Authored: Thu Dec 8 22:21:24 2016 -0800 Committer: Shivaram Venkataraman Committed: Thu Dec 8 22:21:36 2016 -0800 -- dev/create-release/release-build.sh | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e8f351f9/dev/create-release/release-build.sh -- diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index 1b05b20..7c77791 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -258,6 +258,7 @@ if [[ "$1" == "package" ]]; then LFTP mkdir -p $dest_dir LFTP mput -O $dest_dir 'spark-*' LFTP mput -O $dest_dir 'pyspark-*' + LFTP mput -O $dest_dir 'SparkR-*' exit 0 fi - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Copy pyspark and SparkR packages to latest release dir too
Repository: spark Updated Branches: refs/heads/master 934035ae7 -> c074c96dc Copy pyspark and SparkR packages to latest release dir too ## What changes were proposed in this pull request? Copy pyspark and SparkR packages to latest release dir, as per comment [here](https://github.com/apache/spark/pull/16226#discussion_r91664822) Author: Felix Cheung Closes #16227 from felixcheung/pyrftp. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c074c96d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c074c96d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c074c96d Branch: refs/heads/master Commit: c074c96dc57bf18b28fafdcac0c768d75c642cba Parents: 934035a Author: Felix Cheung Authored: Thu Dec 8 22:52:34 2016 -0800 Committer: Shivaram Venkataraman Committed: Thu Dec 8 22:52:34 2016 -0800 -- dev/create-release/release-build.sh | 2 ++ 1 file changed, 2 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c074c96d/dev/create-release/release-build.sh -- diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index 7c77791..c0663b8 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -251,6 +251,8 @@ if [[ "$1" == "package" ]]; then # Put to new directory: LFTP mkdir -p $dest_dir LFTP mput -O $dest_dir 'spark-*' + LFTP mput -O $dest_dir 'pyspark-*' + LFTP mput -O $dest_dir 'SparkR-*' # Delete /latest directory and rename new upload to /latest LFTP "rm -r -f $REMOTE_PARENT_DIR/latest || exit 0" LFTP mv $dest_dir "$REMOTE_PARENT_DIR/latest" - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Copy pyspark and SparkR packages to latest release dir too
Repository: spark Updated Branches: refs/heads/branch-2.1 e8f351f9a -> 2c88e1dc3 Copy pyspark and SparkR packages to latest release dir too ## What changes were proposed in this pull request? Copy pyspark and SparkR packages to latest release dir, as per comment [here](https://github.com/apache/spark/pull/16226#discussion_r91664822) Author: Felix Cheung Closes #16227 from felixcheung/pyrftp. (cherry picked from commit c074c96dc57bf18b28fafdcac0c768d75c642cba) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2c88e1dc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2c88e1dc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2c88e1dc Branch: refs/heads/branch-2.1 Commit: 2c88e1dc31e1b90605ad8ab85b20b131b4b3c722 Parents: e8f351f Author: Felix Cheung Authored: Thu Dec 8 22:52:34 2016 -0800 Committer: Shivaram Venkataraman Committed: Thu Dec 8 22:53:02 2016 -0800 -- dev/create-release/release-build.sh | 2 ++ 1 file changed, 2 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2c88e1dc/dev/create-release/release-build.sh -- diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index 7c77791..c0663b8 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -251,6 +251,8 @@ if [[ "$1" == "package" ]]; then # Put to new directory: LFTP mkdir -p $dest_dir LFTP mput -O $dest_dir 'spark-*' + LFTP mput -O $dest_dir 'pyspark-*' + LFTP mput -O $dest_dir 'SparkR-*' # Delete /latest directory and rename new upload to /latest LFTP "rm -r -f $REMOTE_PARENT_DIR/latest || exit 0" LFTP mv $dest_dir "$REMOTE_PARENT_DIR/latest" - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][SPARKR] Fix SparkR regex in copy command
Repository: spark Updated Branches: refs/heads/branch-2.1 0c6415aec -> eb2d9bfd4 [MINOR][SPARKR] Fix SparkR regex in copy command Fix SparkR package copy regex. The existing code leads to ``` Copying release tarballs to /home//public_html/spark-nightly/spark-branch-2.1-bin/spark-2.1.1-SNAPSHOT-2016_12_08_22_38-e8f351f-bin mput: SparkR-*: no files found ``` Author: Shivaram Venkataraman Closes #16231 from shivaram/typo-sparkr-build. (cherry picked from commit be5fc6ef72c7eb586b184b0f42ac50ef32843208) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eb2d9bfd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eb2d9bfd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eb2d9bfd Branch: refs/heads/branch-2.1 Commit: eb2d9bfd4e100789604ca0810929b42694ea7377 Parents: 0c6415a Author: Shivaram Venkataraman Authored: Fri Dec 9 10:12:56 2016 -0800 Committer: Shivaram Venkataraman Committed: Fri Dec 9 10:13:05 2016 -0800 -- dev/create-release/release-build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/eb2d9bfd/dev/create-release/release-build.sh -- diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index c0663b8..b08577c 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -252,7 +252,7 @@ if [[ "$1" == "package" ]]; then LFTP mkdir -p $dest_dir LFTP mput -O $dest_dir 'spark-*' LFTP mput -O $dest_dir 'pyspark-*' - LFTP mput -O $dest_dir 'SparkR-*' + LFTP mput -O $dest_dir 'SparkR_*' # Delete /latest directory and rename new upload to /latest LFTP "rm -r -f $REMOTE_PARENT_DIR/latest || exit 0" LFTP mv $dest_dir "$REMOTE_PARENT_DIR/latest" @@ -260,7 +260,7 @@ if [[ "$1" == "package" ]]; then LFTP mkdir -p $dest_dir LFTP mput -O $dest_dir 'spark-*' LFTP mput -O $dest_dir 'pyspark-*' - LFTP mput -O $dest_dir 'SparkR-*' + LFTP mput -O $dest_dir 'SparkR_*' exit 0 fi - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][SPARKR] Fix SparkR regex in copy command
Repository: spark Updated Branches: refs/heads/master fd48d80a6 -> be5fc6ef7 [MINOR][SPARKR] Fix SparkR regex in copy command Fix SparkR package copy regex. The existing code leads to ``` Copying release tarballs to /home//public_html/spark-nightly/spark-branch-2.1-bin/spark-2.1.1-SNAPSHOT-2016_12_08_22_38-e8f351f-bin mput: SparkR-*: no files found ``` Author: Shivaram Venkataraman Closes #16231 from shivaram/typo-sparkr-build. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/be5fc6ef Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/be5fc6ef Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/be5fc6ef Branch: refs/heads/master Commit: be5fc6ef72c7eb586b184b0f42ac50ef32843208 Parents: fd48d80 Author: Shivaram Venkataraman Authored: Fri Dec 9 10:12:56 2016 -0800 Committer: Shivaram Venkataraman Committed: Fri Dec 9 10:12:56 2016 -0800 -- dev/create-release/release-build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/be5fc6ef/dev/create-release/release-build.sh -- diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index c0663b8..b08577c 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -252,7 +252,7 @@ if [[ "$1" == "package" ]]; then LFTP mkdir -p $dest_dir LFTP mput -O $dest_dir 'spark-*' LFTP mput -O $dest_dir 'pyspark-*' - LFTP mput -O $dest_dir 'SparkR-*' + LFTP mput -O $dest_dir 'SparkR_*' # Delete /latest directory and rename new upload to /latest LFTP "rm -r -f $REMOTE_PARENT_DIR/latest || exit 0" LFTP mv $dest_dir "$REMOTE_PARENT_DIR/latest" @@ -260,7 +260,7 @@ if [[ "$1" == "package" ]]; then LFTP mkdir -p $dest_dir LFTP mput -O $dest_dir 'spark-*' LFTP mput -O $dest_dir 'pyspark-*' - LFTP mput -O $dest_dir 'SparkR-*' + LFTP mput -O $dest_dir 'SparkR_*' exit 0 fi - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18807][SPARKR] Should suppress output print for calls to JVM methods with void return values
Repository: spark Updated Branches: refs/heads/master d2493a203 -> 3e11d5bfe [SPARK-18807][SPARKR] Should suppress output print for calls to JVM methods with void return values ## What changes were proposed in this pull request? Several SparkR API calling into JVM methods that have void return values are getting printed out, especially when running in a REPL or IDE. example: ``` > setLogLevel("WARN") NULL ``` We should fix this to make the result more clear. Also found a small change to return value of dropTempView in 2.1 - adding doc and test for it. ## How was this patch tested? manually - I didn't find a expect_*() method in testthat for this Author: Felix Cheung Closes #16237 from felixcheung/rinvis. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3e11d5bf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3e11d5bf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3e11d5bf Branch: refs/heads/master Commit: 3e11d5bfef2f05bd6d42c4d6188eae6d63c963ef Parents: d2493a2 Author: Felix Cheung Authored: Fri Dec 9 19:06:05 2016 -0800 Committer: Shivaram Venkataraman Committed: Fri Dec 9 19:06:05 2016 -0800 -- R/pkg/R/SQLContext.R | 7 --- R/pkg/R/context.R | 6 +++--- R/pkg/R/sparkR.R | 6 +++--- R/pkg/inst/tests/testthat/test_sparkSQL.R | 14 +++--- 4 files changed, 17 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3e11d5bf/R/pkg/R/SQLContext.R -- diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index 38d83c6..6f48cd6 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -634,7 +634,7 @@ tableNames <- function(x, ...) { cacheTable.default <- function(tableName) { sparkSession <- getSparkSession() catalog <- callJMethod(sparkSession, "catalog") - callJMethod(catalog, "cacheTable", tableName) + invisible(callJMethod(catalog, "cacheTable", tableName)) } cacheTable <- function(x, ...) { @@ -663,7 +663,7 @@ cacheTable <- function(x, ...) { uncacheTable.default <- function(tableName) { sparkSession <- getSparkSession() catalog <- callJMethod(sparkSession, "catalog") - callJMethod(catalog, "uncacheTable", tableName) + invisible(callJMethod(catalog, "uncacheTable", tableName)) } uncacheTable <- function(x, ...) { @@ -686,7 +686,7 @@ uncacheTable <- function(x, ...) { clearCache.default <- function() { sparkSession <- getSparkSession() catalog <- callJMethod(sparkSession, "catalog") - callJMethod(catalog, "clearCache") + invisible(callJMethod(catalog, "clearCache")) } clearCache <- function() { @@ -730,6 +730,7 @@ dropTempTable <- function(x, ...) { #' If the view has been cached before, then it will also be uncached. #' #' @param viewName the name of the view to be dropped. +#' @return TRUE if the view is dropped successfully, FALSE otherwise. #' @rdname dropTempView #' @name dropTempView #' @export http://git-wip-us.apache.org/repos/asf/spark/blob/3e11d5bf/R/pkg/R/context.R -- diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index 438d77a..1138caf 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -87,8 +87,8 @@ objectFile <- function(sc, path, minPartitions = NULL) { #' in the list are split into \code{numSlices} slices and distributed to nodes #' in the cluster. #' -#' If size of serialized slices is larger than spark.r.maxAllocationLimit or (200MB), the function -#' will write it to disk and send the file name to JVM. Also to make sure each slice is not +#' If size of serialized slices is larger than spark.r.maxAllocationLimit or (200MB), the function +#' will write it to disk and send the file name to JVM. Also to make sure each slice is not #' larger than that limit, number of slices may be increased. #' #' @param sc SparkContext to use @@ -379,5 +379,5 @@ spark.lapply <- function(list, func) { #' @note setLogLevel since 2.0.0 setLogLevel <- function(level) { sc <- getSparkContext() - callJMethod(sc, "setLogLevel", level) + invisible(callJMethod(sc, "setLogLevel", level)) } http://git-wip-us.apache.org/repos/asf/spark/blob/3e11d5bf/R/pkg/R/sparkR.R -- diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 43bff97..c57cc8f 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -
spark git commit: [SPARK-18807][SPARKR] Should suppress output print for calls to JVM methods with void return values
Repository: spark Updated Branches: refs/heads/branch-2.1 e45345d91 -> 8bf56cc46 [SPARK-18807][SPARKR] Should suppress output print for calls to JVM methods with void return values ## What changes were proposed in this pull request? Several SparkR API calling into JVM methods that have void return values are getting printed out, especially when running in a REPL or IDE. example: ``` > setLogLevel("WARN") NULL ``` We should fix this to make the result more clear. Also found a small change to return value of dropTempView in 2.1 - adding doc and test for it. ## How was this patch tested? manually - I didn't find a expect_*() method in testthat for this Author: Felix Cheung Closes #16237 from felixcheung/rinvis. (cherry picked from commit 3e11d5bfef2f05bd6d42c4d6188eae6d63c963ef) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8bf56cc4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8bf56cc4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8bf56cc4 Branch: refs/heads/branch-2.1 Commit: 8bf56cc46b96874565ebd8109f62e69e6c0cf151 Parents: e45345d Author: Felix Cheung Authored: Fri Dec 9 19:06:05 2016 -0800 Committer: Shivaram Venkataraman Committed: Fri Dec 9 19:06:28 2016 -0800 -- R/pkg/R/SQLContext.R | 7 --- R/pkg/R/context.R | 6 +++--- R/pkg/R/sparkR.R | 6 +++--- R/pkg/inst/tests/testthat/test_sparkSQL.R | 14 +++--- 4 files changed, 17 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8bf56cc4/R/pkg/R/SQLContext.R -- diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index 38d83c6..6f48cd6 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -634,7 +634,7 @@ tableNames <- function(x, ...) { cacheTable.default <- function(tableName) { sparkSession <- getSparkSession() catalog <- callJMethod(sparkSession, "catalog") - callJMethod(catalog, "cacheTable", tableName) + invisible(callJMethod(catalog, "cacheTable", tableName)) } cacheTable <- function(x, ...) { @@ -663,7 +663,7 @@ cacheTable <- function(x, ...) { uncacheTable.default <- function(tableName) { sparkSession <- getSparkSession() catalog <- callJMethod(sparkSession, "catalog") - callJMethod(catalog, "uncacheTable", tableName) + invisible(callJMethod(catalog, "uncacheTable", tableName)) } uncacheTable <- function(x, ...) { @@ -686,7 +686,7 @@ uncacheTable <- function(x, ...) { clearCache.default <- function() { sparkSession <- getSparkSession() catalog <- callJMethod(sparkSession, "catalog") - callJMethod(catalog, "clearCache") + invisible(callJMethod(catalog, "clearCache")) } clearCache <- function() { @@ -730,6 +730,7 @@ dropTempTable <- function(x, ...) { #' If the view has been cached before, then it will also be uncached. #' #' @param viewName the name of the view to be dropped. +#' @return TRUE if the view is dropped successfully, FALSE otherwise. #' @rdname dropTempView #' @name dropTempView #' @export http://git-wip-us.apache.org/repos/asf/spark/blob/8bf56cc4/R/pkg/R/context.R -- diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index 438d77a..1138caf 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -87,8 +87,8 @@ objectFile <- function(sc, path, minPartitions = NULL) { #' in the list are split into \code{numSlices} slices and distributed to nodes #' in the cluster. #' -#' If size of serialized slices is larger than spark.r.maxAllocationLimit or (200MB), the function -#' will write it to disk and send the file name to JVM. Also to make sure each slice is not +#' If size of serialized slices is larger than spark.r.maxAllocationLimit or (200MB), the function +#' will write it to disk and send the file name to JVM. Also to make sure each slice is not #' larger than that limit, number of slices may be increased. #' #' @param sc SparkContext to use @@ -379,5 +379,5 @@ spark.lapply <- function(list, func) { #' @note setLogLevel since 2.0.0 setLogLevel <- function(level) { sc <- getSparkContext() - callJMethod(sc, "setLogLevel", level) + invisible(callJMethod(sc, "setLogLevel", level)) } http://git-wip-us.apache.org/repos/asf/spark/blob/8bf56cc4/R/pkg/R/sparkR.R -- diff
spark git commit: [SPARK-18810][SPARKR] SparkR install.spark does not work for RCs, snapshots
Repository: spark Updated Branches: refs/heads/branch-2.1 523071f3f -> 1aeb7f427 [SPARK-18810][SPARKR] SparkR install.spark does not work for RCs, snapshots ## What changes were proposed in this pull request? Support overriding the download url (include version directory) in an environment variable, `SPARKR_RELEASE_DOWNLOAD_URL` ## How was this patch tested? unit test, manually testing - snapshot build url - download when spark jar not cached - when spark jar is cached - RC build url - download when spark jar not cached - when spark jar is cached - multiple cached spark versions - starting with sparkR shell To use this, ``` SPARKR_RELEASE_DOWNLOAD_URL=http://this_is_the_url_to_spark_release_tgz R ``` then in R, ``` library(SparkR) # or specify lib.loc sparkR.session() ``` Author: Felix Cheung Closes #16248 from felixcheung/rinstallurl. (cherry picked from commit 8a51cfdcad5f8397558ed2e245eb03650f37ce66) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1aeb7f42 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1aeb7f42 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1aeb7f42 Branch: refs/heads/branch-2.1 Commit: 1aeb7f427d31bfd44f7abb7c56dd7661be8bbaa6 Parents: 523071f Author: Felix Cheung Authored: Mon Dec 12 14:40:41 2016 -0800 Committer: Shivaram Venkataraman Committed: Mon Dec 12 14:40:52 2016 -0800 -- R/pkg/R/install.R | 38 - R/pkg/R/utils.R| 14 ++- R/pkg/inst/tests/testthat/test_utils.R | 11 + 3 files changed, 51 insertions(+), 12 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1aeb7f42/R/pkg/R/install.R -- diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 69b0a52..097b7ad 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -79,19 +79,28 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, dir.create(localDir, recursive = TRUE) } - packageLocalDir <- file.path(localDir, packageName) - if (overwrite) { message(paste0("Overwrite = TRUE: download and overwrite the tar file", "and Spark package directory if they exist.")) } + releaseUrl <- Sys.getenv("SPARKR_RELEASE_DOWNLOAD_URL") + if (releaseUrl != "") { +packageName <- basenameSansExtFromUrl(releaseUrl) + } + + packageLocalDir <- file.path(localDir, packageName) + # can use dir.exists(packageLocalDir) under R 3.2.0 or later if (!is.na(file.info(packageLocalDir)$isdir) && !overwrite) { -fmt <- "%s for Hadoop %s found, with SPARK_HOME set to %s" -msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), - packageLocalDir) -message(msg) +if (releaseUrl != "") { + message(paste(packageName, "found, setting SPARK_HOME to", packageLocalDir)) +} else { + fmt <- "%s for Hadoop %s found, setting SPARK_HOME to %s" + msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), + packageLocalDir) + message(msg) +} Sys.setenv(SPARK_HOME = packageLocalDir) return(invisible(packageLocalDir)) } else { @@ -104,7 +113,12 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, if (tarExists && !overwrite) { message("tar file found.") } else { -robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) +if (releaseUrl != "") { + message("Downloading from alternate URL:\n- ", releaseUrl) + downloadUrl(releaseUrl, packageLocalPath, paste0("Fetch failed from ", releaseUrl)) +} else { + robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) +} } message(sprintf("Installing to %s", localDir)) @@ -182,16 +196,18 @@ getPreferredMirror <- function(version, packageName) { } directDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) { - packageRemotePath <- paste0( -file.path(mirrorUrl, version, packageName), ".tgz") + packageRemotePath <- paste0(file.path(mirrorUrl, version, packageName), ".tgz") fmt <- "Downloading %s for Hadoop %s from:\n- %s" msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), packageRemotePath)
spark git commit: [SPARK-18810][SPARKR] SparkR install.spark does not work for RCs, snapshots
Repository: spark Updated Branches: refs/heads/master 90abfd15f -> 8a51cfdca [SPARK-18810][SPARKR] SparkR install.spark does not work for RCs, snapshots ## What changes were proposed in this pull request? Support overriding the download url (include version directory) in an environment variable, `SPARKR_RELEASE_DOWNLOAD_URL` ## How was this patch tested? unit test, manually testing - snapshot build url - download when spark jar not cached - when spark jar is cached - RC build url - download when spark jar not cached - when spark jar is cached - multiple cached spark versions - starting with sparkR shell To use this, ``` SPARKR_RELEASE_DOWNLOAD_URL=http://this_is_the_url_to_spark_release_tgz R ``` then in R, ``` library(SparkR) # or specify lib.loc sparkR.session() ``` Author: Felix Cheung Closes #16248 from felixcheung/rinstallurl. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8a51cfdc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8a51cfdc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8a51cfdc Branch: refs/heads/master Commit: 8a51cfdcad5f8397558ed2e245eb03650f37ce66 Parents: 90abfd1 Author: Felix Cheung Authored: Mon Dec 12 14:40:41 2016 -0800 Committer: Shivaram Venkataraman Committed: Mon Dec 12 14:40:41 2016 -0800 -- R/pkg/R/install.R | 38 - R/pkg/R/utils.R| 14 ++- R/pkg/inst/tests/testthat/test_utils.R | 11 + 3 files changed, 51 insertions(+), 12 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8a51cfdc/R/pkg/R/install.R -- diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 69b0a52..097b7ad 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -79,19 +79,28 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, dir.create(localDir, recursive = TRUE) } - packageLocalDir <- file.path(localDir, packageName) - if (overwrite) { message(paste0("Overwrite = TRUE: download and overwrite the tar file", "and Spark package directory if they exist.")) } + releaseUrl <- Sys.getenv("SPARKR_RELEASE_DOWNLOAD_URL") + if (releaseUrl != "") { +packageName <- basenameSansExtFromUrl(releaseUrl) + } + + packageLocalDir <- file.path(localDir, packageName) + # can use dir.exists(packageLocalDir) under R 3.2.0 or later if (!is.na(file.info(packageLocalDir)$isdir) && !overwrite) { -fmt <- "%s for Hadoop %s found, with SPARK_HOME set to %s" -msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), - packageLocalDir) -message(msg) +if (releaseUrl != "") { + message(paste(packageName, "found, setting SPARK_HOME to", packageLocalDir)) +} else { + fmt <- "%s for Hadoop %s found, setting SPARK_HOME to %s" + msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), + packageLocalDir) + message(msg) +} Sys.setenv(SPARK_HOME = packageLocalDir) return(invisible(packageLocalDir)) } else { @@ -104,7 +113,12 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, if (tarExists && !overwrite) { message("tar file found.") } else { -robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) +if (releaseUrl != "") { + message("Downloading from alternate URL:\n- ", releaseUrl) + downloadUrl(releaseUrl, packageLocalPath, paste0("Fetch failed from ", releaseUrl)) +} else { + robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) +} } message(sprintf("Installing to %s", localDir)) @@ -182,16 +196,18 @@ getPreferredMirror <- function(version, packageName) { } directDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) { - packageRemotePath <- paste0( -file.path(mirrorUrl, version, packageName), ".tgz") + packageRemotePath <- paste0(file.path(mirrorUrl, version, packageName), ".tgz") fmt <- "Downloading %s for Hadoop %s from:\n- %s" msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), packageRemotePath) message(msg) + downloadUrl(packageRemotePath, packageLocalPath, paste0("Fetch failed from ", mirrorUrl)) +}
spark git commit: [SPARK-18875][SPARKR][DOCS] Fix R API doc generation by adding `DESCRIPTION` file
Repository: spark Updated Branches: refs/heads/branch-2.0 669815d44 -> d36ed9e1d [SPARK-18875][SPARKR][DOCS] Fix R API doc generation by adding `DESCRIPTION` file ## What changes were proposed in this pull request? Since Apache Spark 1.4.0, R API document page has a broken link on `DESCRIPTION file` because Jekyll plugin script doesn't copy the file. This PR aims to fix that. - Official Latest Website: http://spark.apache.org/docs/latest/api/R/index.html - Apache Spark 2.1.0-rc2: http://people.apache.org/~pwendell/spark-releases/spark-2.1.0-rc2-docs/api/R/index.html ## How was this patch tested? Manual. ```bash cd docs SKIP_SCALADOC=1 jekyll build ``` Author: Dongjoon Hyun Closes #16292 from dongjoon-hyun/SPARK-18875. (cherry picked from commit ec0eae486331c3977505d261676b77a33c334216) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d36ed9e1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d36ed9e1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d36ed9e1 Branch: refs/heads/branch-2.0 Commit: d36ed9e1db363541f9ec4c22d843ae5734805a90 Parents: 669815d Author: Dongjoon Hyun Authored: Wed Dec 14 21:29:20 2016 -0800 Committer: Shivaram Venkataraman Committed: Wed Dec 14 21:29:43 2016 -0800 -- docs/_plugins/copy_api_dirs.rb | 3 +++ 1 file changed, 3 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d36ed9e1/docs/_plugins/copy_api_dirs.rb -- diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb index f926d67..71e6432 100644 --- a/docs/_plugins/copy_api_dirs.rb +++ b/docs/_plugins/copy_api_dirs.rb @@ -142,4 +142,7 @@ if not (ENV['SKIP_API'] == '1') puts "cp -r R/pkg/html/. docs/api/R" cp_r("R/pkg/html/.", "docs/api/R") + puts "cp R/pkg/DESCRIPTION docs/api" + cp("R/pkg/DESCRIPTION", "docs/api") + end - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18875][SPARKR][DOCS] Fix R API doc generation by adding `DESCRIPTION` file
Repository: spark Updated Branches: refs/heads/master 5d510c693 -> ec0eae486 [SPARK-18875][SPARKR][DOCS] Fix R API doc generation by adding `DESCRIPTION` file ## What changes were proposed in this pull request? Since Apache Spark 1.4.0, R API document page has a broken link on `DESCRIPTION file` because Jekyll plugin script doesn't copy the file. This PR aims to fix that. - Official Latest Website: http://spark.apache.org/docs/latest/api/R/index.html - Apache Spark 2.1.0-rc2: http://people.apache.org/~pwendell/spark-releases/spark-2.1.0-rc2-docs/api/R/index.html ## How was this patch tested? Manual. ```bash cd docs SKIP_SCALADOC=1 jekyll build ``` Author: Dongjoon Hyun Closes #16292 from dongjoon-hyun/SPARK-18875. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ec0eae48 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ec0eae48 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ec0eae48 Branch: refs/heads/master Commit: ec0eae486331c3977505d261676b77a33c334216 Parents: 5d510c6 Author: Dongjoon Hyun Authored: Wed Dec 14 21:29:20 2016 -0800 Committer: Shivaram Venkataraman Committed: Wed Dec 14 21:29:20 2016 -0800 -- docs/_plugins/copy_api_dirs.rb | 3 +++ 1 file changed, 3 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ec0eae48/docs/_plugins/copy_api_dirs.rb -- diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb index f926d67..71e6432 100644 --- a/docs/_plugins/copy_api_dirs.rb +++ b/docs/_plugins/copy_api_dirs.rb @@ -142,4 +142,7 @@ if not (ENV['SKIP_API'] == '1') puts "cp -r R/pkg/html/. docs/api/R" cp_r("R/pkg/html/.", "docs/api/R") + puts "cp R/pkg/DESCRIPTION docs/api" + cp("R/pkg/DESCRIPTION", "docs/api") + end - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18875][SPARKR][DOCS] Fix R API doc generation by adding `DESCRIPTION` file
Repository: spark Updated Branches: refs/heads/branch-2.1 b14fc3918 -> d399a297d [SPARK-18875][SPARKR][DOCS] Fix R API doc generation by adding `DESCRIPTION` file ## What changes were proposed in this pull request? Since Apache Spark 1.4.0, R API document page has a broken link on `DESCRIPTION file` because Jekyll plugin script doesn't copy the file. This PR aims to fix that. - Official Latest Website: http://spark.apache.org/docs/latest/api/R/index.html - Apache Spark 2.1.0-rc2: http://people.apache.org/~pwendell/spark-releases/spark-2.1.0-rc2-docs/api/R/index.html ## How was this patch tested? Manual. ```bash cd docs SKIP_SCALADOC=1 jekyll build ``` Author: Dongjoon Hyun Closes #16292 from dongjoon-hyun/SPARK-18875. (cherry picked from commit ec0eae486331c3977505d261676b77a33c334216) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d399a297 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d399a297 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d399a297 Branch: refs/heads/branch-2.1 Commit: d399a297d1ec9e0a3c57658cba0320b4d7fe88c5 Parents: b14fc39 Author: Dongjoon Hyun Authored: Wed Dec 14 21:29:20 2016 -0800 Committer: Shivaram Venkataraman Committed: Wed Dec 14 21:29:30 2016 -0800 -- docs/_plugins/copy_api_dirs.rb | 3 +++ 1 file changed, 3 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d399a297/docs/_plugins/copy_api_dirs.rb -- diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb index f926d67..71e6432 100644 --- a/docs/_plugins/copy_api_dirs.rb +++ b/docs/_plugins/copy_api_dirs.rb @@ -142,4 +142,7 @@ if not (ENV['SKIP_API'] == '1') puts "cp -r R/pkg/html/. docs/api/R" cp_r("R/pkg/html/.", "docs/api/R") + puts "cp R/pkg/DESCRIPTION docs/api" + cp("R/pkg/DESCRIPTION", "docs/api") + end - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18849][ML][SPARKR][DOC] vignettes final check update
Repository: spark Updated Branches: refs/heads/master ec0eae486 -> 7d858bc5c [SPARK-18849][ML][SPARKR][DOC] vignettes final check update ## What changes were proposed in this pull request? doc cleanup ## How was this patch tested? ~~vignettes is not building for me. I'm going to kick off a full clean build and try again and attach output here for review.~~ Output html here: https://felixcheung.github.io/sparkr-vignettes.html Author: Felix Cheung Closes #16286 from felixcheung/rvignettespass. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7d858bc5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7d858bc5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7d858bc5 Branch: refs/heads/master Commit: 7d858bc5ce870a28a559f4e81dcfc54cbd128cb7 Parents: ec0eae4 Author: Felix Cheung Authored: Wed Dec 14 21:51:52 2016 -0800 Committer: Shivaram Venkataraman Committed: Wed Dec 14 21:51:52 2016 -0800 -- R/pkg/vignettes/sparkr-vignettes.Rmd | 38 ++- 1 file changed, 12 insertions(+), 26 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7d858bc5/R/pkg/vignettes/sparkr-vignettes.Rmd -- diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 8f39922..fa2656c 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -447,33 +447,31 @@ head(teenagers) SparkR supports the following machine learning models and algorithms. -* Generalized Linear Model (GLM) +* Accelerated Failure Time (AFT) Survival Model -* Random Forest +* Collaborative Filtering with Alternating Least Squares (ALS) + +* Gaussian Mixture Model (GMM) + +* Generalized Linear Model (GLM) * Gradient-Boosted Trees (GBT) -* Naive Bayes Model +* Isotonic Regression Model * $k$-means Clustering -* Accelerated Failure Time (AFT) Survival Model - -* Gaussian Mixture Model (GMM) +* Kolmogorov-Smirnov Test * Latent Dirichlet Allocation (LDA) -* Multilayer Perceptron Model - -* Collaborative Filtering with Alternating Least Squares (ALS) - -* Isotonic Regression Model - * Logistic Regression Model -* Kolmogorov-Smirnov Test +* Multilayer Perceptron Model -More will be added in the future. +* Naive Bayes Model + +* Random Forest ### R Formula @@ -601,8 +599,6 @@ head(aftPredictions) Gaussian Mixture Model -(Added in 2.1.0) - `spark.gaussianMixture` fits multivariate [Gaussian Mixture Model](https://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model) (GMM) against a `SparkDataFrame`. [Expectation-Maximization](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) (EM) is used to approximate the maximum likelihood estimator (MLE) of the model. We use a simulated example to demostrate the usage. @@ -620,8 +616,6 @@ head(select(gmmFitted, "V1", "V2", "prediction")) Latent Dirichlet Allocation -(Added in 2.1.0) - `spark.lda` fits a [Latent Dirichlet Allocation](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) model on a `SparkDataFrame`. It is often used in topic modeling in which topics are inferred from a collection of text documents. LDA can be thought of as a clustering algorithm as follows: * Topics correspond to cluster centers, and documents correspond to examples (rows) in a dataset. @@ -676,8 +670,6 @@ perplexity Multilayer Perceptron -(Added in 2.1.0) - Multilayer perceptron classifier (MLPC) is a classifier based on the [feedforward artificial neural network](https://en.wikipedia.org/wiki/Feedforward_neural_network). MLPC consists of multiple layers of nodes. Each layer is fully connected to the next layer in the network. Nodes in the input layer represent the input data. All other nodes map inputs to outputs by a linear combination of the inputs with the node’s weights $w$ and bias $b$ and applying an activation function. This can be written in matrix form for MLPC with $K+1$ layers as follows: $$ y(x)=f_K(\ldots f_2(w_2^T f_1(w_1^T x + b_1) + b_2) \ldots + b_K). @@ -726,8 +718,6 @@ head(select(predictions, predictions$prediction)) Collaborative Filtering -(Added in 2.1.0) - `spark.als` learns latent factors in [collaborative filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering) via [alternating least squares](http://dl.acm.org/citation.cfm?id=1608614). There are multiple options that can be configured in `spark.als`, including `rank`, `reg`, `nonnegative`. For a complete list, refer to the help file. @@ -757,8 +747,6 @@ head(predicted) Isotonic Regression Model -(Added in 2.1.0) - `spark.isoreg
spark git commit: [SPARK-18849][ML][SPARKR][DOC] vignettes final check update
Repository: spark Updated Branches: refs/heads/branch-2.1 d399a297d -> 2a8de2e11 [SPARK-18849][ML][SPARKR][DOC] vignettes final check update ## What changes were proposed in this pull request? doc cleanup ## How was this patch tested? ~~vignettes is not building for me. I'm going to kick off a full clean build and try again and attach output here for review.~~ Output html here: https://felixcheung.github.io/sparkr-vignettes.html Author: Felix Cheung Closes #16286 from felixcheung/rvignettespass. (cherry picked from commit 7d858bc5ce870a28a559f4e81dcfc54cbd128cb7) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2a8de2e1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2a8de2e1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2a8de2e1 Branch: refs/heads/branch-2.1 Commit: 2a8de2e11ebab0cb9056444053127619d8a47d8a Parents: d399a29 Author: Felix Cheung Authored: Wed Dec 14 21:51:52 2016 -0800 Committer: Shivaram Venkataraman Committed: Wed Dec 14 21:52:01 2016 -0800 -- R/pkg/vignettes/sparkr-vignettes.Rmd | 38 ++- 1 file changed, 12 insertions(+), 26 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2a8de2e1/R/pkg/vignettes/sparkr-vignettes.Rmd -- diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 8f39922..fa2656c 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -447,33 +447,31 @@ head(teenagers) SparkR supports the following machine learning models and algorithms. -* Generalized Linear Model (GLM) +* Accelerated Failure Time (AFT) Survival Model -* Random Forest +* Collaborative Filtering with Alternating Least Squares (ALS) + +* Gaussian Mixture Model (GMM) + +* Generalized Linear Model (GLM) * Gradient-Boosted Trees (GBT) -* Naive Bayes Model +* Isotonic Regression Model * $k$-means Clustering -* Accelerated Failure Time (AFT) Survival Model - -* Gaussian Mixture Model (GMM) +* Kolmogorov-Smirnov Test * Latent Dirichlet Allocation (LDA) -* Multilayer Perceptron Model - -* Collaborative Filtering with Alternating Least Squares (ALS) - -* Isotonic Regression Model - * Logistic Regression Model -* Kolmogorov-Smirnov Test +* Multilayer Perceptron Model -More will be added in the future. +* Naive Bayes Model + +* Random Forest ### R Formula @@ -601,8 +599,6 @@ head(aftPredictions) Gaussian Mixture Model -(Added in 2.1.0) - `spark.gaussianMixture` fits multivariate [Gaussian Mixture Model](https://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model) (GMM) against a `SparkDataFrame`. [Expectation-Maximization](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) (EM) is used to approximate the maximum likelihood estimator (MLE) of the model. We use a simulated example to demostrate the usage. @@ -620,8 +616,6 @@ head(select(gmmFitted, "V1", "V2", "prediction")) Latent Dirichlet Allocation -(Added in 2.1.0) - `spark.lda` fits a [Latent Dirichlet Allocation](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) model on a `SparkDataFrame`. It is often used in topic modeling in which topics are inferred from a collection of text documents. LDA can be thought of as a clustering algorithm as follows: * Topics correspond to cluster centers, and documents correspond to examples (rows) in a dataset. @@ -676,8 +670,6 @@ perplexity Multilayer Perceptron -(Added in 2.1.0) - Multilayer perceptron classifier (MLPC) is a classifier based on the [feedforward artificial neural network](https://en.wikipedia.org/wiki/Feedforward_neural_network). MLPC consists of multiple layers of nodes. Each layer is fully connected to the next layer in the network. Nodes in the input layer represent the input data. All other nodes map inputs to outputs by a linear combination of the inputs with the node’s weights $w$ and bias $b$ and applying an activation function. This can be written in matrix form for MLPC with $K+1$ layers as follows: $$ y(x)=f_K(\ldots f_2(w_2^T f_1(w_1^T x + b_1) + b_2) \ldots + b_K). @@ -726,8 +718,6 @@ head(select(predictions, predictions$prediction)) Collaborative Filtering -(Added in 2.1.0) - `spark.als` learns latent factors in [collaborative filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering) via [alternating least squares](http://dl.acm.org/citation.cfm?id=1608614). There are multiple options that can be configured in `spark.als`, including `rank`, `reg`, `nonnegative`. For a complete list, refer to the help fi
spark git commit: [MINOR] Handle fact that mv is different on linux, mac
Repository: spark Updated Branches: refs/heads/master 9634018c4 -> 5a44f18a2 [MINOR] Handle fact that mv is different on linux, mac Follow up to https://github.com/apache/spark/commit/ae853e8f3bdbd16427e6f1ffade4f63abaf74abb as `mv` throws an error on the Jenkins machines if source and destinations are the same. Author: Shivaram Venkataraman Closes #16302 from shivaram/sparkr-no-mv-fix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5a44f18a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5a44f18a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5a44f18a Branch: refs/heads/master Commit: 5a44f18a2a114bdd37b6714d81f88cb68148f0c9 Parents: 9634018 Author: Shivaram Venkataraman Authored: Thu Dec 15 17:13:35 2016 -0800 Committer: Shivaram Venkataraman Committed: Thu Dec 15 17:13:35 2016 -0800 -- dev/make-distribution.sh | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5a44f18a/dev/make-distribution.sh -- diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh index da44748..6ea319e 100755 --- a/dev/make-distribution.sh +++ b/dev/make-distribution.sh @@ -228,8 +228,11 @@ if [ "$MAKE_R" == "true" ]; then # Install source package to get it to generate vignettes, etc. # Do not source the check-cran.sh - it should be run from where it is for it to set SPARK_HOME NO_TESTS=1 CLEAN_INSTALL=1 "$SPARK_HOME/"R/check-cran.sh - # Move R source package to file name matching the Spark release version. - mv $SPARK_HOME/R/SparkR_"$R_PACKAGE_VERSION".tar.gz $SPARK_HOME/R/SparkR_"$VERSION".tar.gz + # Move R source package to match the Spark release version if the versions are not the same. + # NOTE(shivaram): `mv` throws an error on Linux if source and destination are same file + if [ "$R_PACKAGE_VERSION" != "$VERSION" ]; then +mv $SPARK_HOME/R/SparkR_"$R_PACKAGE_VERSION".tar.gz $SPARK_HOME/R/SparkR_"$VERSION".tar.gz + fi popd > /dev/null else echo "Skipping building R source package" - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR] Handle fact that mv is different on linux, mac
Repository: spark Updated Branches: refs/heads/branch-2.1 62a6577bf -> b23220fa6 [MINOR] Handle fact that mv is different on linux, mac Follow up to https://github.com/apache/spark/commit/ae853e8f3bdbd16427e6f1ffade4f63abaf74abb as `mv` throws an error on the Jenkins machines if source and destinations are the same. Author: Shivaram Venkataraman Closes #16302 from shivaram/sparkr-no-mv-fix. (cherry picked from commit 5a44f18a2a114bdd37b6714d81f88cb68148f0c9) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b23220fa Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b23220fa Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b23220fa Branch: refs/heads/branch-2.1 Commit: b23220fa67dd279d0b8005cb66d0875adbd3c8cb Parents: 62a6577 Author: Shivaram Venkataraman Authored: Thu Dec 15 17:13:35 2016 -0800 Committer: Shivaram Venkataraman Committed: Thu Dec 15 17:13:43 2016 -0800 -- dev/make-distribution.sh | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b23220fa/dev/make-distribution.sh -- diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh index da44748..6ea319e 100755 --- a/dev/make-distribution.sh +++ b/dev/make-distribution.sh @@ -228,8 +228,11 @@ if [ "$MAKE_R" == "true" ]; then # Install source package to get it to generate vignettes, etc. # Do not source the check-cran.sh - it should be run from where it is for it to set SPARK_HOME NO_TESTS=1 CLEAN_INSTALL=1 "$SPARK_HOME/"R/check-cran.sh - # Move R source package to file name matching the Spark release version. - mv $SPARK_HOME/R/SparkR_"$R_PACKAGE_VERSION".tar.gz $SPARK_HOME/R/SparkR_"$VERSION".tar.gz + # Move R source package to match the Spark release version if the versions are not the same. + # NOTE(shivaram): `mv` throws an error on Linux if source and destination are same file + if [ "$R_PACKAGE_VERSION" != "$VERSION" ]; then +mv $SPARK_HOME/R/SparkR_"$R_PACKAGE_VERSION".tar.gz $SPARK_HOME/R/SparkR_"$VERSION".tar.gz + fi popd > /dev/null else echo "Skipping building R source package" - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18897][SPARKR] Fix SparkR SQL Test to drop test table
Repository: spark Updated Branches: refs/heads/master ed84cd068 -> 1169db44b [SPARK-18897][SPARKR] Fix SparkR SQL Test to drop test table ## What changes were proposed in this pull request? SparkR tests, `R/run-tests.sh`, succeeds only once because `test_sparkSQL.R` does not clean up the test table, `people`. As a result, the rows in `people` table are accumulated at every run and the test cases fail. The following is the failure result for the second run. ```r Failed - 1. Failure: create DataFrame from RDD (test_sparkSQL.R#204) --- collect(sql("SELECT age from people WHERE name = 'Bob'"))$age not equal to c(16). Lengths differ: 2 vs 1 2. Failure: create DataFrame from RDD (test_sparkSQL.R#206) --- collect(sql("SELECT height from people WHERE name ='Bob'"))$height not equal to c(176.5). Lengths differ: 2 vs 1 ``` ## How was this patch tested? Manual. Run `run-tests.sh` twice and check if it passes without failures. Author: Dongjoon Hyun Closes #16310 from dongjoon-hyun/SPARK-18897. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1169db44 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1169db44 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1169db44 Branch: refs/heads/master Commit: 1169db44bc1d51e68feb6ba2552520b2d660c2c0 Parents: ed84cd0 Author: Dongjoon Hyun Authored: Fri Dec 16 11:30:21 2016 -0800 Committer: Shivaram Venkataraman Committed: Fri Dec 16 11:30:21 2016 -0800 -- R/pkg/inst/tests/testthat/test_sparkSQL.R | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1169db44/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index e8ccff8..2e95737 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -205,6 +205,7 @@ test_that("create DataFrame from RDD", { c(16)) expect_equal(collect(sql("SELECT height from people WHERE name ='Bob'"))$height, c(176.5)) + sql("DROP TABLE people") unsetHiveContext() }) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18897][SPARKR] Fix SparkR SQL Test to drop test table
Repository: spark Updated Branches: refs/heads/branch-2.1 d8ef0be83 -> df589be54 [SPARK-18897][SPARKR] Fix SparkR SQL Test to drop test table ## What changes were proposed in this pull request? SparkR tests, `R/run-tests.sh`, succeeds only once because `test_sparkSQL.R` does not clean up the test table, `people`. As a result, the rows in `people` table are accumulated at every run and the test cases fail. The following is the failure result for the second run. ```r Failed - 1. Failure: create DataFrame from RDD (test_sparkSQL.R#204) --- collect(sql("SELECT age from people WHERE name = 'Bob'"))$age not equal to c(16). Lengths differ: 2 vs 1 2. Failure: create DataFrame from RDD (test_sparkSQL.R#206) --- collect(sql("SELECT height from people WHERE name ='Bob'"))$height not equal to c(176.5). Lengths differ: 2 vs 1 ``` ## How was this patch tested? Manual. Run `run-tests.sh` twice and check if it passes without failures. Author: Dongjoon Hyun Closes #16310 from dongjoon-hyun/SPARK-18897. (cherry picked from commit 1169db44bc1d51e68feb6ba2552520b2d660c2c0) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/df589be5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/df589be5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/df589be5 Branch: refs/heads/branch-2.1 Commit: df589be5443980f344d50afc8068f57ae18995de Parents: d8ef0be Author: Dongjoon Hyun Authored: Fri Dec 16 11:30:21 2016 -0800 Committer: Shivaram Venkataraman Committed: Fri Dec 16 11:30:34 2016 -0800 -- R/pkg/inst/tests/testthat/test_sparkSQL.R | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/df589be5/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index e8ccff8..2e95737 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -205,6 +205,7 @@ test_that("create DataFrame from RDD", { c(16)) expect_equal(collect(sql("SELECT height from people WHERE name ='Bob'"))$height, c(176.5)) + sql("DROP TABLE people") unsetHiveContext() }) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18897][SPARKR] Fix SparkR SQL Test to drop test table
Repository: spark Updated Branches: refs/heads/branch-2.0 d36ed9e1d -> 1935bf446 [SPARK-18897][SPARKR] Fix SparkR SQL Test to drop test table ## What changes were proposed in this pull request? SparkR tests, `R/run-tests.sh`, succeeds only once because `test_sparkSQL.R` does not clean up the test table, `people`. As a result, the rows in `people` table are accumulated at every run and the test cases fail. The following is the failure result for the second run. ```r Failed - 1. Failure: create DataFrame from RDD (test_sparkSQL.R#204) --- collect(sql("SELECT age from people WHERE name = 'Bob'"))$age not equal to c(16). Lengths differ: 2 vs 1 2. Failure: create DataFrame from RDD (test_sparkSQL.R#206) --- collect(sql("SELECT height from people WHERE name ='Bob'"))$height not equal to c(176.5). Lengths differ: 2 vs 1 ``` ## How was this patch tested? Manual. Run `run-tests.sh` twice and check if it passes without failures. Author: Dongjoon Hyun Closes #16310 from dongjoon-hyun/SPARK-18897. (cherry picked from commit 1169db44bc1d51e68feb6ba2552520b2d660c2c0) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1935bf44 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1935bf44 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1935bf44 Branch: refs/heads/branch-2.0 Commit: 1935bf44605f92fbd4f6e62d23f18bc437130add Parents: d36ed9e Author: Dongjoon Hyun Authored: Fri Dec 16 11:30:21 2016 -0800 Committer: Shivaram Venkataraman Committed: Fri Dec 16 11:30:53 2016 -0800 -- R/pkg/inst/tests/testthat/test_sparkSQL.R | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1935bf44/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index ef6cab1..9b0b41a 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -205,6 +205,7 @@ test_that("create DataFrame from RDD", { c(16)) expect_equal(collect(sql("SELECT height from people WHERE name ='Bob'"))$height, c(176.5)) + sql("DROP TABLE people") unsetHiveContext() }) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-18895][TESTS] Fix resource-closing-related and path-related test failures in identified ones on Windows
ri Dec 16 21:32:24 2016 -0800 Committer: Shivaram Venkataraman Committed: Fri Dec 16 21:32:24 2016 -0800 -- .../org/apache/spark/deploy/RPackageUtils.scala | 47 .../spark/metrics/InputOutputMetricsSuite.scala | 6 +-- .../scheduler/EventLoggingListenerSuite.scala | 19 3 files changed, 41 insertions(+), 31 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2bc1c951/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala index 3d2cabc..050778a 100644 --- a/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala +++ b/core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala @@ -176,26 +176,31 @@ private[deploy] object RPackageUtils extends Logging { val file = new File(Utils.resolveURI(jarPath)) if (file.exists()) { val jar = new JarFile(file) -if (checkManifestForR(jar)) { - print(s"$file contains R source code. Now installing package.", printStream, Level.INFO) - val rSource = extractRFolder(jar, printStream, verbose) - if (RUtils.rPackages.isEmpty) { -RUtils.rPackages = Some(Utils.createTempDir().getAbsolutePath) - } - try { -if (!rPackageBuilder(rSource, printStream, verbose, RUtils.rPackages.get)) { - print(s"ERROR: Failed to build R package in $file.", printStream) - print(RJarDoc, printStream) +Utils.tryWithSafeFinally { + if (checkManifestForR(jar)) { +print(s"$file contains R source code. Now installing package.", printStream, Level.INFO) +val rSource = extractRFolder(jar, printStream, verbose) +if (RUtils.rPackages.isEmpty) { + RUtils.rPackages = Some(Utils.createTempDir().getAbsolutePath) } - } finally { // clean up -if (!rSource.delete()) { - logWarning(s"Error deleting ${rSource.getPath()}") +try { + if (!rPackageBuilder(rSource, printStream, verbose, RUtils.rPackages.get)) { +print(s"ERROR: Failed to build R package in $file.", printStream) +print(RJarDoc, printStream) + } +} finally { + // clean up + if (!rSource.delete()) { +logWarning(s"Error deleting ${rSource.getPath()}") + } +} + } else { +if (verbose) { + print(s"$file doesn't contain R source code, skipping...", printStream) } } -} else { - if (verbose) { -print(s"$file doesn't contain R source code, skipping...", printStream) - } +} { + jar.close() } } else { print(s"WARN: $file resolved as dependency, but not found.", printStream, Level.WARNING) @@ -231,8 +236,12 @@ private[deploy] object RPackageUtils extends Logging { val zipOutputStream = new ZipOutputStream(new FileOutputStream(zipFile, false)) try { filesToBundle.foreach { file => -// get the relative paths for proper naming in the zip file -val relPath = file.getAbsolutePath.replaceFirst(dir.getAbsolutePath, "") +// Get the relative paths for proper naming in the ZIP file. Note that +// we convert dir to URI to force / and then remove trailing / that show up for +// directories because the separator should always be / for according to ZIP +// specification and therefore `relPath` here should be, for example, +// "/packageTest/def.R" or "/test.R". +val relPath = file.toURI.toString.replaceFirst(dir.toURI.toString.stripSuffix("/"), "") val fis = new FileInputStream(file) val zipEntry = new ZipEntry(relPath) zipOutputStream.putNextEntry(zipEntry) http://git-wip-us.apache.org/repos/asf/spark/blob/2bc1c951/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala index f8054f5..a73b300 100644 --- a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala +++ b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala @@ -61,7 +61,7 @@ class InputOutputMetricsSuite extends SparkFunSuite wit
spark git commit: [SPARK-19130][SPARKR] Support setting literal value as column implicitly
Repository: spark Updated Branches: refs/heads/master 4239a1081 -> d749c0667 [SPARK-19130][SPARKR] Support setting literal value as column implicitly ## What changes were proposed in this pull request? ``` df$foo <- 1 ``` instead of ``` df$foo <- lit(1) ``` ## How was this patch tested? unit tests Author: Felix Cheung Closes #16510 from felixcheung/rlitcol. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d749c066 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d749c066 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d749c066 Branch: refs/heads/master Commit: d749c06677c2fd38377f1c00f542da122b8d Parents: 4239a10 Author: Felix Cheung Authored: Wed Jan 11 08:29:09 2017 -0800 Committer: Shivaram Venkataraman Committed: Wed Jan 11 08:29:09 2017 -0800 -- R/pkg/R/DataFrame.R | 22 +- R/pkg/R/utils.R | 4 R/pkg/inst/tests/testthat/test_sparkSQL.R | 18 ++ 3 files changed, 39 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d749c066/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index c56648a..3d912c9 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1727,14 +1727,21 @@ setMethod("$", signature(x = "SparkDataFrame"), getColumn(x, name) }) -#' @param value a Column or \code{NULL}. If \code{NULL}, the specified Column is dropped. +#' @param value a Column or an atomic vector in the length of 1 as literal value, or \code{NULL}. +#' If \code{NULL}, the specified Column is dropped. #' @rdname select #' @name $<- #' @aliases $<-,SparkDataFrame-method #' @note $<- since 1.4.0 setMethod("$<-", signature(x = "SparkDataFrame"), function(x, name, value) { -stopifnot(class(value) == "Column" || is.null(value)) +if (class(value) != "Column" && !is.null(value)) { + if (isAtomicLengthOne(value)) { +value <- lit(value) + } else { +stop("value must be a Column, literal value as atomic in length of 1, or NULL") + } +} if (is.null(value)) { nx <- drop(x, name) @@ -1947,10 +1954,10 @@ setMethod("selectExpr", #' #' @param x a SparkDataFrame. #' @param colName a column name. -#' @param col a Column expression. +#' @param col a Column expression, or an atomic vector in the length of 1 as literal value. #' @return A SparkDataFrame with the new column added or the existing column replaced. #' @family SparkDataFrame functions -#' @aliases withColumn,SparkDataFrame,character,Column-method +#' @aliases withColumn,SparkDataFrame,character-method #' @rdname withColumn #' @name withColumn #' @seealso \link{rename} \link{mutate} @@ -1963,11 +1970,16 @@ setMethod("selectExpr", #' newDF <- withColumn(df, "newCol", df$col1 * 5) #' # Replace an existing column #' newDF2 <- withColumn(newDF, "newCol", newDF$col1) +#' newDF3 <- withColumn(newDF, "newCol", 42) #' } #' @note withColumn since 1.4.0 setMethod("withColumn", - signature(x = "SparkDataFrame", colName = "character", col = "Column"), + signature(x = "SparkDataFrame", colName = "character"), function(x, colName, col) { +if (class(col) != "Column") { + if (!isAtomicLengthOne(col)) stop("Literal value must be atomic in length of 1") + col <- lit(col) +} sdf <- callJMethod(x@sdf, "withColumn", colName, col@jc) dataFrame(sdf) }) http://git-wip-us.apache.org/repos/asf/spark/blob/d749c066/R/pkg/R/utils.R -- diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index 1283449..74b3e50 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -863,3 +863,7 @@ basenameSansExtFromUrl <- function(url) { # then, strip extension by the last '.' sub("([^.]+)\\.[[:alnum:]]+$", "\\1", filename) } + +isAtomicLengthOne <- function(x) { + is.atomic(x) && length(x) == 1 +} http://git-wip-us.apache.org/repos/asf/spark/blob/d749c066/R/pkg/inst/tests/testthat/test_sparkSQL.R
spark git commit: [SPARK-19130][SPARKR] Support setting literal value as column implicitly
Repository: spark Updated Branches: refs/heads/branch-2.1 1022049c7 -> 82fcc1330 [SPARK-19130][SPARKR] Support setting literal value as column implicitly ## What changes were proposed in this pull request? ``` df$foo <- 1 ``` instead of ``` df$foo <- lit(1) ``` ## How was this patch tested? unit tests Author: Felix Cheung Closes #16510 from felixcheung/rlitcol. (cherry picked from commit d749c06677c2fd38377f1c00f542da122b8d) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/82fcc133 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/82fcc133 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/82fcc133 Branch: refs/heads/branch-2.1 Commit: 82fcc133040cb5ef32f10df73fcb9fd8914aa9c1 Parents: 1022049 Author: Felix Cheung Authored: Wed Jan 11 08:29:09 2017 -0800 Committer: Shivaram Venkataraman Committed: Wed Jan 11 08:29:30 2017 -0800 -- R/pkg/R/DataFrame.R | 22 +- R/pkg/R/utils.R | 4 R/pkg/inst/tests/testthat/test_sparkSQL.R | 18 ++ 3 files changed, 39 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/82fcc133/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 058a77e..c79b1d3 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1721,14 +1721,21 @@ setMethod("$", signature(x = "SparkDataFrame"), getColumn(x, name) }) -#' @param value a Column or \code{NULL}. If \code{NULL}, the specified Column is dropped. +#' @param value a Column or an atomic vector in the length of 1 as literal value, or \code{NULL}. +#' If \code{NULL}, the specified Column is dropped. #' @rdname select #' @name $<- #' @aliases $<-,SparkDataFrame-method #' @note $<- since 1.4.0 setMethod("$<-", signature(x = "SparkDataFrame"), function(x, name, value) { -stopifnot(class(value) == "Column" || is.null(value)) +if (class(value) != "Column" && !is.null(value)) { + if (isAtomicLengthOne(value)) { +value <- lit(value) + } else { +stop("value must be a Column, literal value as atomic in length of 1, or NULL") + } +} if (is.null(value)) { nx <- drop(x, name) @@ -1941,10 +1948,10 @@ setMethod("selectExpr", #' #' @param x a SparkDataFrame. #' @param colName a column name. -#' @param col a Column expression. +#' @param col a Column expression, or an atomic vector in the length of 1 as literal value. #' @return A SparkDataFrame with the new column added or the existing column replaced. #' @family SparkDataFrame functions -#' @aliases withColumn,SparkDataFrame,character,Column-method +#' @aliases withColumn,SparkDataFrame,character-method #' @rdname withColumn #' @name withColumn #' @seealso \link{rename} \link{mutate} @@ -1957,11 +1964,16 @@ setMethod("selectExpr", #' newDF <- withColumn(df, "newCol", df$col1 * 5) #' # Replace an existing column #' newDF2 <- withColumn(newDF, "newCol", newDF$col1) +#' newDF3 <- withColumn(newDF, "newCol", 42) #' } #' @note withColumn since 1.4.0 setMethod("withColumn", - signature(x = "SparkDataFrame", colName = "character", col = "Column"), + signature(x = "SparkDataFrame", colName = "character"), function(x, colName, col) { +if (class(col) != "Column") { + if (!isAtomicLengthOne(col)) stop("Literal value must be atomic in length of 1") + col <- lit(col) +} sdf <- callJMethod(x@sdf, "withColumn", colName, col@jc) dataFrame(sdf) }) http://git-wip-us.apache.org/repos/asf/spark/blob/82fcc133/R/pkg/R/utils.R -- diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index 1283449..74b3e50 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -863,3 +863,7 @@ basenameSansExtFromUrl <- function(url) { # then, strip extension by the last '.' sub("([^.]+)\\.[[:alnum:]]+$", "\\1", filename) } + +isAtomicLengthOne <- function(x) { + is.atomic(x) && length(x) == 1 +} http://git-wip-us.apache.org/repos/asf/
spark git commit: [SPARK-20877][SPARKR][WIP] add timestamps to test runs
Repository: spark Updated Branches: refs/heads/master 1f5dddffa -> 382fefd18 [SPARK-20877][SPARKR][WIP] add timestamps to test runs ## What changes were proposed in this pull request? to investigate how long they run ## How was this patch tested? Jenkins, AppVeyor Author: Felix Cheung Closes #18104 from felixcheung/rtimetest. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/382fefd1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/382fefd1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/382fefd1 Branch: refs/heads/master Commit: 382fefd1879e4670f3e9e8841ec243e3eb11c578 Parents: 1f5dddf Author: Felix Cheung Authored: Tue May 30 22:33:29 2017 -0700 Committer: Shivaram Venkataraman Committed: Tue May 30 22:33:29 2017 -0700 -- R/pkg/inst/tests/testthat/test_Windows.R| 3 + .../tests/testthat/test_mllib_classification.R | 4 + .../inst/tests/testthat/test_mllib_clustering.R | 2 + R/pkg/inst/tests/testthat/test_mllib_tree.R | 82 R/pkg/inst/tests/testthat/test_sparkSQL.R | 15 R/pkg/inst/tests/testthat/test_utils.R | 3 + R/pkg/tests/run-all.R | 6 ++ 7 files changed, 81 insertions(+), 34 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/382fefd1/R/pkg/inst/tests/testthat/test_Windows.R -- diff --git a/R/pkg/inst/tests/testthat/test_Windows.R b/R/pkg/inst/tests/testthat/test_Windows.R index 919b063..00d684e 100644 --- a/R/pkg/inst/tests/testthat/test_Windows.R +++ b/R/pkg/inst/tests/testthat/test_Windows.R @@ -27,3 +27,6 @@ test_that("sparkJars tag in SparkContext", { abcPath <- testOutput[1] expect_equal(abcPath, "a\\b\\c") }) + +message("--- End test (Windows) ", as.POSIXct(Sys.time(), tz = "GMT")) +message("elapsed ", (proc.time() - timer_ptm)[3]) http://git-wip-us.apache.org/repos/asf/spark/blob/382fefd1/R/pkg/inst/tests/testthat/test_mllib_classification.R -- diff --git a/R/pkg/inst/tests/testthat/test_mllib_classification.R b/R/pkg/inst/tests/testthat/test_mllib_classification.R index c1c7468..82e588d 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_classification.R +++ b/R/pkg/inst/tests/testthat/test_mllib_classification.R @@ -28,6 +28,8 @@ absoluteSparkPath <- function(x) { } test_that("spark.svmLinear", { + skip_on_cran() + df <- suppressWarnings(createDataFrame(iris)) training <- df[df$Species %in% c("versicolor", "virginica"), ] model <- spark.svmLinear(training, Species ~ ., regParam = 0.01, maxIter = 10) @@ -226,6 +228,8 @@ test_that("spark.logit", { }) test_that("spark.mlp", { + skip_on_cran() + df <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"), source = "libsvm") model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 5, 4, 3), http://git-wip-us.apache.org/repos/asf/spark/blob/382fefd1/R/pkg/inst/tests/testthat/test_mllib_clustering.R -- diff --git a/R/pkg/inst/tests/testthat/test_mllib_clustering.R b/R/pkg/inst/tests/testthat/test_mllib_clustering.R index 8f71de1..e827e96 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_clustering.R +++ b/R/pkg/inst/tests/testthat/test_mllib_clustering.R @@ -28,6 +28,8 @@ absoluteSparkPath <- function(x) { } test_that("spark.bisectingKmeans", { + skip_on_cran() + newIris <- iris newIris$Species <- NULL training <- suppressWarnings(createDataFrame(newIris)) http://git-wip-us.apache.org/repos/asf/spark/blob/382fefd1/R/pkg/inst/tests/testthat/test_mllib_tree.R -- diff --git a/R/pkg/inst/tests/testthat/test_mllib_tree.R b/R/pkg/inst/tests/testthat/test_mllib_tree.R index 5fd6a38..31427ee 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_tree.R +++ b/R/pkg/inst/tests/testthat/test_mllib_tree.R @@ -28,6 +28,8 @@ absoluteSparkPath <- function(x) { } test_that("spark.gbt", { + skip_on_cran() + # regression data <- suppressWarnings(createDataFrame(longley)) model <- spark.gbt(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16, seed = 123) @@ -103,10 +105,12 @@ test_that("spark.gbt", { expect_equal(stats$maxDepth, 5) # spark.gbt classification can work on libsvm data - data <- read.df(absoluteSparkPath("data/mllib/sample_binary_classification_data.txt"), -source =
spark git commit: [SPARK-20877][SPARKR][WIP] add timestamps to test runs
Repository: spark Updated Branches: refs/heads/branch-2.2 287440df6 -> 3cad66e5e [SPARK-20877][SPARKR][WIP] add timestamps to test runs to investigate how long they run Jenkins, AppVeyor Author: Felix Cheung Closes #18104 from felixcheung/rtimetest. (cherry picked from commit 382fefd1879e4670f3e9e8841ec243e3eb11c578) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3cad66e5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3cad66e5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3cad66e5 Branch: refs/heads/branch-2.2 Commit: 3cad66e5e06a4020a16fa757fbf67f666b319bab Parents: 287440d Author: Felix Cheung Authored: Tue May 30 22:33:29 2017 -0700 Committer: Shivaram Venkataraman Committed: Tue May 30 22:35:44 2017 -0700 -- R/pkg/inst/tests/testthat/test_Windows.R| 3 +++ .../tests/testthat/test_mllib_classification.R | 4 .../inst/tests/testthat/test_mllib_clustering.R | 2 ++ R/pkg/inst/tests/testthat/test_mllib_tree.R | 22 +--- R/pkg/inst/tests/testthat/test_sparkSQL.R | 15 + R/pkg/inst/tests/testthat/test_utils.R | 3 +++ R/pkg/tests/run-all.R | 6 ++ 7 files changed, 47 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3cad66e5/R/pkg/inst/tests/testthat/test_Windows.R -- diff --git a/R/pkg/inst/tests/testthat/test_Windows.R b/R/pkg/inst/tests/testthat/test_Windows.R index 919b063..00d684e 100644 --- a/R/pkg/inst/tests/testthat/test_Windows.R +++ b/R/pkg/inst/tests/testthat/test_Windows.R @@ -27,3 +27,6 @@ test_that("sparkJars tag in SparkContext", { abcPath <- testOutput[1] expect_equal(abcPath, "a\\b\\c") }) + +message("--- End test (Windows) ", as.POSIXct(Sys.time(), tz = "GMT")) +message("elapsed ", (proc.time() - timer_ptm)[3]) http://git-wip-us.apache.org/repos/asf/spark/blob/3cad66e5/R/pkg/inst/tests/testthat/test_mllib_classification.R -- diff --git a/R/pkg/inst/tests/testthat/test_mllib_classification.R b/R/pkg/inst/tests/testthat/test_mllib_classification.R index c1c7468..82e588d 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_classification.R +++ b/R/pkg/inst/tests/testthat/test_mllib_classification.R @@ -28,6 +28,8 @@ absoluteSparkPath <- function(x) { } test_that("spark.svmLinear", { + skip_on_cran() + df <- suppressWarnings(createDataFrame(iris)) training <- df[df$Species %in% c("versicolor", "virginica"), ] model <- spark.svmLinear(training, Species ~ ., regParam = 0.01, maxIter = 10) @@ -226,6 +228,8 @@ test_that("spark.logit", { }) test_that("spark.mlp", { + skip_on_cran() + df <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"), source = "libsvm") model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 5, 4, 3), http://git-wip-us.apache.org/repos/asf/spark/blob/3cad66e5/R/pkg/inst/tests/testthat/test_mllib_clustering.R -- diff --git a/R/pkg/inst/tests/testthat/test_mllib_clustering.R b/R/pkg/inst/tests/testthat/test_mllib_clustering.R index 8f71de1..e827e96 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_clustering.R +++ b/R/pkg/inst/tests/testthat/test_mllib_clustering.R @@ -28,6 +28,8 @@ absoluteSparkPath <- function(x) { } test_that("spark.bisectingKmeans", { + skip_on_cran() + newIris <- iris newIris$Species <- NULL training <- suppressWarnings(createDataFrame(newIris)) http://git-wip-us.apache.org/repos/asf/spark/blob/3cad66e5/R/pkg/inst/tests/testthat/test_mllib_tree.R -- diff --git a/R/pkg/inst/tests/testthat/test_mllib_tree.R b/R/pkg/inst/tests/testthat/test_mllib_tree.R index 4cde1cd..923f535 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_tree.R +++ b/R/pkg/inst/tests/testthat/test_mllib_tree.R @@ -28,6 +28,8 @@ absoluteSparkPath <- function(x) { } test_that("spark.gbt", { + skip_on_cran() + # regression data <- suppressWarnings(createDataFrame(longley)) model <- spark.gbt(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16, seed = 123) @@ -103,10 +105,12 @@ test_that("spark.gbt", { expect_equal(stats$maxDepth, 5) # spark.gbt classification can work on libsvm data - data <- read.df(absoluteSparkPath("data/mllib/sample_binary_classification_d
spark git commit: [SPARK-12224][SPARKR] R support for JDBC source
Repository: spark Updated Branches: refs/heads/master 008a8bbef -> ecd877e83 [SPARK-12224][SPARKR] R support for JDBC source Add R API for `read.jdbc`, `write.jdbc`. Tested this quite a bit manually with different combinations of parameters. It's not clear if we could have automated tests in R for this - Scala `JDBCSuite` depends on Java H2 in-memory database. Refactored some code into util so they could be tested. Core's R SerDe code needs to be updated to allow access to java.util.Properties as `jobj` handle which is required by DataFrameReader/Writer's `jdbc` method. It would be possible, though more code to add a `sql/r/SQLUtils` helper function. Tested: ``` # with postgresql ../bin/sparkR --driver-class-path /usr/share/java/postgresql-9.4.1207.jre7.jar # read.jdbc df <- read.jdbc(sqlContext, "jdbc:postgresql://localhost/db", "films2", user = "user", password = "12345") df <- read.jdbc(sqlContext, "jdbc:postgresql://localhost/db", "films2", user = "user", password = 12345) # partitionColumn and numPartitions test df <- read.jdbc(sqlContext, "jdbc:postgresql://localhost/db", "films2", partitionColumn = "did", lowerBound = 0, upperBound = 200, numPartitions = 4, user = "user", password = 12345) a <- SparkR:::toRDD(df) SparkR:::getNumPartitions(a) [1] 4 SparkR:::collectPartition(a, 2L) # defaultParallelism test df <- read.jdbc(sqlContext, "jdbc:postgresql://localhost/db", "films2", partitionColumn = "did", lowerBound = 0, upperBound = 200, user = "user", password = 12345) SparkR:::getNumPartitions(a) [1] 2 # predicates test df <- read.jdbc(sqlContext, "jdbc:postgresql://localhost/db", "films2", predicates = list("did<=105"), user = "user", password = 12345) count(df) == 1 # write.jdbc, default save mode "error" irisDf <- as.DataFrame(sqlContext, iris) write.jdbc(irisDf, "jdbc:postgresql://localhost/db", "films2", user = "user", password = "12345") "error, already exists" write.jdbc(irisDf, "jdbc:postgresql://localhost/db", "iris", user = "user", password = "12345") ``` Author: felixcheung Closes #10480 from felixcheung/rreadjdbc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ecd877e8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ecd877e8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ecd877e8 Branch: refs/heads/master Commit: ecd877e8335ff6bb06c96d3045ccade80676e714 Parents: 008a8bb Author: felixcheung Authored: Tue Apr 19 15:59:47 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Apr 19 15:59:47 2016 -0700 -- R/pkg/NAMESPACE | 2 + R/pkg/R/DataFrame.R | 39 - R/pkg/R/SQLContext.R| 58 R/pkg/R/generics.R | 6 ++ R/pkg/R/utils.R | 11 R/pkg/inst/tests/testthat/test_utils.R | 24 .../scala/org/apache/spark/api/r/SerDe.scala| 7 +++ 7 files changed, 146 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ecd877e8/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 94ac7e7..10b9d16 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -101,6 +101,7 @@ exportMethods("arrange", "withColumn", "withColumnRenamed", "write.df", + "write.jdbc", "write.json", "write.parquet", "write.text") @@ -284,6 +285,7 @@ export("as.DataFrame", "loadDF", "parquetFile", "read.df", + "read.jdbc", "read.json", "read.parquet", "read.text", http://git-wip-us.apache.org/repos/asf/spark/blob/ecd877e8/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index a64a013..ddb056f 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2363,7 +2363,7 @@ setMethod("with", #' @examples \dontrun{ #' # Create a DataFrame from the Iris dataset #' irisDF <- createDataFrame(sqlContext, iris) -#' +#' #' # Show the structure of the DataFrame #' str(irisDF) #'
spark git commit: [SPARK-13905][SPARKR] Change signature of as.data.frame() to be consistent with the R base package.
Repository: spark Updated Branches: refs/heads/master 4514aebd1 -> 8eedf0b55 [SPARK-13905][SPARKR] Change signature of as.data.frame() to be consistent with the R base package. ## What changes were proposed in this pull request? Change the signature of as.data.frame() to be consistent with that in the R base package to meet R user's convention. ## How was this patch tested? dev/lint-r SparkR unit tests Author: Sun Rui Closes #11811 from sun-rui/SPARK-13905. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8eedf0b5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8eedf0b5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8eedf0b5 Branch: refs/heads/master Commit: 8eedf0b553180d0e958b0fb49bc2fee81658495c Parents: 4514aeb Author: Sun Rui Authored: Tue Apr 19 19:57:03 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Apr 19 19:57:03 2016 -0700 -- R/pkg/R/DataFrame.R | 8 ++-- R/pkg/R/generics.R| 5 - R/pkg/inst/tests/testthat/test_context.R | 2 +- R/pkg/inst/tests/testthat/test_sparkSQL.R | 3 +++ 4 files changed, 10 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8eedf0b5/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index ddb056f..95e2eb2 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2296,12 +2296,8 @@ setMethod("fillna", #' } setMethod("as.data.frame", signature(x = "DataFrame"), - function(x, ...) { -# Check if additional parameters have been passed -if (length(list(...)) > 0) { - stop(paste("Unused argument(s): ", paste(list(...), collapse = ", "))) -} -collect(x) + function(x, row.names = NULL, optional = FALSE, ...) { +as.data.frame(collect(x), row.names, optional, ...) }) #' The specified DataFrame is attached to the R search path. This means that http://git-wip-us.apache.org/repos/asf/spark/blob/8eedf0b5/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 4ef05d5..a71be55 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -397,7 +397,10 @@ setGeneric("arrange", function(x, col, ...) { standardGeneric("arrange") }) #' @rdname as.data.frame #' @export -setGeneric("as.data.frame") +setGeneric("as.data.frame", + function(x, row.names = NULL, optional = FALSE, ...) { + standardGeneric("as.data.frame") + }) #' @rdname attach #' @export http://git-wip-us.apache.org/repos/asf/spark/blob/8eedf0b5/R/pkg/inst/tests/testthat/test_context.R -- diff --git a/R/pkg/inst/tests/testthat/test_context.R b/R/pkg/inst/tests/testthat/test_context.R index 6e06c97..9f51161 100644 --- a/R/pkg/inst/tests/testthat/test_context.R +++ b/R/pkg/inst/tests/testthat/test_context.R @@ -26,7 +26,7 @@ test_that("Check masked functions", { maskedBySparkR <- masked[funcSparkROrEmpty] namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var", "colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset", - "summary", "transform", "drop", "window") + "summary", "transform", "drop", "window", "as.data.frame") expect_equal(length(maskedBySparkR), length(namesOfMasked)) expect_equal(sort(maskedBySparkR), sort(namesOfMasked)) # above are those reported as masked when `library(SparkR)` http://git-wip-us.apache.org/repos/asf/spark/blob/8eedf0b5/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index d747d4f..2f65484 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1863,6 +1863,9 @@ test_that("Method as.data.frame as a synonym for collect()", { expect_equal(as.data.frame(irisDF), collect(irisDF)) irisDF2 <- irisDF[irisDF$Species == "setosa", ] expect_equal(as.data.frame(irisDF2), collect(irisDF2)) + + # Make sure as.data.frame in the R base package is not covered + expect_that(as.data.frame(c(1, 2)), not(throws_error())) }) test_that("attach() on a DataFrame", { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-13178] RRDD faces with concurrency issue in case of rdd.zip(rdd).count().
Repository: spark Updated Branches: refs/heads/master aeb52bea5 -> 1a7fc74cc [SPARK-13178] RRDD faces with concurrency issue in case of rdd.zip(rdd).count(). ## What changes were proposed in this pull request? The concurrency issue reported in SPARK-13178 was fixed by the PR https://github.com/apache/spark/pull/10947 for SPARK-12792. This PR just removes a workaround not needed anymore. ## How was this patch tested? SparkR unit tests. Author: Sun Rui Closes #12606 from sun-rui/SPARK-13178. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1a7fc74c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1a7fc74c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1a7fc74c Branch: refs/heads/master Commit: 1a7fc74ccf1b98d929aa4b2ab45c24d4c3d42c1a Parents: aeb52be Author: Sun Rui Authored: Fri Apr 22 11:19:52 2016 -0700 Committer: Shivaram Venkataraman Committed: Fri Apr 22 11:19:52 2016 -0700 -- R/pkg/inst/tests/testthat/test_mllib.R | 2 -- 1 file changed, 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1a7fc74c/R/pkg/inst/tests/testthat/test_mllib.R -- diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 47bbf7e..1597306 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -131,8 +131,6 @@ test_that("kmeans", { newIris$Species <- NULL training <- suppressWarnings(createDataFrame(sqlContext, newIris)) - # Cache the DataFrame here to work around the bug SPARK-13178. - cache(training) take(training, 1) model <- kmeans(x = training, centers = 2) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-14594][SPARKR] check execution return status code
Repository: spark Updated Branches: refs/heads/master 6acc72a02 -> 39d3bc62a [SPARK-14594][SPARKR] check execution return status code ## What changes were proposed in this pull request? When JVM backend fails without going proper error handling (eg. process crashed), the R error message could be ambiguous. ``` Error in if (returnStatus != 0) { : argument is of length zero ``` This change attempts to make it more clear (however, one would still need to investigate why JVM fails) ## How was this patch tested? manually Author: felixcheung Closes #12622 from felixcheung/rreturnstatus. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/39d3bc62 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/39d3bc62 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/39d3bc62 Branch: refs/heads/master Commit: 39d3bc62a7ba16c646bed8d524cf9b929374a790 Parents: 6acc72a Author: felixcheung Authored: Sat Apr 23 11:08:19 2016 -0700 Committer: Shivaram Venkataraman Committed: Sat Apr 23 11:08:19 2016 -0700 -- R/pkg/R/backend.R | 3 +++ 1 file changed, 3 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/39d3bc62/R/pkg/R/backend.R -- diff --git a/R/pkg/R/backend.R b/R/pkg/R/backend.R index 4916283..6c81492 100644 --- a/R/pkg/R/backend.R +++ b/R/pkg/R/backend.R @@ -110,6 +110,9 @@ invokeJava <- function(isStatic, objId, methodName, ...) { # TODO: check the status code to output error information returnStatus <- readInt(conn) + if (length(returnStatus) == 0) { +stop("No status is returned. Java SparkR backend might have failed.") + } if (returnStatus != 0) { stop(readString(conn)) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-14883][DOCS] Fix wrong R examples and make them up-to-date
Repository: spark Updated Branches: refs/heads/master 35319d326 -> 6ab4d9e0c [SPARK-14883][DOCS] Fix wrong R examples and make them up-to-date ## What changes were proposed in this pull request? This issue aims to fix some errors in R examples and make them up-to-date in docs and example modules. - Remove the wrong usage of `map`. We need to use `lapply` in `sparkR` if needed. However, `lapply` is private so far. The corrected example will be added later. - Fix the wrong example in Section `Generic Load/Save Functions` of `docs/sql-programming-guide.md` for consistency - Fix datatypes in `sparkr.md`. - Update a data result in `sparkr.md`. - Replace deprecated functions to remove warnings: jsonFile -> read.json, parquetFile -> read.parquet - Use up-to-date R-like functions: loadDF -> read.df, saveDF -> write.df, saveAsParquetFile -> write.parquet - Replace `SparkR DataFrame` with `SparkDataFrame` in `dataframe.R` and `data-manipulation.R`. - Other minor syntax fixes and a typo. ## How was this patch tested? Manual. Author: Dongjoon Hyun Closes #12649 from dongjoon-hyun/SPARK-14883. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6ab4d9e0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6ab4d9e0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6ab4d9e0 Branch: refs/heads/master Commit: 6ab4d9e0c76b69b4d6d5f39037a77bdfb042be19 Parents: 35319d3 Author: Dongjoon Hyun Authored: Sun Apr 24 22:10:27 2016 -0700 Committer: Shivaram Venkataraman Committed: Sun Apr 24 22:10:27 2016 -0700 -- R/pkg/R/DataFrame.R | 2 +- docs/sparkr.md | 11 +- docs/sql-programming-guide.md | 30 examples/src/main/r/data-manipulation.R | 22 ++-- examples/src/main/r/dataframe.R | 2 +- 5 files changed, 31 insertions(+), 36 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6ab4d9e0/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 3b2fd73..890d15d 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -845,7 +845,7 @@ setMethod("ncol", length(columns(x)) }) -#' Returns the dimentions (number of rows and columns) of a SparkDataFrame +#' Returns the dimensions (number of rows and columns) of a SparkDataFrame #' @param x a SparkDataFrame #' #' @family SparkDataFrame functions http://git-wip-us.apache.org/repos/asf/spark/blob/6ab4d9e0/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index a0b4f93..760534a 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -141,7 +141,7 @@ head(people) # SparkR automatically infers the schema from the JSON file printSchema(people) # root -# |-- age: integer (nullable = true) +# |-- age: long (nullable = true) # |-- name: string (nullable = true) {% endhighlight %} @@ -195,7 +195,7 @@ df <- createDataFrame(sqlContext, faithful) # Get basic information about the DataFrame df -## DataFrame[eruptions:double, waiting:double] +## SparkDataFrame[eruptions:double, waiting:double] # Select only the "eruptions" column head(select(df, df$eruptions)) @@ -228,14 +228,13 @@ SparkR data frames support a number of commonly used functions to aggregate data # We use the `n` operator to count the number of times each waiting time appears head(summarize(groupBy(df, df$waiting), count = n(df$waiting))) ## waiting count -##1 8113 -##2 60 6 -##3 68 1 +##1 70 4 +##2 67 1 +##3 69 2 # We can also sort the output from the aggregation to get the most common waiting times waiting_counts <- summarize(groupBy(df, df$waiting), count = n(df$waiting)) head(arrange(waiting_counts, desc(waiting_counts$count))) - ## waiting count ##1 7815 ##2 8314 http://git-wip-us.apache.org/repos/asf/spark/blob/6ab4d9e0/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 77887f4..9a3db9c 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -173,7 +173,7 @@ df.show() {% highlight r %} sqlContext <- SQLContext(sc) -df <- jsonFile(sqlContext, "examples/src/main/resources/people.json") +df <- read.json(sqlContext, "examples/src/main/resources/people.json") # Displays the content of the DataFrame to stdout showDF(df) @@ -366,7 +366,7 @@ In addition to simple column references and expressions, DataFrames
spark git commit: [SPARKR][DOC] SparkR ML user guides update for 2.0
Repository: spark Updated Branches: refs/heads/master 840853ed0 -> 2ad031be6 [SPARKR][DOC] SparkR ML user guides update for 2.0 ## What changes were proposed in this pull request? * Update SparkR ML section to make them consistent with SparkR API docs. * Since #13972 adds labelling support for the ```include_example``` Jekyll plugin, so that we can split the single ```ml.R``` example file into multiple line blocks with different labels, and include them in different algorithms/models in the generated HTML page. ## How was this patch tested? Only docs update, manually check the generated docs. Author: Yanbo Liang Closes #14011 from yanboliang/r-user-guide-update. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2ad031be Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2ad031be Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2ad031be Branch: refs/heads/master Commit: 2ad031be67c7a0f0c4895c084c891330a9ec935e Parents: 840853e Author: Yanbo Liang Authored: Mon Jul 11 14:31:11 2016 -0700 Committer: Shivaram Venkataraman Committed: Mon Jul 11 14:31:11 2016 -0700 -- R/pkg/R/mllib.R | 8 +--- docs/sparkr.md | 43 +-- examples/src/main/r/ml.R | 22 +++--- 3 files changed, 41 insertions(+), 32 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2ad031be/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 4fe7367..e9fd0c7 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -55,8 +55,9 @@ setClass("KMeansModel", representation(jobj = "jobj")) #' Generalized Linear Models #' -#' Fits generalized linear model against a Spark DataFrame. Users can print, make predictions on the -#' produced model and save the model to the input path. +#' Fits generalized linear model against a Spark DataFrame. +#' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make +#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. #' #' @param data SparkDataFrame for training. #' @param formula A symbolic description of the model to be fitted. Currently only a few formula @@ -270,7 +271,8 @@ setMethod("summary", signature(object = "NaiveBayesModel"), #' K-Means Clustering Model #' #' Fits a k-means clustering model against a Spark DataFrame, similarly to R's kmeans(). -#' Users can print, make predictions on the produced model and save the model to the input path. +#' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make +#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. #' #' @param data SparkDataFrame for training #' @param formula A symbolic description of the model to be fitted. Currently only a few formula http://git-wip-us.apache.org/repos/asf/spark/blob/2ad031be/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index 32ef815..b4acb23 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -355,32 +355,39 @@ head(teenagers) # Machine Learning -SparkR supports the following Machine Learning algorithms. +SparkR supports the following machine learning algorithms currently: `Generalized Linear Model`, `Accelerated Failure Time (AFT) Survival Regression Model`, `Naive Bayes Model` and `KMeans Model`. +Under the hood, SparkR uses MLlib to train the model. +Users can call `summary` to print a summary of the fitted model, [predict](api/R/predict.html) to make predictions on new data, and [write.ml](api/R/write.ml.html)/[read.ml](api/R/read.ml.html) to save/load fitted models. +SparkR supports a subset of the available R formula operators for model fitting, including ‘~’, ‘.’, ‘:’, ‘+’, and ‘-‘. -* Generalized Linear Regression Model [spark.glm()](api/R/spark.glm.html) -* Naive Bayes [spark.naiveBayes()](api/R/spark.naiveBayes.html) -* KMeans [spark.kmeans()](api/R/spark.kmeans.html) -* AFT Survival Regression [spark.survreg()](api/R/spark.survreg.html) +## Algorithms -[Generalized Linear Regression](api/R/spark.glm.html) can be used to train a model from a specified family. Currently the Gaussian, Binomial, Poisson and Gamma families are supported. We support a subset of the available R formula operators for model fitting, including '~', '.', ':', '+', and '-'. +### Generalized Linear Model -The [summary()](api/R/summary.html) function gives the su
spark git commit: [SPARKR][DOC] SparkR ML user guides update for 2.0
Repository: spark Updated Branches: refs/heads/branch-2.0 aea33bf05 -> b938ca76e [SPARKR][DOC] SparkR ML user guides update for 2.0 ## What changes were proposed in this pull request? * Update SparkR ML section to make them consistent with SparkR API docs. * Since #13972 adds labelling support for the ```include_example``` Jekyll plugin, so that we can split the single ```ml.R``` example file into multiple line blocks with different labels, and include them in different algorithms/models in the generated HTML page. ## How was this patch tested? Only docs update, manually check the generated docs. Author: Yanbo Liang Closes #14011 from yanboliang/r-user-guide-update. (cherry picked from commit 2ad031be67c7a0f0c4895c084c891330a9ec935e) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b938ca76 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b938ca76 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b938ca76 Branch: refs/heads/branch-2.0 Commit: b938ca76ebd92e17233addfc29cb7c3692957a7b Parents: aea33bf Author: Yanbo Liang Authored: Mon Jul 11 14:31:11 2016 -0700 Committer: Shivaram Venkataraman Committed: Mon Jul 11 14:31:18 2016 -0700 -- R/pkg/R/mllib.R | 8 +--- docs/sparkr.md | 43 +-- examples/src/main/r/ml.R | 22 +++--- 3 files changed, 41 insertions(+), 32 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b938ca76/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 4fe7367..e9fd0c7 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -55,8 +55,9 @@ setClass("KMeansModel", representation(jobj = "jobj")) #' Generalized Linear Models #' -#' Fits generalized linear model against a Spark DataFrame. Users can print, make predictions on the -#' produced model and save the model to the input path. +#' Fits generalized linear model against a Spark DataFrame. +#' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make +#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. #' #' @param data SparkDataFrame for training. #' @param formula A symbolic description of the model to be fitted. Currently only a few formula @@ -270,7 +271,8 @@ setMethod("summary", signature(object = "NaiveBayesModel"), #' K-Means Clustering Model #' #' Fits a k-means clustering model against a Spark DataFrame, similarly to R's kmeans(). -#' Users can print, make predictions on the produced model and save the model to the input path. +#' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make +#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. #' #' @param data SparkDataFrame for training #' @param formula A symbolic description of the model to be fitted. Currently only a few formula http://git-wip-us.apache.org/repos/asf/spark/blob/b938ca76/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index 32ef815..b4acb23 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -355,32 +355,39 @@ head(teenagers) # Machine Learning -SparkR supports the following Machine Learning algorithms. +SparkR supports the following machine learning algorithms currently: `Generalized Linear Model`, `Accelerated Failure Time (AFT) Survival Regression Model`, `Naive Bayes Model` and `KMeans Model`. +Under the hood, SparkR uses MLlib to train the model. +Users can call `summary` to print a summary of the fitted model, [predict](api/R/predict.html) to make predictions on new data, and [write.ml](api/R/write.ml.html)/[read.ml](api/R/read.ml.html) to save/load fitted models. +SparkR supports a subset of the available R formula operators for model fitting, including ‘~’, ‘.’, ‘:’, ‘+’, and ‘-‘. -* Generalized Linear Regression Model [spark.glm()](api/R/spark.glm.html) -* Naive Bayes [spark.naiveBayes()](api/R/spark.naiveBayes.html) -* KMeans [spark.kmeans()](api/R/spark.kmeans.html) -* AFT Survival Regression [spark.survreg()](api/R/spark.survreg.html) +## Algorithms -[Generalized Linear Regression](api/R/spark.glm.html) can be used to train a model from a specified family. Currently the Gaussian, Binomial, Poisson and Gamma families are supported. We support a subset of the available R formula operators for model fitting, including '~', '.', ':', '+'
spark git commit: [SPARK-16144][SPARKR] update R API doc for mllib
Repository: spark Updated Branches: refs/heads/master 2ad031be6 -> 7f38b9d5f [SPARK-16144][SPARKR] update R API doc for mllib ## What changes were proposed in this pull request? >From SPARK-16140/PR #13921 - the issue is we left write.ml doc empty: ![image](https://cloud.githubusercontent.com/assets/8969467/16481934/856dd0ea-3e62-11e6-9474-e4d57d1ca001.png) Here's what I meant as the fix: ![image](https://cloud.githubusercontent.com/assets/8969467/16481943/911f02ec-3e62-11e6-9d68-17363a9f5628.png) ![image](https://cloud.githubusercontent.com/assets/8969467/16481950/9bc057aa-3e62-11e6-8127-54870701c4b1.png) I didn't realize there was already a JIRA on this. mengxr yanboliang ## How was this patch tested? check doc generated. Author: Felix Cheung Closes #13993 from felixcheung/rmllibdoc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7f38b9d5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7f38b9d5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7f38b9d5 Branch: refs/heads/master Commit: 7f38b9d5f469b2550bc481cbf9adb9acc3779712 Parents: 2ad031b Author: Felix Cheung Authored: Mon Jul 11 14:34:48 2016 -0700 Committer: Shivaram Venkataraman Committed: Mon Jul 11 14:34:48 2016 -0700 -- R/pkg/R/generics.R | 2 -- R/pkg/R/mllib.R| 36 ++-- 2 files changed, 30 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7f38b9d5/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index e4ec508..df057bd 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1255,7 +1255,6 @@ setGeneric("spark.glm", function(data, formula, ...) { standardGeneric("spark.gl #' @export setGeneric("glm") -#' predict #' @rdname predict #' @export setGeneric("predict", function(object, ...) { standardGeneric("predict") }) @@ -1280,7 +1279,6 @@ setGeneric("spark.naiveBayes", function(data, formula, ...) { standardGeneric("s #' @export setGeneric("spark.survreg", function(data, formula, ...) { standardGeneric("spark.survreg") }) -#' write.ml #' @rdname write.ml #' @export setGeneric("write.ml", function(object, path, ...) { standardGeneric("write.ml") }) http://git-wip-us.apache.org/repos/asf/spark/blob/7f38b9d5/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index e9fd0c7..94e1f65 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -53,6 +53,29 @@ setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj")) #' @note KMeansModel since 2.0.0 setClass("KMeansModel", representation(jobj = "jobj")) +#' Saves the MLlib model to the input path +#' +#' Saves the MLlib model to the input path. For more information, see the specific +#' MLlib model below. +#' @rdname write.ml +#' @name write.ml +#' @export +#' @seealso \link{spark.glm}, \link{glm} +#' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg} +#' @seealso \link{read.ml} +NULL + +#' Makes predictions from a MLlib model +#' +#' Makes predictions from a MLlib model. For more information, see the specific +#' MLlib model below. +#' @rdname predict +#' @name predict +#' @export +#' @seealso \link{spark.glm}, \link{glm} +#' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg} +NULL + #' Generalized Linear Models #' #' Fits generalized linear model against a Spark DataFrame. @@ -146,7 +169,7 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDat }) # Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary(). -#' + #' @param object A fitted generalized linear model #' @return \code{summary} returns a summary object of the fitted model, a list of components #' including at least the coefficients, null/residual deviance, null/residual degrees @@ -186,7 +209,7 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"), }) # Prints the summary of GeneralizedLinearRegressionModel -#' + #' @rdname spark.glm #' @param x Summary object of fitted generalized linear model returned by \code{summary} function #' @export @@ -345,7 +368,7 @@ setMethod("fitted", signature(object = "KMeansModel")
spark git commit: [SPARK-16144][SPARKR] update R API doc for mllib
Repository: spark Updated Branches: refs/heads/branch-2.0 b938ca76e -> cb463b6db [SPARK-16144][SPARKR] update R API doc for mllib ## What changes were proposed in this pull request? >From SPARK-16140/PR #13921 - the issue is we left write.ml doc empty: ![image](https://cloud.githubusercontent.com/assets/8969467/16481934/856dd0ea-3e62-11e6-9474-e4d57d1ca001.png) Here's what I meant as the fix: ![image](https://cloud.githubusercontent.com/assets/8969467/16481943/911f02ec-3e62-11e6-9d68-17363a9f5628.png) ![image](https://cloud.githubusercontent.com/assets/8969467/16481950/9bc057aa-3e62-11e6-8127-54870701c4b1.png) I didn't realize there was already a JIRA on this. mengxr yanboliang ## How was this patch tested? check doc generated. Author: Felix Cheung Closes #13993 from felixcheung/rmllibdoc. (cherry picked from commit 7f38b9d5f469b2550bc481cbf9adb9acc3779712) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cb463b6d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cb463b6d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cb463b6d Branch: refs/heads/branch-2.0 Commit: cb463b6db30491e4e881b8fb5981dfdbf9e73d34 Parents: b938ca7 Author: Felix Cheung Authored: Mon Jul 11 14:34:48 2016 -0700 Committer: Shivaram Venkataraman Committed: Mon Jul 11 14:34:57 2016 -0700 -- R/pkg/R/generics.R | 2 -- R/pkg/R/mllib.R| 36 ++-- 2 files changed, 30 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cb463b6d/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index e4ec508..df057bd 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1255,7 +1255,6 @@ setGeneric("spark.glm", function(data, formula, ...) { standardGeneric("spark.gl #' @export setGeneric("glm") -#' predict #' @rdname predict #' @export setGeneric("predict", function(object, ...) { standardGeneric("predict") }) @@ -1280,7 +1279,6 @@ setGeneric("spark.naiveBayes", function(data, formula, ...) { standardGeneric("s #' @export setGeneric("spark.survreg", function(data, formula, ...) { standardGeneric("spark.survreg") }) -#' write.ml #' @rdname write.ml #' @export setGeneric("write.ml", function(object, path, ...) { standardGeneric("write.ml") }) http://git-wip-us.apache.org/repos/asf/spark/blob/cb463b6d/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index e9fd0c7..94e1f65 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -53,6 +53,29 @@ setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj")) #' @note KMeansModel since 2.0.0 setClass("KMeansModel", representation(jobj = "jobj")) +#' Saves the MLlib model to the input path +#' +#' Saves the MLlib model to the input path. For more information, see the specific +#' MLlib model below. +#' @rdname write.ml +#' @name write.ml +#' @export +#' @seealso \link{spark.glm}, \link{glm} +#' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg} +#' @seealso \link{read.ml} +NULL + +#' Makes predictions from a MLlib model +#' +#' Makes predictions from a MLlib model. For more information, see the specific +#' MLlib model below. +#' @rdname predict +#' @name predict +#' @export +#' @seealso \link{spark.glm}, \link{glm} +#' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg} +NULL + #' Generalized Linear Models #' #' Fits generalized linear model against a Spark DataFrame. @@ -146,7 +169,7 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDat }) # Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary(). -#' + #' @param object A fitted generalized linear model #' @return \code{summary} returns a summary object of the fitted model, a list of components #' including at least the coefficients, null/residual deviance, null/residual degrees @@ -186,7 +209,7 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"), }) # Prints the summary of GeneralizedLinearRegressionModel -#' + #' @rdname spark.glm #' @param x Summary object of fitted generalized linear model returned by \code{summary}
spark git commit: [SPARKR][MINOR] R examples and test updates
Repository: spark Updated Branches: refs/heads/master 51a6706b1 -> b4baf086c [SPARKR][MINOR] R examples and test updates ## What changes were proposed in this pull request? Minor example updates ## How was this patch tested? manual shivaram Author: Felix Cheung Closes #14171 from felixcheung/rexample. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b4baf086 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b4baf086 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b4baf086 Branch: refs/heads/master Commit: b4baf086ca380a46d953f2710184ad9eee3a045e Parents: 51a6706 Author: Felix Cheung Authored: Wed Jul 13 13:33:34 2016 -0700 Committer: Shivaram Venkataraman Committed: Wed Jul 13 13:33:34 2016 -0700 -- R/pkg/inst/tests/testthat/jarTest.R | 2 +- R/pkg/inst/tests/testthat/packageInAJarTest.R | 2 +- examples/src/main/r/RSparkSQLExample.R| 3 +++ examples/src/main/r/dataframe.R | 2 +- 4 files changed, 6 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b4baf086/R/pkg/inst/tests/testthat/jarTest.R -- diff --git a/R/pkg/inst/tests/testthat/jarTest.R b/R/pkg/inst/tests/testthat/jarTest.R index 84e4845..51754a4 100644 --- a/R/pkg/inst/tests/testthat/jarTest.R +++ b/R/pkg/inst/tests/testthat/jarTest.R @@ -16,7 +16,7 @@ # library(SparkR) -sparkSession <- sparkR.session() +sparkR.session() helloTest <- SparkR:::callJStatic("sparkR.test.hello", "helloWorld", http://git-wip-us.apache.org/repos/asf/spark/blob/b4baf086/R/pkg/inst/tests/testthat/packageInAJarTest.R -- diff --git a/R/pkg/inst/tests/testthat/packageInAJarTest.R b/R/pkg/inst/tests/testthat/packageInAJarTest.R index 940c91f..4bc935c 100644 --- a/R/pkg/inst/tests/testthat/packageInAJarTest.R +++ b/R/pkg/inst/tests/testthat/packageInAJarTest.R @@ -17,7 +17,7 @@ library(SparkR) library(sparkPackageTest) -sparkSession <- sparkR.session() +sparkR.session() run1 <- myfunc(5L) http://git-wip-us.apache.org/repos/asf/spark/blob/b4baf086/examples/src/main/r/RSparkSQLExample.R -- diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R index eba3f1b..f20875c 100644 --- a/examples/src/main/r/RSparkSQLExample.R +++ b/examples/src/main/r/RSparkSQLExample.R @@ -195,3 +195,6 @@ results <- collect(sql("FROM src SELECT key, value")) # $example on:jdbc$ df <- read.jdbc("jdbc:postgresql:dbserver", "schema.tablename", user = "username", password = "password") # $example off:jdbc$ + +# Stop the SparkSession now +sparkR.session.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/b4baf086/examples/src/main/r/dataframe.R -- diff --git a/examples/src/main/r/dataframe.R b/examples/src/main/r/dataframe.R index 295f9b4..82b85f2 100644 --- a/examples/src/main/r/dataframe.R +++ b/examples/src/main/r/dataframe.R @@ -18,7 +18,7 @@ library(SparkR) # Initialize SparkSession -sc <- sparkR.session(appName = "SparkR-DataFrame-example") +sparkR.session(appName = "SparkR-DataFrame-example") # Create a simple local data.frame localDF <- data.frame(name=c("John", "Smith", "Sarah"), age=c(19, 23, 18)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARKR][MINOR] R examples and test updates
Repository: spark Updated Branches: refs/heads/branch-2.0 86adc5cfb -> 18255a934 [SPARKR][MINOR] R examples and test updates ## What changes were proposed in this pull request? Minor example updates ## How was this patch tested? manual shivaram Author: Felix Cheung Closes #14171 from felixcheung/rexample. (cherry picked from commit b4baf086ca380a46d953f2710184ad9eee3a045e) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/18255a93 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/18255a93 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/18255a93 Branch: refs/heads/branch-2.0 Commit: 18255a9345dd711bf630993c582511efa74b7919 Parents: 86adc5c Author: Felix Cheung Authored: Wed Jul 13 13:33:34 2016 -0700 Committer: Shivaram Venkataraman Committed: Wed Jul 13 13:33:47 2016 -0700 -- R/pkg/inst/tests/testthat/jarTest.R | 2 +- R/pkg/inst/tests/testthat/packageInAJarTest.R | 2 +- examples/src/main/r/RSparkSQLExample.R| 3 +++ examples/src/main/r/dataframe.R | 2 +- 4 files changed, 6 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/18255a93/R/pkg/inst/tests/testthat/jarTest.R -- diff --git a/R/pkg/inst/tests/testthat/jarTest.R b/R/pkg/inst/tests/testthat/jarTest.R index 84e4845..51754a4 100644 --- a/R/pkg/inst/tests/testthat/jarTest.R +++ b/R/pkg/inst/tests/testthat/jarTest.R @@ -16,7 +16,7 @@ # library(SparkR) -sparkSession <- sparkR.session() +sparkR.session() helloTest <- SparkR:::callJStatic("sparkR.test.hello", "helloWorld", http://git-wip-us.apache.org/repos/asf/spark/blob/18255a93/R/pkg/inst/tests/testthat/packageInAJarTest.R -- diff --git a/R/pkg/inst/tests/testthat/packageInAJarTest.R b/R/pkg/inst/tests/testthat/packageInAJarTest.R index 940c91f..4bc935c 100644 --- a/R/pkg/inst/tests/testthat/packageInAJarTest.R +++ b/R/pkg/inst/tests/testthat/packageInAJarTest.R @@ -17,7 +17,7 @@ library(SparkR) library(sparkPackageTest) -sparkSession <- sparkR.session() +sparkR.session() run1 <- myfunc(5L) http://git-wip-us.apache.org/repos/asf/spark/blob/18255a93/examples/src/main/r/RSparkSQLExample.R -- diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R index eba3f1b..f20875c 100644 --- a/examples/src/main/r/RSparkSQLExample.R +++ b/examples/src/main/r/RSparkSQLExample.R @@ -195,3 +195,6 @@ results <- collect(sql("FROM src SELECT key, value")) # $example on:jdbc$ df <- read.jdbc("jdbc:postgresql:dbserver", "schema.tablename", user = "username", password = "password") # $example off:jdbc$ + +# Stop the SparkSession now +sparkR.session.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/18255a93/examples/src/main/r/dataframe.R -- diff --git a/examples/src/main/r/dataframe.R b/examples/src/main/r/dataframe.R index 295f9b4..82b85f2 100644 --- a/examples/src/main/r/dataframe.R +++ b/examples/src/main/r/dataframe.R @@ -18,7 +18,7 @@ library(SparkR) # Initialize SparkSession -sc <- sparkR.session(appName = "SparkR-DataFrame-example") +sparkR.session(appName = "SparkR-DataFrame-example") # Create a simple local data.frame localDF <- data.frame(name=c("John", "Smith", "Sarah"), age=c(19, 23, 18)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARKR][DOCS][MINOR] R programming guide to include csv data source example
Repository: spark Updated Branches: refs/heads/master b4baf086c -> fb2e8eeb0 [SPARKR][DOCS][MINOR] R programming guide to include csv data source example ## What changes were proposed in this pull request? Minor documentation update for code example, code style, and missed reference to "sparkR.init" ## How was this patch tested? manual shivaram Author: Felix Cheung Closes #14178 from felixcheung/rcsvprogrammingguide. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fb2e8eeb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fb2e8eeb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fb2e8eeb Branch: refs/heads/master Commit: fb2e8eeb0b1e56bea535165f7a3bec6558b3f4a3 Parents: b4baf08 Author: Felix Cheung Authored: Wed Jul 13 15:09:23 2016 -0700 Committer: Shivaram Venkataraman Committed: Wed Jul 13 15:09:23 2016 -0700 -- R/pkg/inst/tests/testthat/test_sparkSQL.R | 2 +- docs/sparkr.md| 27 +- 2 files changed, 19 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fb2e8eeb/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index fdd6020..e61fa41 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -237,7 +237,7 @@ test_that("read csv as DataFrame", { "Empty,Dummy,Placeholder") writeLines(mockLinesCsv, csvPath) - df2 <- read.df(csvPath, "csv", header = "true", inferSchema = "true", na.string = "Empty") + df2 <- read.df(csvPath, "csv", header = "true", inferSchema = "true", na.strings = "Empty") expect_equal(count(df2), 4) withoutna2 <- na.omit(df2, how = "any", cols = "year") expect_equal(count(withoutna2), 3) http://git-wip-us.apache.org/repos/asf/spark/blob/fb2e8eeb/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index b4acb23..9fda0ec 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -111,19 +111,17 @@ head(df) SparkR supports operating on a variety of data sources through the `SparkDataFrame` interface. This section describes the general methods for loading and saving data using Data Sources. You can check the Spark SQL programming guide for more [specific options](sql-programming-guide.html#manually-specifying-options) that are available for the built-in data sources. The general method for creating SparkDataFrames from data sources is `read.df`. This method takes in the path for the file to load and the type of data source, and the currently active SparkSession will be used automatically. SparkR supports reading JSON, CSV and Parquet files natively and through [Spark Packages](http://spark-packages.org/) you can find data source connectors for popular file formats like [Avro](http://spark-packages.org/package/databricks/spark-avro). These packages can either be added by -specifying `--packages` with `spark-submit` or `sparkR` commands, or if creating context through `init` -you can specify the packages with the `packages` argument. +specifying `--packages` with `spark-submit` or `sparkR` commands, or if initializing SparkSession with `sparkPackages` parameter when in an interactive R shell or from RStudio. {% highlight r %} -sc <- sparkR.session(sparkPackages="com.databricks:spark-avro_2.11:3.0.0") +sc <- sparkR.session(sparkPackages = "com.databricks:spark-avro_2.11:3.0.0") {% endhighlight %} We can see how to use data sources using an example JSON input file. Note that the file that is used here is _not_ a typical JSON file. Each line in the file must contain a separate, self-contained valid JSON object. As a consequence, a regular multi-line JSON file will most often fail. - {% highlight r %} people <- read.df("./examples/src/main/resources/people.json", "json") head(people) @@ -138,6 +136,18 @@ printSchema(people) # |-- age: long (nullable = true) # |-- name: string (nullable = true) +# Similarly, multiple files can be read with read.json +people <- read.json(c("./examples/src/main/resources/people.json", "./examples/src/main/resources/people2.json")) + +{% endhighlight %} + + +The data sources API natively supports CSV formatted input files. For more information please refer to SparkR [read.df](api/R/read.df.html) API documentation. + + +{% highlight r %} +df <- read.df(csvPath, &
spark git commit: [SPARKR][DOCS][MINOR] R programming guide to include csv data source example
Repository: spark Updated Branches: refs/heads/branch-2.0 18255a934 -> 9e3a59858 [SPARKR][DOCS][MINOR] R programming guide to include csv data source example ## What changes were proposed in this pull request? Minor documentation update for code example, code style, and missed reference to "sparkR.init" ## How was this patch tested? manual shivaram Author: Felix Cheung Closes #14178 from felixcheung/rcsvprogrammingguide. (cherry picked from commit fb2e8eeb0b1e56bea535165f7a3bec6558b3f4a3) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9e3a5985 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9e3a5985 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9e3a5985 Branch: refs/heads/branch-2.0 Commit: 9e3a598582c747194188f8ad15b43aca03907bae Parents: 18255a9 Author: Felix Cheung Authored: Wed Jul 13 15:09:23 2016 -0700 Committer: Shivaram Venkataraman Committed: Wed Jul 13 15:09:31 2016 -0700 -- R/pkg/inst/tests/testthat/test_sparkSQL.R | 2 +- docs/sparkr.md| 27 +- 2 files changed, 19 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9e3a5985/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index bd7b5f0..e26b015 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -237,7 +237,7 @@ test_that("read csv as DataFrame", { "Empty,Dummy,Placeholder") writeLines(mockLinesCsv, csvPath) - df2 <- read.df(csvPath, "csv", header = "true", inferSchema = "true", na.string = "Empty") + df2 <- read.df(csvPath, "csv", header = "true", inferSchema = "true", na.strings = "Empty") expect_equal(count(df2), 4) withoutna2 <- na.omit(df2, how = "any", cols = "year") expect_equal(count(withoutna2), 3) http://git-wip-us.apache.org/repos/asf/spark/blob/9e3a5985/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index b4acb23..9fda0ec 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -111,19 +111,17 @@ head(df) SparkR supports operating on a variety of data sources through the `SparkDataFrame` interface. This section describes the general methods for loading and saving data using Data Sources. You can check the Spark SQL programming guide for more [specific options](sql-programming-guide.html#manually-specifying-options) that are available for the built-in data sources. The general method for creating SparkDataFrames from data sources is `read.df`. This method takes in the path for the file to load and the type of data source, and the currently active SparkSession will be used automatically. SparkR supports reading JSON, CSV and Parquet files natively and through [Spark Packages](http://spark-packages.org/) you can find data source connectors for popular file formats like [Avro](http://spark-packages.org/package/databricks/spark-avro). These packages can either be added by -specifying `--packages` with `spark-submit` or `sparkR` commands, or if creating context through `init` -you can specify the packages with the `packages` argument. +specifying `--packages` with `spark-submit` or `sparkR` commands, or if initializing SparkSession with `sparkPackages` parameter when in an interactive R shell or from RStudio. {% highlight r %} -sc <- sparkR.session(sparkPackages="com.databricks:spark-avro_2.11:3.0.0") +sc <- sparkR.session(sparkPackages = "com.databricks:spark-avro_2.11:3.0.0") {% endhighlight %} We can see how to use data sources using an example JSON input file. Note that the file that is used here is _not_ a typical JSON file. Each line in the file must contain a separate, self-contained valid JSON object. As a consequence, a regular multi-line JSON file will most often fail. - {% highlight r %} people <- read.df("./examples/src/main/resources/people.json", "json") head(people) @@ -138,6 +136,18 @@ printSchema(people) # |-- age: long (nullable = true) # |-- name: string (nullable = true) +# Similarly, multiple files can be read with read.json +people <- read.json(c("./examples/src/main/resources/people.json", "./examples/src/main/resources/people2.json")) + +{% endhighlight %} + + +The data sources API natively supports CSV formatted input files. For more information please refer t
spark git commit: [SPARK-16509][SPARKR] Rename window.partitionBy and window.orderBy to windowPartitionBy and windowOrderBy.
Repository: spark Updated Branches: refs/heads/branch-2.0 240c42b28 -> 4e9080f44 [SPARK-16509][SPARKR] Rename window.partitionBy and window.orderBy to windowPartitionBy and windowOrderBy. ## What changes were proposed in this pull request? Rename window.partitionBy and window.orderBy to windowPartitionBy and windowOrderBy to pass CRAN package check. ## How was this patch tested? SparkR unit tests. Author: Sun Rui Closes #14192 from sun-rui/SPARK-16509. (cherry picked from commit 093ebbc628699b40f091b5b7083c119fffa9314b) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4e9080f4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4e9080f4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4e9080f4 Branch: refs/heads/branch-2.0 Commit: 4e9080f44431bc4b91118d53f9234420f2f3d584 Parents: 240c42b Author: Sun Rui Authored: Thu Jul 14 09:38:42 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Jul 14 09:38:51 2016 -0700 -- R/pkg/NAMESPACE | 4 +- R/pkg/R/WindowSpec.R | 4 +- R/pkg/R/generics.R| 8 ++-- R/pkg/R/window.R | 54 +++--- R/pkg/inst/tests/testthat/test_sparkSQL.R | 8 ++-- 5 files changed, 44 insertions(+), 34 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4e9080f4/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index bc3aceb..fe52905 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -341,5 +341,5 @@ export("partitionBy", "rowsBetween", "rangeBetween") -export("window.partitionBy", - "window.orderBy") +export("windowPartitionBy", + "windowOrderBy") http://git-wip-us.apache.org/repos/asf/spark/blob/4e9080f4/R/pkg/R/WindowSpec.R -- diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R index 9f3b1e4..e20d05d 100644 --- a/R/pkg/R/WindowSpec.R +++ b/R/pkg/R/WindowSpec.R @@ -22,10 +22,10 @@ NULL #' S4 class that represents a WindowSpec #' -#' WindowSpec can be created by using window.partitionBy() or window.orderBy() +#' WindowSpec can be created by using windowPartitionBy() or windowOrderBy() #' #' @rdname WindowSpec -#' @seealso \link{window.partitionBy}, \link{window.orderBy} +#' @seealso \link{windowPartitionBy}, \link{windowOrderBy} #' #' @param sws A Java object reference to the backing Scala WindowSpec #' @export http://git-wip-us.apache.org/repos/asf/spark/blob/4e9080f4/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index df057bd..8416e5c 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -779,13 +779,13 @@ setGeneric("rowsBetween", function(x, start, end) { standardGeneric("rowsBetween #' @export setGeneric("rangeBetween", function(x, start, end) { standardGeneric("rangeBetween") }) -#' @rdname window.partitionBy +#' @rdname windowPartitionBy #' @export -setGeneric("window.partitionBy", function(col, ...) { standardGeneric("window.partitionBy") }) +setGeneric("windowPartitionBy", function(col, ...) { standardGeneric("windowPartitionBy") }) -#' @rdname window.orderBy +#' @rdname windowOrderBy #' @export -setGeneric("window.orderBy", function(col, ...) { standardGeneric("window.orderBy") }) +setGeneric("windowOrderBy", function(col, ...) { standardGeneric("windowOrderBy") }) ## Expression Function Methods ## http://git-wip-us.apache.org/repos/asf/spark/blob/4e9080f4/R/pkg/R/window.R -- diff --git a/R/pkg/R/window.R b/R/pkg/R/window.R index e4bc933..d9d069c 100644 --- a/R/pkg/R/window.R +++ b/R/pkg/R/window.R @@ -17,23 +17,28 @@ # window.R - Utility functions for defining window in DataFrames -#' window.partitionBy +#' windowPartitionBy #' #' Creates a WindowSpec with the partitioning defined. #' -#' @rdname window.partitionBy -#' @name window.partitionBy +#' @param col A column name or Column by which rows are partitioned to +#'windows. +#' @param ... Optional column names or Columns in addition to col, by +#'which rows are partitioned to windows. +#' +#' @rdname windowPartitionBy +#
spark git commit: [SPARK-16538][SPARKR] fix R call with namespace operator on SparkSession functions
Repository: spark Updated Branches: refs/heads/master 093ebbc62 -> 12005c88f [SPARK-16538][SPARKR] fix R call with namespace operator on SparkSession functions ## What changes were proposed in this pull request? Fix function routing to work with and without namespace operator `SparkR::createDataFrame` ## How was this patch tested? manual, unit tests shivaram Author: Felix Cheung Closes #14195 from felixcheung/rroutedefault. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/12005c88 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/12005c88 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/12005c88 Branch: refs/heads/master Commit: 12005c88fb24168d57b577cff73eddcd9d8963fc Parents: 093ebbc Author: Felix Cheung Authored: Thu Jul 14 09:45:30 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Jul 14 09:45:30 2016 -0700 -- R/pkg/R/SQLContext.R | 4 +++- R/pkg/inst/tests/testthat/test_sparkSQL.R | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/12005c88/R/pkg/R/SQLContext.R -- diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index bc0daa2..d2ea155 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -48,7 +48,9 @@ getInternalType <- function(x) { #' @return whatever the target returns #' @noRd dispatchFunc <- function(newFuncSig, x, ...) { - funcName <- as.character(sys.call(sys.parent())[[1]]) + # When called with SparkR::createDataFrame, sys.call()[[1]] returns c(::, SparkR, createDataFrame) + callsite <- as.character(sys.call(sys.parent())[[1]]) + funcName <- callsite[[length(callsite)]] f <- get(paste0(funcName, ".default")) # Strip sqlContext from list of parameters and then pass the rest along. contextNames <- c("org.apache.spark.sql.SQLContext", http://git-wip-us.apache.org/repos/asf/spark/blob/12005c88/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 8786823..a1b1f1c 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -2405,7 +2405,8 @@ test_that("createDataFrame sqlContext parameter backward compatibility", { a <- 1:3 b <- c("a", "b", "c") ldf <- data.frame(a, b) - df <- suppressWarnings(createDataFrame(sqlContext, ldf)) + # Call function with namespace :: operator - SPARK-16538 + df <- suppressWarnings(SparkR::createDataFrame(sqlContext, ldf)) expect_equal(columns(df), c("a", "b")) expect_equal(dtypes(df), list(c("a", "int"), c("b", "string"))) expect_equal(count(df), 3) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16538][SPARKR] fix R call with namespace operator on SparkSession functions
Repository: spark Updated Branches: refs/heads/branch-2.0 4e9080f44 -> 29281bc40 [SPARK-16538][SPARKR] fix R call with namespace operator on SparkSession functions ## What changes were proposed in this pull request? Fix function routing to work with and without namespace operator `SparkR::createDataFrame` ## How was this patch tested? manual, unit tests shivaram Author: Felix Cheung Closes #14195 from felixcheung/rroutedefault. (cherry picked from commit 12005c88fb24168d57b577cff73eddcd9d8963fc) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/29281bc4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/29281bc4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/29281bc4 Branch: refs/heads/branch-2.0 Commit: 29281bc40cb83ce2946b0395981c8dce5630910c Parents: 4e9080f Author: Felix Cheung Authored: Thu Jul 14 09:45:30 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Jul 14 09:45:39 2016 -0700 -- R/pkg/R/SQLContext.R | 4 +++- R/pkg/inst/tests/testthat/test_sparkSQL.R | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/29281bc4/R/pkg/R/SQLContext.R -- diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index bc0daa2..d2ea155 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -48,7 +48,9 @@ getInternalType <- function(x) { #' @return whatever the target returns #' @noRd dispatchFunc <- function(newFuncSig, x, ...) { - funcName <- as.character(sys.call(sys.parent())[[1]]) + # When called with SparkR::createDataFrame, sys.call()[[1]] returns c(::, SparkR, createDataFrame) + callsite <- as.character(sys.call(sys.parent())[[1]]) + funcName <- callsite[[length(callsite)]] f <- get(paste0(funcName, ".default")) # Strip sqlContext from list of parameters and then pass the rest along. contextNames <- c("org.apache.spark.sql.SQLContext", http://git-wip-us.apache.org/repos/asf/spark/blob/29281bc4/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 1bfdc34..20c750a 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -2397,7 +2397,8 @@ test_that("createDataFrame sqlContext parameter backward compatibility", { a <- 1:3 b <- c("a", "b", "c") ldf <- data.frame(a, b) - df <- suppressWarnings(createDataFrame(sqlContext, ldf)) + # Call function with namespace :: operator - SPARK-16538 + df <- suppressWarnings(SparkR::createDataFrame(sqlContext, ldf)) expect_equal(columns(df), c("a", "b")) expect_equal(dtypes(df), list(c("a", "int"), c("b", "string"))) expect_equal(count(df), 3) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16509][SPARKR] Rename window.partitionBy and window.orderBy to windowPartitionBy and windowOrderBy.
Repository: spark Updated Branches: refs/heads/master 56183b84f -> 093ebbc62 [SPARK-16509][SPARKR] Rename window.partitionBy and window.orderBy to windowPartitionBy and windowOrderBy. ## What changes were proposed in this pull request? Rename window.partitionBy and window.orderBy to windowPartitionBy and windowOrderBy to pass CRAN package check. ## How was this patch tested? SparkR unit tests. Author: Sun Rui Closes #14192 from sun-rui/SPARK-16509. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/093ebbc6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/093ebbc6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/093ebbc6 Branch: refs/heads/master Commit: 093ebbc628699b40f091b5b7083c119fffa9314b Parents: 56183b8 Author: Sun Rui Authored: Thu Jul 14 09:38:42 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Jul 14 09:38:42 2016 -0700 -- R/pkg/NAMESPACE | 4 +- R/pkg/R/WindowSpec.R | 4 +- R/pkg/R/generics.R| 8 ++-- R/pkg/R/window.R | 54 +++--- R/pkg/inst/tests/testthat/test_sparkSQL.R | 8 ++-- 5 files changed, 44 insertions(+), 34 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/093ebbc6/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index bc3aceb..fe52905 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -341,5 +341,5 @@ export("partitionBy", "rowsBetween", "rangeBetween") -export("window.partitionBy", - "window.orderBy") +export("windowPartitionBy", + "windowOrderBy") http://git-wip-us.apache.org/repos/asf/spark/blob/093ebbc6/R/pkg/R/WindowSpec.R -- diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R index 9f3b1e4..e20d05d 100644 --- a/R/pkg/R/WindowSpec.R +++ b/R/pkg/R/WindowSpec.R @@ -22,10 +22,10 @@ NULL #' S4 class that represents a WindowSpec #' -#' WindowSpec can be created by using window.partitionBy() or window.orderBy() +#' WindowSpec can be created by using windowPartitionBy() or windowOrderBy() #' #' @rdname WindowSpec -#' @seealso \link{window.partitionBy}, \link{window.orderBy} +#' @seealso \link{windowPartitionBy}, \link{windowOrderBy} #' #' @param sws A Java object reference to the backing Scala WindowSpec #' @export http://git-wip-us.apache.org/repos/asf/spark/blob/093ebbc6/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index df057bd..8416e5c 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -779,13 +779,13 @@ setGeneric("rowsBetween", function(x, start, end) { standardGeneric("rowsBetween #' @export setGeneric("rangeBetween", function(x, start, end) { standardGeneric("rangeBetween") }) -#' @rdname window.partitionBy +#' @rdname windowPartitionBy #' @export -setGeneric("window.partitionBy", function(col, ...) { standardGeneric("window.partitionBy") }) +setGeneric("windowPartitionBy", function(col, ...) { standardGeneric("windowPartitionBy") }) -#' @rdname window.orderBy +#' @rdname windowOrderBy #' @export -setGeneric("window.orderBy", function(col, ...) { standardGeneric("window.orderBy") }) +setGeneric("windowOrderBy", function(col, ...) { standardGeneric("windowOrderBy") }) ## Expression Function Methods ## http://git-wip-us.apache.org/repos/asf/spark/blob/093ebbc6/R/pkg/R/window.R -- diff --git a/R/pkg/R/window.R b/R/pkg/R/window.R index e4bc933..d9d069c 100644 --- a/R/pkg/R/window.R +++ b/R/pkg/R/window.R @@ -17,23 +17,28 @@ # window.R - Utility functions for defining window in DataFrames -#' window.partitionBy +#' windowPartitionBy #' #' Creates a WindowSpec with the partitioning defined. #' -#' @rdname window.partitionBy -#' @name window.partitionBy +#' @param col A column name or Column by which rows are partitioned to +#'windows. +#' @param ... Optional column names or Columns in addition to col, by +#'which rows are partitioned to windows. +#' +#' @rdname windowPartitionBy +#' @name windowPartitionBy #' @export #' @examples #' \dontrun{ -#' ws <- window.partitio
spark git commit: [SPARK-16538][SPARKR] Add more tests for namespace call to SparkSession functions
Repository: spark Updated Branches: refs/heads/master 5ffd5d383 -> 611a8ca58 [SPARK-16538][SPARKR] Add more tests for namespace call to SparkSession functions ## What changes were proposed in this pull request? More tests I don't think this is critical for Spark 2.0.0 RC, maybe Spark 2.0.1 or 2.1.0. ## How was this patch tested? unit tests shivaram dongjoon-hyun Author: Felix Cheung Closes #14206 from felixcheung/rroutetests. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/611a8ca5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/611a8ca5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/611a8ca5 Branch: refs/heads/master Commit: 611a8ca5895357059f1e7c035d946e0718b26a5a Parents: 5ffd5d3 Author: Felix Cheung Authored: Fri Jul 15 13:58:57 2016 -0700 Committer: Shivaram Venkataraman Committed: Fri Jul 15 13:58:57 2016 -0700 -- R/pkg/inst/tests/testthat/test_sparkSQL.R | 7 +++ 1 file changed, 7 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/611a8ca5/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index a1b1f1c..f275284 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -2424,6 +2424,13 @@ test_that("createDataFrame sqlContext parameter backward compatibility", { before <- suppressWarnings(createDataFrame(sqlContext, iris)) after <- suppressWarnings(createDataFrame(iris)) expect_equal(collect(before), collect(after)) + + # more tests for SPARK-16538 + createOrReplaceTempView(df, "table") + SparkR::tables() + SparkR::sql("SELECT 1") + suppressWarnings(SparkR::sql(sqlContext, "SELECT * FROM table")) + suppressWarnings(SparkR::dropTempTable(sqlContext, "table")) }) test_that("randomSplit", { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16538][SPARKR] Add more tests for namespace call to SparkSession functions
Repository: spark Updated Branches: refs/heads/branch-2.0 90686abbd -> e833c906f [SPARK-16538][SPARKR] Add more tests for namespace call to SparkSession functions ## What changes were proposed in this pull request? More tests I don't think this is critical for Spark 2.0.0 RC, maybe Spark 2.0.1 or 2.1.0. ## How was this patch tested? unit tests shivaram dongjoon-hyun Author: Felix Cheung Closes #14206 from felixcheung/rroutetests. (cherry picked from commit 611a8ca5895357059f1e7c035d946e0718b26a5a) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e833c906 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e833c906 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e833c906 Branch: refs/heads/branch-2.0 Commit: e833c906f4f3e6b82a1fc03f69cf94de06d85c61 Parents: 90686ab Author: Felix Cheung Authored: Fri Jul 15 13:58:57 2016 -0700 Committer: Shivaram Venkataraman Committed: Fri Jul 15 13:59:05 2016 -0700 -- R/pkg/inst/tests/testthat/test_sparkSQL.R | 7 +++ 1 file changed, 7 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e833c906/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 20c750a..7e59fdf 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -2416,6 +2416,13 @@ test_that("createDataFrame sqlContext parameter backward compatibility", { before <- suppressWarnings(createDataFrame(sqlContext, iris)) after <- suppressWarnings(createDataFrame(iris)) expect_equal(collect(before), collect(after)) + + # more tests for SPARK-16538 + createOrReplaceTempView(df, "table") + SparkR::tables() + SparkR::sql("SELECT 1") + suppressWarnings(SparkR::sql(sqlContext, "SELECT * FROM table")) + suppressWarnings(SparkR::dropTempTable(sqlContext, "table")) }) test_that("randomSplit", { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16112][SPARKR] Programming guide for gapply/gapplyCollect
Repository: spark Updated Branches: refs/heads/branch-2.0 cad4693f9 -> 8c2ec443b [SPARK-16112][SPARKR] Programming guide for gapply/gapplyCollect ## What changes were proposed in this pull request? Updates programming guide for spark.gapply/spark.gapplyCollect. Similar to other examples I used `faithful` dataset to demonstrate gapply's functionality. Please, let me know if you prefer another example. ## How was this patch tested? Existing test cases in R Author: Narine Kokhlikyan Closes #14090 from NarineK/gapplyProgGuide. (cherry picked from commit 416730483643a0a92dbd6ae4ad07e80ceb3c5285) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8c2ec443 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8c2ec443 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8c2ec443 Branch: refs/heads/branch-2.0 Commit: 8c2ec443baaf84a7bc008c0fdcddc039a65ad948 Parents: cad4693 Author: Narine Kokhlikyan Authored: Sat Jul 16 16:56:16 2016 -0700 Committer: Shivaram Venkataraman Committed: Sat Jul 16 16:56:24 2016 -0700 -- docs/sparkr.md | 138 ++-- 1 file changed, 134 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8c2ec443/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index 9fda0ec..a5235b2 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -272,11 +272,11 @@ In SparkR, we support several kinds of User-Defined Functions: # dapply Apply a function to each partition of a `SparkDataFrame`. The function to be applied to each partition of the `SparkDataFrame` -and should have only one parameter, to which a `data.frame` corresponds to each partition will be passed. The output of function -should be a `data.frame`. Schema specifies the row format of the resulting a `SparkDataFrame`. It must match the R function's output. +and should have only one parameter, to which a `data.frame` corresponds to each partition will be passed. The output of function should be a `data.frame`. Schema specifies the row format of the resulting a `SparkDataFrame`. It must match to [data types](#data-type-mapping-between-r-and-spark) of returned value. {% highlight r %} + # Convert waiting time from hours to seconds. # Note that we can apply UDF to DataFrame. schema <- structType(structField("eruptions", "double"), structField("waiting", "double"), @@ -295,8 +295,8 @@ head(collect(df1)) # dapplyCollect Like `dapply`, apply a function to each partition of a `SparkDataFrame` and collect the result back. The output of function -should be a `data.frame`. But, Schema is not required to be passed. Note that `dapplyCollect` only can be used if the -output of UDF run on all the partitions can fit in driver memory. +should be a `data.frame`. But, Schema is not required to be passed. Note that `dapplyCollect` can fail if the output of UDF run on all the partition cannot be pulled to the driver and fit in driver memory. + {% highlight r %} @@ -316,6 +316,136 @@ head(ldf, 3) {% endhighlight %} + Run a given function on a large dataset grouping by input column(s) and using `gapply` or `gapplyCollect` + +# gapply +Apply a function to each group of a `SparkDataFrame`. The function is to be applied to each group of the `SparkDataFrame` and should have only two parameters: grouping key and R `data.frame` corresponding to +that key. The groups are chosen from `SparkDataFrame`s column(s). +The output of function should be a `data.frame`. Schema specifies the row format of the resulting +`SparkDataFrame`. It must represent R function's output schema on the basis of Spark data types. The column names of the returned `data.frame` are set by user. Below is the data type mapping between R +and Spark. + + Data type mapping between R and Spark + +RSpark + + byte + byte + + + integer + integer + + + float + float + + + double + double + + + numeric + double + + + character + string + + + string + string + + + binary + binary + + + raw + binary + + + logical + boolean + + + https://stat.ethz.ch/R-manual/R-devel/library/base/html/DateTimeClasses.html";>POSIXct + timestamp + + + https://stat.ethz.ch/R-manual/R-devel/library/base/html/DateTimeClasses.html";>POSIXlt + timestamp + + + https://stat.ethz.ch/R-manual/R-devel/library/base/html/Dates.html";>Date + date + + + array + array + + + list + array + + + env + map + + + + +{% highlight r %} + +# Determine six waiting times with the largest eruption time in minutes. +schema <- structType(structField("waiting&q
spark git commit: [SPARK-16112][SPARKR] Programming guide for gapply/gapplyCollect
Repository: spark Updated Branches: refs/heads/master 5ec0d692b -> 416730483 [SPARK-16112][SPARKR] Programming guide for gapply/gapplyCollect ## What changes were proposed in this pull request? Updates programming guide for spark.gapply/spark.gapplyCollect. Similar to other examples I used `faithful` dataset to demonstrate gapply's functionality. Please, let me know if you prefer another example. ## How was this patch tested? Existing test cases in R Author: Narine Kokhlikyan Closes #14090 from NarineK/gapplyProgGuide. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/41673048 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/41673048 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/41673048 Branch: refs/heads/master Commit: 416730483643a0a92dbd6ae4ad07e80ceb3c5285 Parents: 5ec0d69 Author: Narine Kokhlikyan Authored: Sat Jul 16 16:56:16 2016 -0700 Committer: Shivaram Venkataraman Committed: Sat Jul 16 16:56:16 2016 -0700 -- docs/sparkr.md | 138 ++-- 1 file changed, 134 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/41673048/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index 9fda0ec..a5235b2 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -272,11 +272,11 @@ In SparkR, we support several kinds of User-Defined Functions: # dapply Apply a function to each partition of a `SparkDataFrame`. The function to be applied to each partition of the `SparkDataFrame` -and should have only one parameter, to which a `data.frame` corresponds to each partition will be passed. The output of function -should be a `data.frame`. Schema specifies the row format of the resulting a `SparkDataFrame`. It must match the R function's output. +and should have only one parameter, to which a `data.frame` corresponds to each partition will be passed. The output of function should be a `data.frame`. Schema specifies the row format of the resulting a `SparkDataFrame`. It must match to [data types](#data-type-mapping-between-r-and-spark) of returned value. {% highlight r %} + # Convert waiting time from hours to seconds. # Note that we can apply UDF to DataFrame. schema <- structType(structField("eruptions", "double"), structField("waiting", "double"), @@ -295,8 +295,8 @@ head(collect(df1)) # dapplyCollect Like `dapply`, apply a function to each partition of a `SparkDataFrame` and collect the result back. The output of function -should be a `data.frame`. But, Schema is not required to be passed. Note that `dapplyCollect` only can be used if the -output of UDF run on all the partitions can fit in driver memory. +should be a `data.frame`. But, Schema is not required to be passed. Note that `dapplyCollect` can fail if the output of UDF run on all the partition cannot be pulled to the driver and fit in driver memory. + {% highlight r %} @@ -316,6 +316,136 @@ head(ldf, 3) {% endhighlight %} + Run a given function on a large dataset grouping by input column(s) and using `gapply` or `gapplyCollect` + +# gapply +Apply a function to each group of a `SparkDataFrame`. The function is to be applied to each group of the `SparkDataFrame` and should have only two parameters: grouping key and R `data.frame` corresponding to +that key. The groups are chosen from `SparkDataFrame`s column(s). +The output of function should be a `data.frame`. Schema specifies the row format of the resulting +`SparkDataFrame`. It must represent R function's output schema on the basis of Spark data types. The column names of the returned `data.frame` are set by user. Below is the data type mapping between R +and Spark. + + Data type mapping between R and Spark + +RSpark + + byte + byte + + + integer + integer + + + float + float + + + double + double + + + numeric + double + + + character + string + + + string + string + + + binary + binary + + + raw + binary + + + logical + boolean + + + https://stat.ethz.ch/R-manual/R-devel/library/base/html/DateTimeClasses.html";>POSIXct + timestamp + + + https://stat.ethz.ch/R-manual/R-devel/library/base/html/DateTimeClasses.html";>POSIXlt + timestamp + + + https://stat.ethz.ch/R-manual/R-devel/library/base/html/Dates.html";>Date + date + + + array + array + + + list + array + + + env + map + + + + +{% highlight r %} + +# Determine six waiting times with the largest eruption time in minutes. +schema <- structType(structField("waiting", "double"), structField("max_eruption", "double")) +result <- gapply( +df,
[2/2] spark git commit: [SPARK-16507][SPARKR] Add a CRAN checker, fix Rd aliases
[SPARK-16507][SPARKR] Add a CRAN checker, fix Rd aliases ## What changes were proposed in this pull request? Add a check-cran.sh script that runs `R CMD check` as CRAN. Also fixes a number of issues pointed out by the check. These include - Updating `DESCRIPTION` to be appropriate - Adding a .Rbuildignore to ignore lintr, src-native, html that are non-standard files / dirs - Adding aliases to all S4 methods in DataFrame, Column, GroupedData etc. This is required as stated in https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Documenting-S4-classes-and-methods - Other minor fixes ## How was this patch tested? SparkR unit tests, running the above mentioned script Author: Shivaram Venkataraman Closes #14173 from shivaram/sparkr-cran-changes. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c33e4b0d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c33e4b0d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c33e4b0d Branch: refs/heads/master Commit: c33e4b0d96d424568963c7e716c20f02949c72d1 Parents: 4167304 Author: Shivaram Venkataraman Authored: Sat Jul 16 17:06:44 2016 -0700 Committer: Shivaram Venkataraman Committed: Sat Jul 16 17:06:44 2016 -0700 -- R/check-cran.sh | 52 ++ R/pkg/.Rbuildignore | 5 + R/pkg/DESCRIPTION| 8 +- R/pkg/NAMESPACE | 9 + R/pkg/R/DataFrame.R | 105 ++- R/pkg/R/SQLContext.R | 3 + R/pkg/R/WindowSpec.R | 6 + R/pkg/R/column.R | 27 ++- R/pkg/R/functions.R | 436 +- R/pkg/R/generics.R | 8 +- R/pkg/R/group.R | 9 +- R/pkg/R/mllib.R | 4 +- R/pkg/R/schema.R | 16 +- R/pkg/R/stats.R | 6 + R/pkg/R/utils.R | 20 +-- R/pkg/R/window.R | 4 + dev/.rat-excludes| 1 + 17 files changed, 676 insertions(+), 43 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c33e4b0d/R/check-cran.sh -- diff --git a/R/check-cran.sh b/R/check-cran.sh new file mode 100755 index 000..b3a6860 --- /dev/null +++ b/R/check-cran.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -o pipefail +set -e + +FWDIR="$(cd `dirname $0`; pwd)" +pushd $FWDIR > /dev/null + +if [ ! -z "$R_HOME" ] + then +R_SCRIPT_PATH="$R_HOME/bin" + else +# if system wide R_HOME is not found, then exit +if [ ! `command -v R` ]; then + echo "Cannot find 'R_HOME'. Please specify 'R_HOME' or make sure R is properly installed." + exit 1 +fi +R_SCRIPT_PATH="$(dirname $(which R))" +fi +echo "USING R_HOME = $R_HOME" + +# Build the latest docs +$FWDIR/create-docs.sh + +# Build a zip file containing the source package +"$R_SCRIPT_PATH/"R CMD build $FWDIR/pkg + +# Run check as-cran. +# TODO(shivaram): Remove the skip tests once we figure out the install mechanism + +VERSION=`grep Version $FWDIR/pkg/DESCRIPTION | awk '{print $NF}'` + +"$R_SCRIPT_PATH/"R CMD check --as-cran --no-tests SparkR_"$VERSION".tar.gz + +popd > /dev/null http://git-wip-us.apache.org/repos/asf/spark/blob/c33e4b0d/R/pkg/.Rbuildignore -- diff --git a/R/pkg/.Rbuildignore b/R/pkg/.Rbuildignore new file mode 100644 index 000..544d203 --- /dev/null +++ b/R/pkg/.Rbuildignore @@ -0,0 +1,5 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +^\.lintr$ +^src-native$ +^html$ http://git-wip-us.apache.org/repos/asf/spark/blob/c33e4b0d/R/pkg/DESCRIPTION -- diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 963a1bb..ac73d6c 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -1,12 +1,10 @@ Package: SparkR Type: Package -Title: R frontend for Spark +Title: R Frontend for Apache Spark Version: 2.0.0 -Date: 2013-09-09 +Date: 2016-07-07 Author:
[1/2] spark git commit: [SPARK-16507][SPARKR] Add a CRAN checker, fix Rd aliases
Repository: spark Updated Branches: refs/heads/master 416730483 -> c33e4b0d9 http://git-wip-us.apache.org/repos/asf/spark/blob/c33e4b0d/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 8416e5c..e7444ac 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -51,7 +51,7 @@ setGeneric("collectPartition", standardGeneric("collectPartition") }) -# @rdname count +# @rdname nrow # @export setGeneric("count", function(x) { standardGeneric("count") }) @@ -395,7 +395,7 @@ setGeneric("value", function(bcast) { standardGeneric("value") }) SparkDataFrame Methods -#' @rdname agg +#' @rdname summarize #' @export setGeneric("agg", function (x, ...) { standardGeneric("agg") }) @@ -654,7 +654,7 @@ setGeneric("showDF", function(x, ...) { standardGeneric("showDF") }) # @export setGeneric("subset", function(x, ...) { standardGeneric("subset") }) -#' @rdname agg +#' @rdname summarize #' @export setGeneric("summarize", function(x, ...) { standardGeneric("summarize") }) @@ -1022,7 +1022,7 @@ setGeneric("month", function(x) { standardGeneric("month") }) #' @export setGeneric("months_between", function(y, x) { standardGeneric("months_between") }) -#' @rdname count +#' @rdname nrow #' @export setGeneric("n", function(x) { standardGeneric("n") }) http://git-wip-us.apache.org/repos/asf/spark/blob/c33e4b0d/R/pkg/R/group.R -- diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R index 5ed7e8a..85348ae 100644 --- a/R/pkg/R/group.R +++ b/R/pkg/R/group.R @@ -47,6 +47,7 @@ groupedData <- function(sgd) { #' @rdname show +#' @aliases show,GroupedData-method #' @note show(GroupedData) since 1.4.0 setMethod("show", "GroupedData", function(object) { @@ -61,6 +62,7 @@ setMethod("show", "GroupedData", #' @param x a GroupedData #' @return a SparkDataFrame #' @rdname count +#' @aliases count,GroupedData-method #' @export #' @examples #' \dontrun{ @@ -84,6 +86,7 @@ setMethod("count", #' @param x a GroupedData #' @return a SparkDataFrame #' @rdname summarize +#' @aliases agg,GroupedData-method #' @name agg #' @family agg_funcs #' @export @@ -121,6 +124,7 @@ setMethod("agg", #' @rdname summarize #' @name summarize +#' @aliases summarize,GroupedData-method #' @note summarize since 1.4.0 setMethod("summarize", signature(x = "GroupedData"), @@ -146,6 +150,7 @@ methods <- c("avg", "max", "mean", "min", "sum") #' @param values A value or a list/vector of distinct values for the output columns. #' @return GroupedData object #' @rdname pivot +#' @aliases pivot,GroupedData,character-method #' @name pivot #' @export #' @examples @@ -198,6 +203,7 @@ createMethods() #' #' @param x A GroupedData #' @rdname gapply +#' @aliases gapply,GroupedData-method #' @name gapply #' @export #' @note gapply(GroupedData) since 2.0.0 @@ -212,6 +218,7 @@ setMethod("gapply", #' #' @param x A GroupedData #' @rdname gapplyCollect +#' @aliases gapplyCollect,GroupedData-method #' @name gapplyCollect #' @export #' @note gapplyCollect(GroupedData) since 2.0.0 @@ -243,4 +250,4 @@ gapplyInternal <- function(x, func, schema) { broadcastArr, if (class(schema) == "structType") { schema$jobj } else { NULL }) dataFrame(sdf) -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/spark/blob/c33e4b0d/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 94e1f65..50c601f 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -91,6 +91,7 @@ NULL #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. #' @param tol Positive convergence tolerance of iterations. #' @param maxIter Integer giving the maximal number of IRLS iterations. +#' @aliases spark.glm,SparkDataFrame,formula-method #' @return \code{spark.glm} returns a fitted generalized linear model #' @rdname spark.glm #' @name spark.glm @@ -306,6 +307,7 @@ setMethod("summary", signature(object = "NaiveBayesModel"), #' @param initMode The initialization algorithm choosen to fit the model #' @return \code{spark.kmeans} returns a fitted k-means model #' @rdname spark.kmeans +#' @aliases spark.kmeans,SparkDataFrame,formula-method #' @name spark.kmeans #' @export #' @examples @@ -418,6 +420,7 @@ setMethod("predict", signature(object = "KMeansModel"), #' @param smoothing Smoothing parameter #' @return \code{spark.naiveBayes} returns a fitted naive Bayes model #' @rdname spark.naiveBayes +#' @aliases spark.naiveBayes,SparkDataFrame,formula-method #' @name spark.naiveBayes #' @seealso e1071: \url{https://cran.r-project.org/web/packages/e1071/} #' @export @@ -512,7 +515,6 @@ setMethod("write.ml", signature(object = "GeneralizedLinea
[1/2] spark git commit: [SPARK-16507][SPARKR] Add a CRAN checker, fix Rd aliases
Repository: spark Updated Branches: refs/heads/branch-2.0 8c2ec443b -> c527e9ed4 http://git-wip-us.apache.org/repos/asf/spark/blob/c527e9ed/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 8416e5c..e7444ac 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -51,7 +51,7 @@ setGeneric("collectPartition", standardGeneric("collectPartition") }) -# @rdname count +# @rdname nrow # @export setGeneric("count", function(x) { standardGeneric("count") }) @@ -395,7 +395,7 @@ setGeneric("value", function(bcast) { standardGeneric("value") }) SparkDataFrame Methods -#' @rdname agg +#' @rdname summarize #' @export setGeneric("agg", function (x, ...) { standardGeneric("agg") }) @@ -654,7 +654,7 @@ setGeneric("showDF", function(x, ...) { standardGeneric("showDF") }) # @export setGeneric("subset", function(x, ...) { standardGeneric("subset") }) -#' @rdname agg +#' @rdname summarize #' @export setGeneric("summarize", function(x, ...) { standardGeneric("summarize") }) @@ -1022,7 +1022,7 @@ setGeneric("month", function(x) { standardGeneric("month") }) #' @export setGeneric("months_between", function(y, x) { standardGeneric("months_between") }) -#' @rdname count +#' @rdname nrow #' @export setGeneric("n", function(x) { standardGeneric("n") }) http://git-wip-us.apache.org/repos/asf/spark/blob/c527e9ed/R/pkg/R/group.R -- diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R index 5ed7e8a..85348ae 100644 --- a/R/pkg/R/group.R +++ b/R/pkg/R/group.R @@ -47,6 +47,7 @@ groupedData <- function(sgd) { #' @rdname show +#' @aliases show,GroupedData-method #' @note show(GroupedData) since 1.4.0 setMethod("show", "GroupedData", function(object) { @@ -61,6 +62,7 @@ setMethod("show", "GroupedData", #' @param x a GroupedData #' @return a SparkDataFrame #' @rdname count +#' @aliases count,GroupedData-method #' @export #' @examples #' \dontrun{ @@ -84,6 +86,7 @@ setMethod("count", #' @param x a GroupedData #' @return a SparkDataFrame #' @rdname summarize +#' @aliases agg,GroupedData-method #' @name agg #' @family agg_funcs #' @export @@ -121,6 +124,7 @@ setMethod("agg", #' @rdname summarize #' @name summarize +#' @aliases summarize,GroupedData-method #' @note summarize since 1.4.0 setMethod("summarize", signature(x = "GroupedData"), @@ -146,6 +150,7 @@ methods <- c("avg", "max", "mean", "min", "sum") #' @param values A value or a list/vector of distinct values for the output columns. #' @return GroupedData object #' @rdname pivot +#' @aliases pivot,GroupedData,character-method #' @name pivot #' @export #' @examples @@ -198,6 +203,7 @@ createMethods() #' #' @param x A GroupedData #' @rdname gapply +#' @aliases gapply,GroupedData-method #' @name gapply #' @export #' @note gapply(GroupedData) since 2.0.0 @@ -212,6 +218,7 @@ setMethod("gapply", #' #' @param x A GroupedData #' @rdname gapplyCollect +#' @aliases gapplyCollect,GroupedData-method #' @name gapplyCollect #' @export #' @note gapplyCollect(GroupedData) since 2.0.0 @@ -243,4 +250,4 @@ gapplyInternal <- function(x, func, schema) { broadcastArr, if (class(schema) == "structType") { schema$jobj } else { NULL }) dataFrame(sdf) -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/spark/blob/c527e9ed/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 94e1f65..50c601f 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -91,6 +91,7 @@ NULL #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. #' @param tol Positive convergence tolerance of iterations. #' @param maxIter Integer giving the maximal number of IRLS iterations. +#' @aliases spark.glm,SparkDataFrame,formula-method #' @return \code{spark.glm} returns a fitted generalized linear model #' @rdname spark.glm #' @name spark.glm @@ -306,6 +307,7 @@ setMethod("summary", signature(object = "NaiveBayesModel"), #' @param initMode The initialization algorithm choosen to fit the model #' @return \code{spark.kmeans} returns a fitted k-means model #' @rdname spark.kmeans +#' @aliases spark.kmeans,SparkDataFrame,formula-method #' @name spark.kmeans #' @export #' @examples @@ -418,6 +420,7 @@ setMethod("predict", signature(object = "KMeansModel"), #' @param smoothing Smoothing parameter #' @return \code{spark.naiveBayes} returns a fitted naive Bayes model #' @rdname spark.naiveBayes +#' @aliases spark.naiveBayes,SparkDataFrame,formula-method #' @name spark.naiveBayes #' @seealso e1071: \url{https://cran.r-project.org/web/packages/e1071/} #' @export @@ -512,7 +515,6 @@ setMethod("write.ml", signature(object = "GeneralizedL
[2/2] spark git commit: [SPARK-16507][SPARKR] Add a CRAN checker, fix Rd aliases
[SPARK-16507][SPARKR] Add a CRAN checker, fix Rd aliases ## What changes were proposed in this pull request? Add a check-cran.sh script that runs `R CMD check` as CRAN. Also fixes a number of issues pointed out by the check. These include - Updating `DESCRIPTION` to be appropriate - Adding a .Rbuildignore to ignore lintr, src-native, html that are non-standard files / dirs - Adding aliases to all S4 methods in DataFrame, Column, GroupedData etc. This is required as stated in https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Documenting-S4-classes-and-methods - Other minor fixes ## How was this patch tested? SparkR unit tests, running the above mentioned script Author: Shivaram Venkataraman Closes #14173 from shivaram/sparkr-cran-changes. (cherry picked from commit c33e4b0d96d424568963c7e716c20f02949c72d1) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c527e9ed Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c527e9ed Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c527e9ed Branch: refs/heads/branch-2.0 Commit: c527e9ed4ae3d45e2df3c7bdb1a4d44afca652d7 Parents: 8c2ec44 Author: Shivaram Venkataraman Authored: Sat Jul 16 17:06:44 2016 -0700 Committer: Shivaram Venkataraman Committed: Sat Jul 16 17:07:38 2016 -0700 -- R/check-cran.sh | 52 ++ R/pkg/.Rbuildignore | 5 + R/pkg/DESCRIPTION| 8 +- R/pkg/NAMESPACE | 9 + R/pkg/R/DataFrame.R | 105 ++- R/pkg/R/SQLContext.R | 3 + R/pkg/R/WindowSpec.R | 6 + R/pkg/R/column.R | 27 ++- R/pkg/R/functions.R | 436 +- R/pkg/R/generics.R | 8 +- R/pkg/R/group.R | 9 +- R/pkg/R/mllib.R | 4 +- R/pkg/R/schema.R | 16 +- R/pkg/R/stats.R | 6 + R/pkg/R/utils.R | 20 +-- R/pkg/R/window.R | 4 + dev/.rat-excludes| 1 + 17 files changed, 676 insertions(+), 43 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c527e9ed/R/check-cran.sh -- diff --git a/R/check-cran.sh b/R/check-cran.sh new file mode 100755 index 000..b3a6860 --- /dev/null +++ b/R/check-cran.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -o pipefail +set -e + +FWDIR="$(cd `dirname $0`; pwd)" +pushd $FWDIR > /dev/null + +if [ ! -z "$R_HOME" ] + then +R_SCRIPT_PATH="$R_HOME/bin" + else +# if system wide R_HOME is not found, then exit +if [ ! `command -v R` ]; then + echo "Cannot find 'R_HOME'. Please specify 'R_HOME' or make sure R is properly installed." + exit 1 +fi +R_SCRIPT_PATH="$(dirname $(which R))" +fi +echo "USING R_HOME = $R_HOME" + +# Build the latest docs +$FWDIR/create-docs.sh + +# Build a zip file containing the source package +"$R_SCRIPT_PATH/"R CMD build $FWDIR/pkg + +# Run check as-cran. +# TODO(shivaram): Remove the skip tests once we figure out the install mechanism + +VERSION=`grep Version $FWDIR/pkg/DESCRIPTION | awk '{print $NF}'` + +"$R_SCRIPT_PATH/"R CMD check --as-cran --no-tests SparkR_"$VERSION".tar.gz + +popd > /dev/null http://git-wip-us.apache.org/repos/asf/spark/blob/c527e9ed/R/pkg/.Rbuildignore -- diff --git a/R/pkg/.Rbuildignore b/R/pkg/.Rbuildignore new file mode 100644 index 000..544d203 --- /dev/null +++ b/R/pkg/.Rbuildignore @@ -0,0 +1,5 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +^\.lintr$ +^src-native$ +^html$ http://git-wip-us.apache.org/repos/asf/spark/blob/c527e9ed/R/pkg/DESCRIPTION -- diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 963a1bb..ac73d6c 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -1,12 +1,10 @@ Package: SparkR Type: Package -Title: R