Repository: spark Updated Branches: refs/heads/master 93df3cd03 -> e9efb62e0
[SPARK-24187][R][SQL] Add array_join function to SparkR ## What changes were proposed in this pull request? This PR adds array_join function to SparkR ## How was this patch tested? Add unit test in test_sparkSQL.R Author: Huaxin Gao <huax...@us.ibm.com> Closes #21313 from huaxingao/spark-24187. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e9efb62e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e9efb62e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e9efb62e Branch: refs/heads/master Commit: e9efb62e0795c8d5233b7e5bfc276d74953942b8 Parents: 93df3cd Author: Huaxin Gao <huax...@us.ibm.com> Authored: Wed Jun 6 08:31:35 2018 +0700 Committer: hyukjinkwon <gurwls...@apache.org> Committed: Wed Jun 6 08:31:35 2018 +0700 ---------------------------------------------------------------------- R/pkg/NAMESPACE | 1 + R/pkg/R/functions.R | 29 ++++++++++++++++++++++++++--- R/pkg/R/generics.R | 4 ++++ R/pkg/tests/fulltests/test_sparkSQL.R | 15 +++++++++++++++ 4 files changed, 46 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/e9efb62e/R/pkg/NAMESPACE ---------------------------------------------------------------------- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 73a33af..9696f69 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -201,6 +201,7 @@ exportMethods("%<=>%", "approxCountDistinct", "approxQuantile", "array_contains", + "array_join", "array_max", "array_min", "array_position", http://git-wip-us.apache.org/repos/asf/spark/blob/e9efb62e/R/pkg/R/functions.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index abc91ae..3bff633 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -221,7 +221,9 @@ NULL #' head(select(tmp3, element_at(tmp3$v3, "Valiant"))) #' tmp4 <- mutate(df, v4 = create_array(df$mpg, df$cyl), v5 = create_array(df$cyl, df$hp)) #' head(select(tmp4, concat(tmp4$v4, tmp4$v5), arrays_overlap(tmp4$v4, tmp4$v5))) -#' head(select(tmp, concat(df$mpg, df$cyl, df$hp)))} +#' head(select(tmp, concat(df$mpg, df$cyl, df$hp))) +#' tmp5 <- mutate(df, v6 = create_array(df$model, df$model)) +#' head(select(tmp5, array_join(tmp5$v6, "#"), array_join(tmp5$v6, "#", "NULL")))} NULL #' Window functions for Column operations @@ -3007,6 +3009,27 @@ setMethod("array_contains", }) #' @details +#' \code{array_join}: Concatenates the elements of column using the delimiter. +#' Null values are replaced with nullReplacement if set, otherwise they are ignored. +#' +#' @param delimiter a character string that is used to concatenate the elements of column. +#' @param nullReplacement an optional character string that is used to replace the Null values. +#' @rdname column_collection_functions +#' @aliases array_join array_join,Column-method +#' @note array_join since 2.4.0 +setMethod("array_join", + signature(x = "Column", delimiter = "character"), + function(x, delimiter, nullReplacement = NULL) { + jc <- if (is.null(nullReplacement)) { + callJStatic("org.apache.spark.sql.functions", "array_join", x@jc, delimiter) + } else { + callJStatic("org.apache.spark.sql.functions", "array_join", x@jc, delimiter, + as.character(nullReplacement)) + } + column(jc) + }) + +#' @details #' \code{array_max}: Returns the maximum value of the array. #' #' @rdname column_collection_functions @@ -3197,8 +3220,8 @@ setMethod("size", #' (or starting from the end if start is negative) with the specified length. #' #' @rdname column_collection_functions -#' @param start an index indicating the first element occuring in the result. -#' @param length a number of consecutive elements choosen to the result. +#' @param start an index indicating the first element occurring in the result. +#' @param length a number of consecutive elements chosen to the result. #' @aliases slice slice,Column-method #' @note slice since 2.4.0 setMethod("slice", http://git-wip-us.apache.org/repos/asf/spark/blob/e9efb62e/R/pkg/R/generics.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 8894cb1..9321bba 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -759,6 +759,10 @@ setGeneric("array_contains", function(x, value) { standardGeneric("array_contain #' @rdname column_collection_functions #' @name NULL +setGeneric("array_join", function(x, delimiter, ...) { standardGeneric("array_join") }) + +#' @rdname column_collection_functions +#' @name NULL setGeneric("array_max", function(x) { standardGeneric("array_max") }) #' @rdname column_collection_functions http://git-wip-us.apache.org/repos/asf/spark/blob/e9efb62e/R/pkg/tests/fulltests/test_sparkSQL.R ---------------------------------------------------------------------- diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 16c1fd5..36e0f78 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1518,6 +1518,21 @@ test_that("column functions", { result <- collect(select(df, arrays_overlap(df[[1]], df[[2]])))[[1]] expect_equal(result, c(TRUE, FALSE, NA)) + # Test array_join() + df <- createDataFrame(list(list(list("Hello", "World!")))) + result <- collect(select(df, array_join(df[[1]], "#")))[[1]] + expect_equal(result, "Hello#World!") + df2 <- createDataFrame(list(list(list("Hello", NA, "World!")))) + result <- collect(select(df2, array_join(df2[[1]], "#", "Beautiful")))[[1]] + expect_equal(result, "Hello#Beautiful#World!") + result <- collect(select(df2, array_join(df2[[1]], "#")))[[1]] + expect_equal(result, "Hello#World!") + df3 <- createDataFrame(list(list(list("Hello", NULL, "World!")))) + result <- collect(select(df3, array_join(df3[[1]], "#", "Beautiful")))[[1]] + expect_equal(result, "Hello#Beautiful#World!") + result <- collect(select(df3, array_join(df3[[1]], "#")))[[1]] + expect_equal(result, "Hello#World!") + # Test array_sort() and sort_array() df <- createDataFrame(list(list(list(2L, 1L, 3L, NA)), list(list(NA, 6L, 5L, NA, 4L)))) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org