Repository: spark
Updated Branches:
  refs/heads/master 90264aced -> 8a272ddc9


[SPARK-20438][R] SparkR wrappers for split and repeat

## What changes were proposed in this pull request?

Add wrappers for `o.a.s.sql.functions`:

- `split` as `split_string`
- `repeat` as `repeat_string`

## How was this patch tested?

Existing tests, additional unit tests, `check-cran.sh`

Author: zero323 <zero...@users.noreply.github.com>

Closes #17729 from zero323/SPARK-20438.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8a272ddc
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8a272ddc
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8a272ddc

Branch: refs/heads/master
Commit: 8a272ddc9d2359a724aa89ae2f8de121a4aa7ac2
Parents: 90264ac
Author: zero323 <zero...@users.noreply.github.com>
Authored: Mon Apr 24 10:56:57 2017 -0700
Committer: Felix Cheung <felixche...@apache.org>
Committed: Mon Apr 24 10:56:57 2017 -0700

----------------------------------------------------------------------
 R/pkg/NAMESPACE                           |  2 +
 R/pkg/R/functions.R                       | 58 ++++++++++++++++++++++++++
 R/pkg/R/generics.R                        |  8 ++++
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 34 +++++++++++++++
 4 files changed, 102 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/8a272ddc/R/pkg/NAMESPACE
----------------------------------------------------------------------
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index e804e30..95d5cc6 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -300,6 +300,7 @@ exportMethods("%in%",
               "rank",
               "regexp_extract",
               "regexp_replace",
+              "repeat_string",
               "reverse",
               "rint",
               "rlike",
@@ -323,6 +324,7 @@ exportMethods("%in%",
               "sort_array",
               "soundex",
               "spark_partition_id",
+              "split_string",
               "stddev",
               "stddev_pop",
               "stddev_samp",

http://git-wip-us.apache.org/repos/asf/spark/blob/8a272ddc/R/pkg/R/functions.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index e7decb9..752e4c5 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -3745,3 +3745,61 @@ setMethod("collect_set",
             jc <- callJStatic("org.apache.spark.sql.functions", "collect_set", 
x@jc)
             column(jc)
           })
+
+#' split_string
+#'
+#' Splits string on regular expression.
+#'
+#' Equivalent to \code{split} SQL function
+#'
+#' @param x Column to compute on
+#' @param pattern Java regular expression
+#'
+#' @rdname split_string
+#' @family string_funcs
+#' @aliases split_string,Column-method
+#' @export
+#' @examples \dontrun{
+#' df <- read.text("README.md")
+#'
+#' head(select(df, split_string(df$value, "\\s+")))
+#'
+#' # This is equivalent to the following SQL expression
+#' head(selectExpr(df, "split(value, '\\\\s+')"))
+#' }
+#' @note split_string 2.3.0
+setMethod("split_string",
+          signature(x = "Column", pattern = "character"),
+          function(x, pattern) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "split", x@jc, 
pattern)
+            column(jc)
+          })
+
+#' repeat_string
+#'
+#' Repeats string n times.
+#'
+#' Equivalent to \code{repeat} SQL function
+#'
+#' @param x Column to compute on
+#' @param n Number of repetitions
+#'
+#' @rdname repeat_string
+#' @family string_funcs
+#' @aliases repeat_string,Column-method
+#' @export
+#' @examples \dontrun{
+#' df <- read.text("README.md")
+#'
+#' first(select(df, repeat_string(df$value, 3)))
+#'
+#' # This is equivalent to the following SQL expression
+#' first(selectExpr(df, "repeat(value, 3)"))
+#' }
+#' @note repeat_string 2.3.0
+setMethod("repeat_string",
+          signature(x = "Column", n = "numeric"),
+          function(x, n) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "repeat", 
x@jc, numToInt(n))
+            column(jc)
+          })

http://git-wip-us.apache.org/repos/asf/spark/blob/8a272ddc/R/pkg/R/generics.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 61d248e..5e7a1c6 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1192,6 +1192,10 @@ setGeneric("regexp_extract", function(x, pattern, idx) { 
standardGeneric("regexp
 setGeneric("regexp_replace",
            function(x, pattern, replacement) { 
standardGeneric("regexp_replace") })
 
+#' @rdname repeat_string
+#' @export
+setGeneric("repeat_string", function(x, n) { standardGeneric("repeat_string") 
})
+
 #' @rdname reverse
 #' @export
 setGeneric("reverse", function(x) { standardGeneric("reverse") })
@@ -1257,6 +1261,10 @@ setGeneric("skewness", function(x) { 
standardGeneric("skewness") })
 #' @export
 setGeneric("sort_array", function(x, asc = TRUE) { 
standardGeneric("sort_array") })
 
+#' @rdname split_string
+#' @export
+setGeneric("split_string", function(x, pattern) { 
standardGeneric("split_string") })
+
 #' @rdname soundex
 #' @export
 setGeneric("soundex", function(x) { standardGeneric("soundex") })

http://git-wip-us.apache.org/repos/asf/spark/blob/8a272ddc/R/pkg/inst/tests/testthat/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index bf2093f..c21ba2f 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1546,6 +1546,40 @@ test_that("string operators", {
   expect_equal(collect(select(df3, substring_index(df3$a, ".", 2)))[1, 1], 
"a.b")
   expect_equal(collect(select(df3, substring_index(df3$a, ".", -3)))[1, 1], 
"b.c.d")
   expect_equal(collect(select(df3, translate(df3$a, "bc", "12")))[1, 1], 
"a.1.2.d")
+
+  l4 <- list(list(a = "a.b@c.d   1\\b"))
+  df4 <- createDataFrame(l4)
+  expect_equal(
+    collect(select(df4, split_string(df4$a, "\\s+")))[1, 1],
+    list(list("a.b@c.d", "1\\b"))
+  )
+  expect_equal(
+    collect(select(df4, split_string(df4$a, "\\.")))[1, 1],
+    list(list("a", "b@c", "d   1\\b"))
+  )
+  expect_equal(
+    collect(select(df4, split_string(df4$a, "@")))[1, 1],
+    list(list("a.b", "c.d   1\\b"))
+  )
+  expect_equal(
+    collect(select(df4, split_string(df4$a, "\\\\")))[1, 1],
+    list(list("a.b@c.d   1", "b"))
+  )
+
+  l5 <- list(list(a = "abc"))
+  df5 <- createDataFrame(l5)
+  expect_equal(
+    collect(select(df5, repeat_string(df5$a, 1L)))[1, 1],
+    "abc"
+  )
+  expect_equal(
+    collect(select(df5, repeat_string(df5$a, 3)))[1, 1],
+    "abcabcabc"
+  )
+  expect_equal(
+    collect(select(df5, repeat_string(df5$a, -1)))[1, 1],
+    ""
+  )
 })
 
 test_that("date functions on a DataFrame", {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to