Repository: spark
Updated Branches:
  refs/heads/branch-2.0 591bf7909 -> aeda9a153


[SPARK-16096][SPARKR] add union and deprecate unionAll

## What changes were proposed in this pull request?

add union and deprecate unionAll, separate roxygen2 doc for rbind (since their 
usage and parameter lists are quite different)

`explode` is also deprecated - but seems like replacement is a combination of 
calls; not sure if we should deprecate it in SparkR, yet.

## How was this patch tested?

unit tests, manual checks for r doc

Author: Felix Cheung <felixcheun...@hotmail.com>

Closes #13805 from felixcheung/runion.

(cherry picked from commit dbfdae4e41a900de01b48639d6554d32edbb2e0b)
Signed-off-by: Shivaram Venkataraman <shiva...@cs.berkeley.edu>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/aeda9a15
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/aeda9a15
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/aeda9a15

Branch: refs/heads/branch-2.0
Commit: aeda9a153c117921e95cf204daab0df3202f1d95
Parents: 591bf79
Author: Felix Cheung <felixcheun...@hotmail.com>
Authored: Tue Jun 21 13:36:50 2016 -0700
Committer: Shivaram Venkataraman <shiva...@cs.berkeley.edu>
Committed: Tue Jun 21 13:36:58 2016 -0700

----------------------------------------------------------------------
 R/pkg/NAMESPACE                           |  1 +
 R/pkg/R/DataFrame.R                       | 43 ++++++++++++++++++++------
 R/pkg/R/generics.R                        |  6 +++-
 R/pkg/inst/tests/testthat/test_context.R  |  2 +-
 R/pkg/inst/tests/testthat/test_sparkSQL.R |  8 +++--
 5 files changed, 47 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/aeda9a15/R/pkg/NAMESPACE
----------------------------------------------------------------------
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index ea42888..2272d8b 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -107,6 +107,7 @@ exportMethods("arrange",
               "summary",
               "take",
               "transform",
+              "union",
               "unionAll",
               "unique",
               "unpersist",

http://git-wip-us.apache.org/repos/asf/spark/blob/aeda9a15/R/pkg/R/DataFrame.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index ed0bb85..725cbf2 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2251,7 +2251,7 @@ generateAliasesForIntersectedCols <- function (x, 
intersectedColNames, suffix) {
   cols
 }
 
-#' rbind
+#' Return a new SparkDataFrame containing the union of rows
 #'
 #' Return a new SparkDataFrame containing the union of rows in this 
SparkDataFrame
 #' and another SparkDataFrame. This is equivalent to `UNION ALL` in SQL.
@@ -2261,39 +2261,64 @@ generateAliasesForIntersectedCols <- function (x, 
intersectedColNames, suffix) {
 #' @param y A SparkDataFrame
 #' @return A SparkDataFrame containing the result of the union.
 #' @family SparkDataFrame functions
-#' @rdname rbind
-#' @name unionAll
+#' @rdname union
+#' @name union
+#' @seealso \link{rbind}
 #' @export
 #' @examples
 #'\dontrun{
 #' sparkR.session()
 #' df1 <- read.json(path)
 #' df2 <- read.json(path2)
-#' unioned <- unionAll(df, df2)
+#' unioned <- union(df, df2)
+#' unions <- rbind(df, df2, df3, df4)
 #' }
+#' @note union since 2.0.0
+setMethod("union",
+          signature(x = "SparkDataFrame", y = "SparkDataFrame"),
+          function(x, y) {
+            unioned <- callJMethod(x@sdf, "union", y@sdf)
+            dataFrame(unioned)
+          })
+
+#' unionAll is deprecated - use union instead
+#' @rdname union
+#' @name unionAll
+#' @export
 #' @note unionAll since 1.4.0
 setMethod("unionAll",
           signature(x = "SparkDataFrame", y = "SparkDataFrame"),
           function(x, y) {
-            unioned <- callJMethod(x@sdf, "unionAll", y@sdf)
-            dataFrame(unioned)
+            .Deprecated("union")
+            union(x, y)
           })
 
 #' Union two or more SparkDataFrames
 #'
-#' Returns a new SparkDataFrame containing rows of all parameters.
+#' Union two or more SparkDataFrames. This is equivalent to `UNION ALL` in SQL.
+#' Note that this does not remove duplicate rows across the two 
SparkDataFrames.
 #'
+#' @param x A SparkDataFrame
+#' @param ... Additional SparkDataFrame
+#' @return A SparkDataFrame containing the result of the union.
+#' @family SparkDataFrame functions
 #' @rdname rbind
 #' @name rbind
+#' @seealso \link{union}
 #' @export
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' unions <- rbind(df, df2, df3, df4)
+#' }
 #' @note rbind since 1.5.0
 setMethod("rbind",
           signature(... = "SparkDataFrame"),
           function(x, ..., deparse.level = 1) {
             if (nargs() == 3) {
-              unionAll(x, ...)
+              union(x, ...)
             } else {
-              unionAll(x, Recall(..., deparse.level = 1))
+              union(x, Recall(..., deparse.level = 1))
             }
           })
 

http://git-wip-us.apache.org/repos/asf/spark/blob/aeda9a15/R/pkg/R/generics.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 7b08a8e..27dfd67 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -662,7 +662,11 @@ setGeneric("toJSON", function(x) { 
standardGeneric("toJSON") })
 
 setGeneric("toRDD", function(x) { standardGeneric("toRDD") })
 
-#' @rdname rbind
+#' @rdname union
+#' @export
+setGeneric("union", function(x, y) { standardGeneric("union") })
+
+#' @rdname union
 #' @export
 setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })
 

http://git-wip-us.apache.org/repos/asf/spark/blob/aeda9a15/R/pkg/inst/tests/testthat/test_context.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_context.R 
b/R/pkg/inst/tests/testthat/test_context.R
index b149818..3d232df 100644
--- a/R/pkg/inst/tests/testthat/test_context.R
+++ b/R/pkg/inst/tests/testthat/test_context.R
@@ -24,7 +24,7 @@ test_that("Check masked functions", {
   namesOfMaskedCompletely <- c("cov", "filter", "sample")
   namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", 
"sd", "var",
                      "colnames", "colnames<-", "intersect", "rank", "rbind", 
"sample", "subset",
-                     "summary", "transform", "drop", "window", "as.data.frame")
+                     "summary", "transform", "drop", "window", 
"as.data.frame", "union")
   if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) {
     namesOfMasked <- c("endsWith", "startsWith", namesOfMasked)
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/aeda9a15/R/pkg/inst/tests/testthat/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 7c192fb..9378c7a 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1590,7 +1590,7 @@ test_that("isLocal()", {
   expect_false(isLocal(df))
 })
 
-test_that("unionAll(), rbind(), except(), and intersect() on a DataFrame", {
+test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
   df <- read.json(jsonPath)
 
   lines <- c("{\"name\":\"Bob\", \"age\":24}",
@@ -1600,10 +1600,11 @@ test_that("unionAll(), rbind(), except(), and 
intersect() on a DataFrame", {
   writeLines(lines, jsonPath2)
   df2 <- read.df(jsonPath2, "json")
 
-  unioned <- arrange(unionAll(df, df2), df$age)
+  unioned <- arrange(union(df, df2), df$age)
   expect_is(unioned, "SparkDataFrame")
   expect_equal(count(unioned), 6)
   expect_equal(first(unioned)$name, "Michael")
+  expect_equal(count(arrange(suppressWarnings(unionAll(df, df2)), df$age)), 6)
 
   unioned2 <- arrange(rbind(unioned, df, df2), df$age)
   expect_is(unioned2, "SparkDataFrame")
@@ -1620,6 +1621,9 @@ test_that("unionAll(), rbind(), except(), and intersect() 
on a DataFrame", {
   expect_equal(count(intersected), 1)
   expect_equal(first(intersected)$name, "Andy")
 
+  # Test base::union is working
+  expect_equal(union(c(1:3), c(3:5)), c(1:5))
+
   # Test base::rbind is working
   expect_equal(length(rbind(1:4, c = 2, a = 10, 10, deparse.level = 0)), 16)
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to