spark git commit: [SPARK-19818][SPARKR] rbind should check for name consistency of input data frames

felixcheung Mon, 06 Mar 2017 21:55:38 -0800

Repository: spark
Updated Branches:
  refs/heads/master 9909f6d36 -> 1f6c090c1



[SPARK-19818][SPARKR] rbind should check for name consistency of input data 
frames

## What changes were proposed in this pull request?
Added checks for name consistency of input data frames in union.

## How was this patch tested?
new test.

Author: actuaryzhang <actuaryzhan...@gmail.com>

Closes #17159 from actuaryzhang/sparkRUnion.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1f6c090c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1f6c090c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1f6c090c

Branch: refs/heads/master
Commit: 1f6c090c15f355a0c2aad736f8291fcdee5c556d
Parents: 9909f6d
Author: actuaryzhang <actuaryzhan...@gmail.com>
Authored: Mon Mar 6 21:55:11 2017 -0800
Committer: Felix Cheung <felixche...@apache.org>
Committed: Mon Mar 6 21:55:11 2017 -0800

----------------------------------------------------------------------
 R/pkg/R/DataFrame.R                       | 8 +++++++-
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 7 +++++++
 2 files changed, 14 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/1f6c090c/R/pkg/R/DataFrame.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index e33d0d8..97e0c9e 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2642,6 +2642,7 @@ generateAliasesForIntersectedCols <- function (x, 
intersectedColNames, suffix) {
 #'
 #' Return a new SparkDataFrame containing the union of rows in this 
SparkDataFrame
 #' and another SparkDataFrame. This is equivalent to \code{UNION ALL} in SQL.
+#' Input SparkDataFrames can have different schemas (names and data types).
 #'
 #' Note: This does not remove duplicate rows across the two SparkDataFrames.
 #'
@@ -2685,7 +2686,8 @@ setMethod("unionAll",
 
 #' Union two or more SparkDataFrames
 #'
-#' Union two or more SparkDataFrames. This is equivalent to \code{UNION ALL} 
in SQL.
+#' Union two or more SparkDataFrames by row. As in R's \code{rbind}, this 
method
+#' requires that the input SparkDataFrames have the same column names.
 #'
 #' Note: This does not remove duplicate rows across the two SparkDataFrames.
 #'
@@ -2709,6 +2711,10 @@ setMethod("unionAll",
 setMethod("rbind",
           signature(... = "SparkDataFrame"),
           function(x, ..., deparse.level = 1) {
+            nm <- lapply(list(x, ...), names)
+            if (length(unique(nm)) != 1) {
+              stop("Names of input data frames are different.")
+            }
             if (nargs() == 3) {
               union(x, ...)
             } else {

http://git-wip-us.apache.org/repos/asf/spark/blob/1f6c090c/R/pkg/inst/tests/testthat/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 7c09659..620b633 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1850,6 +1850,13 @@ test_that("union(), rbind(), except(), and intersect() 
on a DataFrame", {
   expect_equal(count(unioned2), 12)
   expect_equal(first(unioned2)$name, "Michael")
 
+  df3 <- df2
+  names(df3)[1] <- "newName"
+  expect_error(rbind(df, df3),
+               "Names of input data frames are different.")
+  expect_error(rbind(df, df2, df3),
+               "Names of input data frames are different.")
+
   excepted <- arrange(except(df, df2), desc(df$age))
   expect_is(unioned, "SparkDataFrame")
   expect_equal(count(excepted), 2)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-19818][SPARKR] rbind should check for name consistency of input data frames

Reply via email to