spark git commit: [SPARK-15908][R] Add varargs-type dropDuplicates() function in SparkR

shivaram Thu, 16 Jun 2016 20:35:55 -0700

Repository: spark
Updated Branches:
  refs/heads/master 5fd20b66f -> 513a03e41



[SPARK-15908][R] Add varargs-type dropDuplicates() function in SparkR

## What changes were proposed in this pull request?

This PR adds varargs-type `dropDuplicates` function to SparkR for API parity.
Refer to https://issues.apache.org/jira/browse/SPARK-15807, too.

## How was this patch tested?

Pass the Jenkins tests with new testcases.

Author: Dongjoon Hyun <dongj...@apache.org>

Closes #13684 from dongjoon-hyun/SPARK-15908.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/513a03e4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/513a03e4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/513a03e4

Branch: refs/heads/master
Commit: 513a03e41e27d9c5f70911faccc5d3aecd8bdde9
Parents: 5fd20b6
Author: Dongjoon Hyun <dongj...@apache.org>
Authored: Thu Jun 16 20:35:17 2016 -0700
Committer: Shivaram Venkataraman <shiva...@cs.berkeley.edu>
Committed: Thu Jun 16 20:35:17 2016 -0700

----------------------------------------------------------------------
 R/pkg/R/DataFrame.R                       | 25 +++++++++++++++++++------
 R/pkg/R/generics.R                        |  7 ++-----
 R/pkg/inst/tests/testthat/test_sparkSQL.R |  8 ++++++++
 3 files changed, 29 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/513a03e4/R/pkg/R/DataFrame.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index d72cbbd..c710bff 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1936,10 +1936,11 @@ setMethod("where",
 #' the subset of columns.
 #'
 #' @param x A SparkDataFrame.
-#' @param colnames A character vector of column names.
+#' @param ... A character vector of column names or string column names.
+#'            If the first argument contains a character vector, the 
followings are ignored.
 #' @return A SparkDataFrame with duplicate rows removed.
 #' @family SparkDataFrame functions
-#' @rdname dropduplicates
+#' @rdname dropDuplicates
 #' @name dropDuplicates
 #' @export
 #' @examples
@@ -1949,14 +1950,26 @@ setMethod("where",
 #' path <- "path/to/file.json"
 #' df <- read.json(path)
 #' dropDuplicates(df)
+#' dropDuplicates(df, "col1", "col2")
 #' dropDuplicates(df, c("col1", "col2"))
 #' }
 setMethod("dropDuplicates",
           signature(x = "SparkDataFrame"),
-          function(x, colNames = columns(x)) {
-            stopifnot(class(colNames) == "character")
-
-            sdf <- callJMethod(x@sdf, "dropDuplicates", as.list(colNames))
+          function(x, ...) {
+            cols <- list(...)
+            if (length(cols) == 0) {
+              sdf <- callJMethod(x@sdf, "dropDuplicates", as.list(columns(x)))
+            } else {
+              if (!all(sapply(cols, function(c) { is.character(c) }))) {
+                stop("all columns names should be characters")
+              }
+              col <- cols[[1]]
+              if (length(col) > 1) {
+                sdf <- callJMethod(x@sdf, "dropDuplicates", as.list(col))
+              } else {
+                sdf <- callJMethod(x@sdf, "dropDuplicates", cols)
+              }
+            }
             dataFrame(sdf)
           })
 

http://git-wip-us.apache.org/repos/asf/spark/blob/513a03e4/R/pkg/R/generics.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 40a96d8..8164e77 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -466,12 +466,9 @@ setGeneric("describe", function(x, col, ...) { 
standardGeneric("describe") })
 #' @export
 setGeneric("drop", function(x, ...) { standardGeneric("drop") })
 
-#' @rdname dropduplicates
+#' @rdname dropDuplicates
 #' @export
-setGeneric("dropDuplicates",
-           function(x, colNames = columns(x)) {
-             standardGeneric("dropDuplicates")
-           })
+setGeneric("dropDuplicates", function(x, ...) { 
standardGeneric("dropDuplicates") })
 
 #' @rdname nafunctions
 #' @export

http://git-wip-us.apache.org/repos/asf/spark/blob/513a03e4/R/pkg/inst/tests/testthat/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index c11930a..11d6936 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -796,6 +796,14 @@ test_that("distinct(), unique() and dropDuplicates() on 
DataFrames", {
     result[order(result$key, result$value1, result$value2), ],
     expected)
 
+  result <- collect(dropDuplicates(df, "key", "value1"))
+  expected <- rbind.data.frame(
+    c(1, 1, 1), c(1, 2, 1), c(2, 1, 2), c(2, 2, 2))
+  names(expected) <- c("key", "value1", "value2")
+  expect_equivalent(
+    result[order(result$key, result$value1, result$value2), ],
+    expected)
+
   result <- collect(dropDuplicates(df, "key"))
   expected <- rbind.data.frame(
     c(1, 1, 1), c(2, 1, 2))


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15908][R] Add varargs-type dropDuplicates() function in SparkR

Reply via email to