spark git commit: [SPARK-13389][SPARKR] SparkR support first/last with ignore NAs

shivaram Thu, 10 Mar 2016 17:32:22 -0800

Repository: spark
Updated Branches:
  refs/heads/master c3a6269ca -> 4d535d1f1



[SPARK-13389][SPARKR] SparkR support first/last with ignore NAs

## What changes were proposed in this pull request?

SparkR support first/last with ignore NAs

cc sun-rui felixcheung shivaram

## How was the this patch tested?

unit tests

Author: Yanbo Liang <yblia...@gmail.com>

Closes #11267 from yanboliang/spark-13389.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4d535d1f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4d535d1f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4d535d1f

Branch: refs/heads/master
Commit: 4d535d1f1c19faa43f96433aee8760e37b1690ea
Parents: c3a6269
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Thu Mar 10 17:31:19 2016 -0800
Committer: Shivaram Venkataraman <shiva...@cs.berkeley.edu>
Committed: Thu Mar 10 17:31:19 2016 -0800

----------------------------------------------------------------------
 R/pkg/R/functions.R                       | 40 ++++++++++++++++++++------
 R/pkg/R/generics.R                        |  4 +--
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 11 +++++++
 3 files changed, 45 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/4d535d1f/R/pkg/R/functions.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index e5521f3..d9c10b4 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -536,15 +536,27 @@ setMethod("factorial",
 #'
 #' Aggregate function: returns the first value in a group.
 #'
+#' The function by default returns the first values it sees. It will return 
the first non-missing
+#' value it sees when na.rm is set to true. If all values are missing, then NA 
is returned.
+#'
 #' @rdname first
 #' @name first
 #' @family agg_funcs
 #' @export
-#' @examples \dontrun{first(df$c)}
+#' @examples
+#' \dontrun{
+#' first(df$c)
+#' first(df$c, TRUE)
+#' }
 setMethod("first",
-          signature(x = "Column"),
-          function(x) {
-            jc <- callJStatic("org.apache.spark.sql.functions", "first", x@jc)
+          signature(x = "characterOrColumn"),
+          function(x, na.rm = FALSE) {
+            col <- if (class(x) == "Column") {
+              x@jc
+            } else {
+              x
+            }
+            jc <- callJStatic("org.apache.spark.sql.functions", "first", col, 
na.rm)
             column(jc)
           })
 
@@ -663,15 +675,27 @@ setMethod("kurtosis",
 #'
 #' Aggregate function: returns the last value in a group.
 #'
+#' The function by default returns the last values it sees. It will return the 
last non-missing
+#' value it sees when na.rm is set to true. If all values are missing, then NA 
is returned.
+#'
 #' @rdname last
 #' @name last
 #' @family agg_funcs
 #' @export
-#' @examples \dontrun{last(df$c)}
+#' @examples
+#' \dontrun{
+#' last(df$c)
+#' last(df$c, TRUE)
+#' }
 setMethod("last",
-          signature(x = "Column"),
-          function(x) {
-            jc <- callJStatic("org.apache.spark.sql.functions", "last", x@jc)
+          signature(x = "characterOrColumn"),
+          function(x, na.rm = FALSE) {
+            col <- if (class(x) == "Column") {
+              x@jc
+            } else {
+              x
+            }
+            jc <- callJStatic("org.apache.spark.sql.functions", "last", col, 
na.rm)
             column(jc)
           })
 

http://git-wip-us.apache.org/repos/asf/spark/blob/4d535d1f/R/pkg/R/generics.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 3db72b5..ddfa617 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -84,7 +84,7 @@ setGeneric("filterRDD", function(x, f) { 
standardGeneric("filterRDD") })
 
 # @rdname first
 # @export
-setGeneric("first", function(x) { standardGeneric("first") })
+setGeneric("first", function(x, ...) { standardGeneric("first") })
 
 # @rdname flatMap
 # @export
@@ -889,7 +889,7 @@ setGeneric("lag", function(x, ...) { standardGeneric("lag") 
})
 
 #' @rdname last
 #' @export
-setGeneric("last", function(x) { standardGeneric("last") })
+setGeneric("last", function(x, ...) { standardGeneric("last") })
 
 #' @rdname last_day
 #' @export

http://git-wip-us.apache.org/repos/asf/spark/blob/4d535d1f/R/pkg/inst/tests/testthat/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index cad5766..11a8f12 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1076,6 +1076,17 @@ test_that("column functions", {
   result <- collect(select(df, encode(df$a, "utf-8"), decode(df$c, "utf-8")))
   expect_equal(result[[1]][[1]], bytes)
   expect_equal(result[[2]], markUtf8("å¤§åä¸ç"))
+
+  # Test first(), last()
+  df <- read.json(sqlContext, jsonPath)
+  expect_equal(collect(select(df, first(df$age)))[[1]], NA)
+  expect_equal(collect(select(df, first(df$age, TRUE)))[[1]], 30)
+  expect_equal(collect(select(df, first("age")))[[1]], NA)
+  expect_equal(collect(select(df, first("age", TRUE)))[[1]], 30)
+  expect_equal(collect(select(df, last(df$age)))[[1]], 19)
+  expect_equal(collect(select(df, last(df$age, TRUE)))[[1]], 19)
+  expect_equal(collect(select(df, last("age")))[[1]], 19)
+  expect_equal(collect(select(df, last("age", TRUE)))[[1]], 19)
 })
 
 test_that("column binary mathfunctions", {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-13389][SPARKR] SparkR support first/last with ignore NAs

Reply via email to