spark git commit: [SPARK-9316] [SPARKR] Add support for filtering using `[` (synonym for filter / select)

shivaram Tue, 25 Aug 2015 23:49:15 -0700

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 21a10a86d -> 5220db9e3



[SPARK-9316] [SPARKR] Add support for filtering using `[` (synonym for filter / 
select)

Add support for
```
   df[df$name == "Smith", c(1,2)]
   df[df$age %in% c(19, 30), 1:2]
```

shivaram

Author: felixcheung <felixcheun...@hotmail.com>

Closes #8394 from felixcheung/rsubset.

(cherry picked from commit 75d4773aa50e24972c533e8b48697fde586429eb)
Signed-off-by: Shivaram Venkataraman <shiva...@cs.berkeley.edu>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5220db9e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5220db9e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5220db9e

Branch: refs/heads/branch-1.5
Commit: 5220db9e352b5d5eae59cead9478ca0a9f73f16b
Parents: 21a10a8
Author: felixcheung <felixcheun...@hotmail.com>
Authored: Tue Aug 25 23:48:16 2015 -0700
Committer: Shivaram Venkataraman <shiva...@cs.berkeley.edu>
Committed: Tue Aug 25 23:48:27 2015 -0700

----------------------------------------------------------------------
 R/pkg/R/DataFrame.R              | 22 +++++++++++++++++++++-
 R/pkg/inst/tests/test_sparkSQL.R | 27 +++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/5220db9e/R/pkg/R/DataFrame.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 10f3c4e..1d870ec 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -954,9 +954,11 @@ setMethod("$<-", signature(x = "DataFrame"),
             x
           })
 
+setClassUnion("numericOrcharacter", c("numeric", "character"))
+
 #' @rdname select
 #' @name [[
-setMethod("[[", signature(x = "DataFrame"),
+setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"),
           function(x, i) {
             if (is.numeric(i)) {
               cols <- columns(x)
@@ -979,6 +981,20 @@ setMethod("[", signature(x = "DataFrame", i = "missing"),
             select(x, j)
           })
 
+#' @rdname select
+#' @name [
+setMethod("[", signature(x = "DataFrame", i = "Column"),
+          function(x, i, j, ...) {
+            # It could handle i as "character" but it seems confusing and not 
required
+            # 
https://stat.ethz.ch/R-manual/R-devel/library/base/html/Extract.data.frame.html
+            filtered <- filter(x, i)
+            if (!missing(j)) {
+              filtered[, j]
+            } else {
+              filtered
+            }
+          })
+
 #' Select
 #'
 #' Selects a set of columns with names or Column expressions.
@@ -997,8 +1013,12 @@ setMethod("[", signature(x = "DataFrame", i = "missing"),
 #'   # Columns can also be selected using `[[` and `[`
 #'   df[[2]] == df[["age"]]
 #'   df[,2] == df[,"age"]
+#'   df[,c("name", "age")]
 #'   # Similar to R data frames columns can also be selected using `$`
 #'   df$age
+#'   # It can also be subset on rows and Columns
+#'   df[df$name == "Smith", c(1,2)]
+#'   df[df$age %in% c(19, 30), 1:2]
 #' }
 setMethod("select", signature(x = "DataFrame", col = "character"),
           function(x, col, ...) {

http://git-wip-us.apache.org/repos/asf/spark/blob/5220db9e/R/pkg/inst/tests/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 556b8c5..ee48a3d 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -587,6 +587,33 @@ test_that("select with column", {
   expect_equal(collect(select(df3, "x"))[[1, 1]], "x")
 })
 
+test_that("subsetting", {
+  # jsonFile returns columns in random order
+  df <- select(jsonFile(sqlContext, jsonPath), "name", "age")
+  filtered <- df[df$age > 20,]
+  expect_equal(count(filtered), 1)
+  expect_equal(columns(filtered), c("name", "age"))
+  expect_equal(collect(filtered)$name, "Andy")
+
+  df2 <- df[df$age == 19, 1]
+  expect_is(df2, "DataFrame")
+  expect_equal(count(df2), 1)
+  expect_equal(columns(df2), c("name"))
+  expect_equal(collect(df2)$name, "Justin")
+
+  df3 <- df[df$age > 20, 2]
+  expect_equal(count(df3), 1)
+  expect_equal(columns(df3), c("age"))
+
+  df4 <- df[df$age %in% c(19, 30), 1:2]
+  expect_equal(count(df4), 2)
+  expect_equal(columns(df4), c("name", "age"))
+  
+  df5 <- df[df$age %in% c(19), c(1,2)]
+  expect_equal(count(df5), 1)
+  expect_equal(columns(df5), c("name", "age"))
+})
+
 test_that("selectExpr() on a DataFrame", {
   df <- jsonFile(sqlContext, jsonPath)
   selected <- selectExpr(df, "age * 2")


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9316] [SPARKR] Add support for filtering using `[` (synonym for filter / select)

Reply via email to