spark git commit: [SPARK-9855] [SPARKR] Add expression functions into SparkR whose params are simple

shivaram Wed, 12 Aug 2015 18:34:02 -0700

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 62ab2a4c6 -> ca39c9e91



[SPARK-9855] [SPARKR] Add expression functions into SparkR whose params are 
simple

I added lots of expression functions for SparkR. This PR includes only 
functions whose params  are only `(Column)` or `(Column, Column)`.  And I think 
we need to improve how to test those functions. However, it would be better to 
work on another issue.

## Diff Summary

- Add lots of functions in `functions.R` and their generic in `generic.R`
- Add aliases for `ceiling` and `sign`
- Move expression functions from `column.R` to `functions.R`
- Modify `rdname` from `column` to `functions`

I haven't supported `not` function, because the name has a collesion with 
`testthat` package. I didn't think of the way  to define it.

## New Supported Functions

```
approxCountDistinct
ascii
base64
bin
bitwiseNOT
ceil (alias: ceiling)
crc32
dayofmonth
dayofyear
explode
factorial
hex
hour
initcap
isNaN
last_day
length
log2
ltrim
md5
minute
month
negate
quarter
reverse
round
rtrim
second
sha1
signum (alias: sign)
size
soundex
to_date
trim
unbase64
unhex
weekofyear
year

datediff
levenshtein
months_between
nanvl
pmod
```

## JIRA
[[SPARK-9855] Add expression functions into SparkR whose params are simple - 
ASF JIRA](https://issues.apache.org/jira/browse/SPARK-9855)

Author: Yu ISHIKAWA <yuu.ishik...@gmail.com>

Closes #8123 from yu-iskw/SPARK-9855.

(cherry picked from commit f4bc01f1f33a93e6affe5c8a3e33ffbd92d03f38)
Signed-off-by: Shivaram Venkataraman <shiva...@cs.berkeley.edu>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ca39c9e9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ca39c9e9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ca39c9e9

Branch: refs/heads/branch-1.5
Commit: ca39c9e91602223f5665ab6942b917c4900bd996
Parents: 62ab2a4
Author: Yu ISHIKAWA <yuu.ishik...@gmail.com>
Authored: Wed Aug 12 18:33:27 2015 -0700
Committer: Shivaram Venkataraman <shiva...@cs.berkeley.edu>
Committed: Wed Aug 12 18:33:35 2015 -0700

----------------------------------------------------------------------
 R/pkg/DESCRIPTION                |   1 +
 R/pkg/R/column.R                 |  81 ---------------
 R/pkg/R/functions.R              | 123 ++++++++++++++++++++++
 R/pkg/R/generics.R               | 185 +++++++++++++++++++++++++++++++---
 R/pkg/inst/tests/test_sparkSQL.R |  21 ++--
 5 files changed, 309 insertions(+), 102 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/ca39c9e9/R/pkg/DESCRIPTION
----------------------------------------------------------------------
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 4949d86..83e6489 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -29,6 +29,7 @@ Collate:
     'client.R'
     'context.R'
     'deserialize.R'
+    'functions.R'
     'mllib.R'
     'serialize.R'
     'sparkR.R'

http://git-wip-us.apache.org/repos/asf/spark/blob/ca39c9e9/R/pkg/R/column.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index eeaf9f1..328f595 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -60,12 +60,6 @@ operators <- list(
 )
 column_functions1 <- c("asc", "desc", "isNull", "isNotNull")
 column_functions2 <- c("like", "rlike", "startsWith", "endsWith", "getField", 
"getItem", "contains")
-functions <- c("min", "max", "sum", "avg", "mean", "count", "abs", "sqrt",
-               "first", "last", "lower", "upper", "sumDistinct",
-               "acos", "asin", "atan", "cbrt", "ceiling", "cos", "cosh", "exp",
-               "expm1", "floor", "log", "log10", "log1p", "rint", "sign",
-               "sin", "sinh", "tan", "tanh", "toDegrees", "toRadians")
-binary_mathfunctions <- c("atan2", "hypot")
 
 createOperator <- function(op) {
   setMethod(op,
@@ -111,33 +105,6 @@ createColumnFunction2 <- function(name) {
             })
 }
 
-createStaticFunction <- function(name) {
-  setMethod(name,
-            signature(x = "Column"),
-            function(x) {
-              if (name == "ceiling") {
-                  name <- "ceil"
-              }
-              if (name == "sign") {
-                  name <- "signum"
-              }
-              jc <- callJStatic("org.apache.spark.sql.functions", name, x@jc)
-              column(jc)
-            })
-}
-
-createBinaryMathfunctions <- function(name) {
-  setMethod(name,
-            signature(y = "Column"),
-            function(y, x) {
-              if (class(x) == "Column") {
-                x <- x@jc
-              }
-              jc <- callJStatic("org.apache.spark.sql.functions", name, y@jc, 
x)
-              column(jc)
-            })
-}
-
 createMethods <- function() {
   for (op in names(operators)) {
     createOperator(op)
@@ -148,12 +115,6 @@ createMethods <- function() {
   for (name in column_functions2) {
     createColumnFunction2(name)
   }
-  for (x in functions) {
-    createStaticFunction(x)
-  }
-  for (name in binary_mathfunctions) {
-    createBinaryMathfunctions(name)
-  }
 }
 
 createMethods()
@@ -242,45 +203,3 @@ setMethod("%in%",
             jc <- callJMethod(x@jc, "in", table)
             return(column(jc))
           })
-
-#' Approx Count Distinct
-#'
-#' @rdname column
-#' @return the approximate number of distinct items in a group.
-setMethod("approxCountDistinct",
-          signature(x = "Column"),
-          function(x, rsd = 0.95) {
-            jc <- callJStatic("org.apache.spark.sql.functions", 
"approxCountDistinct", x@jc, rsd)
-            column(jc)
-          })
-
-#' Count Distinct
-#'
-#' @rdname column
-#' @return the number of distinct items in a group.
-setMethod("countDistinct",
-          signature(x = "Column"),
-          function(x, ...) {
-            jcol <- lapply(list(...), function (x) {
-              x@jc
-            })
-            jc <- callJStatic("org.apache.spark.sql.functions", 
"countDistinct", x@jc,
-                              listToSeq(jcol))
-            column(jc)
-          })
-
-#' @rdname column
-#' @aliases countDistinct
-setMethod("n_distinct",
-          signature(x = "Column"),
-          function(x, ...) {
-            countDistinct(x, ...)
-          })
-
-#' @rdname column
-#' @aliases count
-setMethod("n",
-          signature(x = "Column"),
-          function(x) {
-            count(x)
-          })

http://git-wip-us.apache.org/repos/asf/spark/blob/ca39c9e9/R/pkg/R/functions.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
new file mode 100644
index 0000000..a15d2d5
--- /dev/null
+++ b/R/pkg/R/functions.R
@@ -0,0 +1,123 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#' @include generics.R column.R
+NULL
+
+#' @title S4 expression functions for DataFrame column(s)
+#' @description These are expression functions on DataFrame columns
+
+functions1 <- c(
+  "abs", "acos", "approxCountDistinct", "ascii", "asin", "atan",
+  "avg", "base64", "bin", "bitwiseNOT", "cbrt", "ceil", "cos", "cosh", "count",
+  "crc32", "dayofmonth", "dayofyear", "exp", "explode", "expm1", "factorial",
+  "first", "floor", "hex", "hour", "initcap", "isNaN", "last", "last_day",
+  "length", "log", "log10", "log1p", "log2", "lower", "ltrim", "max", "md5",
+  "mean", "min", "minute", "month", "negate", "quarter", "reverse",
+  "rint", "round", "rtrim", "second", "sha1", "signum", "sin", "sinh", "size",
+  "soundex", "sqrt", "sum", "sumDistinct", "tan", "tanh", "toDegrees",
+  "toRadians", "to_date", "trim", "unbase64", "unhex", "upper", "weekofyear",
+  "year")
+functions2 <- c(
+  "atan2", "datediff", "hypot", "levenshtein", "months_between", "nanvl", 
"pmod")
+
+createFunction1 <- function(name) {
+  setMethod(name,
+            signature(x = "Column"),
+            function(x) {
+              jc <- callJStatic("org.apache.spark.sql.functions", name, x@jc)
+              column(jc)
+            })
+}
+
+createFunction2 <- function(name) {
+  setMethod(name,
+            signature(y = "Column"),
+            function(y, x) {
+              if (class(x) == "Column") {
+                x <- x@jc
+              }
+              jc <- callJStatic("org.apache.spark.sql.functions", name, y@jc, 
x)
+              column(jc)
+            })
+}
+
+createFunctions <- function() {
+  for (name in functions1) {
+    createFunction1(name)
+  }
+  for (name in functions2) {
+    createFunction2(name)
+  }
+}
+
+createFunctions()
+
+#' Approx Count Distinct
+#'
+#' @rdname functions
+#' @return the approximate number of distinct items in a group.
+setMethod("approxCountDistinct",
+          signature(x = "Column"),
+          function(x, rsd = 0.95) {
+            jc <- callJStatic("org.apache.spark.sql.functions", 
"approxCountDistinct", x@jc, rsd)
+            column(jc)
+          })
+
+#' Count Distinct
+#'
+#' @rdname functions
+#' @return the number of distinct items in a group.
+setMethod("countDistinct",
+          signature(x = "Column"),
+          function(x, ...) {
+            jcol <- lapply(list(...), function (x) {
+              x@jc
+            })
+            jc <- callJStatic("org.apache.spark.sql.functions", 
"countDistinct", x@jc,
+                              listToSeq(jcol))
+            column(jc)
+          })
+
+#' @rdname functions
+#' @aliases ceil
+setMethod("ceiling",
+          signature(x = "Column"),
+          function(x) {
+            ceil(x)
+          })
+
+#' @rdname functions
+#' @aliases signum
+setMethod("sign", signature(x = "Column"),
+          function(x) {
+            signum(x)
+          })
+
+#' @rdname functions
+#' @aliases countDistinct
+setMethod("n_distinct", signature(x = "Column"),
+          function(x, ...) {
+            countDistinct(x, ...)
+          })
+
+#' @rdname functions
+#' @aliases count
+setMethod("n", signature(x = "Column"),
+          function(x) {
+            count(x)
+          })

http://git-wip-us.apache.org/repos/asf/spark/blob/ca39c9e9/R/pkg/R/generics.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 379a78b..f11e7fc 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -577,10 +577,6 @@ setGeneric("asc", function(x) { standardGeneric("asc") })
 
 #' @rdname column
 #' @export
-setGeneric("avg", function(x, ...) { standardGeneric("avg") })
-
-#' @rdname column
-#' @export
 setGeneric("between", function(x, bounds) { standardGeneric("between") })
 
 #' @rdname column
@@ -589,11 +585,8 @@ setGeneric("cast", function(x, dataType) { 
standardGeneric("cast") })
 
 #' @rdname column
 #' @export
-setGeneric("cbrt", function(x) { standardGeneric("cbrt") })
-
-#' @rdname column
-#' @export
 setGeneric("contains", function(x, ...) { standardGeneric("contains") })
+
 #' @rdname column
 #' @export
 setGeneric("countDistinct", function(x, ...) { 
standardGeneric("countDistinct") })
@@ -658,22 +651,190 @@ setGeneric("rlike", function(x, ...) { 
standardGeneric("rlike") })
 #' @export
 setGeneric("startsWith", function(x, ...) { standardGeneric("startsWith") })
 
-#' @rdname column
+
+###################### Expression Function Methods ##########################
+
+#' @rdname functions
+#' @export
+setGeneric("ascii", function(x) { standardGeneric("ascii") })
+
+#' @rdname functions
+#' @export
+setGeneric("avg", function(x, ...) { standardGeneric("avg") })
+
+#' @rdname functions
+#' @export
+setGeneric("base64", function(x) { standardGeneric("base64") })
+
+#' @rdname functions
+#' @export
+setGeneric("bin", function(x) { standardGeneric("bin") })
+
+#' @rdname functions
+#' @export
+setGeneric("bitwiseNOT", function(x) { standardGeneric("bitwiseNOT") })
+
+#' @rdname functions
+#' @export
+setGeneric("cbrt", function(x) { standardGeneric("cbrt") })
+
+#' @rdname functions
+#' @export
+setGeneric("ceil", function(x) { standardGeneric("ceil") })
+
+#' @rdname functions
+#' @export
+setGeneric("crc32", function(x) { standardGeneric("crc32") })
+
+#' @rdname functions
+#' @export
+setGeneric("datediff", function(y, x) { standardGeneric("datediff") })
+
+#' @rdname functions
+#' @export
+setGeneric("dayofmonth", function(x) { standardGeneric("dayofmonth") })
+
+#' @rdname functions
+#' @export
+setGeneric("dayofyear", function(x) { standardGeneric("dayofyear") })
+
+#' @rdname functions
+#' @export
+setGeneric("explode", function(x) { standardGeneric("explode") })
+
+#' @rdname functions
+#' @export
+setGeneric("hex", function(x) { standardGeneric("hex") })
+
+#' @rdname functions
+#' @export
+setGeneric("hour", function(x) { standardGeneric("hour") })
+
+#' @rdname functions
+#' @export
+setGeneric("initcap", function(x) { standardGeneric("initcap") })
+
+#' @rdname functions
+#' @export
+setGeneric("isNaN", function(x) { standardGeneric("isNaN") })
+
+#' @rdname functions
+#' @export
+setGeneric("last_day", function(x) { standardGeneric("last_day") })
+
+#' @rdname functions
+#' @export
+setGeneric("levenshtein", function(y, x) { standardGeneric("levenshtein") })
+
+#' @rdname functions
+#' @export
+setGeneric("lower", function(x) { standardGeneric("lower") })
+
+#' @rdname functions
+#' @export
+setGeneric("ltrim", function(x) { standardGeneric("ltrim") })
+
+#' @rdname functions
+#' @export
+setGeneric("md5", function(x) { standardGeneric("md5") })
+
+#' @rdname functions
+#' @export
+setGeneric("minute", function(x) { standardGeneric("minute") })
+
+#' @rdname functions
+#' @export
+setGeneric("month", function(x) { standardGeneric("month") })
+
+#' @rdname functions
+#' @export
+setGeneric("months_between", function(y, x) { 
standardGeneric("months_between") })
+
+#' @rdname functions
+#' @export
+setGeneric("nanvl", function(y, x) { standardGeneric("nanvl") })
+
+#' @rdname functions
+#' @export
+setGeneric("negate", function(x) { standardGeneric("negate") })
+
+#' @rdname functions
+#' @export
+setGeneric("pmod", function(y, x) { standardGeneric("pmod") })
+
+#' @rdname functions
+#' @export
+setGeneric("quarter", function(x) { standardGeneric("quarter") })
+
+#' @rdname functions
+#' @export
+setGeneric("reverse", function(x) { standardGeneric("reverse") })
+
+#' @rdname functions
+#' @export
+setGeneric("rtrim", function(x) { standardGeneric("rtrim") })
+
+#' @rdname functions
+#' @export
+setGeneric("second", function(x) { standardGeneric("second") })
+
+#' @rdname functions
+#' @export
+setGeneric("sha1", function(x) { standardGeneric("sha1") })
+
+#' @rdname functions
+#' @export
+setGeneric("signum", function(x) { standardGeneric("signum") })
+
+#' @rdname functions
+#' @export
+setGeneric("size", function(x) { standardGeneric("size") })
+
+#' @rdname functions
+#' @export
+setGeneric("soundex", function(x) { standardGeneric("soundex") })
+
+#' @rdname functions
 #' @export
 setGeneric("sumDistinct", function(x) { standardGeneric("sumDistinct") })
 
-#' @rdname column
+#' @rdname functions
 #' @export
 setGeneric("toDegrees", function(x) { standardGeneric("toDegrees") })
 
-#' @rdname column
+#' @rdname functions
 #' @export
 setGeneric("toRadians", function(x) { standardGeneric("toRadians") })
 
-#' @rdname column
+#' @rdname functions
+#' @export
+setGeneric("to_date", function(x) { standardGeneric("to_date") })
+
+#' @rdname functions
+#' @export
+setGeneric("trim", function(x) { standardGeneric("trim") })
+
+#' @rdname functions
+#' @export
+setGeneric("unbase64", function(x) { standardGeneric("unbase64") })
+
+#' @rdname functions
+#' @export
+setGeneric("unhex", function(x) { standardGeneric("unhex") })
+
+#' @rdname functions
 #' @export
 setGeneric("upper", function(x) { standardGeneric("upper") })
 
+#' @rdname functions
+#' @export
+setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") })
+
+#' @rdname functions
+#' @export
+setGeneric("year", function(x) { standardGeneric("year") })
+
+
 #' @rdname glm
 #' @export
 setGeneric("glm")

http://git-wip-us.apache.org/repos/asf/spark/blob/ca39c9e9/R/pkg/inst/tests/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 7377fc8..e6d3b21 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -640,15 +640,18 @@ test_that("column operators", {
 
 test_that("column functions", {
   c <- SparkR:::col("a")
-  c2 <- min(c) + max(c) + sum(c) + avg(c) + count(c) + abs(c) + sqrt(c)
-  c3 <- lower(c) + upper(c) + first(c) + last(c)
-  c4 <- approxCountDistinct(c) + countDistinct(c) + cast(c, "string")
-  c5 <- n(c) + n_distinct(c)
-  c5 <- acos(c) + asin(c) + atan(c) + cbrt(c)
-  c6 <- ceiling(c) + cos(c) + cosh(c) + exp(c) + expm1(c)
-  c7 <- floor(c) + log(c) + log10(c) + log1p(c) + rint(c)
-  c8 <- sign(c) + sin(c) + sinh(c) + tan(c) + tanh(c)
-  c9 <- toDegrees(c) + toRadians(c)
+  c1 <- abs(c) + acos(c) + approxCountDistinct(c) + ascii(c) + asin(c) + 
atan(c)
+  c2 <- avg(c) + base64(c) + bin(c) + bitwiseNOT(c) + cbrt(c) + ceil(c) + 
cos(c)
+  c3 <- cosh(c) + count(c) + crc32(c) + dayofmonth(c) + dayofyear(c) + exp(c)
+  c4 <- explode(c) + expm1(c) + factorial(c) + first(c) + floor(c) + hex(c)
+  c5 <- hour(c) + initcap(c) + isNaN(c) + last(c) + last_day(c) + length(c)
+  c6 <- log(c) + (c) + log1p(c) + log2(c) + lower(c) + ltrim(c) + max(c) + 
md5(c)
+  c7 <- mean(c) + min(c) + minute(c) + month(c) + negate(c) + quarter(c)
+  c8 <- reverse(c) + rint(c) + round(c) + rtrim(c) + second(c) + sha1(c)
+  c9 <- signum(c) + sin(c) + sinh(c) + size(c) + soundex(c) + sqrt(c) + sum(c)
+  c10 <- sumDistinct(c) + tan(c) + tanh(c) + toDegrees(c) + toRadians(c)
+  c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c) + 
weekofyear(c)
+  c12 <- year(c)
 
   df <- jsonFile(sqlContext, jsonPath)
   df2 <- select(df, between(df$age, c(20, 30)), between(df$age, c(10, 20)))


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9855] [SPARKR] Add expression functions into SparkR whose params are simple

Reply via email to