spark git commit: [SPARK-19828][R] Support array type in from_json in R

felixcheung Tue, 14 Mar 2017 19:52:23 -0700

Repository: spark
Updated Branches:
  refs/heads/master 8fb2a02e2 -> d1f6c64c4



[SPARK-19828][R] Support array type in from_json in R

## What changes were proposed in this pull request?

Since we could not directly define the array type in R, this PR proposes to 
support array types in R as string types that are used in `structField` as 
below:

```R
jsonArr <- "[{\"name\":\"Bob\"}, {\"name\":\"Alice\"}]"
df <- as.DataFrame(list(list("people" = jsonArr)))
collect(select(df, alias(from_json(df$people, "array<struct<name:string>>"), 
"arrcol")))
```

prints

```R
      arrcol
1 Bob, Alice
```

## How was this patch tested?

Unit tests in `test_sparkSQL.R`.

Author: hyukjinkwon <gurwls...@gmail.com>

Closes #17178 from HyukjinKwon/SPARK-19828.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d1f6c64c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d1f6c64c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d1f6c64c

Branch: refs/heads/master
Commit: d1f6c64c4b763c05d6d79ae5497f298dc3835f3e
Parents: 8fb2a02
Author: hyukjinkwon <gurwls...@gmail.com>
Authored: Tue Mar 14 19:51:25 2017 -0700
Committer: Felix Cheung <felixche...@apache.org>
Committed: Tue Mar 14 19:51:25 2017 -0700

----------------------------------------------------------------------
 R/pkg/R/functions.R                                     | 12 ++++++++++--
 R/pkg/inst/tests/testthat/test_sparkSQL.R               | 12 ++++++++++++
 .../scala/org/apache/spark/sql/api/r/SQLUtils.scala     |  2 +-
 3 files changed, 23 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/d1f6c64c/R/pkg/R/functions.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index edf2bcf..9867f2d 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2437,6 +2437,7 @@ setMethod("date_format", signature(y = "Column", x = 
"character"),
 #'
 #' @param x Column containing the JSON string.
 #' @param schema a structType object to use as the schema to use when parsing 
the JSON string.
+#' @param asJsonArray indicating if input string is JSON array of objects or a 
single object.
 #' @param ... additional named properties to control how the json is parsed, 
accepts the same
 #'            options as the JSON data source.
 #'
@@ -2452,11 +2453,18 @@ setMethod("date_format", signature(y = "Column", x = 
"character"),
 #'}
 #' @note from_json since 2.2.0
 setMethod("from_json", signature(x = "Column", schema = "structType"),
-          function(x, schema, ...) {
+          function(x, schema, asJsonArray = FALSE, ...) {
+            if (asJsonArray) {
+              jschema <- callJStatic("org.apache.spark.sql.types.DataTypes",
+                                     "createArrayType",
+                                     schema$jobj)
+            } else {
+              jschema <- schema$jobj
+            }
             options <- varargsToStrEnv(...)
             jc <- callJStatic("org.apache.spark.sql.functions",
                               "from_json",
-                              x@jc, schema$jobj, options)
+                              x@jc, jschema, options)
             column(jc)
           })
 

http://git-wip-us.apache.org/repos/asf/spark/blob/d1f6c64c/R/pkg/inst/tests/testthat/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 9735fe3..f7081cb 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1364,6 +1364,18 @@ test_that("column functions", {
   # check for unparseable
   df <- as.DataFrame(list(list("a" = "")))
   expect_equal(collect(select(df, from_json(df$a, schema)))[[1]][[1]], NA)
+
+  # check if array type in string is correctly supported.
+  jsonArr <- "[{\"name\":\"Bob\"}, {\"name\":\"Alice\"}]"
+  df <- as.DataFrame(list(list("people" = jsonArr)))
+  schema <- structType(structField("name", "string"))
+  arr <- collect(select(df, alias(from_json(df$people, schema, asJsonArray = 
TRUE), "arrcol")))
+  expect_equal(ncol(arr), 1)
+  expect_equal(nrow(arr), 1)
+  expect_is(arr[[1]][[1]], "list")
+  expect_equal(length(arr$arrcol[[1]]), 2)
+  expect_equal(arr$arrcol[[1]][[1]]$name, "Bob")
+  expect_equal(arr$arrcol[[1]][[2]]$name, "Alice")
 })
 
 test_that("column binary mathfunctions", {

http://git-wip-us.apache.org/repos/asf/spark/blob/d1f6c64c/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
index a4c5bf7..c773286 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
@@ -81,7 +81,7 @@ private[sql] object SQLUtils extends Logging {
     new JavaSparkContext(spark.sparkContext)
   }
 
-  def createStructType(fields : Seq[StructField]): StructType = {
+  def createStructType(fields: Seq[StructField]): StructType = {
     StructType(fields)
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-19828][R] Support array type in from_json in R

Reply via email to