spark git commit: [SPARK-16051][R] Add `read.orc/write.orc` to SparkR

2016-06-20 Thread shivaram
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 5b22e34e9 -> ead872e49


[SPARK-16051][R] Add `read.orc/write.orc` to SparkR

## What changes were proposed in this pull request?

This issue adds `read.orc/write.orc` to SparkR for API parity.

## How was this patch tested?

Pass the Jenkins tests (with new testcases).

Author: Dongjoon Hyun 

Closes #13763 from dongjoon-hyun/SPARK-16051.

(cherry picked from commit c44bf137c7ca649e0c504229eb3e6ff7955e9a53)
Signed-off-by: Shivaram Venkataraman 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ead872e4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ead872e4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ead872e4

Branch: refs/heads/branch-2.0
Commit: ead872e4996ad0c0b02debd1ab829ff67b79abfb
Parents: 5b22e34
Author: Dongjoon Hyun 
Authored: Mon Jun 20 11:30:26 2016 -0700
Committer: Shivaram Venkataraman 
Committed: Mon Jun 20 11:30:36 2016 -0700

--
 R/pkg/NAMESPACE   |  2 ++
 R/pkg/R/DataFrame.R   | 27 ++
 R/pkg/R/SQLContext.R  | 21 +++-
 R/pkg/R/generics.R|  4 
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 21 
 5 files changed, 74 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ead872e4/R/pkg/NAMESPACE
--
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index cc129a7..aaeab66 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -117,6 +117,7 @@ exportMethods("arrange",
   "write.df",
   "write.jdbc",
   "write.json",
+  "write.orc",
   "write.parquet",
   "write.text",
   "write.ml")
@@ -306,6 +307,7 @@ export("as.DataFrame",
"read.df",
"read.jdbc",
"read.json",
+   "read.orc",
"read.parquet",
"read.text",
"spark.lapply",

http://git-wip-us.apache.org/repos/asf/spark/blob/ead872e4/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index ea091c8..f3a3eff 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -701,6 +701,33 @@ setMethod("write.json",
 invisible(callJMethod(write, "json", path))
   })
 
+#' Save the contents of SparkDataFrame as an ORC file, preserving the schema.
+#'
+#' Save the contents of a SparkDataFrame as an ORC file, preserving the 
schema. Files written out
+#' with this method can be read back in as a SparkDataFrame using read.orc().
+#'
+#' @param x A SparkDataFrame
+#' @param path The directory where the file is saved
+#'
+#' @family SparkDataFrame functions
+#' @rdname write.orc
+#' @name write.orc
+#' @export
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' path <- "path/to/file.json"
+#' df <- read.json(path)
+#' write.orc(df, "/tmp/sparkr-tmp1/")
+#' }
+#' @note write.orc since 2.0.0
+setMethod("write.orc",
+  signature(x = "SparkDataFrame", path = "character"),
+  function(x, path) {
+write <- callJMethod(x@sdf, "write")
+invisible(callJMethod(write, "orc", path))
+  })
+
 #' Save the contents of SparkDataFrame as a Parquet file, preserving the 
schema.
 #'
 #' Save the contents of a SparkDataFrame as a Parquet file, preserving the 
schema. Files written out

http://git-wip-us.apache.org/repos/asf/spark/blob/ead872e4/R/pkg/R/SQLContext.R
--
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index b0ccc42..b7e1c06 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -330,6 +330,25 @@ jsonRDD <- function(sqlContext, rdd, schema = NULL, 
samplingRatio = 1.0) {
   }
 }
 
+#' Create a SparkDataFrame from an ORC file.
+#'
+#' Loads an ORC file, returning the result as a SparkDataFrame.
+#'
+#' @param path Path of file to read.
+#' @return SparkDataFrame
+#' @rdname read.orc
+#' @export
+#' @name read.orc
+#' @note read.orc since 2.0.0
+read.orc <- function(path) {
+  sparkSession <- getSparkSession()
+  # Allow the user to have a more flexible definiton of the ORC file path
+  path <- suppressWarnings(normalizePath(path))
+  read <- callJMethod(sparkSession, "read")
+  sdf <- callJMethod(read, "orc", path)
+  dataFrame(sdf)
+}
+
 #' Create a SparkDataFrame from a Parquet file.
 #'
 #' Loads a Parquet file, returning the result as a SparkDataFrame.
@@ -343,7 +362,7 @@ jsonRDD <- function(sqlContext, rdd, schema = NULL, 
samplingRatio = 1.0) {
 
 read.parquet.default <- function(path) {
   sparkSession <- getSparkSession()
-  # Allow the user to have a more flexible 

spark git commit: [SPARK-16051][R] Add `read.orc/write.orc` to SparkR

2016-06-20 Thread shivaram
Repository: spark
Updated Branches:
  refs/heads/master 36e812d4b -> c44bf137c


[SPARK-16051][R] Add `read.orc/write.orc` to SparkR

## What changes were proposed in this pull request?

This issue adds `read.orc/write.orc` to SparkR for API parity.

## How was this patch tested?

Pass the Jenkins tests (with new testcases).

Author: Dongjoon Hyun 

Closes #13763 from dongjoon-hyun/SPARK-16051.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c44bf137
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c44bf137
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c44bf137

Branch: refs/heads/master
Commit: c44bf137c7ca649e0c504229eb3e6ff7955e9a53
Parents: 36e812d
Author: Dongjoon Hyun 
Authored: Mon Jun 20 11:30:26 2016 -0700
Committer: Shivaram Venkataraman 
Committed: Mon Jun 20 11:30:26 2016 -0700

--
 R/pkg/NAMESPACE   |  2 ++
 R/pkg/R/DataFrame.R   | 27 ++
 R/pkg/R/SQLContext.R  | 21 +++-
 R/pkg/R/generics.R|  4 
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 21 
 5 files changed, 74 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c44bf137/R/pkg/NAMESPACE
--
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index cc129a7..aaeab66 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -117,6 +117,7 @@ exportMethods("arrange",
   "write.df",
   "write.jdbc",
   "write.json",
+  "write.orc",
   "write.parquet",
   "write.text",
   "write.ml")
@@ -306,6 +307,7 @@ export("as.DataFrame",
"read.df",
"read.jdbc",
"read.json",
+   "read.orc",
"read.parquet",
"read.text",
"spark.lapply",

http://git-wip-us.apache.org/repos/asf/spark/blob/c44bf137/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index ea091c8..f3a3eff 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -701,6 +701,33 @@ setMethod("write.json",
 invisible(callJMethod(write, "json", path))
   })
 
+#' Save the contents of SparkDataFrame as an ORC file, preserving the schema.
+#'
+#' Save the contents of a SparkDataFrame as an ORC file, preserving the 
schema. Files written out
+#' with this method can be read back in as a SparkDataFrame using read.orc().
+#'
+#' @param x A SparkDataFrame
+#' @param path The directory where the file is saved
+#'
+#' @family SparkDataFrame functions
+#' @rdname write.orc
+#' @name write.orc
+#' @export
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' path <- "path/to/file.json"
+#' df <- read.json(path)
+#' write.orc(df, "/tmp/sparkr-tmp1/")
+#' }
+#' @note write.orc since 2.0.0
+setMethod("write.orc",
+  signature(x = "SparkDataFrame", path = "character"),
+  function(x, path) {
+write <- callJMethod(x@sdf, "write")
+invisible(callJMethod(write, "orc", path))
+  })
+
 #' Save the contents of SparkDataFrame as a Parquet file, preserving the 
schema.
 #'
 #' Save the contents of a SparkDataFrame as a Parquet file, preserving the 
schema. Files written out

http://git-wip-us.apache.org/repos/asf/spark/blob/c44bf137/R/pkg/R/SQLContext.R
--
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index b0ccc42..b7e1c06 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -330,6 +330,25 @@ jsonRDD <- function(sqlContext, rdd, schema = NULL, 
samplingRatio = 1.0) {
   }
 }
 
+#' Create a SparkDataFrame from an ORC file.
+#'
+#' Loads an ORC file, returning the result as a SparkDataFrame.
+#'
+#' @param path Path of file to read.
+#' @return SparkDataFrame
+#' @rdname read.orc
+#' @export
+#' @name read.orc
+#' @note read.orc since 2.0.0
+read.orc <- function(path) {
+  sparkSession <- getSparkSession()
+  # Allow the user to have a more flexible definiton of the ORC file path
+  path <- suppressWarnings(normalizePath(path))
+  read <- callJMethod(sparkSession, "read")
+  sdf <- callJMethod(read, "orc", path)
+  dataFrame(sdf)
+}
+
 #' Create a SparkDataFrame from a Parquet file.
 #'
 #' Loads a Parquet file, returning the result as a SparkDataFrame.
@@ -343,7 +362,7 @@ jsonRDD <- function(sqlContext, rdd, schema = NULL, 
samplingRatio = 1.0) {
 
 read.parquet.default <- function(path) {
   sparkSession <- getSparkSession()
-  # Allow the user to have a more flexible definiton of the text file path
+  # Allow the user to have a more flexible definiton of the Parquet file path
   pa