spark git commit: [SPARK-16310][SPARKR] R na.string-like default for csv source
Repository: spark Updated Branches: refs/heads/master 28710b42b -> f4767bcc7 [SPARK-16310][SPARKR] R na.string-like default for csv source ## What changes were proposed in this pull request? Apply default "NA" as null string for R, like R read.csv na.string parameter. https://stat.ethz.ch/R-manual/R-devel/library/utils/html/read.table.html na.strings = "NA" An user passing a csv file with NA value should get the same behavior with SparkR read.df(... source = "csv") (couldn't open JIRA, will do that later) ## How was this patch tested? unit tests shivaram Author: Felix CheungCloses #13984 from felixcheung/rcsvnastring. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f4767bcc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f4767bcc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f4767bcc Branch: refs/heads/master Commit: f4767bcc7a9d1bdd301f054776aa45e7c9f344a7 Parents: 28710b4 Author: Felix Cheung Authored: Thu Jul 7 15:21:57 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Jul 7 15:21:57 2016 -0700 -- R/pkg/R/SQLContext.R | 10 ++-- R/pkg/inst/tests/testthat/test_sparkSQL.R | 32 +- 2 files changed, 34 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f4767bcc/R/pkg/R/SQLContext.R -- diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index 8df73db..bc0daa2 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -714,11 +714,14 @@ dropTempView <- function(viewName) { #' #' The data source is specified by the `source` and a set of options(...). #' If `source` is not specified, the default data source configured by -#' "spark.sql.sources.default" will be used. +#' "spark.sql.sources.default" will be used. \cr +#' Similar to R read.csv, when `source` is "csv", by default, a value of "NA" will be interpreted +#' as NA. #' #' @param path The path of files to load #' @param source The name of external data source #' @param schema The data schema defined in structType +#' @param na.strings Default string value for NA when source is "csv" #' @return SparkDataFrame #' @rdname read.df #' @name read.df @@ -735,7 +738,7 @@ dropTempView <- function(viewName) { #' @name read.df #' @method read.df default #' @note read.df since 1.4.0 -read.df.default <- function(path = NULL, source = NULL, schema = NULL, ...) { +read.df.default <- function(path = NULL, source = NULL, schema = NULL, na.strings = "NA", ...) { sparkSession <- getSparkSession() options <- varargsToEnv(...) if (!is.null(path)) { @@ -744,6 +747,9 @@ read.df.default <- function(path = NULL, source = NULL, schema = NULL, ...) { if (is.null(source)) { source <- getDefaultSqlSource() } + if (source == "csv" && is.null(options[["nullValue"]])) { +options[["nullValue"]] <- na.strings + } if (!is.null(schema)) { stopifnot(class(schema) == "structType") sdf <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sparkSession, source, http://git-wip-us.apache.org/repos/asf/spark/blob/f4767bcc/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index a3aa26d..a0ab719 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -213,15 +213,35 @@ test_that("read csv as DataFrame", { mockLinesCsv <- c("year,make,model,comment,blank", "\"2012\",\"Tesla\",\"S\",\"No comment\",", "1997,Ford,E350,\"Go get one now they are going fast\",", - "2015,Chevy,Volt") + "2015,Chevy,Volt", + "NA,Dummy,Placeholder") writeLines(mockLinesCsv, csvPath) - # default "header" is false - df <- read.df(csvPath, "csv", header = "true") - expect_equal(count(df), 3) + # default "header" is false, inferSchema to handle "year" as "int" + df <- read.df(csvPath, "csv", header = "true", inferSchema = "true") + expect_equal(count(df), 4) expect_equal(columns(df), c("year", "make", "model", "comment", "blank")) - expect_equal(sort(unlist(collect(where(df, df$year == "2015", - sort(unlist(list(year = "2015", make = "Chevy", model = "Volt" + expect_equal(sort(unlist(collect(where(df, df$year == 2015, + sort(unlist(list(year = 2015, make = "Chevy", model = "Volt" + + # since "year" is "int", let's skip the NA values + withoutna <- na.omit(df, how = "any", cols = "year") +
spark git commit: [SPARK-16310][SPARKR] R na.string-like default for csv source
Repository: spark Updated Branches: refs/heads/branch-2.0 30cb3f1d3 -> 5828da41c [SPARK-16310][SPARKR] R na.string-like default for csv source ## What changes were proposed in this pull request? Apply default "NA" as null string for R, like R read.csv na.string parameter. https://stat.ethz.ch/R-manual/R-devel/library/utils/html/read.table.html na.strings = "NA" An user passing a csv file with NA value should get the same behavior with SparkR read.df(... source = "csv") (couldn't open JIRA, will do that later) ## How was this patch tested? unit tests shivaram Author: Felix CheungCloses #13984 from felixcheung/rcsvnastring. (cherry picked from commit f4767bcc7a9d1bdd301f054776aa45e7c9f344a7) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5828da41 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5828da41 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5828da41 Branch: refs/heads/branch-2.0 Commit: 5828da41cb2d815708191bd9a5cf3bd82795aa41 Parents: 30cb3f1 Author: Felix Cheung Authored: Thu Jul 7 15:21:57 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Jul 7 15:22:06 2016 -0700 -- R/pkg/R/SQLContext.R | 10 ++-- R/pkg/inst/tests/testthat/test_sparkSQL.R | 32 +- 2 files changed, 34 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5828da41/R/pkg/R/SQLContext.R -- diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index 8df73db..bc0daa2 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -714,11 +714,14 @@ dropTempView <- function(viewName) { #' #' The data source is specified by the `source` and a set of options(...). #' If `source` is not specified, the default data source configured by -#' "spark.sql.sources.default" will be used. +#' "spark.sql.sources.default" will be used. \cr +#' Similar to R read.csv, when `source` is "csv", by default, a value of "NA" will be interpreted +#' as NA. #' #' @param path The path of files to load #' @param source The name of external data source #' @param schema The data schema defined in structType +#' @param na.strings Default string value for NA when source is "csv" #' @return SparkDataFrame #' @rdname read.df #' @name read.df @@ -735,7 +738,7 @@ dropTempView <- function(viewName) { #' @name read.df #' @method read.df default #' @note read.df since 1.4.0 -read.df.default <- function(path = NULL, source = NULL, schema = NULL, ...) { +read.df.default <- function(path = NULL, source = NULL, schema = NULL, na.strings = "NA", ...) { sparkSession <- getSparkSession() options <- varargsToEnv(...) if (!is.null(path)) { @@ -744,6 +747,9 @@ read.df.default <- function(path = NULL, source = NULL, schema = NULL, ...) { if (is.null(source)) { source <- getDefaultSqlSource() } + if (source == "csv" && is.null(options[["nullValue"]])) { +options[["nullValue"]] <- na.strings + } if (!is.null(schema)) { stopifnot(class(schema) == "structType") sdf <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sparkSession, source, http://git-wip-us.apache.org/repos/asf/spark/blob/5828da41/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index d22baf6..003fcce 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -213,15 +213,35 @@ test_that("read csv as DataFrame", { mockLinesCsv <- c("year,make,model,comment,blank", "\"2012\",\"Tesla\",\"S\",\"No comment\",", "1997,Ford,E350,\"Go get one now they are going fast\",", - "2015,Chevy,Volt") + "2015,Chevy,Volt", + "NA,Dummy,Placeholder") writeLines(mockLinesCsv, csvPath) - # default "header" is false - df <- read.df(csvPath, "csv", header = "true") - expect_equal(count(df), 3) + # default "header" is false, inferSchema to handle "year" as "int" + df <- read.df(csvPath, "csv", header = "true", inferSchema = "true") + expect_equal(count(df), 4) expect_equal(columns(df), c("year", "make", "model", "comment", "blank")) - expect_equal(sort(unlist(collect(where(df, df$year == "2015", - sort(unlist(list(year = "2015", make = "Chevy", model = "Volt" + expect_equal(sort(unlist(collect(where(df, df$year == 2015, + sort(unlist(list(year = 2015, make = "Chevy", model =