This is an automated email from the ASF dual-hosted git repository. thisisnic pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 878d5cac09 ARROW-15470: [R] Set null value in CSV writer (#14679) 878d5cac09 is described below commit 878d5cac09073cee72de000a7e8418ce8a3a31b8 Author: Will Jones <willjones...@gmail.com> AuthorDate: Mon Jan 9 04:19:15 2023 -0800 ARROW-15470: [R] Set null value in CSV writer (#14679) Authored-by: Will Jones <willjones...@gmail.com> Signed-off-by: Nic Crane <thisis...@gmail.com> --- r/R/csv.R | 29 +++++++++++++++++------- r/R/dataset-format.R | 32 +++++++++++++++++++++----- r/man/CsvReadOptions.Rd | 2 ++ r/man/write_csv_arrow.Rd | 4 ++++ r/src/csv.cpp | 1 + r/tests/testthat/_snaps/dataset-write.md | 2 +- r/tests/testthat/test-csv.R | 39 ++++++++++++++++++++++++++++++++ r/tests/testthat/test-dataset-csv.R | 22 ++++++++++++++++++ 8 files changed, 116 insertions(+), 15 deletions(-) diff --git a/r/R/csv.R b/r/R/csv.R index fef8723fb2..6f53a060f5 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -418,6 +418,8 @@ CsvTableReader$create <- function(file, #' The `CsvWriteOptions$create()` factory method takes the following arguments: #' - `include_header` Whether to write an initial header line with column names #' - `batch_size` Maximum number of rows processed at a time. Default is 1024. +#' - `null_string` The string to be written for null values. Must not contain +#' quotation marks. Default is an empty string (`""`). #' #' @section Active bindings: #' @@ -455,25 +457,32 @@ CsvReadOptions$create <- function(use_threads = option_use_threads(), options } -readr_to_csv_write_options <- function(include_header, - batch_size = 1024L) { - assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0) - assert_that(is.logical(include_header)) +readr_to_csv_write_options <- function(include_header = TRUE, + batch_size = 1024L, + na = "") { CsvWriteOptions$create( include_header = include_header, - batch_size = as.integer(batch_size) + batch_size = batch_size, + null_string = na ) } #' @rdname CsvReadOptions #' @export CsvWriteOptions <- R6Class("CsvWriteOptions", inherit = ArrowObject) -CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L) { +CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L, null_string = "") { assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0) + assert_that(is.logical(include_header)) + assert_that(is.character(null_string)) + assert_that(!is.na(null_string)) + assert_that(length(null_string) == 1) + assert_that(!grepl('"', null_string), msg = "na argument must not contain quote characters.") + csv___WriteOptions__initialize( list( include_header = include_header, - batch_size = as.integer(batch_size) + batch_size = as.integer(batch_size), + null_string = as.character(null_string) ) ) } @@ -665,6 +674,8 @@ readr_to_csv_convert_options <- function(na, #' @param col_names identical to `include_header`. Specify this or #' `include_headers`, not both. #' @param batch_size Maximum number of rows processed at a time. Default is 1024. +#' @param na value to write for NA values. Must not contain quote marks. Default +#' is `""`. #' @param write_options see [file reader options][CsvWriteOptions] #' @param ... additional parameters #' @@ -682,6 +693,7 @@ write_csv_arrow <- function(x, include_header = TRUE, col_names = NULL, batch_size = 1024L, + na = "", write_options = NULL, ...) { unsupported_passed_args <- names(list(...)) @@ -723,7 +735,8 @@ write_csv_arrow <- function(x, if (is.null(write_options)) { write_options <- readr_to_csv_write_options( include_header = include_header, - batch_size = batch_size + batch_size = batch_size, + na = na ) } diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R index aacde187c4..c1d2730bb6 100644 --- a/r/R/dataset-format.R +++ b/r/R/dataset-format.R @@ -452,7 +452,10 @@ FileWriteOptions <- R6Class("FileWriteOptions", "null_fallback" ) } else if (format == "csv") { - supported_args <- names(formals(CsvWriteOptions$create)) + supported_args <- c( + names(formals(CsvWriteOptions$create)), + names(formals(readr_to_csv_write_options)) + ) } unsupported_passed_args <- setdiff(passed_args, supported_args) @@ -470,7 +473,7 @@ FileWriteOptions <- R6Class("FileWriteOptions", err_info <- NULL arg_info <- paste0( "Supported arguments: ", - oxford_paste(supported_args, quote_symbol = "`"), + oxford_paste(unique(supported_args), quote_symbol = "`"), "." ) if ("compression" %in% unsupported_passed_args) { @@ -505,10 +508,27 @@ FileWriteOptions <- R6Class("FileWriteOptions", ) } } else if (self$type == "csv") { - dataset___CsvFileWriteOptions__update( - self, - CsvWriteOptions$create(...) - ) + arrow_opts <- names(formals(CsvWriteOptions$create)) + readr_opts <- names(formals(readr_to_csv_write_options)) + readr_only_opts <- setdiff(readr_opts, arrow_opts) + + is_arrow_opt <- !is.na(pmatch(names(args), arrow_opts)) + is_readr_opt <- !is.na(pmatch(names(args), readr_opts)) + is_readr_only_opt <- !is.na(pmatch(names(args), readr_only_opts)) + + # These option names aren't mutually exclusive, so only use readr path + # if we have at least one readr-specific option. + if (sum(is_readr_only_opt)) { + dataset___CsvFileWriteOptions__update( + self, + do.call(readr_to_csv_write_options, args[is_readr_opt]) + ) + } else { + dataset___CsvFileWriteOptions__update( + self, + do.call(CsvWriteOptions$create, args[is_arrow_opt]) + ) + } } invisible(self) } diff --git a/r/man/CsvReadOptions.Rd b/r/man/CsvReadOptions.Rd index a3cf2073ee..270d522b83 100644 --- a/r/man/CsvReadOptions.Rd +++ b/r/man/CsvReadOptions.Rd @@ -96,6 +96,8 @@ The \code{CsvWriteOptions$create()} factory method takes the following arguments \itemize{ \item \code{include_header} Whether to write an initial header line with column names \item \code{batch_size} Maximum number of rows processed at a time. Default is 1024. +\item \code{null_string} The string to be written for null values. Must not contain +quotation marks. Default is an empty string (\code{""}). } } diff --git a/r/man/write_csv_arrow.Rd b/r/man/write_csv_arrow.Rd index c93c94fd8f..2b0d09ba74 100644 --- a/r/man/write_csv_arrow.Rd +++ b/r/man/write_csv_arrow.Rd @@ -11,6 +11,7 @@ write_csv_arrow( include_header = TRUE, col_names = NULL, batch_size = 1024L, + na = "", write_options = NULL, ... ) @@ -30,6 +31,9 @@ system (\code{SubTreeFileSystem})} \item{batch_size}{Maximum number of rows processed at a time. Default is 1024.} +\item{na}{value to write for NA values. Must not contain quote marks. Default +is \code{""}.} + \item{write_options}{see \link[=CsvWriteOptions]{file reader options}} \item{...}{additional parameters} diff --git a/r/src/csv.cpp b/r/src/csv.cpp index 7747369300..3f880cae16 100644 --- a/r/src/csv.cpp +++ b/r/src/csv.cpp @@ -32,6 +32,7 @@ std::shared_ptr<arrow::csv::WriteOptions> csv___WriteOptions__initialize( res->include_header = cpp11::as_cpp<bool>(options["include_header"]); res->batch_size = cpp11::as_cpp<int>(options["batch_size"]); res->io_context = MainRThread::GetInstance().CancellableIOContext(); + res->null_string = cpp11::as_cpp<std::string>(options["null_string"]); return res; } diff --git a/r/tests/testthat/_snaps/dataset-write.md b/r/tests/testthat/_snaps/dataset-write.md index cea2a30a29..e302d8463d 100644 --- a/r/tests/testthat/_snaps/dataset-write.md +++ b/r/tests/testthat/_snaps/dataset-write.md @@ -42,7 +42,7 @@ Condition Error in `check_additional_args()`: ! `nonsensical_arg` is not a valid argument for your chosen `format`. - i Supported arguments: `include_header` and `batch_size`. + i Supported arguments: `include_header`, `batch_size`, `null_string`, and `na`. --- diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R index 6033253517..ee290e3241 100644 --- a/r/tests/testthat/test-csv.R +++ b/r/tests/testthat/test-csv.R @@ -422,6 +422,45 @@ test_that("Write a CSV file with invalid batch size", { ) }) +test_that("Write a CSV with custom NA value", { + tbl_out1 <- write_csv_arrow(tbl_no_dates, csv_file, na = "NULL_VALUE") + expect_true(file.exists(csv_file)) + expect_identical(tbl_out1, tbl_no_dates) + + csv_contents <- readLines(csv_file) + expect_true(any(grepl("NULL_VALUE", csv_contents))) + + tbl_in1 <- read_csv_arrow(csv_file, na = "NULL_VALUE") + expect_identical(tbl_in1, tbl_no_dates) + + # Also can use null_value in CsvWriteOptions + tbl_out1 <- write_csv_arrow(tbl_no_dates, csv_file, + write_options = CsvWriteOptions$create(null_string = "another_null") + ) + csv_contents <- readLines(csv_file) + expect_true(any(grepl("another_null", csv_contents))) + + tbl_in1 <- read_csv_arrow(csv_file, na = "another_null") + expect_identical(tbl_in1, tbl_no_dates) + + # Also can use empty string + write_csv_arrow(tbl_no_dates, csv_file, na = "") + expect_true(file.exists(csv_file)) + + csv_contents <- readLines(csv_file) + expect_true(any(grepl(",,", csv_contents))) + + tbl_in1 <- read_csv_arrow(csv_file) + expect_identical(tbl_in1, tbl_no_dates) +}) + +test_that("Write a CSV file with invalid null value", { + expect_error( + write_csv_arrow(tbl_no_dates, csv_file, na = "MY\"VAL"), + regexp = "must not contain quote characters" + ) +}) + test_that("time mapping work as expected (ARROW-13624)", { tbl <- tibble::tibble( dt = as.POSIXct(c("2020-07-20 16:20", NA), tz = "UTC"), diff --git a/r/tests/testthat/test-dataset-csv.R b/r/tests/testthat/test-dataset-csv.R index 29e95f509c..3f85c89e45 100644 --- a/r/tests/testthat/test-dataset-csv.R +++ b/r/tests/testthat/test-dataset-csv.R @@ -273,6 +273,28 @@ test_that("readr parse options", { ) }) +test_that("Can set null string values", { + dst_dir <- make_temp_dir() + df <- tibble(x = c(1, NA, 3)) + write_dataset(df, dst_dir, null_string = "NULL_VALUE", format = "csv") + + csv_contents <- readLines(list.files(dst_dir, full.names = TRUE)[1]) + expect_equal(csv_contents, c("\"x\"", "1", "NULL_VALUE", "3")) + + back <- open_dataset(dst_dir, null_values = "NULL_VALUE", format = "csv") %>% collect() + expect_equal(df, back) + + # Also works with `na` parameter + dst_dir <- make_temp_dir() + write_dataset(df, dst_dir, na = "another_null", format = "csv") + + csv_contents <- readLines(list.files(dst_dir, full.names = TRUE)[1]) + expect_equal(csv_contents, c("\"x\"", "1", "another_null", "3")) + + back <- open_dataset(dst_dir, null_values = "another_null", format = "csv") %>% collect() + expect_equal(df, back) +}) + # see https://issues.apache.org/jira/browse/ARROW-12791 test_that("Error if no format specified and files are not parquet", { expect_error(