[arrow] branch main updated: GH-37994: [R] Create wrapper functions for the CSV*Options classes (#37995)

thisisnic Mon, 09 Oct 2023 01:57:33 -0700

This is an automated email from the ASF dual-hosted git repository.

thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 293819c48b GH-37994: [R] Create wrapper functions for the CSV*Options 
classes  (#37995)
293819c48b is described below

commit 293819c48b06c840ce57da25836ef911c5786398
Author: Nic Crane <[email protected]>
AuthorDate: Mon Oct 9 10:56:53 2023 +0200

    GH-37994: [R] Create wrapper functions for the CSV*Options classes  (#37995)
    
    ### Rationale for this change
    
    It's hard to find the docs for the R6 objects for the CSV reading/writing 
etc options classes
    
    ### What changes are included in this PR?
    
    Create wrapper functions, which are more easily documented
    
    ### Are these changes tested?
    
    Yep, I've swapped some existing tests to using the wrappers
    
    ### Are there any user-facing changes?
    
    Yes
    * Closes: #37994
    
    Authored-by: Nic Crane <[email protected]>
    Signed-off-by: Nic Crane <[email protected]>
---
 r/NAMESPACE                         |   4 +
 r/R/csv.R                           | 264 ++++++++++++++++++++++++++----------
 r/R/dataset-format.R                |  46 +++----
 r/R/dataset.R                       |  12 +-
 r/_pkgdown.yml                      |   5 +
 r/man/CsvReadOptions.Rd             |   5 +
 r/man/csv_convert_options.Rd        |  69 ++++++++++
 r/man/csv_parse_options.Rd          |  46 +++++++
 r/man/csv_read_options.Rd           |  51 +++++++
 r/man/csv_write_options.Rd          |  38 ++++++
 r/man/open_delim_dataset.Rd         |  17 ++-
 r/man/read_delim_arrow.Rd           |   6 +-
 r/man/write_csv_arrow.Rd            |   2 +-
 r/tests/testthat/test-csv.R         |  10 +-
 r/tests/testthat/test-dataset-csv.R |  22 ++-
 15 files changed, 475 insertions(+), 122 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index d49255f781..b675952d01 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -308,6 +308,10 @@ export(contains)
 export(copy_files)
 export(cpu_count)
 export(create_package_with_all_dependencies)
+export(csv_convert_options)
+export(csv_parse_options)
+export(csv_read_options)
+export(csv_write_options)
 export(dataset_factory)
 export(date32)
 export(date64)
diff --git a/r/R/csv.R b/r/R/csv.R
index 116c620f83..e68a05720d 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -122,11 +122,11 @@
 #'  - `NULL`: the default, which uses the ISO-8601 parser
 #'  - a character vector of [strptime][base::strptime()] parse strings
 #'  - a list of [TimestampParser] objects
-#' @param parse_options see [file reader options][CsvReadOptions].
+#' @param parse_options see [CSV parsing options][csv_parse_options()].
 #' If given, this overrides any
 #' parsing options provided in other arguments (e.g. `delim`, `quote`, etc.).
-#' @param convert_options see [file reader options][CsvReadOptions]
-#' @param read_options see [file reader options][CsvReadOptions]
+#' @param convert_options see [CSV conversion options][csv_convert_options()]
+#' @param read_options see [CSV reading options][csv_read_options()]
 #' @param as_data_frame Should the function return a `tibble` (default) or
 #' an Arrow [Table]?
 #'
@@ -337,22 +337,22 @@ CsvTableReader <- R6Class("CsvTableReader",
   )
 )
 CsvTableReader$create <- function(file,
-                                  read_options = CsvReadOptions$create(),
-                                  parse_options = CsvParseOptions$create(),
-                                  convert_options = CsvConvertOptions$create(),
+                                  read_options = csv_read_options(),
+                                  parse_options = csv_parse_options(),
+                                  convert_options = csv_convert_options(),
                                   ...) {
   assert_is(file, "InputStream")
 
   if (is.list(read_options)) {
-    read_options <- do.call(CsvReadOptions$create, read_options)
+    read_options <- do.call(csv_read_options, read_options)
   }
 
   if (is.list(parse_options)) {
-    parse_options <- do.call(CsvParseOptions$create, parse_options)
+    parse_options <- do.call(csv_parse_options, parse_options)
   }
 
   if (is.list(convert_options)) {
-    convert_options <- do.call(CsvConvertOptions$create, convert_options)
+    convert_options <- do.call(csv_convert_options, convert_options)
   }
 
   if (!(tolower(read_options$encoding) %in% c("utf-8", "utf8"))) {
@@ -362,6 +362,58 @@ CsvTableReader$create <- function(file,
   csv___TableReader__Make(file, read_options, parse_options, convert_options)
 }
 
+#' CSV Reading Options
+#'
+#' @param use_threads Whether to use the global CPU thread pool
+#' @param block_size Block size we request from the IO layer; also determines
+#'  the size of chunks when use_threads is `TRUE`.
+#' @param skip_rows Number of lines to skip before reading data (default 0).
+#' @param column_names Character vector to supply column names. If length-0
+#' (the default), the first non-skipped row will be parsed to generate column
+#' names, unless `autogenerate_column_names` is `TRUE`.
+#' @param autogenerate_column_names Logical: generate column names instead of
+#' using the first non-skipped row (the default)? If `TRUE`, column names will
+#' be "f0", "f1", ..., "fN".
+#' @param encoding The file encoding. (default `"UTF-8"`)
+#' @param skip_rows_after_names Number of lines to skip after the column names 
(default 0).
+#'    This number can be larger than the number of rows in one block, and 
empty rows are counted.
+#'    The order of application is as follows:
+#'      - `skip_rows` is applied (if non-zero);
+#'      - column names are read (unless `column_names` is set);
+#'      - `skip_rows_after_names` is applied (if non-zero).
+#'
+#' @examples
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' writeLines("my file has a non-data header\nx\n1\n2", tf)
+#' read_csv_arrow(tf, read_options = csv_read_options(skip_rows = 1))
+#' open_csv_dataset(tf, read_options = csv_read_options(skip_rows = 1))
+#' @export
+csv_read_options <- function(use_threads = option_use_threads(),
+                             block_size = 1048576L,
+                             skip_rows = 0L,
+                             column_names = character(0),
+                             autogenerate_column_names = FALSE,
+                             encoding = "UTF-8",
+                             skip_rows_after_names = 0L) {
+  assert_that(is.string(encoding))
+
+  options <- csv___ReadOptions__initialize(
+    list(
+      use_threads = use_threads,
+      block_size = block_size,
+      skip_rows = skip_rows,
+      skip_rows_after_names = skip_rows_after_names,
+      column_names = column_names,
+      autogenerate_column_names = autogenerate_column_names
+    )
+  )
+
+  options$encoding <- encoding
+
+  options
+}
+
 #' @title File reader options
 #' @rdname CsvReadOptions
 #' @name CsvReadOptions
@@ -455,6 +507,11 @@ CsvTableReader$create <- function(file,
 #' - `batch_size` Maximum number of rows processed at a time. Default is 1024.
 #' - `null_string` The string to be written for null values. Must not contain
 #'   quotation marks. Default is an empty string (`""`).
+#' - `eol` The end of line character to use for ending rows.
+#' - `delimiter` Field delimiter
+#' - `quoting_style` Quoting style: "Needed" (Only enclose values in quotes 
which need them, because their CSV
+#'    rendering can contain quotes itself (e.g. strings or binary values)), 
"AllValid" (Enclose all valid values in
+#'    quotes), or "None" (Do not enclose any values in quotes).
 #'
 #' @section Active bindings:
 #'
@@ -485,30 +542,8 @@ CsvReadOptions <- R6Class("CsvReadOptions",
     skip_rows_after_names = function() 
csv___ReadOptions__skip_rows_after_names(self)
   )
 )
-CsvReadOptions$create <- function(use_threads = option_use_threads(),
-                                  block_size = 1048576L,
-                                  skip_rows = 0L,
-                                  column_names = character(0),
-                                  autogenerate_column_names = FALSE,
-                                  encoding = "UTF-8",
-                                  skip_rows_after_names = 0L) {
-  assert_that(is.string(encoding))
 
-  options <- csv___ReadOptions__initialize(
-    list(
-      use_threads = use_threads,
-      block_size = block_size,
-      skip_rows = skip_rows,
-      skip_rows_after_names = skip_rows_after_names,
-      column_names = column_names,
-      autogenerate_column_names = autogenerate_column_names
-    )
-  )
-
-  options$encoding <- encoding
-
-  options
-}
+CsvReadOptions$create <- csv_read_options
 
 readr_to_csv_write_options <- function(col_names = TRUE,
                                        batch_size = 1024L,
@@ -520,7 +555,7 @@ readr_to_csv_write_options <- function(col_names = TRUE,
   quote <- match(match.arg(quote), c("needed", "all", "none"))
   quote <- quoting_style_arrow_opts[quote]
 
-  CsvWriteOptions$create(
+  csv_write_options(
     include_header = col_names,
     batch_size = batch_size,
     delimiter = delim,
@@ -530,15 +565,28 @@ readr_to_csv_write_options <- function(col_names = TRUE,
   )
 }
 
-#' @rdname CsvReadOptions
+#' CSV Writing Options
+#'
+#' @param include_header Whether to write an initial header line with column 
names
+#' @param batch_size Maximum number of rows processed at a time.
+#' @param null_string The string to be written for null values. Must not 
contain quotation marks.
+#' @param delimiter Field delimiter
+#' @param eol The end of line character to use for ending rows
+#' @param quoting_style How to handle quotes. "Needed" (Only enclose values in 
quotes which need them, because their CSV
+#'    rendering can contain quotes itself (e.g. strings or binary values)), 
"AllValid" (Enclose all valid values in
+#'    quotes), or "None" (Do not enclose any values in quotes).
+#'
+#' @examples
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(airquality, tf, write_options = 
csv_write_options(null_string = "-99"))
 #' @export
-CsvWriteOptions <- R6Class("CsvWriteOptions", inherit = ArrowObject)
-CsvWriteOptions$create <- function(include_header = TRUE,
-                                   batch_size = 1024L,
-                                   null_string = "",
-                                   delimiter = ",",
-                                   eol = "\n",
-                                   quoting_style = c("Needed", "AllValid", 
"None")) {
+csv_write_options <- function(include_header = TRUE,
+                              batch_size = 1024L,
+                              null_string = "",
+                              delimiter = ",",
+                              eol = "\n",
+                              quoting_style = c("Needed", "AllValid", "None")) 
{
   quoting_style <- match.arg(quoting_style)
   quoting_style_opts <- c("Needed", "AllValid", "None")
   quoting_style <- match(quoting_style, quoting_style_opts) - 1L
@@ -564,32 +612,50 @@ CsvWriteOptions$create <- function(include_header = TRUE,
   )
 }
 
+#' @rdname CsvReadOptions
+#' @export
+CsvWriteOptions <- R6Class("CsvWriteOptions", inherit = ArrowObject)
+CsvWriteOptions$create <- csv_write_options
+
 readr_to_csv_read_options <- function(skip = 0, col_names = TRUE) {
   if (isTRUE(col_names)) {
     # C++ default to parse is 0-length string array
     col_names <- character(0)
   }
   if (identical(col_names, FALSE)) {
-    CsvReadOptions$create(skip_rows = skip, autogenerate_column_names = TRUE)
+    csv_read_options(skip_rows = skip, autogenerate_column_names = TRUE)
   } else {
-    CsvReadOptions$create(skip_rows = skip, column_names = col_names)
+    csv_read_options(skip_rows = skip, column_names = col_names)
   }
 }
 
-#' @rdname CsvReadOptions
-#' @usage NULL
-#' @format NULL
-#' @docType class
+#' CSV Parsing Options
+#'
+#' @param delimiter Field delimiting character
+#' @param quoting Logical: are strings quoted?
+#' @param quote_char Quoting character, if `quoting` is `TRUE`
+#' @param double_quote Logical: are quotes inside values double-quoted?
+#' @param escaping Logical: whether escaping is used
+#' @param escape_char Escaping character, if `escaping` is `TRUE`
+#' @param newlines_in_values Logical: are values allowed to contain CR (`0x0d`)
+#'    and LF (`0x0a`) characters?
+#' @param ignore_empty_lines Logical: should empty lines be ignored (default) 
or
+#'    generate a row of missing values (if `FALSE`)?
+#' @examples
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' writeLines("x\n1\n\n2", tf)
+#' read_csv_arrow(tf, parse_options = csv_parse_options(ignore_empty_lines = 
FALSE))
+#' open_csv_dataset(tf, parse_options = csv_parse_options(ignore_empty_lines = 
FALSE))
 #' @export
-CsvParseOptions <- R6Class("CsvParseOptions", inherit = ArrowObject)
-CsvParseOptions$create <- function(delimiter = ",",
-                                   quoting = TRUE,
-                                   quote_char = '"',
-                                   double_quote = TRUE,
-                                   escaping = FALSE,
-                                   escape_char = "\\",
-                                   newlines_in_values = FALSE,
-                                   ignore_empty_lines = TRUE) {
+csv_parse_options <- function(delimiter = ",",
+                              quoting = TRUE,
+                              quote_char = '"',
+                              double_quote = TRUE,
+                              escaping = FALSE,
+                              escape_char = "\\",
+                              newlines_in_values = FALSE,
+                              ignore_empty_lines = TRUE) {
   csv___ParseOptions__initialize(
     list(
       delimiter = delimiter,
@@ -604,6 +670,14 @@ CsvParseOptions$create <- function(delimiter = ",",
   )
 }
 
+#' @rdname CsvReadOptions
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @export
+CsvParseOptions <- R6Class("CsvParseOptions", inherit = ArrowObject)
+CsvParseOptions$create <- csv_parse_options
+
 readr_to_csv_parse_options <- function(delim = ",",
                                        quote = '"',
                                        escape_double = TRUE,
@@ -611,7 +685,7 @@ readr_to_csv_parse_options <- function(delim = ",",
                                        skip_empty_rows = TRUE) {
   # This function translates from the readr argument list to the arrow arg 
names
   # TODO: validate inputs
-  CsvParseOptions$create(
+  csv_parse_options(
     delimiter = delim,
     quoting = nzchar(quote),
     quote_char = quote,
@@ -643,23 +717,55 @@ TimestampParser$create <- function(format = NULL) {
   }
 }
 
-#' @rdname CsvReadOptions
-#' @usage NULL
-#' @format NULL
-#' @docType class
+
+#' CSV Convert Options
+#'
+#' @param check_utf8 Logical: check UTF8 validity of string columns?
+#' @param null_values Character vector of recognized spellings for null values.
+#'    Analogous to the `na.strings` argument to
+#'    [`read.csv()`][utils::read.csv()] or `na` in [readr::read_csv()].
+#' @param strings_can_be_null Logical: can string / binary columns have
+#'    null values? Similar to the `quoted_na` argument to [readr::read_csv()]
+#' @param true_values Character vector of recognized spellings for `TRUE` 
values
+#' @param false_values Character vector of recognized spellings for `FALSE` 
values
+#' @param col_types A `Schema` or `NULL` to infer types
+#' @param auto_dict_encode Logical: Whether to try to automatically
+#'    dictionary-encode string / binary data (think `stringsAsFactors`).
+#'    This setting is ignored for non-inferred columns (those in `col_types`).
+#' @param auto_dict_max_cardinality If `auto_dict_encode`, string/binary 
columns
+#'    are dictionary-encoded up to this number of unique values (default 50),
+#'    after which it switches to regular encoding.
+#' @param include_columns If non-empty, indicates the names of columns from the
+#'    CSV file that should be actually read and converted (in the vector's 
order).
+#' @param include_missing_columns Logical: if `include_columns` is provided, 
should
+#'    columns named in it but not found in the data be included as a column of
+#'    type `null()`? The default (`FALSE`) means that the reader will instead
+#'    raise an error.
+#' @param timestamp_parsers User-defined timestamp parsers. If more than one
+#'    parser is specified, the CSV conversion logic will try parsing values
+#'    starting from the beginning of this vector. Possible values are
+#'    (a) `NULL`, the default, which uses the ISO-8601 parser;
+#'    (b) a character vector of [strptime][base::strptime()] parse strings; or
+#'    (c) a list of [TimestampParser] objects.
+#'
+#' @examples
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' writeLines("x\n1\nNULL\n2\nNA", tf)
+#' read_csv_arrow(tf, convert_options = csv_convert_options(null_values = 
c("", "NA", "NULL")))
+#' open_csv_dataset(tf, convert_options = csv_convert_options(null_values = 
c("", "NA", "NULL")))
 #' @export
-CsvConvertOptions <- R6Class("CsvConvertOptions", inherit = ArrowObject)
-CsvConvertOptions$create <- function(check_utf8 = TRUE,
-                                     null_values = c("", "NA"),
-                                     true_values = c("T", "true", "TRUE"),
-                                     false_values = c("F", "false", "FALSE"),
-                                     strings_can_be_null = FALSE,
-                                     col_types = NULL,
-                                     auto_dict_encode = FALSE,
-                                     auto_dict_max_cardinality = 50L,
-                                     include_columns = character(),
-                                     include_missing_columns = FALSE,
-                                     timestamp_parsers = NULL) {
+csv_convert_options <- function(check_utf8 = TRUE,
+                                null_values = c("", "NA"),
+                                true_values = c("T", "true", "TRUE"),
+                                false_values = c("F", "false", "FALSE"),
+                                strings_can_be_null = FALSE,
+                                col_types = NULL,
+                                auto_dict_encode = FALSE,
+                                auto_dict_max_cardinality = 50L,
+                                include_columns = character(),
+                                include_missing_columns = FALSE,
+                                timestamp_parsers = NULL) {
   if (!is.null(col_types) && !inherits(col_types, "Schema")) {
     abort(c(
       "Unsupported `col_types` specification.",
@@ -684,6 +790,14 @@ CsvConvertOptions$create <- function(check_utf8 = TRUE,
   )
 }
 
+#' @rdname CsvReadOptions
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @export
+CsvConvertOptions <- R6Class("CsvConvertOptions", inherit = ArrowObject)
+CsvConvertOptions$create <- csv_convert_options
+
 readr_to_csv_convert_options <- function(na,
                                          quoted_na,
                                          col_types = NULL,
@@ -732,7 +846,7 @@ readr_to_csv_convert_options <- function(na,
       include_columns <- setdiff(col_names, names(col_types)[nulls])
     }
   }
-  CsvConvertOptions$create(
+  csv_convert_options(
     null_values = na,
     strings_can_be_null = quoted_na,
     col_types = col_types,
@@ -753,7 +867,7 @@ readr_to_csv_convert_options <- function(na,
 #' @param batch_size Maximum number of rows processed at a time. Default is 
1024.
 #' @param na value to write for NA values. Must not contain quote marks. 
Default
 #'     is `""`.
-#' @param write_options see [file reader options][CsvWriteOptions]
+#' @param write_options see [CSV write options][csv_write_options]
 #' @param ... additional parameters
 #'
 #' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R
index cdaaf08827..c25a505f89 100644
--- a/r/R/dataset-format.R
+++ b/r/R/dataset-format.R
@@ -217,19 +217,19 @@ check_csv_file_format_args <- function(args, partitioning 
= NULL) {
   if (is.null(args$parse_options)) {
     options$parse_options <- do.call(csv_file_format_parse_opts, args)
   } else if (is.list(args$parse_options)) {
-    options$parse_options <- do.call(CsvParseOptions$create, 
args$parse_options)
+    options$parse_options <- do.call(csv_parse_options, args$parse_options)
   }
 
   if (is.null(args$convert_options)) {
     options$convert_options <- do.call(csv_file_format_convert_opts, args)
   } else if (is.list(args$convert_options)) {
-    options$convert_options <- do.call(CsvConvertOptions$create, 
args$convert_options)
+    options$convert_options <- do.call(csv_convert_options, 
args$convert_options)
   }
 
   if (is.null(args$read_options)) {
     options$read_options <- do.call(csv_file_format_read_opts, c(args, 
list(partitioning = partitioning)))
   } else if (is.list(args$read_options)) {
-    options$read_options <- do.call(CsvReadOptions$create, args$read_options)
+    options$read_options <- do.call(csv_read_options, args$read_options)
   }
 
   options
@@ -239,16 +239,16 @@ check_unsupported_args <- function(args) {
   opt_names <- get_opt_names(args)
 
   # Filter out arguments meant for CsvConvertOptions/CsvReadOptions
-  supported_convert_opts <- c(names(formals(CsvConvertOptions$create)), "na", 
"quoted_na")
+  supported_convert_opts <- c(names(formals(csv_convert_options)), "na", 
"quoted_na")
 
   supported_read_opts <- c(
-    names(formals(CsvReadOptions$create)),
+    names(formals(csv_read_options)),
     names(formals(readr_to_csv_read_options))
   )
 
   # We only currently support all of the readr options for parseoptions
   supported_parse_opts <- c(
-    names(formals(CsvParseOptions$create)),
+    names(formals(csv_parse_options)),
     names(formals(readr_to_csv_parse_options))
   )
 
@@ -303,9 +303,9 @@ check_unrecognised_args <- function(opts) {
   opt_names <- get_opt_names(opts)
 
   arrow_opts <- c(
-    names(formals(CsvParseOptions$create)),
-    names(formals(CsvReadOptions$create)),
-    names(formals(CsvConvertOptions$create)),
+    names(formals(csv_parse_options)),
+    names(formals(csv_read_options)),
+    names(formals(csv_convert_options)),
     "schema"
   )
 
@@ -395,9 +395,9 @@ check_schema <- function(schema, partitioning, 
column_names) {
 csv_file_format_parse_opts <- function(...) {
   opts <- list(...)
   # Filter out arguments meant for CsvConvertOptions/CsvReadOptions
-  convert_opts <- c(names(formals(CsvConvertOptions$create)), "na", 
"quoted_na", "convert_options")
+  convert_opts <- c(names(formals(csv_convert_options)), "na", "quoted_na", 
"convert_options")
   read_opts <- c(
-    names(formals(CsvReadOptions$create)),
+    names(formals(csv_read_options)),
     names(formals(readr_to_csv_read_options)),
     "read_options"
   )
@@ -407,7 +407,7 @@ csv_file_format_parse_opts <- function(...) {
   opts[["parse_options"]] <- NULL
   opt_names <- get_opt_names(opts)
 
-  arrow_opts <- c(names(formals(CsvParseOptions$create)))
+  arrow_opts <- c(names(formals(csv_parse_options)))
   readr_opts <- c(names(formals(readr_to_csv_parse_options)))
 
   is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts))
@@ -427,17 +427,17 @@ csv_file_format_parse_opts <- function(...) {
     }
     do.call(readr_to_csv_parse_options, opts) # all options have readr-style 
names
   } else {
-    do.call(CsvParseOptions$create, opts) # all options have Arrow C++ names
+    do.call(csv_parse_options, opts) # all options have Arrow C++ names
   }
 }
 
 csv_file_format_convert_opts <- function(...) {
   opts <- list(...)
   # Filter out arguments meant for CsvParseOptions/CsvReadOptions
-  arrow_opts <- c(names(formals(CsvParseOptions$create)), "parse_options")
+  arrow_opts <- c(names(formals(csv_parse_options)), "parse_options")
   readr_opts <- names(formals(readr_to_csv_parse_options))
   read_opts <- c(
-    names(formals(CsvReadOptions$create)),
+    names(formals(csv_read_options)),
     names(formals(readr_to_csv_read_options)),
     "read_options"
   )
@@ -458,23 +458,23 @@ csv_file_format_convert_opts <- function(...) {
     opts[["quoted_na"]] <- NULL
   }
 
-  do.call(CsvConvertOptions$create, opts)
+  do.call(csv_convert_options, opts)
 }
 
 csv_file_format_read_opts <- function(schema = NULL, partitioning = NULL, ...) 
{
 
   opts <- list(...)
   # Filter out arguments meant for CsvParseOptions/CsvConvertOptions
-  arrow_opts <- c(names(formals(CsvParseOptions$create)), "parse_options")
+  arrow_opts <- c(names(formals(csv_parse_options)), "parse_options")
   readr_opts <- names(formals(readr_to_csv_parse_options))
-  convert_opts <- c(names(formals(CsvConvertOptions$create)), "na", 
"quoted_na", "convert_options")
+  convert_opts <- c(names(formals(csv_convert_options)), "na", "quoted_na", 
"convert_options")
   opts[arrow_opts] <- NULL
   opts[readr_opts] <- NULL
   opts[convert_opts] <- NULL
   opts[["read_options"]] <- NULL
 
   opt_names <- names(opts)
-  arrow_opts <- c(names(formals(CsvReadOptions$create)))
+  arrow_opts <- c(names(formals(csv_read_options)))
   readr_opts <- c(names(formals(readr_to_csv_read_options)))
 
   is_arrow_opt <- !is.na(match(opt_names, arrow_opts))
@@ -505,7 +505,7 @@ csv_file_format_read_opts <- function(schema = NULL, 
partitioning = NULL, ...) {
     }
     do.call(readr_to_csv_read_options, opts) # all options have readr-style 
names
   } else {
-    do.call(CsvReadOptions$create, opts) # all options have Arrow C++ names
+    do.call(csv_read_options, opts) # all options have Arrow C++ names
   }
 }
 
@@ -648,7 +648,7 @@ FileWriteOptions <- R6Class("FileWriteOptions",
           )
         } else if (format %in% c("csv", "tsv", "txt", "text")) {
           supported_args <- c(
-            names(formals(CsvWriteOptions$create)),
+            names(formals(csv_write_options)),
             names(formals(readr_to_csv_write_options))
           )
         }
@@ -703,7 +703,7 @@ FileWriteOptions <- R6Class("FileWriteOptions",
           )
         }
       } else if (self$type %in% c("csv", "tsv", "txt", "text")) {
-        arrow_opts <- names(formals(CsvWriteOptions$create))
+        arrow_opts <- names(formals(csv_write_options))
         readr_opts <- names(formals(readr_to_csv_write_options))
         readr_only_opts <- setdiff(readr_opts, arrow_opts)
         arrow_only_opts <- setdiff(arrow_opts, readr_opts)
@@ -727,7 +727,7 @@ FileWriteOptions <- R6Class("FileWriteOptions",
         } else {
           dataset___CsvFileWriteOptions__update(
             self,
-            do.call(CsvWriteOptions$create, args[is_arrow_opt])
+            do.call(csv_write_options, args[is_arrow_opt])
           )
         }
       }
diff --git a/r/R/dataset.R b/r/R/dataset.R
index 9d91839c22..90e6516927 100644
--- a/r/R/dataset.R
+++ b/r/R/dataset.R
@@ -276,7 +276,8 @@ open_delim_dataset <- function(sources,
                                convert_options = NULL,
                                read_options = NULL,
                                timestamp_parsers = NULL,
-                               quoted_na = TRUE) {
+                               quoted_na = TRUE,
+                               parse_options = NULL) {
   open_dataset(
     sources = sources,
     schema = schema,
@@ -297,7 +298,8 @@ open_delim_dataset <- function(sources,
     convert_options = convert_options,
     read_options = read_options,
     timestamp_parsers = timestamp_parsers,
-    quoted_na = quoted_na
+    quoted_na = quoted_na,
+    parse_options = parse_options
   )
 }
 
@@ -320,7 +322,8 @@ open_csv_dataset <- function(sources,
                              convert_options = NULL,
                              read_options = NULL,
                              timestamp_parsers = NULL,
-                             quoted_na = TRUE) {
+                             quoted_na = TRUE,
+                             parse_options = NULL) {
   mc <- match.call()
   mc$delim <- ","
   mc[[1]] <- get("open_delim_dataset", envir = asNamespace("arrow"))
@@ -346,7 +349,8 @@ open_tsv_dataset <- function(sources,
                              convert_options = NULL,
                              read_options = NULL,
                              timestamp_parsers = NULL,
-                             quoted_na = TRUE) {
+                             quoted_na = TRUE,
+                             parse_options = NULL) {
   mc <- match.call()
   mc$delim <- "\t"
   mc[[1]] <- get("open_delim_dataset", envir = asNamespace("arrow"))
diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml
index 57c21e3c0f..84111e599c 100644
--- a/r/_pkgdown.yml
+++ b/r/_pkgdown.yml
@@ -146,6 +146,10 @@ reference:
   - open_delim_dataset
   - open_csv_dataset
   - open_tsv_dataset
+  - csv_read_options
+  - csv_parse_options
+  - csv_convert_options
+
 
 - title: Write datasets
   desc: >
@@ -155,6 +159,7 @@ reference:
   - write_delim_dataset
   - write_csv_dataset
   - write_tsv_dataset
+  - csv_write_options
 
 - title: Read files
   desc: >
diff --git a/r/man/CsvReadOptions.Rd b/r/man/CsvReadOptions.Rd
index 6ebb235518..32742280cc 100644
--- a/r/man/CsvReadOptions.Rd
+++ b/r/man/CsvReadOptions.Rd
@@ -106,6 +106,11 @@ The \code{CsvWriteOptions$create()} factory method takes 
the following arguments
 \item \code{batch_size} Maximum number of rows processed at a time. Default is 
1024.
 \item \code{null_string} The string to be written for null values. Must not 
contain
 quotation marks. Default is an empty string (\code{""}).
+\item \code{eol} The end of line character to use for ending rows.
+\item \code{delimiter} Field delimiter
+\item \code{quoting_style} Quoting style: "Needed" (Only enclose values in 
quotes which need them, because their CSV
+rendering can contain quotes itself (e.g. strings or binary values)), 
"AllValid" (Enclose all valid values in
+quotes), or "None" (Do not enclose any values in quotes).
 }
 }
 
diff --git a/r/man/csv_convert_options.Rd b/r/man/csv_convert_options.Rd
new file mode 100644
index 0000000000..4fd6eac1c3
--- /dev/null
+++ b/r/man/csv_convert_options.Rd
@@ -0,0 +1,69 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/csv.R
+\name{csv_convert_options}
+\alias{csv_convert_options}
+\title{CSV Convert Options}
+\usage{
+csv_convert_options(
+  check_utf8 = TRUE,
+  null_values = c("", "NA"),
+  true_values = c("T", "true", "TRUE"),
+  false_values = c("F", "false", "FALSE"),
+  strings_can_be_null = FALSE,
+  col_types = NULL,
+  auto_dict_encode = FALSE,
+  auto_dict_max_cardinality = 50L,
+  include_columns = character(),
+  include_missing_columns = FALSE,
+  timestamp_parsers = NULL
+)
+}
+\arguments{
+\item{check_utf8}{Logical: check UTF8 validity of string columns?}
+
+\item{null_values}{Character vector of recognized spellings for null values.
+Analogous to the \code{na.strings} argument to
+\code{\link[utils:read.table]{read.csv()}} or \code{na} in 
\code{\link[readr:read_delim]{readr::read_csv()}}.}
+
+\item{true_values}{Character vector of recognized spellings for \code{TRUE} 
values}
+
+\item{false_values}{Character vector of recognized spellings for \code{FALSE} 
values}
+
+\item{strings_can_be_null}{Logical: can string / binary columns have
+null values? Similar to the \code{quoted_na} argument to 
\code{\link[readr:read_delim]{readr::read_csv()}}}
+
+\item{col_types}{A \code{Schema} or \code{NULL} to infer types}
+
+\item{auto_dict_encode}{Logical: Whether to try to automatically
+dictionary-encode string / binary data (think \code{stringsAsFactors}).
+This setting is ignored for non-inferred columns (those in \code{col_types}).}
+
+\item{auto_dict_max_cardinality}{If \code{auto_dict_encode}, string/binary 
columns
+are dictionary-encoded up to this number of unique values (default 50),
+after which it switches to regular encoding.}
+
+\item{include_columns}{If non-empty, indicates the names of columns from the
+CSV file that should be actually read and converted (in the vector's order).}
+
+\item{include_missing_columns}{Logical: if \code{include_columns} is provided, 
should
+columns named in it but not found in the data be included as a column of
+type \code{null()}? The default (\code{FALSE}) means that the reader will 
instead
+raise an error.}
+
+\item{timestamp_parsers}{User-defined timestamp parsers. If more than one
+parser is specified, the CSV conversion logic will try parsing values
+starting from the beginning of this vector. Possible values are
+(a) \code{NULL}, the default, which uses the ISO-8601 parser;
+(b) a character vector of \link[base:strptime]{strptime} parse strings; or
+(c) a list of \link{TimestampParser} objects.}
+}
+\description{
+CSV Convert Options
+}
+\examples{
+tf <- tempfile()
+on.exit(unlink(tf))
+writeLines('x\n1\nNULL\n2\nNA', tf)
+read_csv_arrow(tf, convert_options = csv_convert_options(null_values =  c("", 
"NA", "NULL")))
+open_csv_dataset(tf, convert_options = csv_convert_options(null_values =  
c("", "NA", "NULL")))
+}
diff --git a/r/man/csv_parse_options.Rd b/r/man/csv_parse_options.Rd
new file mode 100644
index 0000000000..e2e8fd19a5
--- /dev/null
+++ b/r/man/csv_parse_options.Rd
@@ -0,0 +1,46 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/csv.R
+\name{csv_parse_options}
+\alias{csv_parse_options}
+\title{CSV Parsing Options}
+\usage{
+csv_parse_options(
+  delimiter = ",",
+  quoting = TRUE,
+  quote_char = "\\"",
+  double_quote = TRUE,
+  escaping = FALSE,
+  escape_char = "\\\\",
+  newlines_in_values = FALSE,
+  ignore_empty_lines = TRUE
+)
+}
+\arguments{
+\item{delimiter}{Field delimiting character}
+
+\item{quoting}{Logical: are strings quoted?}
+
+\item{quote_char}{Quoting character, if \code{quoting} is \code{TRUE}}
+
+\item{double_quote}{Logical: are quotes inside values double-quoted?}
+
+\item{escaping}{Logical: whether escaping is used}
+
+\item{escape_char}{Escaping character, if \code{escaping} is \code{TRUE}}
+
+\item{newlines_in_values}{Logical: are values allowed to contain CR 
(\code{0x0d})
+and LF (\code{0x0a}) characters?}
+
+\item{ignore_empty_lines}{Logical: should empty lines be ignored (default) or
+generate a row of missing values (if \code{FALSE})?}
+}
+\description{
+CSV Parsing Options
+}
+\examples{
+tf <- tempfile()
+on.exit(unlink(tf))
+writeLines('x\n1\n\n2', tf)
+read_csv_arrow(tf, parse_options = csv_parse_options(ignore_empty_lines =  
FALSE))
+open_csv_dataset(tf, parse_options = csv_parse_options(ignore_empty_lines =  
FALSE))
+}
diff --git a/r/man/csv_read_options.Rd b/r/man/csv_read_options.Rd
new file mode 100644
index 0000000000..ed2436f316
--- /dev/null
+++ b/r/man/csv_read_options.Rd
@@ -0,0 +1,51 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/csv.R
+\name{csv_read_options}
+\alias{csv_read_options}
+\title{CSV Reading Options}
+\usage{
+csv_read_options(
+  use_threads = option_use_threads(),
+  block_size = 1048576L,
+  skip_rows = 0L,
+  column_names = character(0),
+  autogenerate_column_names = FALSE,
+  encoding = "UTF-8",
+  skip_rows_after_names = 0L
+)
+}
+\arguments{
+\item{use_threads}{Whether to use the global CPU thread pool}
+
+\item{block_size}{Block size we request from the IO layer; also determines
+the size of chunks when use_threads is \code{TRUE}.}
+
+\item{skip_rows}{Number of lines to skip before reading data (default 0).}
+
+\item{column_names}{Character vector to supply column names. If length-0
+(the default), the first non-skipped row will be parsed to generate column
+names, unless \code{autogenerate_column_names} is \code{TRUE}.}
+
+\item{autogenerate_column_names}{Logical: generate column names instead of
+using the first non-skipped row (the default)? If \code{TRUE}, column names 
will
+be "f0", "f1", ..., "fN".}
+
+\item{encoding}{The file encoding. (default \code{"UTF-8"})}
+
+\item{skip_rows_after_names}{Number of lines to skip after the column names 
(default 0).
+This number can be larger than the number of rows in one block, and empty rows 
are counted.
+The order of application is as follows:
+- \code{skip_rows} is applied (if non-zero);
+- column names are read (unless \code{column_names} is set);
+- \code{skip_rows_after_names} is applied (if non-zero).}
+}
+\description{
+CSV Reading Options
+}
+\examples{
+tf <- tempfile()
+on.exit(unlink(tf))
+writeLines('my file has a non-data header\nx\n1\n2', tf)
+read_csv_arrow(tf, read_options = csv_read_options(skip_rows = 1))
+open_csv_dataset(tf, read_options = csv_read_options(skip_rows = 1))
+}
diff --git a/r/man/csv_write_options.Rd b/r/man/csv_write_options.Rd
new file mode 100644
index 0000000000..a42905dd18
--- /dev/null
+++ b/r/man/csv_write_options.Rd
@@ -0,0 +1,38 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/csv.R
+\name{csv_write_options}
+\alias{csv_write_options}
+\title{CSV Writing Options}
+\usage{
+csv_write_options(
+  include_header = TRUE,
+  batch_size = 1024L,
+  null_string = "",
+  delimiter = ",",
+  eol = "\\n",
+  quoting_style = c("Needed", "AllValid", "None")
+)
+}
+\arguments{
+\item{include_header}{Whether to write an initial header line with column 
names}
+
+\item{batch_size}{Maximum number of rows processed at a time.}
+
+\item{null_string}{The string to be written for null values. Must not contain 
quotation marks.}
+
+\item{delimiter}{Field delimiter}
+
+\item{eol}{The end of line character to use for ending rows}
+
+\item{quoting_style}{How to handle quotes. "Needed" (Only enclose values in 
quotes which need them, because their CSV
+rendering can contain quotes itself (e.g. strings or binary values)), 
"AllValid" (Enclose all valid values in
+quotes), or "None" (Do not enclose any values in quotes).}
+}
+\description{
+CSV Writing Options
+}
+\examples{
+tf <- tempfile()
+on.exit(unlink(tf))
+write_csv_arrow(airquality, tf, write_options = csv_write_options(null_string 
= "-99"))
+}
diff --git a/r/man/open_delim_dataset.Rd b/r/man/open_delim_dataset.Rd
index cf08302cc6..7b81f0033e 100644
--- a/r/man/open_delim_dataset.Rd
+++ b/r/man/open_delim_dataset.Rd
@@ -25,7 +25,8 @@ open_delim_dataset(
   convert_options = NULL,
   read_options = NULL,
   timestamp_parsers = NULL,
-  quoted_na = TRUE
+  quoted_na = TRUE,
+  parse_options = NULL
 )
 
 open_csv_dataset(
@@ -46,7 +47,8 @@ open_csv_dataset(
   convert_options = NULL,
   read_options = NULL,
   timestamp_parsers = NULL,
-  quoted_na = TRUE
+  quoted_na = TRUE,
+  parse_options = NULL
 )
 
 open_tsv_dataset(
@@ -67,7 +69,8 @@ open_tsv_dataset(
   convert_options = NULL,
   read_options = NULL,
   timestamp_parsers = NULL,
-  quoted_na = TRUE
+  quoted_na = TRUE,
+  parse_options = NULL
 )
 }
 \arguments{
@@ -169,9 +172,9 @@ filled with missings.}
 
 \item{skip}{Number of lines to skip before reading data.}
 
-\item{convert_options}{see \link[=CsvReadOptions]{file reader options}}
+\item{convert_options}{see \link[=csv_convert_options]{CSV conversion options}}
 
-\item{read_options}{see \link[=CsvReadOptions]{file reader options}}
+\item{read_options}{see \link[=csv_read_options]{CSV reading options}}
 
 \item{timestamp_parsers}{User-defined timestamp parsers. If more than one
 parser is specified, the CSV conversion logic will try parsing values
@@ -186,6 +189,10 @@ starting from the beginning of this vector. Possible 
values are:
 values (the default) or strings. (Note that this is different from the
 the Arrow C++ default for the corresponding convert option,
 \code{strings_can_be_null}.)}
+
+\item{parse_options}{see \link[=csv_parse_options]{CSV parsing options}.
+If given, this overrides any
+parsing options provided in other arguments (e.g. \code{delim}, \code{quote}, 
etc.).}
 }
 \description{
 A wrapper around \link{open_dataset} which explicitly includes parameters 
mirroring \code{\link[=read_csv_arrow]{read_csv_arrow()}},
diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd
index cd07c8ad07..09c20fa013 100644
--- a/r/man/read_delim_arrow.Rd
+++ b/r/man/read_delim_arrow.Rd
@@ -120,13 +120,13 @@ filled with missings.}
 
 \item{skip}{Number of lines to skip before reading data.}
 
-\item{parse_options}{see \link[=CsvReadOptions]{file reader options}.
+\item{parse_options}{see \link[=csv_parse_options]{CSV parsing options}.
 If given, this overrides any
 parsing options provided in other arguments (e.g. \code{delim}, \code{quote}, 
etc.).}
 
-\item{convert_options}{see \link[=CsvReadOptions]{file reader options}}
+\item{convert_options}{see \link[=csv_convert_options]{CSV conversion options}}
 
-\item{read_options}{see \link[=CsvReadOptions]{file reader options}}
+\item{read_options}{see \link[=csv_read_options]{CSV reading options}}
 
 \item{as_data_frame}{Should the function return a \code{tibble} (default) or
 an Arrow \link{Table}?}
diff --git a/r/man/write_csv_arrow.Rd b/r/man/write_csv_arrow.Rd
index 2b0d09ba74..9fcca49fad 100644
--- a/r/man/write_csv_arrow.Rd
+++ b/r/man/write_csv_arrow.Rd
@@ -34,7 +34,7 @@ system (\code{SubTreeFileSystem})}
 \item{na}{value to write for NA values. Must not contain quote marks. Default
 is \code{""}.}
 
-\item{write_options}{see \link[=CsvWriteOptions]{file reader options}}
+\item{write_options}{see \link[=csv_write_options]{CSV write options}}
 
 \item{...}{additional parameters}
 }
diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R
index 38f79c6397..22ccc7950f 100644
--- a/r/tests/testthat/test-csv.R
+++ b/r/tests/testthat/test-csv.R
@@ -336,7 +336,7 @@ test_that("CSV reader works on files with non-UTF-8 
encoding", {
   fs <- LocalFileSystem$create()
   reader <- CsvTableReader$create(
     fs$OpenInputStream(tf),
-    read_options = CsvReadOptions$create(encoding = "UTF-16LE")
+    read_options = csv_read_options(encoding = "UTF-16LE")
   )
 
   table <- reader$Read()
@@ -435,7 +435,7 @@ test_that("Write a CSV with custom NA value", {
 
   # Also can use null_value in CsvWriteOptions
   tbl_out1 <- write_csv_arrow(tbl_no_dates, csv_file,
-    write_options = CsvWriteOptions$create(null_string = "another_null")
+    write_options = csv_write_options(null_string = "another_null")
   )
   csv_contents <- readLines(csv_file)
   expect_true(any(grepl("another_null", csv_contents)))
@@ -680,9 +680,9 @@ test_that("CSV reading/parsing/convert options can be 
passed in as lists", {
 
   tab2 <- read_csv_arrow(
     tf,
-    convert_options = CsvConvertOptions$create(null_values = c(NA, "NA", 
"NULL"), strings_can_be_null = TRUE),
-    parse_options = CsvParseOptions$create(ignore_empty_lines = FALSE),
-    read_options = CsvReadOptions$create(skip_rows = 1L)
+    convert_options = csv_convert_options(null_values = c(NA, "NA", "NULL"), 
strings_can_be_null = TRUE),
+    parse_options = csv_parse_options(ignore_empty_lines = FALSE),
+    read_options = csv_read_options(skip_rows = 1L)
   )
 
   expect_equal(tab1, tab2)
diff --git a/r/tests/testthat/test-dataset-csv.R 
b/r/tests/testthat/test-dataset-csv.R
index e8e7c61fc8..bee701e5ef 100644
--- a/r/tests/testthat/test-dataset-csv.R
+++ b/r/tests/testthat/test-dataset-csv.R
@@ -206,7 +206,7 @@ test_that("Other text delimited dataset", {
 })
 
 test_that("readr parse options", {
-  arrow_opts <- names(formals(CsvParseOptions$create))
+  arrow_opts <- names(formals(csv_parse_options))
   readr_opts <- names(formals(readr_to_csv_parse_options))
 
   # Arrow and readr parse options must be mutually exclusive, or else the code
@@ -469,8 +469,8 @@ test_that("CSV reading/parsing/convert options can be 
passed in as lists", {
   ds2 <- open_dataset(
     tf,
     format = "csv",
-    convert_options = CsvConvertOptions$create(null_values = c(NA, "NA", 
"NULL"), strings_can_be_null = TRUE),
-    read_options = CsvReadOptions$create(skip_rows = 1L)
+    convert_options = csv_convert_options(null_values = c(NA, "NA", "NULL"), 
strings_can_be_null = TRUE),
+    read_options = csv_read_options(skip_rows = 1L)
   ) %>%
     collect()
 
@@ -571,6 +571,16 @@ test_that("open_delim_dataset params passed through to 
open_dataset", {
   ds <- open_csv_dataset(dst_dir, quoted_na = FALSE) %>% collect()
   expect_equal(ds$text, c("one", "two", "", "four"))
 
+  # parse_options
+  dst_dir <- make_temp_dir()
+  dst_file <- file.path(dst_dir, "data.csv")
+  writeLines("x\n\n1\n\n\n2\n\n3", dst_file)
+  ds <- open_csv_dataset(
+    dst_dir,
+    parse_options = csv_parse_options(ignore_empty_lines = FALSE)
+  ) %>% collect()
+  expect_equal(ds$x, c(NA, 1L, NA, NA, 2L, NA, 3L))
+
   # timestamp_parsers
   skip("GH-33708: timestamp_parsers don't appear to be working properly")
 
@@ -586,15 +596,15 @@ test_that("open_delim_dataset params passed through to 
open_dataset", {
 })
 
 test_that("CSVReadOptions printing", {
-  default_read_options <- CsvReadOptions$create()
-  custom_read_options <- CsvReadOptions$create(skip_rows = 102)
+  default_read_options <- csv_read_options()
+  custom_read_options <- csv_read_options(skip_rows = 102)
 
   expect_output(print(default_read_options), "skip_rows: 0")
   expect_output(print(custom_read_options), "skip_rows: 102")
 })
 
 test_that("CSVReadOptions field access", {
-  options <- CsvReadOptions$create()
+  options <- csv_read_options()
   expect_equal(options$skip_rows, 0)
   expect_equal(options$autogenerate_column_names, FALSE)
   expect_equal(options$skip_rows_after_names, 0)

[arrow] branch main updated: GH-37994: [R] Create wrapper functions for the CSV*Options classes (#37995)

Reply via email to