This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 293819c48b GH-37994: [R] Create wrapper functions for the CSV*Options
classes (#37995)
293819c48b is described below
commit 293819c48b06c840ce57da25836ef911c5786398
Author: Nic Crane <[email protected]>
AuthorDate: Mon Oct 9 10:56:53 2023 +0200
GH-37994: [R] Create wrapper functions for the CSV*Options classes (#37995)
### Rationale for this change
It's hard to find the docs for the R6 objects for the CSV reading/writing
etc options classes
### What changes are included in this PR?
Create wrapper functions, which are more easily documented
### Are these changes tested?
Yep, I've swapped some existing tests to using the wrappers
### Are there any user-facing changes?
Yes
* Closes: #37994
Authored-by: Nic Crane <[email protected]>
Signed-off-by: Nic Crane <[email protected]>
---
r/NAMESPACE | 4 +
r/R/csv.R | 264 ++++++++++++++++++++++++++----------
r/R/dataset-format.R | 46 +++----
r/R/dataset.R | 12 +-
r/_pkgdown.yml | 5 +
r/man/CsvReadOptions.Rd | 5 +
r/man/csv_convert_options.Rd | 69 ++++++++++
r/man/csv_parse_options.Rd | 46 +++++++
r/man/csv_read_options.Rd | 51 +++++++
r/man/csv_write_options.Rd | 38 ++++++
r/man/open_delim_dataset.Rd | 17 ++-
r/man/read_delim_arrow.Rd | 6 +-
r/man/write_csv_arrow.Rd | 2 +-
r/tests/testthat/test-csv.R | 10 +-
r/tests/testthat/test-dataset-csv.R | 22 ++-
15 files changed, 475 insertions(+), 122 deletions(-)
diff --git a/r/NAMESPACE b/r/NAMESPACE
index d49255f781..b675952d01 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -308,6 +308,10 @@ export(contains)
export(copy_files)
export(cpu_count)
export(create_package_with_all_dependencies)
+export(csv_convert_options)
+export(csv_parse_options)
+export(csv_read_options)
+export(csv_write_options)
export(dataset_factory)
export(date32)
export(date64)
diff --git a/r/R/csv.R b/r/R/csv.R
index 116c620f83..e68a05720d 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -122,11 +122,11 @@
#' - `NULL`: the default, which uses the ISO-8601 parser
#' - a character vector of [strptime][base::strptime()] parse strings
#' - a list of [TimestampParser] objects
-#' @param parse_options see [file reader options][CsvReadOptions].
+#' @param parse_options see [CSV parsing options][csv_parse_options()].
#' If given, this overrides any
#' parsing options provided in other arguments (e.g. `delim`, `quote`, etc.).
-#' @param convert_options see [file reader options][CsvReadOptions]
-#' @param read_options see [file reader options][CsvReadOptions]
+#' @param convert_options see [CSV conversion options][csv_convert_options()]
+#' @param read_options see [CSV reading options][csv_read_options()]
#' @param as_data_frame Should the function return a `tibble` (default) or
#' an Arrow [Table]?
#'
@@ -337,22 +337,22 @@ CsvTableReader <- R6Class("CsvTableReader",
)
)
CsvTableReader$create <- function(file,
- read_options = CsvReadOptions$create(),
- parse_options = CsvParseOptions$create(),
- convert_options = CsvConvertOptions$create(),
+ read_options = csv_read_options(),
+ parse_options = csv_parse_options(),
+ convert_options = csv_convert_options(),
...) {
assert_is(file, "InputStream")
if (is.list(read_options)) {
- read_options <- do.call(CsvReadOptions$create, read_options)
+ read_options <- do.call(csv_read_options, read_options)
}
if (is.list(parse_options)) {
- parse_options <- do.call(CsvParseOptions$create, parse_options)
+ parse_options <- do.call(csv_parse_options, parse_options)
}
if (is.list(convert_options)) {
- convert_options <- do.call(CsvConvertOptions$create, convert_options)
+ convert_options <- do.call(csv_convert_options, convert_options)
}
if (!(tolower(read_options$encoding) %in% c("utf-8", "utf8"))) {
@@ -362,6 +362,58 @@ CsvTableReader$create <- function(file,
csv___TableReader__Make(file, read_options, parse_options, convert_options)
}
+#' CSV Reading Options
+#'
+#' @param use_threads Whether to use the global CPU thread pool
+#' @param block_size Block size we request from the IO layer; also determines
+#' the size of chunks when use_threads is `TRUE`.
+#' @param skip_rows Number of lines to skip before reading data (default 0).
+#' @param column_names Character vector to supply column names. If length-0
+#' (the default), the first non-skipped row will be parsed to generate column
+#' names, unless `autogenerate_column_names` is `TRUE`.
+#' @param autogenerate_column_names Logical: generate column names instead of
+#' using the first non-skipped row (the default)? If `TRUE`, column names will
+#' be "f0", "f1", ..., "fN".
+#' @param encoding The file encoding. (default `"UTF-8"`)
+#' @param skip_rows_after_names Number of lines to skip after the column names
(default 0).
+#' This number can be larger than the number of rows in one block, and
empty rows are counted.
+#' The order of application is as follows:
+#' - `skip_rows` is applied (if non-zero);
+#' - column names are read (unless `column_names` is set);
+#' - `skip_rows_after_names` is applied (if non-zero).
+#'
+#' @examples
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' writeLines("my file has a non-data header\nx\n1\n2", tf)
+#' read_csv_arrow(tf, read_options = csv_read_options(skip_rows = 1))
+#' open_csv_dataset(tf, read_options = csv_read_options(skip_rows = 1))
+#' @export
+csv_read_options <- function(use_threads = option_use_threads(),
+ block_size = 1048576L,
+ skip_rows = 0L,
+ column_names = character(0),
+ autogenerate_column_names = FALSE,
+ encoding = "UTF-8",
+ skip_rows_after_names = 0L) {
+ assert_that(is.string(encoding))
+
+ options <- csv___ReadOptions__initialize(
+ list(
+ use_threads = use_threads,
+ block_size = block_size,
+ skip_rows = skip_rows,
+ skip_rows_after_names = skip_rows_after_names,
+ column_names = column_names,
+ autogenerate_column_names = autogenerate_column_names
+ )
+ )
+
+ options$encoding <- encoding
+
+ options
+}
+
#' @title File reader options
#' @rdname CsvReadOptions
#' @name CsvReadOptions
@@ -455,6 +507,11 @@ CsvTableReader$create <- function(file,
#' - `batch_size` Maximum number of rows processed at a time. Default is 1024.
#' - `null_string` The string to be written for null values. Must not contain
#' quotation marks. Default is an empty string (`""`).
+#' - `eol` The end of line character to use for ending rows.
+#' - `delimiter` Field delimiter
+#' - `quoting_style` Quoting style: "Needed" (Only enclose values in quotes
which need them, because their CSV
+#' rendering can contain quotes itself (e.g. strings or binary values)),
"AllValid" (Enclose all valid values in
+#' quotes), or "None" (Do not enclose any values in quotes).
#'
#' @section Active bindings:
#'
@@ -485,30 +542,8 @@ CsvReadOptions <- R6Class("CsvReadOptions",
skip_rows_after_names = function()
csv___ReadOptions__skip_rows_after_names(self)
)
)
-CsvReadOptions$create <- function(use_threads = option_use_threads(),
- block_size = 1048576L,
- skip_rows = 0L,
- column_names = character(0),
- autogenerate_column_names = FALSE,
- encoding = "UTF-8",
- skip_rows_after_names = 0L) {
- assert_that(is.string(encoding))
- options <- csv___ReadOptions__initialize(
- list(
- use_threads = use_threads,
- block_size = block_size,
- skip_rows = skip_rows,
- skip_rows_after_names = skip_rows_after_names,
- column_names = column_names,
- autogenerate_column_names = autogenerate_column_names
- )
- )
-
- options$encoding <- encoding
-
- options
-}
+CsvReadOptions$create <- csv_read_options
readr_to_csv_write_options <- function(col_names = TRUE,
batch_size = 1024L,
@@ -520,7 +555,7 @@ readr_to_csv_write_options <- function(col_names = TRUE,
quote <- match(match.arg(quote), c("needed", "all", "none"))
quote <- quoting_style_arrow_opts[quote]
- CsvWriteOptions$create(
+ csv_write_options(
include_header = col_names,
batch_size = batch_size,
delimiter = delim,
@@ -530,15 +565,28 @@ readr_to_csv_write_options <- function(col_names = TRUE,
)
}
-#' @rdname CsvReadOptions
+#' CSV Writing Options
+#'
+#' @param include_header Whether to write an initial header line with column
names
+#' @param batch_size Maximum number of rows processed at a time.
+#' @param null_string The string to be written for null values. Must not
contain quotation marks.
+#' @param delimiter Field delimiter
+#' @param eol The end of line character to use for ending rows
+#' @param quoting_style How to handle quotes. "Needed" (Only enclose values in
quotes which need them, because their CSV
+#' rendering can contain quotes itself (e.g. strings or binary values)),
"AllValid" (Enclose all valid values in
+#' quotes), or "None" (Do not enclose any values in quotes).
+#'
+#' @examples
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(airquality, tf, write_options =
csv_write_options(null_string = "-99"))
#' @export
-CsvWriteOptions <- R6Class("CsvWriteOptions", inherit = ArrowObject)
-CsvWriteOptions$create <- function(include_header = TRUE,
- batch_size = 1024L,
- null_string = "",
- delimiter = ",",
- eol = "\n",
- quoting_style = c("Needed", "AllValid",
"None")) {
+csv_write_options <- function(include_header = TRUE,
+ batch_size = 1024L,
+ null_string = "",
+ delimiter = ",",
+ eol = "\n",
+ quoting_style = c("Needed", "AllValid", "None"))
{
quoting_style <- match.arg(quoting_style)
quoting_style_opts <- c("Needed", "AllValid", "None")
quoting_style <- match(quoting_style, quoting_style_opts) - 1L
@@ -564,32 +612,50 @@ CsvWriteOptions$create <- function(include_header = TRUE,
)
}
+#' @rdname CsvReadOptions
+#' @export
+CsvWriteOptions <- R6Class("CsvWriteOptions", inherit = ArrowObject)
+CsvWriteOptions$create <- csv_write_options
+
readr_to_csv_read_options <- function(skip = 0, col_names = TRUE) {
if (isTRUE(col_names)) {
# C++ default to parse is 0-length string array
col_names <- character(0)
}
if (identical(col_names, FALSE)) {
- CsvReadOptions$create(skip_rows = skip, autogenerate_column_names = TRUE)
+ csv_read_options(skip_rows = skip, autogenerate_column_names = TRUE)
} else {
- CsvReadOptions$create(skip_rows = skip, column_names = col_names)
+ csv_read_options(skip_rows = skip, column_names = col_names)
}
}
-#' @rdname CsvReadOptions
-#' @usage NULL
-#' @format NULL
-#' @docType class
+#' CSV Parsing Options
+#'
+#' @param delimiter Field delimiting character
+#' @param quoting Logical: are strings quoted?
+#' @param quote_char Quoting character, if `quoting` is `TRUE`
+#' @param double_quote Logical: are quotes inside values double-quoted?
+#' @param escaping Logical: whether escaping is used
+#' @param escape_char Escaping character, if `escaping` is `TRUE`
+#' @param newlines_in_values Logical: are values allowed to contain CR (`0x0d`)
+#' and LF (`0x0a`) characters?
+#' @param ignore_empty_lines Logical: should empty lines be ignored (default)
or
+#' generate a row of missing values (if `FALSE`)?
+#' @examples
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' writeLines("x\n1\n\n2", tf)
+#' read_csv_arrow(tf, parse_options = csv_parse_options(ignore_empty_lines =
FALSE))
+#' open_csv_dataset(tf, parse_options = csv_parse_options(ignore_empty_lines =
FALSE))
#' @export
-CsvParseOptions <- R6Class("CsvParseOptions", inherit = ArrowObject)
-CsvParseOptions$create <- function(delimiter = ",",
- quoting = TRUE,
- quote_char = '"',
- double_quote = TRUE,
- escaping = FALSE,
- escape_char = "\\",
- newlines_in_values = FALSE,
- ignore_empty_lines = TRUE) {
+csv_parse_options <- function(delimiter = ",",
+ quoting = TRUE,
+ quote_char = '"',
+ double_quote = TRUE,
+ escaping = FALSE,
+ escape_char = "\\",
+ newlines_in_values = FALSE,
+ ignore_empty_lines = TRUE) {
csv___ParseOptions__initialize(
list(
delimiter = delimiter,
@@ -604,6 +670,14 @@ CsvParseOptions$create <- function(delimiter = ",",
)
}
+#' @rdname CsvReadOptions
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @export
+CsvParseOptions <- R6Class("CsvParseOptions", inherit = ArrowObject)
+CsvParseOptions$create <- csv_parse_options
+
readr_to_csv_parse_options <- function(delim = ",",
quote = '"',
escape_double = TRUE,
@@ -611,7 +685,7 @@ readr_to_csv_parse_options <- function(delim = ",",
skip_empty_rows = TRUE) {
# This function translates from the readr argument list to the arrow arg
names
# TODO: validate inputs
- CsvParseOptions$create(
+ csv_parse_options(
delimiter = delim,
quoting = nzchar(quote),
quote_char = quote,
@@ -643,23 +717,55 @@ TimestampParser$create <- function(format = NULL) {
}
}
-#' @rdname CsvReadOptions
-#' @usage NULL
-#' @format NULL
-#' @docType class
+
+#' CSV Convert Options
+#'
+#' @param check_utf8 Logical: check UTF8 validity of string columns?
+#' @param null_values Character vector of recognized spellings for null values.
+#' Analogous to the `na.strings` argument to
+#' [`read.csv()`][utils::read.csv()] or `na` in [readr::read_csv()].
+#' @param strings_can_be_null Logical: can string / binary columns have
+#' null values? Similar to the `quoted_na` argument to [readr::read_csv()]
+#' @param true_values Character vector of recognized spellings for `TRUE`
values
+#' @param false_values Character vector of recognized spellings for `FALSE`
values
+#' @param col_types A `Schema` or `NULL` to infer types
+#' @param auto_dict_encode Logical: Whether to try to automatically
+#' dictionary-encode string / binary data (think `stringsAsFactors`).
+#' This setting is ignored for non-inferred columns (those in `col_types`).
+#' @param auto_dict_max_cardinality If `auto_dict_encode`, string/binary
columns
+#' are dictionary-encoded up to this number of unique values (default 50),
+#' after which it switches to regular encoding.
+#' @param include_columns If non-empty, indicates the names of columns from the
+#' CSV file that should be actually read and converted (in the vector's
order).
+#' @param include_missing_columns Logical: if `include_columns` is provided,
should
+#' columns named in it but not found in the data be included as a column of
+#' type `null()`? The default (`FALSE`) means that the reader will instead
+#' raise an error.
+#' @param timestamp_parsers User-defined timestamp parsers. If more than one
+#' parser is specified, the CSV conversion logic will try parsing values
+#' starting from the beginning of this vector. Possible values are
+#' (a) `NULL`, the default, which uses the ISO-8601 parser;
+#' (b) a character vector of [strptime][base::strptime()] parse strings; or
+#' (c) a list of [TimestampParser] objects.
+#'
+#' @examples
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' writeLines("x\n1\nNULL\n2\nNA", tf)
+#' read_csv_arrow(tf, convert_options = csv_convert_options(null_values =
c("", "NA", "NULL")))
+#' open_csv_dataset(tf, convert_options = csv_convert_options(null_values =
c("", "NA", "NULL")))
#' @export
-CsvConvertOptions <- R6Class("CsvConvertOptions", inherit = ArrowObject)
-CsvConvertOptions$create <- function(check_utf8 = TRUE,
- null_values = c("", "NA"),
- true_values = c("T", "true", "TRUE"),
- false_values = c("F", "false", "FALSE"),
- strings_can_be_null = FALSE,
- col_types = NULL,
- auto_dict_encode = FALSE,
- auto_dict_max_cardinality = 50L,
- include_columns = character(),
- include_missing_columns = FALSE,
- timestamp_parsers = NULL) {
+csv_convert_options <- function(check_utf8 = TRUE,
+ null_values = c("", "NA"),
+ true_values = c("T", "true", "TRUE"),
+ false_values = c("F", "false", "FALSE"),
+ strings_can_be_null = FALSE,
+ col_types = NULL,
+ auto_dict_encode = FALSE,
+ auto_dict_max_cardinality = 50L,
+ include_columns = character(),
+ include_missing_columns = FALSE,
+ timestamp_parsers = NULL) {
if (!is.null(col_types) && !inherits(col_types, "Schema")) {
abort(c(
"Unsupported `col_types` specification.",
@@ -684,6 +790,14 @@ CsvConvertOptions$create <- function(check_utf8 = TRUE,
)
}
+#' @rdname CsvReadOptions
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @export
+CsvConvertOptions <- R6Class("CsvConvertOptions", inherit = ArrowObject)
+CsvConvertOptions$create <- csv_convert_options
+
readr_to_csv_convert_options <- function(na,
quoted_na,
col_types = NULL,
@@ -732,7 +846,7 @@ readr_to_csv_convert_options <- function(na,
include_columns <- setdiff(col_names, names(col_types)[nulls])
}
}
- CsvConvertOptions$create(
+ csv_convert_options(
null_values = na,
strings_can_be_null = quoted_na,
col_types = col_types,
@@ -753,7 +867,7 @@ readr_to_csv_convert_options <- function(na,
#' @param batch_size Maximum number of rows processed at a time. Default is
1024.
#' @param na value to write for NA values. Must not contain quote marks.
Default
#' is `""`.
-#' @param write_options see [file reader options][CsvWriteOptions]
+#' @param write_options see [CSV write options][csv_write_options]
#' @param ... additional parameters
#'
#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R
index cdaaf08827..c25a505f89 100644
--- a/r/R/dataset-format.R
+++ b/r/R/dataset-format.R
@@ -217,19 +217,19 @@ check_csv_file_format_args <- function(args, partitioning
= NULL) {
if (is.null(args$parse_options)) {
options$parse_options <- do.call(csv_file_format_parse_opts, args)
} else if (is.list(args$parse_options)) {
- options$parse_options <- do.call(CsvParseOptions$create,
args$parse_options)
+ options$parse_options <- do.call(csv_parse_options, args$parse_options)
}
if (is.null(args$convert_options)) {
options$convert_options <- do.call(csv_file_format_convert_opts, args)
} else if (is.list(args$convert_options)) {
- options$convert_options <- do.call(CsvConvertOptions$create,
args$convert_options)
+ options$convert_options <- do.call(csv_convert_options,
args$convert_options)
}
if (is.null(args$read_options)) {
options$read_options <- do.call(csv_file_format_read_opts, c(args,
list(partitioning = partitioning)))
} else if (is.list(args$read_options)) {
- options$read_options <- do.call(CsvReadOptions$create, args$read_options)
+ options$read_options <- do.call(csv_read_options, args$read_options)
}
options
@@ -239,16 +239,16 @@ check_unsupported_args <- function(args) {
opt_names <- get_opt_names(args)
# Filter out arguments meant for CsvConvertOptions/CsvReadOptions
- supported_convert_opts <- c(names(formals(CsvConvertOptions$create)), "na",
"quoted_na")
+ supported_convert_opts <- c(names(formals(csv_convert_options)), "na",
"quoted_na")
supported_read_opts <- c(
- names(formals(CsvReadOptions$create)),
+ names(formals(csv_read_options)),
names(formals(readr_to_csv_read_options))
)
# We only currently support all of the readr options for parseoptions
supported_parse_opts <- c(
- names(formals(CsvParseOptions$create)),
+ names(formals(csv_parse_options)),
names(formals(readr_to_csv_parse_options))
)
@@ -303,9 +303,9 @@ check_unrecognised_args <- function(opts) {
opt_names <- get_opt_names(opts)
arrow_opts <- c(
- names(formals(CsvParseOptions$create)),
- names(formals(CsvReadOptions$create)),
- names(formals(CsvConvertOptions$create)),
+ names(formals(csv_parse_options)),
+ names(formals(csv_read_options)),
+ names(formals(csv_convert_options)),
"schema"
)
@@ -395,9 +395,9 @@ check_schema <- function(schema, partitioning,
column_names) {
csv_file_format_parse_opts <- function(...) {
opts <- list(...)
# Filter out arguments meant for CsvConvertOptions/CsvReadOptions
- convert_opts <- c(names(formals(CsvConvertOptions$create)), "na",
"quoted_na", "convert_options")
+ convert_opts <- c(names(formals(csv_convert_options)), "na", "quoted_na",
"convert_options")
read_opts <- c(
- names(formals(CsvReadOptions$create)),
+ names(formals(csv_read_options)),
names(formals(readr_to_csv_read_options)),
"read_options"
)
@@ -407,7 +407,7 @@ csv_file_format_parse_opts <- function(...) {
opts[["parse_options"]] <- NULL
opt_names <- get_opt_names(opts)
- arrow_opts <- c(names(formals(CsvParseOptions$create)))
+ arrow_opts <- c(names(formals(csv_parse_options)))
readr_opts <- c(names(formals(readr_to_csv_parse_options)))
is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts))
@@ -427,17 +427,17 @@ csv_file_format_parse_opts <- function(...) {
}
do.call(readr_to_csv_parse_options, opts) # all options have readr-style
names
} else {
- do.call(CsvParseOptions$create, opts) # all options have Arrow C++ names
+ do.call(csv_parse_options, opts) # all options have Arrow C++ names
}
}
csv_file_format_convert_opts <- function(...) {
opts <- list(...)
# Filter out arguments meant for CsvParseOptions/CsvReadOptions
- arrow_opts <- c(names(formals(CsvParseOptions$create)), "parse_options")
+ arrow_opts <- c(names(formals(csv_parse_options)), "parse_options")
readr_opts <- names(formals(readr_to_csv_parse_options))
read_opts <- c(
- names(formals(CsvReadOptions$create)),
+ names(formals(csv_read_options)),
names(formals(readr_to_csv_read_options)),
"read_options"
)
@@ -458,23 +458,23 @@ csv_file_format_convert_opts <- function(...) {
opts[["quoted_na"]] <- NULL
}
- do.call(CsvConvertOptions$create, opts)
+ do.call(csv_convert_options, opts)
}
csv_file_format_read_opts <- function(schema = NULL, partitioning = NULL, ...)
{
opts <- list(...)
# Filter out arguments meant for CsvParseOptions/CsvConvertOptions
- arrow_opts <- c(names(formals(CsvParseOptions$create)), "parse_options")
+ arrow_opts <- c(names(formals(csv_parse_options)), "parse_options")
readr_opts <- names(formals(readr_to_csv_parse_options))
- convert_opts <- c(names(formals(CsvConvertOptions$create)), "na",
"quoted_na", "convert_options")
+ convert_opts <- c(names(formals(csv_convert_options)), "na", "quoted_na",
"convert_options")
opts[arrow_opts] <- NULL
opts[readr_opts] <- NULL
opts[convert_opts] <- NULL
opts[["read_options"]] <- NULL
opt_names <- names(opts)
- arrow_opts <- c(names(formals(CsvReadOptions$create)))
+ arrow_opts <- c(names(formals(csv_read_options)))
readr_opts <- c(names(formals(readr_to_csv_read_options)))
is_arrow_opt <- !is.na(match(opt_names, arrow_opts))
@@ -505,7 +505,7 @@ csv_file_format_read_opts <- function(schema = NULL,
partitioning = NULL, ...) {
}
do.call(readr_to_csv_read_options, opts) # all options have readr-style
names
} else {
- do.call(CsvReadOptions$create, opts) # all options have Arrow C++ names
+ do.call(csv_read_options, opts) # all options have Arrow C++ names
}
}
@@ -648,7 +648,7 @@ FileWriteOptions <- R6Class("FileWriteOptions",
)
} else if (format %in% c("csv", "tsv", "txt", "text")) {
supported_args <- c(
- names(formals(CsvWriteOptions$create)),
+ names(formals(csv_write_options)),
names(formals(readr_to_csv_write_options))
)
}
@@ -703,7 +703,7 @@ FileWriteOptions <- R6Class("FileWriteOptions",
)
}
} else if (self$type %in% c("csv", "tsv", "txt", "text")) {
- arrow_opts <- names(formals(CsvWriteOptions$create))
+ arrow_opts <- names(formals(csv_write_options))
readr_opts <- names(formals(readr_to_csv_write_options))
readr_only_opts <- setdiff(readr_opts, arrow_opts)
arrow_only_opts <- setdiff(arrow_opts, readr_opts)
@@ -727,7 +727,7 @@ FileWriteOptions <- R6Class("FileWriteOptions",
} else {
dataset___CsvFileWriteOptions__update(
self,
- do.call(CsvWriteOptions$create, args[is_arrow_opt])
+ do.call(csv_write_options, args[is_arrow_opt])
)
}
}
diff --git a/r/R/dataset.R b/r/R/dataset.R
index 9d91839c22..90e6516927 100644
--- a/r/R/dataset.R
+++ b/r/R/dataset.R
@@ -276,7 +276,8 @@ open_delim_dataset <- function(sources,
convert_options = NULL,
read_options = NULL,
timestamp_parsers = NULL,
- quoted_na = TRUE) {
+ quoted_na = TRUE,
+ parse_options = NULL) {
open_dataset(
sources = sources,
schema = schema,
@@ -297,7 +298,8 @@ open_delim_dataset <- function(sources,
convert_options = convert_options,
read_options = read_options,
timestamp_parsers = timestamp_parsers,
- quoted_na = quoted_na
+ quoted_na = quoted_na,
+ parse_options = parse_options
)
}
@@ -320,7 +322,8 @@ open_csv_dataset <- function(sources,
convert_options = NULL,
read_options = NULL,
timestamp_parsers = NULL,
- quoted_na = TRUE) {
+ quoted_na = TRUE,
+ parse_options = NULL) {
mc <- match.call()
mc$delim <- ","
mc[[1]] <- get("open_delim_dataset", envir = asNamespace("arrow"))
@@ -346,7 +349,8 @@ open_tsv_dataset <- function(sources,
convert_options = NULL,
read_options = NULL,
timestamp_parsers = NULL,
- quoted_na = TRUE) {
+ quoted_na = TRUE,
+ parse_options = NULL) {
mc <- match.call()
mc$delim <- "\t"
mc[[1]] <- get("open_delim_dataset", envir = asNamespace("arrow"))
diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml
index 57c21e3c0f..84111e599c 100644
--- a/r/_pkgdown.yml
+++ b/r/_pkgdown.yml
@@ -146,6 +146,10 @@ reference:
- open_delim_dataset
- open_csv_dataset
- open_tsv_dataset
+ - csv_read_options
+ - csv_parse_options
+ - csv_convert_options
+
- title: Write datasets
desc: >
@@ -155,6 +159,7 @@ reference:
- write_delim_dataset
- write_csv_dataset
- write_tsv_dataset
+ - csv_write_options
- title: Read files
desc: >
diff --git a/r/man/CsvReadOptions.Rd b/r/man/CsvReadOptions.Rd
index 6ebb235518..32742280cc 100644
--- a/r/man/CsvReadOptions.Rd
+++ b/r/man/CsvReadOptions.Rd
@@ -106,6 +106,11 @@ The \code{CsvWriteOptions$create()} factory method takes
the following arguments
\item \code{batch_size} Maximum number of rows processed at a time. Default is
1024.
\item \code{null_string} The string to be written for null values. Must not
contain
quotation marks. Default is an empty string (\code{""}).
+\item \code{eol} The end of line character to use for ending rows.
+\item \code{delimiter} Field delimiter
+\item \code{quoting_style} Quoting style: "Needed" (Only enclose values in
quotes which need them, because their CSV
+rendering can contain quotes itself (e.g. strings or binary values)),
"AllValid" (Enclose all valid values in
+quotes), or "None" (Do not enclose any values in quotes).
}
}
diff --git a/r/man/csv_convert_options.Rd b/r/man/csv_convert_options.Rd
new file mode 100644
index 0000000000..4fd6eac1c3
--- /dev/null
+++ b/r/man/csv_convert_options.Rd
@@ -0,0 +1,69 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/csv.R
+\name{csv_convert_options}
+\alias{csv_convert_options}
+\title{CSV Convert Options}
+\usage{
+csv_convert_options(
+ check_utf8 = TRUE,
+ null_values = c("", "NA"),
+ true_values = c("T", "true", "TRUE"),
+ false_values = c("F", "false", "FALSE"),
+ strings_can_be_null = FALSE,
+ col_types = NULL,
+ auto_dict_encode = FALSE,
+ auto_dict_max_cardinality = 50L,
+ include_columns = character(),
+ include_missing_columns = FALSE,
+ timestamp_parsers = NULL
+)
+}
+\arguments{
+\item{check_utf8}{Logical: check UTF8 validity of string columns?}
+
+\item{null_values}{Character vector of recognized spellings for null values.
+Analogous to the \code{na.strings} argument to
+\code{\link[utils:read.table]{read.csv()}} or \code{na} in
\code{\link[readr:read_delim]{readr::read_csv()}}.}
+
+\item{true_values}{Character vector of recognized spellings for \code{TRUE}
values}
+
+\item{false_values}{Character vector of recognized spellings for \code{FALSE}
values}
+
+\item{strings_can_be_null}{Logical: can string / binary columns have
+null values? Similar to the \code{quoted_na} argument to
\code{\link[readr:read_delim]{readr::read_csv()}}}
+
+\item{col_types}{A \code{Schema} or \code{NULL} to infer types}
+
+\item{auto_dict_encode}{Logical: Whether to try to automatically
+dictionary-encode string / binary data (think \code{stringsAsFactors}).
+This setting is ignored for non-inferred columns (those in \code{col_types}).}
+
+\item{auto_dict_max_cardinality}{If \code{auto_dict_encode}, string/binary
columns
+are dictionary-encoded up to this number of unique values (default 50),
+after which it switches to regular encoding.}
+
+\item{include_columns}{If non-empty, indicates the names of columns from the
+CSV file that should be actually read and converted (in the vector's order).}
+
+\item{include_missing_columns}{Logical: if \code{include_columns} is provided,
should
+columns named in it but not found in the data be included as a column of
+type \code{null()}? The default (\code{FALSE}) means that the reader will
instead
+raise an error.}
+
+\item{timestamp_parsers}{User-defined timestamp parsers. If more than one
+parser is specified, the CSV conversion logic will try parsing values
+starting from the beginning of this vector. Possible values are
+(a) \code{NULL}, the default, which uses the ISO-8601 parser;
+(b) a character vector of \link[base:strptime]{strptime} parse strings; or
+(c) a list of \link{TimestampParser} objects.}
+}
+\description{
+CSV Convert Options
+}
+\examples{
+tf <- tempfile()
+on.exit(unlink(tf))
+writeLines('x\n1\nNULL\n2\nNA', tf)
+read_csv_arrow(tf, convert_options = csv_convert_options(null_values = c("",
"NA", "NULL")))
+open_csv_dataset(tf, convert_options = csv_convert_options(null_values =
c("", "NA", "NULL")))
+}
diff --git a/r/man/csv_parse_options.Rd b/r/man/csv_parse_options.Rd
new file mode 100644
index 0000000000..e2e8fd19a5
--- /dev/null
+++ b/r/man/csv_parse_options.Rd
@@ -0,0 +1,46 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/csv.R
+\name{csv_parse_options}
+\alias{csv_parse_options}
+\title{CSV Parsing Options}
+\usage{
+csv_parse_options(
+ delimiter = ",",
+ quoting = TRUE,
+ quote_char = "\\"",
+ double_quote = TRUE,
+ escaping = FALSE,
+ escape_char = "\\\\",
+ newlines_in_values = FALSE,
+ ignore_empty_lines = TRUE
+)
+}
+\arguments{
+\item{delimiter}{Field delimiting character}
+
+\item{quoting}{Logical: are strings quoted?}
+
+\item{quote_char}{Quoting character, if \code{quoting} is \code{TRUE}}
+
+\item{double_quote}{Logical: are quotes inside values double-quoted?}
+
+\item{escaping}{Logical: whether escaping is used}
+
+\item{escape_char}{Escaping character, if \code{escaping} is \code{TRUE}}
+
+\item{newlines_in_values}{Logical: are values allowed to contain CR
(\code{0x0d})
+and LF (\code{0x0a}) characters?}
+
+\item{ignore_empty_lines}{Logical: should empty lines be ignored (default) or
+generate a row of missing values (if \code{FALSE})?}
+}
+\description{
+CSV Parsing Options
+}
+\examples{
+tf <- tempfile()
+on.exit(unlink(tf))
+writeLines('x\n1\n\n2', tf)
+read_csv_arrow(tf, parse_options = csv_parse_options(ignore_empty_lines =
FALSE))
+open_csv_dataset(tf, parse_options = csv_parse_options(ignore_empty_lines =
FALSE))
+}
diff --git a/r/man/csv_read_options.Rd b/r/man/csv_read_options.Rd
new file mode 100644
index 0000000000..ed2436f316
--- /dev/null
+++ b/r/man/csv_read_options.Rd
@@ -0,0 +1,51 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/csv.R
+\name{csv_read_options}
+\alias{csv_read_options}
+\title{CSV Reading Options}
+\usage{
+csv_read_options(
+ use_threads = option_use_threads(),
+ block_size = 1048576L,
+ skip_rows = 0L,
+ column_names = character(0),
+ autogenerate_column_names = FALSE,
+ encoding = "UTF-8",
+ skip_rows_after_names = 0L
+)
+}
+\arguments{
+\item{use_threads}{Whether to use the global CPU thread pool}
+
+\item{block_size}{Block size we request from the IO layer; also determines
+the size of chunks when use_threads is \code{TRUE}.}
+
+\item{skip_rows}{Number of lines to skip before reading data (default 0).}
+
+\item{column_names}{Character vector to supply column names. If length-0
+(the default), the first non-skipped row will be parsed to generate column
+names, unless \code{autogenerate_column_names} is \code{TRUE}.}
+
+\item{autogenerate_column_names}{Logical: generate column names instead of
+using the first non-skipped row (the default)? If \code{TRUE}, column names
will
+be "f0", "f1", ..., "fN".}
+
+\item{encoding}{The file encoding. (default \code{"UTF-8"})}
+
+\item{skip_rows_after_names}{Number of lines to skip after the column names
(default 0).
+This number can be larger than the number of rows in one block, and empty rows
are counted.
+The order of application is as follows:
+- \code{skip_rows} is applied (if non-zero);
+- column names are read (unless \code{column_names} is set);
+- \code{skip_rows_after_names} is applied (if non-zero).}
+}
+\description{
+CSV Reading Options
+}
+\examples{
+tf <- tempfile()
+on.exit(unlink(tf))
+writeLines('my file has a non-data header\nx\n1\n2', tf)
+read_csv_arrow(tf, read_options = csv_read_options(skip_rows = 1))
+open_csv_dataset(tf, read_options = csv_read_options(skip_rows = 1))
+}
diff --git a/r/man/csv_write_options.Rd b/r/man/csv_write_options.Rd
new file mode 100644
index 0000000000..a42905dd18
--- /dev/null
+++ b/r/man/csv_write_options.Rd
@@ -0,0 +1,38 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/csv.R
+\name{csv_write_options}
+\alias{csv_write_options}
+\title{CSV Writing Options}
+\usage{
+csv_write_options(
+ include_header = TRUE,
+ batch_size = 1024L,
+ null_string = "",
+ delimiter = ",",
+ eol = "\\n",
+ quoting_style = c("Needed", "AllValid", "None")
+)
+}
+\arguments{
+\item{include_header}{Whether to write an initial header line with column
names}
+
+\item{batch_size}{Maximum number of rows processed at a time.}
+
+\item{null_string}{The string to be written for null values. Must not contain
quotation marks.}
+
+\item{delimiter}{Field delimiter}
+
+\item{eol}{The end of line character to use for ending rows}
+
+\item{quoting_style}{How to handle quotes. "Needed" (Only enclose values in
quotes which need them, because their CSV
+rendering can contain quotes itself (e.g. strings or binary values)),
"AllValid" (Enclose all valid values in
+quotes), or "None" (Do not enclose any values in quotes).}
+}
+\description{
+CSV Writing Options
+}
+\examples{
+tf <- tempfile()
+on.exit(unlink(tf))
+write_csv_arrow(airquality, tf, write_options = csv_write_options(null_string
= "-99"))
+}
diff --git a/r/man/open_delim_dataset.Rd b/r/man/open_delim_dataset.Rd
index cf08302cc6..7b81f0033e 100644
--- a/r/man/open_delim_dataset.Rd
+++ b/r/man/open_delim_dataset.Rd
@@ -25,7 +25,8 @@ open_delim_dataset(
convert_options = NULL,
read_options = NULL,
timestamp_parsers = NULL,
- quoted_na = TRUE
+ quoted_na = TRUE,
+ parse_options = NULL
)
open_csv_dataset(
@@ -46,7 +47,8 @@ open_csv_dataset(
convert_options = NULL,
read_options = NULL,
timestamp_parsers = NULL,
- quoted_na = TRUE
+ quoted_na = TRUE,
+ parse_options = NULL
)
open_tsv_dataset(
@@ -67,7 +69,8 @@ open_tsv_dataset(
convert_options = NULL,
read_options = NULL,
timestamp_parsers = NULL,
- quoted_na = TRUE
+ quoted_na = TRUE,
+ parse_options = NULL
)
}
\arguments{
@@ -169,9 +172,9 @@ filled with missings.}
\item{skip}{Number of lines to skip before reading data.}
-\item{convert_options}{see \link[=CsvReadOptions]{file reader options}}
+\item{convert_options}{see \link[=csv_convert_options]{CSV conversion options}}
-\item{read_options}{see \link[=CsvReadOptions]{file reader options}}
+\item{read_options}{see \link[=csv_read_options]{CSV reading options}}
\item{timestamp_parsers}{User-defined timestamp parsers. If more than one
parser is specified, the CSV conversion logic will try parsing values
@@ -186,6 +189,10 @@ starting from the beginning of this vector. Possible
values are:
values (the default) or strings. (Note that this is different from the
the Arrow C++ default for the corresponding convert option,
\code{strings_can_be_null}.)}
+
+\item{parse_options}{see \link[=csv_parse_options]{CSV parsing options}.
+If given, this overrides any
+parsing options provided in other arguments (e.g. \code{delim}, \code{quote},
etc.).}
}
\description{
A wrapper around \link{open_dataset} which explicitly includes parameters
mirroring \code{\link[=read_csv_arrow]{read_csv_arrow()}},
diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd
index cd07c8ad07..09c20fa013 100644
--- a/r/man/read_delim_arrow.Rd
+++ b/r/man/read_delim_arrow.Rd
@@ -120,13 +120,13 @@ filled with missings.}
\item{skip}{Number of lines to skip before reading data.}
-\item{parse_options}{see \link[=CsvReadOptions]{file reader options}.
+\item{parse_options}{see \link[=csv_parse_options]{CSV parsing options}.
If given, this overrides any
parsing options provided in other arguments (e.g. \code{delim}, \code{quote},
etc.).}
-\item{convert_options}{see \link[=CsvReadOptions]{file reader options}}
+\item{convert_options}{see \link[=csv_convert_options]{CSV conversion options}}
-\item{read_options}{see \link[=CsvReadOptions]{file reader options}}
+\item{read_options}{see \link[=csv_read_options]{CSV reading options}}
\item{as_data_frame}{Should the function return a \code{tibble} (default) or
an Arrow \link{Table}?}
diff --git a/r/man/write_csv_arrow.Rd b/r/man/write_csv_arrow.Rd
index 2b0d09ba74..9fcca49fad 100644
--- a/r/man/write_csv_arrow.Rd
+++ b/r/man/write_csv_arrow.Rd
@@ -34,7 +34,7 @@ system (\code{SubTreeFileSystem})}
\item{na}{value to write for NA values. Must not contain quote marks. Default
is \code{""}.}
-\item{write_options}{see \link[=CsvWriteOptions]{file reader options}}
+\item{write_options}{see \link[=csv_write_options]{CSV write options}}
\item{...}{additional parameters}
}
diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R
index 38f79c6397..22ccc7950f 100644
--- a/r/tests/testthat/test-csv.R
+++ b/r/tests/testthat/test-csv.R
@@ -336,7 +336,7 @@ test_that("CSV reader works on files with non-UTF-8
encoding", {
fs <- LocalFileSystem$create()
reader <- CsvTableReader$create(
fs$OpenInputStream(tf),
- read_options = CsvReadOptions$create(encoding = "UTF-16LE")
+ read_options = csv_read_options(encoding = "UTF-16LE")
)
table <- reader$Read()
@@ -435,7 +435,7 @@ test_that("Write a CSV with custom NA value", {
# Also can use null_value in CsvWriteOptions
tbl_out1 <- write_csv_arrow(tbl_no_dates, csv_file,
- write_options = CsvWriteOptions$create(null_string = "another_null")
+ write_options = csv_write_options(null_string = "another_null")
)
csv_contents <- readLines(csv_file)
expect_true(any(grepl("another_null", csv_contents)))
@@ -680,9 +680,9 @@ test_that("CSV reading/parsing/convert options can be
passed in as lists", {
tab2 <- read_csv_arrow(
tf,
- convert_options = CsvConvertOptions$create(null_values = c(NA, "NA",
"NULL"), strings_can_be_null = TRUE),
- parse_options = CsvParseOptions$create(ignore_empty_lines = FALSE),
- read_options = CsvReadOptions$create(skip_rows = 1L)
+ convert_options = csv_convert_options(null_values = c(NA, "NA", "NULL"),
strings_can_be_null = TRUE),
+ parse_options = csv_parse_options(ignore_empty_lines = FALSE),
+ read_options = csv_read_options(skip_rows = 1L)
)
expect_equal(tab1, tab2)
diff --git a/r/tests/testthat/test-dataset-csv.R
b/r/tests/testthat/test-dataset-csv.R
index e8e7c61fc8..bee701e5ef 100644
--- a/r/tests/testthat/test-dataset-csv.R
+++ b/r/tests/testthat/test-dataset-csv.R
@@ -206,7 +206,7 @@ test_that("Other text delimited dataset", {
})
test_that("readr parse options", {
- arrow_opts <- names(formals(CsvParseOptions$create))
+ arrow_opts <- names(formals(csv_parse_options))
readr_opts <- names(formals(readr_to_csv_parse_options))
# Arrow and readr parse options must be mutually exclusive, or else the code
@@ -469,8 +469,8 @@ test_that("CSV reading/parsing/convert options can be
passed in as lists", {
ds2 <- open_dataset(
tf,
format = "csv",
- convert_options = CsvConvertOptions$create(null_values = c(NA, "NA",
"NULL"), strings_can_be_null = TRUE),
- read_options = CsvReadOptions$create(skip_rows = 1L)
+ convert_options = csv_convert_options(null_values = c(NA, "NA", "NULL"),
strings_can_be_null = TRUE),
+ read_options = csv_read_options(skip_rows = 1L)
) %>%
collect()
@@ -571,6 +571,16 @@ test_that("open_delim_dataset params passed through to
open_dataset", {
ds <- open_csv_dataset(dst_dir, quoted_na = FALSE) %>% collect()
expect_equal(ds$text, c("one", "two", "", "four"))
+ # parse_options
+ dst_dir <- make_temp_dir()
+ dst_file <- file.path(dst_dir, "data.csv")
+ writeLines("x\n\n1\n\n\n2\n\n3", dst_file)
+ ds <- open_csv_dataset(
+ dst_dir,
+ parse_options = csv_parse_options(ignore_empty_lines = FALSE)
+ ) %>% collect()
+ expect_equal(ds$x, c(NA, 1L, NA, NA, 2L, NA, 3L))
+
# timestamp_parsers
skip("GH-33708: timestamp_parsers don't appear to be working properly")
@@ -586,15 +596,15 @@ test_that("open_delim_dataset params passed through to
open_dataset", {
})
test_that("CSVReadOptions printing", {
- default_read_options <- CsvReadOptions$create()
- custom_read_options <- CsvReadOptions$create(skip_rows = 102)
+ default_read_options <- csv_read_options()
+ custom_read_options <- csv_read_options(skip_rows = 102)
expect_output(print(default_read_options), "skip_rows: 0")
expect_output(print(custom_read_options), "skip_rows: 102")
})
test_that("CSVReadOptions field access", {
- options <- CsvReadOptions$create()
+ options <- csv_read_options()
expect_equal(options$skip_rows, 0)
expect_equal(options$autogenerate_column_names, FALSE)
expect_equal(options$skip_rows_after_names, 0)