This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 18e255ed75 GH-34339: [R] Add `skip_rows_after_names` option to
`read_csv_arrow`'s options (#34340)
18e255ed75 is described below
commit 18e255ed758f3e953196ed2cdbfc7424b9861afd
Author: eitsupi <[email protected]>
AuthorDate: Wed Mar 1 00:05:47 2023 +0900
GH-34339: [R] Add `skip_rows_after_names` option to `read_csv_arrow`'s
options (#34340)
### Rationale for this change
The `skip_rows_after_names` option implemented in C++ was not exposed to R.
### What changes are included in this PR?
The new `skip_rows_after_names` option is now available for the
`read_csv_arrow` function.
``` r
> csv <- "a,b\n1,2\n3,4"
> arrow::read_csv_arrow(I(csv), read_options = list(skip_rows_after_names =
1))
# A tibble: 1 × 2
a b
<int> <int>
1 3 4
```
### Are these changes tested?
Tests are added for the new option.
### Are there any user-facing changes?
The new `skip_rows_after_names` option can be used.
* Closes: #34339
Lead-authored-by: SHIMA Tatsuya <[email protected]>
Co-authored-by: eitsupi <[email protected]>
Co-authored-by: Nic Crane <[email protected]>
Signed-off-by: Nic Crane <[email protected]>
---
r/R/csv.R | 12 ++++++++++--
r/man/CsvReadOptions.Rd | 10 +++++++++-
r/src/csv.cpp | 1 +
r/tests/testthat/test-csv.R | 26 ++++++++++++++++++++++++++
4 files changed, 46 insertions(+), 3 deletions(-)
diff --git a/r/R/csv.R b/r/R/csv.R
index 03e3e0e8d4..c3ebd6e41f 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -385,7 +385,7 @@ CsvTableReader$create <- function(file,
#'
#' `CsvReadOptions$create()` further accepts these additional arguments:
#'
-#' - `skip_rows` Number of lines to skip before reading data (default 0)
+#' - `skip_rows` Number of lines to skip before reading data (default 0).
#' - `column_names` Character vector to supply column names. If length-0
#' (the default), the first non-skipped row will be parsed to generate column
#' names, unless `autogenerate_column_names` is `TRUE`.
@@ -393,6 +393,12 @@ CsvTableReader$create <- function(file,
#' using the first non-skipped row (the default)? If `TRUE`, column names will
#' be "f0", "f1", ..., "fN".
#' - `encoding` The file encoding. (default `"UTF-8"`)
+#' - `skip_rows_after_names` Number of lines to skip after the column names
(default 0).
+#' This number can be larger than the number of rows in one block, and
empty rows are counted.
+#' The order of application is as follows:
+#' - `skip_rows` is applied (if non-zero);
+#' - column names are read (unless `column_names` is set);
+#' - `skip_rows_after_names` is applied (if non-zero).
#'
#' `CsvParseOptions$create()` takes the following arguments:
#'
@@ -469,7 +475,8 @@ CsvReadOptions$create <- function(use_threads =
option_use_threads(),
skip_rows = 0L,
column_names = character(0),
autogenerate_column_names = FALSE,
- encoding = "UTF-8") {
+ encoding = "UTF-8",
+ skip_rows_after_names = 0L) {
assert_that(is.string(encoding))
options <- csv___ReadOptions__initialize(
@@ -477,6 +484,7 @@ CsvReadOptions$create <- function(use_threads =
option_use_threads(),
use_threads = use_threads,
block_size = block_size,
skip_rows = skip_rows,
+ skip_rows_after_names = skip_rows_after_names,
column_names = column_names,
autogenerate_column_names = autogenerate_column_names
)
diff --git a/r/man/CsvReadOptions.Rd b/r/man/CsvReadOptions.Rd
index 270d522b83..a18ff959ce 100644
--- a/r/man/CsvReadOptions.Rd
+++ b/r/man/CsvReadOptions.Rd
@@ -30,7 +30,7 @@ must end with an empty line.
\code{CsvReadOptions$create()} further accepts these additional arguments:
\itemize{
-\item \code{skip_rows} Number of lines to skip before reading data (default 0)
+\item \code{skip_rows} Number of lines to skip before reading data (default 0).
\item \code{column_names} Character vector to supply column names. If length-0
(the default), the first non-skipped row will be parsed to generate column
names, unless \code{autogenerate_column_names} is \code{TRUE}.
@@ -38,6 +38,14 @@ names, unless \code{autogenerate_column_names} is
\code{TRUE}.
using the first non-skipped row (the default)? If \code{TRUE}, column names
will
be "f0", "f1", ..., "fN".
\item \code{encoding} The file encoding. (default \code{"UTF-8"})
+\item \code{skip_rows_after_names} Number of lines to skip after the column
names (default 0).
+This number can be larger than the number of rows in one block, and empty rows
are counted.
+The order of application is as follows:
+\itemize{
+\item \code{skip_rows} is applied (if non-zero);
+\item column names are read (unless \code{column_names} is set);
+\item \code{skip_rows_after_names} is applied (if non-zero).
+}
}
\code{CsvParseOptions$create()} takes the following arguments:
diff --git a/r/src/csv.cpp b/r/src/csv.cpp
index 3f880cae16..2eb22d7d81 100644
--- a/r/src/csv.cpp
+++ b/r/src/csv.cpp
@@ -44,6 +44,7 @@ std::shared_ptr<arrow::csv::ReadOptions>
csv___ReadOptions__initialize(
res->use_threads = cpp11::as_cpp<bool>(options["use_threads"]);
res->block_size = cpp11::as_cpp<int>(options["block_size"]);
res->skip_rows = cpp11::as_cpp<int>(options["skip_rows"]);
+ res->skip_rows_after_names =
cpp11::as_cpp<int>(options["skip_rows_after_names"]);
res->column_names =
cpp11::as_cpp<std::vector<std::string>>(options["column_names"]);
res->autogenerate_column_names =
cpp11::as_cpp<bool>(options["autogenerate_column_names"]);
diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R
index b1a3f843b9..e806a66801 100644
--- a/r/tests/testthat/test-csv.R
+++ b/r/tests/testthat/test-csv.R
@@ -698,3 +698,29 @@ test_that("Read literal data directly", {
expect_identical(read_csv_arrow(I(charToRaw("x,y\n1,2\n3,4"))), expected)
expect_identical(read_csv_arrow(I(c("x,y", "1,2", "3,4"))), expected)
})
+
+test_that("skip_rows and skip_rows_after_names option", {
+ txt_raw <- charToRaw(paste0(c("a", 1:4), collapse = "\n"))
+
+ expect_identical(
+ read_csv_arrow(
+ txt_raw,
+ read_options = list(skip_rows_after_names = 1)
+ ),
+ tibble::tibble(a = 2:4)
+ )
+ expect_identical(
+ read_csv_arrow(
+ txt_raw,
+ read_options = list(skip_rows_after_names = 10)
+ ),
+ tibble::tibble(a = vctrs::unspecified())
+ )
+ expect_identical(
+ read_csv_arrow(
+ txt_raw,
+ read_options = list(skip = 1, skip_rows_after_names = 1)
+ ),
+ tibble::tibble(`1` = 3:4)
+ )
+})