This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new f35dd92 ARROW-5747: [C++] Improve CSV header and column names options
f35dd92 is described below
commit f35dd92ecb5178a798d747d8728ee6343385a565
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Jul 24 14:26:36 2019 +0200
ARROW-5747: [C++] Improve CSV header and column names options
The `header_rows` option is removed (it seemed untested and probably didn't
work well)
and replaced with a `skip_rows` option that dictates how many rows are
skipped at
the start of the CSV file (default 0).
Furthermore, a `column_names` option is added to hardcode the column names.
If empty (default), they are read from the first non-skipped row in the CSV
file.
Closes #4898 from pitrou/ARROW-5747-csv-column-names and squashes the
following commits:
f42840f00 <Antoine Pitrou> Fix downcast
d1381b3bd <Antoine Pitrou> Implement skip_rows + column_names in R
36f2743c5 <Antoine Pitrou> Fix expected semantics for skip_rows
410805254 <Antoine Pitrou> ARROW-5747: Improve header and column names
options
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
c_glib/arrow-glib/reader.cpp | 27 ---------
cpp/src/arrow/csv/options.h | 9 +--
cpp/src/arrow/csv/parser-test.cc | 92 ++++++++++++++++++++++++++++++
cpp/src/arrow/csv/parser.cc | 26 +++++++++
cpp/src/arrow/csv/parser.h | 22 ++++++++
cpp/src/arrow/csv/reader.cc | 66 ++++++++++++++--------
python/pyarrow/_csv.pyx | 56 ++++++++++++------
python/pyarrow/includes/libarrow.pxd | 3 +-
python/pyarrow/tests/test_csv.py | 106 ++++++++++++++++++++++++++++++++---
r/R/csv.R | 87 ++++++++++++++--------------
r/README.md | 8 ---
r/man/csv_parse_options.Rd | 4 +-
r/man/csv_read_options.Rd | 12 +++-
r/man/read_delim_arrow.Rd | 27 +++++----
r/src/csv.cpp | 3 +-
r/tests/testthat/test-arrow-csv.R | 22 ++++++--
16 files changed, 417 insertions(+), 153 deletions(-)
diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp
index 7783362..fdc8ffe 100644
--- a/c_glib/arrow-glib/reader.cpp
+++ b/c_glib/arrow-glib/reader.cpp
@@ -902,7 +902,6 @@ enum {
PROP_ESCAPE_CHARACTER,
PROP_ALLOW_NEWLINES_IN_VALUES,
PROP_IGNORE_EMPTY_LINES,
- PROP_N_HEADER_ROWS,
PROP_CHECK_UTF8,
PROP_ALLOW_NULL_STRINGS
};
@@ -955,9 +954,6 @@ garrow_csv_read_options_set_property(GObject *object,
case PROP_IGNORE_EMPTY_LINES:
priv->parse_options.ignore_empty_lines = g_value_get_boolean(value);
break;
- case PROP_N_HEADER_ROWS:
- priv->parse_options.header_rows = g_value_get_uint(value);
- break;
case PROP_CHECK_UTF8:
priv->convert_options.check_utf8 = g_value_get_boolean(value);
break;
@@ -1009,9 +1005,6 @@ garrow_csv_read_options_get_property(GObject *object,
case PROP_IGNORE_EMPTY_LINES:
g_value_set_boolean(value, priv->parse_options.ignore_empty_lines);
break;
- case PROP_N_HEADER_ROWS:
- g_value_set_uint(value, priv->parse_options.header_rows);
- break;
case PROP_CHECK_UTF8:
g_value_set_boolean(value, priv->convert_options.check_utf8);
break;
@@ -1210,26 +1203,6 @@
garrow_csv_read_options_class_init(GArrowCSVReadOptionsClass *klass)
PROP_IGNORE_EMPTY_LINES,
spec);
- /**
- * GArrowCSVReadOptions:n-header-rows:
- *
- * The number of header rows to skip (including the first row
- * containing column names)
- *
- * Since: 0.12.0
- */
- spec = g_param_spec_uint("n-header-rows",
- "N header rows",
- "The number of header rows to skip "
- "(including the first row containing column names",
- 0,
- G_MAXUINT,
- parse_options.header_rows,
- static_cast<GParamFlags>(G_PARAM_READWRITE));
- g_object_class_install_property(gobject_class,
- PROP_N_HEADER_ROWS,
- spec);
-
auto convert_options = arrow::csv::ConvertOptions::Defaults();
/**
diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h
index 9cd312a..21d0ab2 100644
--- a/cpp/src/arrow/csv/options.h
+++ b/cpp/src/arrow/csv/options.h
@@ -53,10 +53,6 @@ struct ARROW_EXPORT ParseOptions {
// a single empty value (assuming a one-column CSV file).
bool ignore_empty_lines = true;
- // XXX Should this be in ReadOptions?
- // Number of header rows to skip (including the first row containing column
names)
- int32_t header_rows = 1;
-
static ParseOptions Defaults();
};
@@ -89,6 +85,11 @@ struct ARROW_EXPORT ReadOptions {
// chunks when use_threads is true
int32_t block_size = 1 << 20; // 1 MB
+ // Number of header rows to skip (not including the row of column names, if
any)
+ int32_t skip_rows = 0;
+ // Column names (if empty, will be read from first row after `skip_rows`)
+ std::vector<std::string> column_names;
+
static ReadOptions Defaults();
};
diff --git a/cpp/src/arrow/csv/parser-test.cc b/cpp/src/arrow/csv/parser-test.cc
index d1790b2..f379621 100644
--- a/cpp/src/arrow/csv/parser-test.cc
+++ b/cpp/src/arrow/csv/parser-test.cc
@@ -31,6 +31,57 @@
namespace arrow {
namespace csv {
+void CheckSkipRows(const std::string& rows, int32_t num_rows,
+ int32_t expected_skipped_rows, int32_t
expected_skipped_bytes) {
+ const uint8_t* start = reinterpret_cast<const uint8_t*>(rows.data());
+ const uint8_t* data;
+ int32_t skipped_rows =
+ SkipRows(start, static_cast<int32_t>(rows.size()), num_rows, &data);
+ ASSERT_EQ(skipped_rows, expected_skipped_rows);
+ ASSERT_EQ(data - start, expected_skipped_bytes);
+}
+
+TEST(SkipRows, Basics) {
+ CheckSkipRows("", 0, 0, 0);
+ CheckSkipRows("", 15, 0, 0);
+
+ CheckSkipRows("a\nb\nc\nd", 1, 1, 2);
+ CheckSkipRows("a\nb\nc\nd", 2, 2, 4);
+ CheckSkipRows("a\nb\nc\nd", 3, 3, 6);
+ CheckSkipRows("a\nb\nc\nd", 4, 3, 6);
+
+ CheckSkipRows("a\nb\nc\nd\n", 3, 3, 6);
+ CheckSkipRows("a\nb\nc\nd\n", 4, 4, 8);
+ CheckSkipRows("a\nb\nc\nd\n", 5, 4, 8);
+
+ CheckSkipRows("\t\n\t\n\t\n\t", 1, 1, 2);
+ CheckSkipRows("\t\n\t\n\t\n\t", 3, 3, 6);
+ CheckSkipRows("\t\n\t\n\t\n\t", 4, 3, 6);
+
+ CheckSkipRows("a\r\nb\nc\rd\r\n", 1, 1, 3);
+ CheckSkipRows("a\r\nb\nc\rd\r\n", 2, 2, 5);
+ CheckSkipRows("a\r\nb\nc\rd\r\n", 3, 3, 7);
+ CheckSkipRows("a\r\nb\nc\rd\r\n", 4, 4, 10);
+ CheckSkipRows("a\r\nb\nc\rd\r\n", 5, 4, 10);
+
+ CheckSkipRows("a\r\nb\nc\rd\r", 4, 4, 9);
+ CheckSkipRows("a\r\nb\nc\rd\r", 5, 4, 9);
+ CheckSkipRows("a\r\nb\nc\rd\re", 4, 4, 9);
+ CheckSkipRows("a\r\nb\nc\rd\re", 5, 4, 9);
+
+ CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 1, 1, 1);
+ CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 2, 2, 3);
+ CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 3, 3, 4);
+ CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 4, 4, 6);
+ CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 5, 5, 7);
+ CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 6, 6, 9);
+ CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 7, 7, 10);
+ CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 8, 7, 10);
+}
+
+////////////////////////////////////////////////////////////////////////////
+// BlockParser tests
+
// Read the column with the given index out of the BlockParser.
void GetColumn(const BlockParser& parser, int32_t col_index,
std::vector<std::string>* out, std::vector<bool>* out_quoted =
nullptr) {
@@ -50,6 +101,24 @@ void GetColumn(const BlockParser& parser, int32_t col_index,
}
}
+void GetLastRow(const BlockParser& parser, std::vector<std::string>* out,
+ std::vector<bool>* out_quoted = nullptr) {
+ std::vector<std::string> values;
+ std::vector<bool> quoted_values;
+ auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
+ values.push_back(std::string(reinterpret_cast<const char*>(data), size));
+ if (out_quoted) {
+ quoted_values.push_back(quoted);
+ }
+ return Status::OK();
+ };
+ ASSERT_OK(parser.VisitLastRow(visit));
+ *out = std::move(values);
+ if (out_quoted) {
+ *out_quoted = std::move(quoted_values);
+ }
+}
+
Status Parse(BlockParser& parser, const std::string& str, uint32_t* out_size) {
const char* data = str.data();
uint32_t size = static_cast<uint32_t>(str.length());
@@ -81,6 +150,23 @@ void AssertParsePartial(BlockParser& parser, const
std::string& str,
ASSERT_EQ(parsed_size, expected_size);
}
+void AssertLastRowEq(const BlockParser& parser, const std::vector<std::string>
expected) {
+ std::vector<std::string> values;
+ GetLastRow(parser, &values);
+ ASSERT_EQ(parser.num_rows(), expected.size());
+ ASSERT_EQ(values, expected);
+}
+
+void AssertLastRowEq(const BlockParser& parser, const std::vector<std::string>
expected,
+ const std::vector<bool> expected_quoted) {
+ std::vector<std::string> values;
+ std::vector<bool> quoted;
+ GetLastRow(parser, &values, "ed);
+ ASSERT_EQ(parser.num_cols(), expected.size());
+ ASSERT_EQ(values, expected);
+ ASSERT_EQ(quoted, expected_quoted);
+}
+
void AssertColumnEq(const BlockParser& parser, int32_t col_index,
const std::vector<std::string> expected) {
std::vector<std::string> values;
@@ -129,6 +215,7 @@ TEST(BlockParser, Basics) {
BlockParser parser(ParseOptions::Defaults());
AssertParseOk(parser, csv);
AssertColumnsEq(parser, {{"ab", "ef", ""}, {"cd", "", "ij"}, {"", "gh",
"kl"}});
+ AssertLastRowEq(parser, {"", "ij", "kl"}, {false, false, false});
}
TEST(BlockParser, EmptyHeader) {
@@ -152,12 +239,14 @@ TEST(BlockParser, Empty) {
BlockParser parser(ParseOptions::Defaults());
AssertParseOk(parser, csv);
AssertColumnsEq(parser, {{""}, {""}});
+ AssertLastRowEq(parser, {"", ""}, {false, false});
}
{
auto csv = MakeCSVData({",\n,\n"});
BlockParser parser(ParseOptions::Defaults());
AssertParseOk(parser, csv);
AssertColumnsEq(parser, {{"", ""}, {"", ""}});
+ AssertLastRowEq(parser, {"", ""}, {false, false});
}
}
@@ -315,18 +404,21 @@ TEST(BlockParser, QuotingEmpty) {
auto csv = MakeCSVData({"\"\"\n"});
AssertParseOk(parser, csv);
AssertColumnsEq(parser, {{""}}, {{true}} /* quoted */);
+ AssertLastRowEq(parser, {""}, {true});
}
{
BlockParser parser(ParseOptions::Defaults());
auto csv = MakeCSVData({",\"\"\n"});
AssertParseOk(parser, csv);
AssertColumnsEq(parser, {{""}, {""}}, {{false}, {true}} /* quoted */);
+ AssertLastRowEq(parser, {"", ""}, {false, true});
}
{
BlockParser parser(ParseOptions::Defaults());
auto csv = MakeCSVData({"\"\",\n"});
AssertParseOk(parser, csv);
AssertColumnsEq(parser, {{""}, {""}}, {{true}, {false}} /* quoted */);
+ AssertLastRowEq(parser, {"", ""}, {true, false});
}
}
diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc
index 89c3f4c..7ae7603 100644
--- a/cpp/src/arrow/csv/parser.cc
+++ b/cpp/src/arrow/csv/parser.cc
@@ -40,6 +40,32 @@ static Status MismatchingColumns(int32_t expected, int32_t
actual) {
static inline bool IsControlChar(uint8_t c) { return c < ' '; }
+int32_t SkipRows(const uint8_t* data, uint32_t size, int32_t num_rows,
+ const uint8_t** out_data) {
+ const auto end = data + size;
+ int32_t skipped_rows = 0;
+ *out_data = data;
+
+ for (; skipped_rows < num_rows; ++skipped_rows) {
+ uint8_t c;
+ do {
+ while (ARROW_PREDICT_FALSE(data < end && !IsControlChar(*data))) {
+ ++data;
+ }
+ if (ARROW_PREDICT_FALSE(data == end)) {
+ return skipped_rows;
+ }
+ c = *data++;
+ } while (c != '\r' && c != '\n');
+ if (c == '\r' && data < end && *data == '\n') {
+ ++data;
+ }
+ *out_data = data;
+ }
+
+ return skipped_rows;
+}
+
template <bool Quoting, bool Escaping>
class SpecializedOptions {
public:
diff --git a/cpp/src/arrow/csv/parser.h b/cpp/src/arrow/csv/parser.h
index fdddc37..60ad4c2 100644
--- a/cpp/src/arrow/csv/parser.h
+++ b/cpp/src/arrow/csv/parser.h
@@ -37,6 +37,13 @@ namespace csv {
constexpr int32_t kMaxParserNumRows = 100000;
+/// Skip at most num_rows from the given input. The input pointer is updated
+/// and the number of actually skipped rows is returns (may be less than
+/// requested if the input is too short).
+ARROW_EXPORT
+int32_t SkipRows(const uint8_t* data, uint32_t size, int32_t num_rows,
+ const uint8_t** out_data);
+
/// \class BlockParser
/// \brief A reusable block-based parser for CSV data
///
@@ -96,6 +103,21 @@ class ARROW_EXPORT BlockParser {
return Status::OK();
}
+ template <typename Visitor>
+ Status VisitLastRow(Visitor&& visit) const {
+ const auto& values_buffer = values_buffers_.back();
+ const auto values = reinterpret_cast<const
ValueDesc*>(values_buffer->data());
+ const auto start_pos =
+ static_cast<int32_t>(values_buffer->size() / sizeof(ValueDesc)) -
num_cols_ - 1;
+ for (int32_t col_index = 0; col_index < num_cols_; ++col_index) {
+ auto start = values[start_pos + col_index].offset;
+ auto stop = values[start_pos + col_index + 1].offset;
+ auto quoted = values[start_pos + col_index + 1].quoted;
+ ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted));
+ }
+ return Status::OK();
+ }
+
protected:
ARROW_DISALLOW_COPY_AND_ASSIGN(BlockParser);
diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc
index ec4d179..1ef2d39 100644
--- a/cpp/src/arrow/csv/reader.cc
+++ b/cpp/src/arrow/csv/reader.cc
@@ -145,34 +145,55 @@ class BaseTableReader : public csv::TableReader {
// Read header and column names from current block, create column builders
Status ProcessHeader() {
DCHECK_GT(cur_size_, 0);
- if (parse_options_.header_rows == 0) {
- // TODO allow passing names and/or generate column numbers?
- return Status::Invalid("header_rows == 0 needs explicit column names");
- }
-
- BlockParser parser(pool_, parse_options_, num_cols_,
parse_options_.header_rows);
- uint32_t parsed_size = 0;
- RETURN_NOT_OK(parser.Parse(reinterpret_cast<const char*>(cur_data_),
- static_cast<uint32_t>(cur_size_),
&parsed_size));
- if (parser.num_rows() != parse_options_.header_rows) {
- return Status::Invalid(
- "Could not read header rows from CSV file, either "
- "file is too short or header is larger than block size");
- }
- if (parser.num_cols() == 0) {
- return Status::Invalid("No columns in CSV file");
+ if (read_options_.skip_rows) {
+ // Skip initial rows (potentially invalid CSV data)
+ auto data = cur_data_;
+ auto num_skipped_rows = SkipRows(cur_data_,
static_cast<uint32_t>(cur_size_),
+ read_options_.skip_rows, &data);
+ cur_size_ -= data - cur_data_;
+ cur_data_ = data;
+ if (num_skipped_rows < read_options_.skip_rows) {
+ return Status::Invalid(
+ "Could not skip initial ", read_options_.skip_rows,
+ " rows from CSV file, "
+ "either file is too short or header is larger than block size");
+ }
}
- num_cols_ = parser.num_cols();
- DCHECK_GT(num_cols_, 0);
- for (int32_t col_index = 0; col_index < num_cols_; ++col_index) {
+ if (read_options_.column_names.empty()) {
+ // Read one row with column names
+ BlockParser parser(pool_, parse_options_, num_cols_, 1);
+ uint32_t parsed_size = 0;
+ RETURN_NOT_OK(parser.Parse(reinterpret_cast<const char*>(cur_data_),
+ static_cast<uint32_t>(cur_size_),
&parsed_size));
+ if (parser.num_rows() != 1) {
+ return Status::Invalid(
+ "Could not read column names from CSV file, either "
+ "file is too short or header is larger than block size");
+ }
+ if (parser.num_cols() == 0) {
+ return Status::Invalid("No columns in CSV file");
+ }
+ // Read column names from last header row
auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) ->
Status {
- DCHECK_EQ(column_names_.size(), static_cast<uint32_t>(col_index));
column_names_.emplace_back(reinterpret_cast<const char*>(data), size);
return Status::OK();
};
- RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
+ RETURN_NOT_OK(parser.VisitLastRow(visit));
+ DCHECK_EQ(static_cast<size_t>(parser.num_cols()), column_names_.size());
+ // Skip parsed header row
+ cur_data_ += parsed_size;
+ cur_size_ -= parsed_size;
+ } else {
+ column_names_ = read_options_.column_names;
+ }
+
+ num_cols_ = static_cast<int32_t>(column_names_.size());
+ DCHECK_GT(num_cols_, 0);
+
+ // Construct column builders
+ for (int32_t col_index = 0; col_index < num_cols_; ++col_index) {
std::shared_ptr<ColumnBuilder> builder;
// Does the named column have a fixed type?
auto it = convert_options_.column_types.find(column_names_[col_index]);
@@ -186,9 +207,6 @@ class BaseTableReader : public csv::TableReader {
column_builders_.push_back(builder);
}
- // Skip parsed header rows
- cur_data_ += parsed_size;
- cur_size_ -= parsed_size;
return Status::OK();
}
diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
index 067b830..93e9cb3 100644
--- a/python/pyarrow/_csv.pyx
+++ b/python/pyarrow/_csv.pyx
@@ -51,6 +51,12 @@ cdef class ReadOptions:
How much bytes to process at a time from the input stream.
This will determine multi-threading granularity as well as
the size of individual chunks in the Table.
+ skip_rows: int, optional (default 0)
+ The number of rows to skip at the start of the CSV data, not
+ including the row of column names (if any).
+ column_names: list, optional
+ The Table column names. If empty, column names will be
+ read from the first row after `skip_rows`.
"""
cdef:
CCSVReadOptions options
@@ -58,12 +64,17 @@ cdef class ReadOptions:
# Avoid mistakingly creating attributes
__slots__ = ()
- def __init__(self, use_threads=None, block_size=None):
+ def __init__(self, use_threads=None, block_size=None, skip_rows=None,
+ column_names=None):
self.options = CCSVReadOptions.Defaults()
if use_threads is not None:
self.use_threads = use_threads
if block_size is not None:
self.block_size = block_size
+ if skip_rows is not None:
+ self.skip_rows = skip_rows
+ if column_names is not None:
+ self.column_names = column_names
@property
def use_threads(self):
@@ -89,6 +100,32 @@ cdef class ReadOptions:
def block_size(self, value):
self.options.block_size = value
+ @property
+ def skip_rows(self):
+ """
+ The number of rows to skip at the start of the CSV data, not
+ including the row of column names (if any).
+ """
+ return self.options.skip_rows
+
+ @skip_rows.setter
+ def skip_rows(self, value):
+ self.options.skip_rows = value
+
+ @property
+ def column_names(self):
+ """
+ The Table column names. If empty, column names will be
+ read from the first row after `skip_rows`.
+ """
+ return [frombytes(s) for s in self.options.column_names]
+
+ @column_names.setter
+ def column_names(self, value):
+ self.options.column_names.clear()
+ for item in value:
+ self.options.column_names.push_back(tobytes(item))
+
cdef class ParseOptions:
"""
@@ -107,8 +144,6 @@ cdef class ParseOptions:
escape_char: 1-character string or False, optional (default False)
The character used optionally for escaping special characters
(False if escaping is not allowed).
- header_rows: int, optional (default 1)
- The number of rows to skip at the start of the CSV data.
newlines_in_values: bool, optional (default False)
Whether newline characters are allowed in CSV values.
Setting this to True reduces the performance of multi-threaded
@@ -124,7 +159,7 @@ cdef class ParseOptions:
__slots__ = ()
def __init__(self, delimiter=None, quote_char=None, double_quote=None,
- escape_char=None, header_rows=None, newlines_in_values=None,
+ escape_char=None, newlines_in_values=None,
ignore_empty_lines=None):
self.options = CCSVParseOptions.Defaults()
if delimiter is not None:
@@ -135,8 +170,6 @@ cdef class ParseOptions:
self.double_quote = double_quote
if escape_char is not None:
self.escape_char = escape_char
- if header_rows is not None:
- self.header_rows = header_rows
if newlines_in_values is not None:
self.newlines_in_values = newlines_in_values
if ignore_empty_lines is not None:
@@ -204,17 +237,6 @@ cdef class ParseOptions:
self.options.escaping = True
@property
- def header_rows(self):
- """
- The number of rows to skip at the start of the CSV data.
- """
- return self.options.header_rows
-
- @header_rows.setter
- def header_rows(self, value):
- self.options.header_rows = value
-
- @property
def newlines_in_values(self):
"""
Whether newline characters are allowed in CSV values.
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index 282572e..3a5a26b 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1056,7 +1056,6 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv"
nogil:
c_bool double_quote
c_bool escaping
unsigned char escape_char
- int32_t header_rows
c_bool newlines_in_values
c_bool ignore_empty_lines
@@ -1077,6 +1076,8 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv"
nogil:
cdef cppclass CCSVReadOptions" arrow::csv::ReadOptions":
c_bool use_threads
int32_t block_size
+ int32_t skip_rows
+ vector[c_string] column_names
@staticmethod
CCSVReadOptions Defaults()
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index 9f0c08b..a8d7998 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -72,9 +72,20 @@ def test_read_options():
opts.use_threads = False
assert opts.use_threads is False
- opts = cls(block_size=1234, use_threads=False)
+ assert opts.skip_rows == 0
+ opts.skip_rows = 3
+ assert opts.skip_rows == 3
+
+ assert opts.column_names == []
+ opts.column_names = ["ab", "cd"]
+ assert opts.column_names == ["ab", "cd"]
+
+ opts = cls(block_size=1234, use_threads=False, skip_rows=42,
+ column_names=["a", "b", "c"])
assert opts.block_size == 1234
assert opts.use_threads is False
+ assert opts.skip_rows == 42
+ assert opts.column_names == ["a", "b", "c"]
def test_parse_options():
@@ -84,7 +95,6 @@ def test_parse_options():
assert opts.quote_char == '"'
assert opts.double_quote is True
assert opts.escape_char is False
- assert opts.header_rows == 1
assert opts.newlines_in_values is False
assert opts.ignore_empty_lines is True
@@ -110,17 +120,13 @@ def test_parse_options():
opts.ignore_empty_lines = False
assert opts.ignore_empty_lines is False
- opts.header_rows = 2
- assert opts.header_rows == 2
-
opts = cls(delimiter=';', quote_char='%', double_quote=False,
- escape_char='\\', header_rows=2, newlines_in_values=True,
+ escape_char='\\', newlines_in_values=True,
ignore_empty_lines=False)
assert opts.delimiter == ';'
assert opts.quote_char == '%'
assert opts.double_quote is False
assert opts.escape_char == '\\'
- assert opts.header_rows == 2
assert opts.newlines_in_values is True
assert opts.ignore_empty_lines is False
@@ -214,6 +220,92 @@ class BaseTestCSVRead:
table = self.read_bytes(rows)
assert table.to_pydict() == expected_data
+ def test_header_skip_rows(self):
+ rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n"
+
+ opts = ReadOptions()
+ opts.skip_rows = 1
+ table = self.read_bytes(rows, read_options=opts)
+ self.check_names(table, ["ef", "gh"])
+ assert table.to_pydict() == {
+ "ef": ["ij", "mn"],
+ "gh": ["kl", "op"],
+ }
+
+ opts.skip_rows = 3
+ table = self.read_bytes(rows, read_options=opts)
+ self.check_names(table, ["mn", "op"])
+ assert table.to_pydict() == {
+ "mn": [],
+ "op": [],
+ }
+
+ opts.skip_rows = 4
+ with pytest.raises(pa.ArrowInvalid):
+ # Not enough rows
+ table = self.read_bytes(rows, read_options=opts)
+
+ # Can skip rows with a different number of columns
+ rows = b"abcd\n,,,,,\nij,kl\nmn,op\n"
+ opts.skip_rows = 2
+ table = self.read_bytes(rows, read_options=opts)
+ self.check_names(table, ["ij", "kl"])
+ assert table.to_pydict() == {
+ "ij": ["mn"],
+ "kl": ["op"],
+ }
+
+ def test_header_column_names(self):
+ rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n"
+
+ opts = ReadOptions()
+ opts.column_names = ["x", "y"]
+ table = self.read_bytes(rows, read_options=opts)
+ self.check_names(table, ["x", "y"])
+ assert table.to_pydict() == {
+ "x": ["ab", "ef", "ij", "mn"],
+ "y": ["cd", "gh", "kl", "op"],
+ }
+
+ opts.skip_rows = 3
+ table = self.read_bytes(rows, read_options=opts)
+ self.check_names(table, ["x", "y"])
+ assert table.to_pydict() == {
+ "x": ["mn"],
+ "y": ["op"],
+ }
+
+ opts.skip_rows = 4
+ table = self.read_bytes(rows, read_options=opts)
+ self.check_names(table, ["x", "y"])
+ assert table.to_pydict() == {
+ "x": [],
+ "y": [],
+ }
+
+ opts.skip_rows = 5
+ with pytest.raises(pa.ArrowInvalid):
+ # Not enough rows
+ table = self.read_bytes(rows, read_options=opts)
+
+ # Unexpected number of columns
+ opts.skip_rows = 0
+ opts.column_names = ["x", "y", "z"]
+ with pytest.raises(pa.ArrowInvalid,
+ match="Expected 3 columns, got 2"):
+ table = self.read_bytes(rows, read_options=opts)
+
+ # Can skip rows with a different number of columns
+ rows = b"abcd\n,,,,,\nij,kl\nmn,op\n"
+ opts.skip_rows = 2
+ opts.column_names = ["x", "y"]
+ table = self.read_bytes(rows, read_options=opts)
+ self.check_names(table, ["x", "y"])
+ assert table.to_pydict() == {
+ "x": ["ij", "mn"],
+ "y": ["kl", "op"],
+ }
+
def test_simple_ints(self):
# Infer integer columns
rows = b"a,b,c\n1,2,3\n4,5,6\n"
diff --git a/r/R/csv.R b/r/R/csv.R
index 6ac0118..bf69830 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -42,16 +42,16 @@
#' characters? This is more general than `escape_double` as backslashes
#' can be used to escape the delimiter character, the quote character, or
#' to add special characters like `\\n`.
-# #' @param col_names If `TRUE`, the first row of the input will be used as the
-# #' column names and will not be included in the data frame. Note that `FALSE`
-# #' is not currently supported, nor is specifying a character vector of column
-# #' names.
+#' @param col_names If `TRUE`, the first row of the input will be used as the
+#' column names and will not be included in the data frame. (Note that `FALSE`
+#' is not currently supported.) Alternatively, you can specify a character
+#' vector of column names.
#' @param col_select A [tidy selection specification][tidyselect::vars_select]
#' of columns, as used in `dplyr::select()`.
#' @param skip_empty_rows Should blank rows be ignored altogether? If
#' `TRUE`, blank rows will not be represented at all. If `FALSE`, they will be
#' filled with missings.
-# #' @param skip Number of lines to skip before reading data.
+#' @param skip Number of lines to skip before reading data.
#' @param parse_options see [csv_parse_options()]. If given, this overrides any
#' parsing options provided in other arguments (e.g. `delim`, `quote`, etc.).
#' @param convert_options see [csv_convert_options()]
@@ -66,39 +66,41 @@ read_delim_arrow <- function(file,
quote = '"',
escape_double = TRUE,
escape_backslash = FALSE,
- # col_names = TRUE,
+ col_names = TRUE,
# col_types = TRUE,
col_select = NULL,
# na = c("", "NA"),
# quoted_na = TRUE,
skip_empty_rows = TRUE,
- # skip = 0L,
+ skip = 0L,
parse_options = NULL,
convert_options = NULL,
- read_options = csv_read_options(),
+ read_options = NULL,
as_tibble = TRUE) {
- # These are hardcoded pending
https://issues.apache.org/jira/browse/ARROW-5747
- col_names <- TRUE
- skip <- 0L
-
+ if (identical(col_names, FALSE)) {
+ stop("Not implemented", call.=FALSE)
+ }
if (is.null(parse_options)) {
- if (isTRUE(col_names)) {
- # Add one row to skip, to match arrow's header_rows
- skip <- skip + 1L
- # Note that with the hardcoding, header_rows is always 1, which
- # turns out to be the only value that works meaningfully
- }
parse_options <- readr_to_csv_parse_options(
delim,
quote,
escape_double,
escape_backslash,
- skip_empty_rows,
- skip
+ skip_empty_rows
)
}
+ if (is.null(read_options)) {
+ if (isTRUE(col_names)) {
+ # C++ default to parse is 0-length string array
+ col_names <- character(0)
+ }
+ read_options <- csv_read_options(
+ skip_rows = skip,
+ column_names = col_names
+ )
+ }
if (is.null(convert_options)) {
# TODO:
# * na strings (needs wiring in csv_convert_options)
@@ -117,10 +119,6 @@ read_delim_arrow <- function(file,
)
tab <- reader$Read()$select(!!enquo(col_select))
- if (is.character(col_names)) {
- # TODO: Rename `tab`'s columns
- # See https://github.com/apache/arrow/pull/4557
- }
if (isTRUE(as_tibble)) {
tab <- as.data.frame(tab)
@@ -135,16 +133,16 @@ read_csv_arrow <- function(file,
quote = '"',
escape_double = TRUE,
escape_backslash = FALSE,
- # col_names = TRUE,
+ col_names = TRUE,
# col_types = TRUE,
col_select = NULL,
# na = c("", "NA"),
# quoted_na = TRUE,
skip_empty_rows = TRUE,
- # skip = 0L,
+ skip = 0L,
parse_options = NULL,
convert_options = NULL,
- read_options = csv_read_options(),
+ read_options = NULL,
as_tibble = TRUE) {
mc <- match.call()
@@ -159,16 +157,16 @@ read_tsv_arrow <- function(file,
quote = '"',
escape_double = TRUE,
escape_backslash = FALSE,
- # col_names = TRUE,
+ col_names = TRUE,
# col_types = TRUE,
col_select = NULL,
# na = c("", "NA"),
# quoted_na = TRUE,
skip_empty_rows = TRUE,
- # skip = 0L,
+ skip = 0L,
parse_options = NULL,
convert_options = NULL,
- read_options = csv_read_options(),
+ read_options = NULL,
as_tibble = TRUE) {
mc <- match.call()
@@ -192,15 +190,25 @@ read_tsv_arrow <- function(file,
#' Read options for the Arrow file readers
#'
#' @param use_threads Whether to use the global CPU thread pool
-#' @param block_size Block size we request from the IO layer; also determines
the size of chunks when use_threads is `TRUE`. NB: if false, JSON input must
end with an empty line
+#' @param block_size Block size we request from the IO layer; also determines
+#' the size of chunks when use_threads is `TRUE`. NB: if `FALSE`, JSON input
+#' must end with an empty line.
+#' @param skip_rows Number of lines to skip before reading data.
+#' @param column_names Character vector to supply column names. If length-0
+#' (the default), the first non-skipped row will be parsed to generate column
+#' names.
#'
#' @export
csv_read_options <- function(use_threads = option_use_threads(),
- block_size = 1048576L) {
+ block_size = 1048576L,
+ skip_rows = 0L,
+ column_names = character(0)) {
shared_ptr(`arrow::csv::ReadOptions`, csv___ReadOptions__initialize(
list(
use_threads = use_threads,
- block_size = block_size
+ block_size = block_size,
+ skip_rows = skip_rows,
+ column_names = column_names
)
))
}
@@ -209,8 +217,7 @@ readr_to_csv_parse_options <- function(delim = ",",
quote = '"',
escape_double = TRUE,
escape_backslash = FALSE,
- skip_empty_rows = TRUE,
- skip = 0L) {
+ skip_empty_rows = TRUE) {
# This function translates from the readr argument list to the arrow arg
names
# TODO: validate inputs
csv_parse_options(
@@ -221,8 +228,7 @@ readr_to_csv_parse_options <- function(delim = ",",
escaping = escape_backslash,
escape_char = '\\',
newlines_in_values = escape_backslash,
- ignore_empty_lines = skip_empty_rows,
- header_rows = skip
+ ignore_empty_lines = skip_empty_rows
)
}
@@ -236,7 +242,6 @@ readr_to_csv_parse_options <- function(delim = ",",
#' @param escape_char Escaping character (if `escaping` is `TRUE`)
#' @param newlines_in_values Whether values are allowed to contain CR (`0x0d`)
and LF (`0x0a`) characters
#' @param ignore_empty_lines Whether empty lines are ignored. If `FALSE`, an
empty line represents
-#' @param header_rows Number of header rows to skip (including the first row
containing column names)
#'
#' @export
csv_parse_options <- function(delimiter = ",",
@@ -246,8 +251,7 @@ csv_parse_options <- function(delimiter = ",",
escaping = FALSE,
escape_char = '\\',
newlines_in_values = FALSE,
- ignore_empty_lines = TRUE,
- header_rows = 1L) {
+ ignore_empty_lines = TRUE) {
shared_ptr(`arrow::csv::ParseOptions`, csv___ParseOptions__initialize(
list(
@@ -258,8 +262,7 @@ csv_parse_options <- function(delimiter = ",",
escaping = escaping,
escape_char = escape_char,
newlines_in_values = newlines_in_values,
- ignore_empty_lines = ignore_empty_lines,
- header_rows = header_rows
+ ignore_empty_lines = ignore_empty_lines
)
))
}
diff --git a/r/README.md b/r/README.md
index 43280f3..47458cf 100644
--- a/r/README.md
+++ b/r/README.md
@@ -48,14 +48,6 @@ library.
``` r
library(arrow)
-#>
-#> Attaching package: 'arrow'
-#> The following object is masked from 'package:utils':
-#>
-#> timestamp
-#> The following objects are masked from 'package:base':
-#>
-#> array, table
set.seed(24)
tab <- arrow::table(x = 1:10, y = rnorm(10))
diff --git a/r/man/csv_parse_options.Rd b/r/man/csv_parse_options.Rd
index 17c5ba2..a46cfb3 100644
--- a/r/man/csv_parse_options.Rd
+++ b/r/man/csv_parse_options.Rd
@@ -8,7 +8,7 @@
csv_parse_options(delimiter = ",", quoting = TRUE,
quote_char = "\\"", double_quote = TRUE, escaping = FALSE,
escape_char = "\\\\", newlines_in_values = FALSE,
- ignore_empty_lines = TRUE, header_rows = 1L)
+ ignore_empty_lines = TRUE)
json_parse_options(newlines_in_values = FALSE)
}
@@ -28,8 +28,6 @@ json_parse_options(newlines_in_values = FALSE)
\item{newlines_in_values}{Whether values are allowed to contain CR
(\code{0x0d}) and LF (\code{0x0a}) characters}
\item{ignore_empty_lines}{Whether empty lines are ignored. If \code{FALSE},
an empty line represents}
-
-\item{header_rows}{Number of header rows to skip (including the first row
containing column names)}
}
\description{
Parsing options for Arrow file readers
diff --git a/r/man/csv_read_options.Rd b/r/man/csv_read_options.Rd
index ddfc9d1..38b6e47 100644
--- a/r/man/csv_read_options.Rd
+++ b/r/man/csv_read_options.Rd
@@ -6,14 +6,22 @@
\title{Read options for the Arrow file readers}
\usage{
csv_read_options(use_threads = option_use_threads(),
- block_size = 1048576L)
+ block_size = 1048576L, skip_rows = 0L, column_names = character(0))
json_read_options(use_threads = TRUE, block_size = 1048576L)
}
\arguments{
\item{use_threads}{Whether to use the global CPU thread pool}
-\item{block_size}{Block size we request from the IO layer; also determines the
size of chunks when use_threads is \code{TRUE}. NB: if false, JSON input must
end with an empty line}
+\item{block_size}{Block size we request from the IO layer; also determines
+the size of chunks when use_threads is \code{TRUE}. NB: if \code{FALSE}, JSON
input
+must end with an empty line.}
+
+\item{skip_rows}{Number of lines to skip before reading data.}
+
+\item{column_names}{Character vector to supply column names. If length-0
+(the default), the first non-skipped row will be parsed to generate column
+names.}
}
\description{
Read options for the Arrow file readers
diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd
index ff732ae..0726889 100644
--- a/r/man/read_delim_arrow.Rd
+++ b/r/man/read_delim_arrow.Rd
@@ -7,22 +7,20 @@
\title{Read a CSV or other delimited file with Arrow}
\usage{
read_delim_arrow(file, delim = ",", quote = "\\"",
- escape_double = TRUE, escape_backslash = FALSE, col_select = NULL,
- skip_empty_rows = TRUE, parse_options = NULL,
- convert_options = NULL, read_options = csv_read_options(),
+ escape_double = TRUE, escape_backslash = FALSE, col_names = TRUE,
+ col_select = NULL, skip_empty_rows = TRUE, skip = 0L,
+ parse_options = NULL, convert_options = NULL, read_options = NULL,
as_tibble = TRUE)
read_csv_arrow(file, quote = "\\"", escape_double = TRUE,
- escape_backslash = FALSE, col_select = NULL,
- skip_empty_rows = TRUE, parse_options = NULL,
- convert_options = NULL, read_options = csv_read_options(),
- as_tibble = TRUE)
+ escape_backslash = FALSE, col_names = TRUE, col_select = NULL,
+ skip_empty_rows = TRUE, skip = 0L, parse_options = NULL,
+ convert_options = NULL, read_options = NULL, as_tibble = TRUE)
read_tsv_arrow(file, quote = "\\"", escape_double = TRUE,
- escape_backslash = FALSE, col_select = NULL,
- skip_empty_rows = TRUE, parse_options = NULL,
- convert_options = NULL, read_options = csv_read_options(),
- as_tibble = TRUE)
+ escape_backslash = FALSE, col_names = TRUE, col_select = NULL,
+ skip_empty_rows = TRUE, skip = 0L, parse_options = NULL,
+ convert_options = NULL, read_options = NULL, as_tibble = TRUE)
}
\arguments{
\item{file}{A character path to a local file, or an Arrow input stream}
@@ -40,6 +38,11 @@ characters? This is more general than \code{escape_double}
as backslashes
can be used to escape the delimiter character, the quote character, or
to add special characters like \code{\\n}.}
+\item{col_names}{If \code{TRUE}, the first row of the input will be used as the
+column names and will not be included in the data frame. (Note that
\code{FALSE}
+is not currently supported.) Alternatively, you can specify a character
+vector of column names.}
+
\item{col_select}{A \link[tidyselect:vars_select]{tidy selection specification}
of columns, as used in \code{dplyr::select()}.}
@@ -47,6 +50,8 @@ of columns, as used in \code{dplyr::select()}.}
\code{TRUE}, blank rows will not be represented at all. If \code{FALSE}, they
will be
filled with missings.}
+\item{skip}{Number of lines to skip before reading data.}
+
\item{parse_options}{see
\code{\link[=csv_parse_options]{csv_parse_options()}}. If given, this overrides
any
parsing options provided in other arguments (e.g. \code{delim}, \code{quote},
etc.).}
diff --git a/r/src/csv.cpp b/r/src/csv.cpp
index bfcbae7..6165636 100644
--- a/r/src/csv.cpp
+++ b/r/src/csv.cpp
@@ -28,6 +28,8 @@ std::shared_ptr<arrow::csv::ReadOptions>
csv___ReadOptions__initialize(List_ opt
std::make_shared<arrow::csv::ReadOptions>(arrow::csv::ReadOptions::Defaults());
res->use_threads = options["use_threads"];
res->block_size = options["block_size"];
+ res->skip_rows = options["skip_rows"];
+ res->column_names =
Rcpp::as<std::vector<std::string>>(options["column_names"]);
return res;
}
@@ -43,7 +45,6 @@ std::shared_ptr<arrow::csv::ParseOptions>
csv___ParseOptions__initialize(List_ o
res->double_quote = options["double_quote"];
res->escape_char = get_char(options["escape_char"]);
res->newlines_in_values = options["newlines_in_values"];
- res->header_rows = options["header_rows"];
res->ignore_empty_lines = options["ignore_empty_lines"];
return res;
}
diff --git a/r/tests/testthat/test-arrow-csv.R
b/r/tests/testthat/test-arrow-csv.R
index aed9638..81e35b3 100644
--- a/r/tests/testthat/test-arrow-csv.R
+++ b/r/tests/testthat/test-arrow-csv.R
@@ -81,29 +81,39 @@ test_that("read_delim_arrow parsing options: quote", {
})
test_that("read_csv_arrow parsing options: col_names", {
- skip("Invalid: Empty CSV file")
tf <- tempfile()
on.exit(unlink(tf))
+ # Writing the CSV without the header
write.table(iris, tf, sep = ",", row.names = FALSE, col.names = FALSE)
- tab1 <- read_csv_arrow(tf, col_names = FALSE)
+
+ expect_error(read_csv_arrow(tf, col_names = FALSE), "Not implemented")
+
+ tab1 <- read_csv_arrow(tf, col_names = names(iris))
expect_identical(names(tab1), names(iris))
iris$Species <- as.character(iris$Species)
expect_equivalent(iris, tab1)
+
+ # This errors (correctly) because I haven't given enough names
+ # but the error message is "Invalid: Empty CSV file", which is not accurate
+ expect_error(
+ read_csv_arrow(tf, col_names = names(iris)[1])
+ )
+ # Same here
+ expect_error(
+ read_csv_arrow(tf, col_names = c(names(iris), names(iris)))
+ )
})
test_that("read_csv_arrow parsing options: skip", {
- skip("Invalid: Empty CSV file")
tf <- tempfile()
on.exit(unlink(tf))
+ # Adding two garbage lines to start the csv
cat("asdf\nqwer\n", file = tf)
suppressWarnings(write.table(iris, tf, sep = ",", row.names = FALSE, append
= TRUE))
- # This works:
- # print(head(readr::read_csv(tf, skip = 2)))
- # This errors:
tab1 <- read_csv_arrow(tf, skip = 2)
expect_identical(names(tab1), names(iris))