[arrow] branch master updated: ARROW-5747: [C++] Improve CSV header and column names options

apitrou Wed, 24 Jul 2019 05:27:15 -0700

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new f35dd92  ARROW-5747: [C++] Improve CSV header and column names options
f35dd92 is described below

commit f35dd92ecb5178a798d747d8728ee6343385a565
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Jul 24 14:26:36 2019 +0200

    ARROW-5747: [C++] Improve CSV header and column names options
    
    The `header_rows` option is removed (it seemed untested and probably didn't 
work well)
    and replaced with a `skip_rows` option that dictates how many rows are 
skipped at
    the start of the CSV file (default 0).
    
    Furthermore, a `column_names` option is added to hardcode the column names.
    If empty (default), they are read from the first non-skipped row in the CSV 
file.
    
    Closes #4898 from pitrou/ARROW-5747-csv-column-names and squashes the 
following commits:
    
    f42840f00 <Antoine Pitrou> Fix downcast
    d1381b3bd <Antoine Pitrou> Implement skip_rows + column_names in R
    36f2743c5 <Antoine Pitrou> Fix expected semantics for skip_rows
    410805254 <Antoine Pitrou> ARROW-5747:  Improve header and column names 
options
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 c_glib/arrow-glib/reader.cpp         |  27 ---------
 cpp/src/arrow/csv/options.h          |   9 +--
 cpp/src/arrow/csv/parser-test.cc     |  92 ++++++++++++++++++++++++++++++
 cpp/src/arrow/csv/parser.cc          |  26 +++++++++
 cpp/src/arrow/csv/parser.h           |  22 ++++++++
 cpp/src/arrow/csv/reader.cc          |  66 ++++++++++++++--------
 python/pyarrow/_csv.pyx              |  56 ++++++++++++------
 python/pyarrow/includes/libarrow.pxd |   3 +-
 python/pyarrow/tests/test_csv.py     | 106 ++++++++++++++++++++++++++++++++---
 r/R/csv.R                            |  87 ++++++++++++++--------------
 r/README.md                          |   8 ---
 r/man/csv_parse_options.Rd           |   4 +-
 r/man/csv_read_options.Rd            |  12 +++-
 r/man/read_delim_arrow.Rd            |  27 +++++----
 r/src/csv.cpp                        |   3 +-
 r/tests/testthat/test-arrow-csv.R    |  22 ++++++--
 16 files changed, 417 insertions(+), 153 deletions(-)

diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp
index 7783362..fdc8ffe 100644
--- a/c_glib/arrow-glib/reader.cpp
+++ b/c_glib/arrow-glib/reader.cpp
@@ -902,7 +902,6 @@ enum {
   PROP_ESCAPE_CHARACTER,
   PROP_ALLOW_NEWLINES_IN_VALUES,
   PROP_IGNORE_EMPTY_LINES,
-  PROP_N_HEADER_ROWS,
   PROP_CHECK_UTF8,
   PROP_ALLOW_NULL_STRINGS
 };
@@ -955,9 +954,6 @@ garrow_csv_read_options_set_property(GObject *object,
   case PROP_IGNORE_EMPTY_LINES:
     priv->parse_options.ignore_empty_lines = g_value_get_boolean(value);
     break;
-  case PROP_N_HEADER_ROWS:
-    priv->parse_options.header_rows = g_value_get_uint(value);
-    break;
   case PROP_CHECK_UTF8:
     priv->convert_options.check_utf8 = g_value_get_boolean(value);
     break;
@@ -1009,9 +1005,6 @@ garrow_csv_read_options_get_property(GObject *object,
   case PROP_IGNORE_EMPTY_LINES:
     g_value_set_boolean(value, priv->parse_options.ignore_empty_lines);
     break;
-  case PROP_N_HEADER_ROWS:
-    g_value_set_uint(value, priv->parse_options.header_rows);
-    break;
   case PROP_CHECK_UTF8:
     g_value_set_boolean(value, priv->convert_options.check_utf8);
     break;
@@ -1210,26 +1203,6 @@ 
garrow_csv_read_options_class_init(GArrowCSVReadOptionsClass *klass)
                                   PROP_IGNORE_EMPTY_LINES,
                                   spec);
 
-  /**
-   * GArrowCSVReadOptions:n-header-rows:
-   *
-   * The number of header rows to skip (including the first row
-   * containing column names)
-   *
-   * Since: 0.12.0
-   */
-  spec = g_param_spec_uint("n-header-rows",
-                           "N header rows",
-                           "The number of header rows to skip "
-                           "(including the first row containing column names",
-                           0,
-                           G_MAXUINT,
-                           parse_options.header_rows,
-                           static_cast<GParamFlags>(G_PARAM_READWRITE));
-  g_object_class_install_property(gobject_class,
-                                  PROP_N_HEADER_ROWS,
-                                  spec);
-
   auto convert_options = arrow::csv::ConvertOptions::Defaults();
 
   /**
diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h
index 9cd312a..21d0ab2 100644
--- a/cpp/src/arrow/csv/options.h
+++ b/cpp/src/arrow/csv/options.h
@@ -53,10 +53,6 @@ struct ARROW_EXPORT ParseOptions {
   // a single empty value (assuming a one-column CSV file).
   bool ignore_empty_lines = true;
 
-  // XXX Should this be in ReadOptions?
-  // Number of header rows to skip (including the first row containing column 
names)
-  int32_t header_rows = 1;
-
   static ParseOptions Defaults();
 };
 
@@ -89,6 +85,11 @@ struct ARROW_EXPORT ReadOptions {
   // chunks when use_threads is true
   int32_t block_size = 1 << 20;  // 1 MB
 
+  // Number of header rows to skip (not including the row of column names, if 
any)
+  int32_t skip_rows = 0;
+  // Column names (if empty, will be read from first row after `skip_rows`)
+  std::vector<std::string> column_names;
+
   static ReadOptions Defaults();
 };
 
diff --git a/cpp/src/arrow/csv/parser-test.cc b/cpp/src/arrow/csv/parser-test.cc
index d1790b2..f379621 100644
--- a/cpp/src/arrow/csv/parser-test.cc
+++ b/cpp/src/arrow/csv/parser-test.cc
@@ -31,6 +31,57 @@
 namespace arrow {
 namespace csv {
 
+void CheckSkipRows(const std::string& rows, int32_t num_rows,
+                   int32_t expected_skipped_rows, int32_t 
expected_skipped_bytes) {
+  const uint8_t* start = reinterpret_cast<const uint8_t*>(rows.data());
+  const uint8_t* data;
+  int32_t skipped_rows =
+      SkipRows(start, static_cast<int32_t>(rows.size()), num_rows, &data);
+  ASSERT_EQ(skipped_rows, expected_skipped_rows);
+  ASSERT_EQ(data - start, expected_skipped_bytes);
+}
+
+TEST(SkipRows, Basics) {
+  CheckSkipRows("", 0, 0, 0);
+  CheckSkipRows("", 15, 0, 0);
+
+  CheckSkipRows("a\nb\nc\nd", 1, 1, 2);
+  CheckSkipRows("a\nb\nc\nd", 2, 2, 4);
+  CheckSkipRows("a\nb\nc\nd", 3, 3, 6);
+  CheckSkipRows("a\nb\nc\nd", 4, 3, 6);
+
+  CheckSkipRows("a\nb\nc\nd\n", 3, 3, 6);
+  CheckSkipRows("a\nb\nc\nd\n", 4, 4, 8);
+  CheckSkipRows("a\nb\nc\nd\n", 5, 4, 8);
+
+  CheckSkipRows("\t\n\t\n\t\n\t", 1, 1, 2);
+  CheckSkipRows("\t\n\t\n\t\n\t", 3, 3, 6);
+  CheckSkipRows("\t\n\t\n\t\n\t", 4, 3, 6);
+
+  CheckSkipRows("a\r\nb\nc\rd\r\n", 1, 1, 3);
+  CheckSkipRows("a\r\nb\nc\rd\r\n", 2, 2, 5);
+  CheckSkipRows("a\r\nb\nc\rd\r\n", 3, 3, 7);
+  CheckSkipRows("a\r\nb\nc\rd\r\n", 4, 4, 10);
+  CheckSkipRows("a\r\nb\nc\rd\r\n", 5, 4, 10);
+
+  CheckSkipRows("a\r\nb\nc\rd\r", 4, 4, 9);
+  CheckSkipRows("a\r\nb\nc\rd\r", 5, 4, 9);
+  CheckSkipRows("a\r\nb\nc\rd\re", 4, 4, 9);
+  CheckSkipRows("a\r\nb\nc\rd\re", 5, 4, 9);
+
+  CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 1, 1, 1);
+  CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 2, 2, 3);
+  CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 3, 3, 4);
+  CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 4, 4, 6);
+  CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 5, 5, 7);
+  CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 6, 6, 9);
+  CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 7, 7, 10);
+  CheckSkipRows("\n\r\n\r\r\n\n\r\n\r", 8, 7, 10);
+}
+
+////////////////////////////////////////////////////////////////////////////
+// BlockParser tests
+
 // Read the column with the given index out of the BlockParser.
 void GetColumn(const BlockParser& parser, int32_t col_index,
                std::vector<std::string>* out, std::vector<bool>* out_quoted = 
nullptr) {
@@ -50,6 +101,24 @@ void GetColumn(const BlockParser& parser, int32_t col_index,
   }
 }
 
+void GetLastRow(const BlockParser& parser, std::vector<std::string>* out,
+                std::vector<bool>* out_quoted = nullptr) {
+  std::vector<std::string> values;
+  std::vector<bool> quoted_values;
+  auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
+    values.push_back(std::string(reinterpret_cast<const char*>(data), size));
+    if (out_quoted) {
+      quoted_values.push_back(quoted);
+    }
+    return Status::OK();
+  };
+  ASSERT_OK(parser.VisitLastRow(visit));
+  *out = std::move(values);
+  if (out_quoted) {
+    *out_quoted = std::move(quoted_values);
+  }
+}
+
 Status Parse(BlockParser& parser, const std::string& str, uint32_t* out_size) {
   const char* data = str.data();
   uint32_t size = static_cast<uint32_t>(str.length());
@@ -81,6 +150,23 @@ void AssertParsePartial(BlockParser& parser, const 
std::string& str,
   ASSERT_EQ(parsed_size, expected_size);
 }
 
+void AssertLastRowEq(const BlockParser& parser, const std::vector<std::string> 
expected) {
+  std::vector<std::string> values;
+  GetLastRow(parser, &values);
+  ASSERT_EQ(parser.num_rows(), expected.size());
+  ASSERT_EQ(values, expected);
+}
+
+void AssertLastRowEq(const BlockParser& parser, const std::vector<std::string> 
expected,
+                     const std::vector<bool> expected_quoted) {
+  std::vector<std::string> values;
+  std::vector<bool> quoted;
+  GetLastRow(parser, &values, &quoted);
+  ASSERT_EQ(parser.num_cols(), expected.size());
+  ASSERT_EQ(values, expected);
+  ASSERT_EQ(quoted, expected_quoted);
+}
+
 void AssertColumnEq(const BlockParser& parser, int32_t col_index,
                     const std::vector<std::string> expected) {
   std::vector<std::string> values;
@@ -129,6 +215,7 @@ TEST(BlockParser, Basics) {
   BlockParser parser(ParseOptions::Defaults());
   AssertParseOk(parser, csv);
   AssertColumnsEq(parser, {{"ab", "ef", ""}, {"cd", "", "ij"}, {"", "gh", 
"kl"}});
+  AssertLastRowEq(parser, {"", "ij", "kl"}, {false, false, false});
 }
 
 TEST(BlockParser, EmptyHeader) {
@@ -152,12 +239,14 @@ TEST(BlockParser, Empty) {
     BlockParser parser(ParseOptions::Defaults());
     AssertParseOk(parser, csv);
     AssertColumnsEq(parser, {{""}, {""}});
+    AssertLastRowEq(parser, {"", ""}, {false, false});
   }
   {
     auto csv = MakeCSVData({",\n,\n"});
     BlockParser parser(ParseOptions::Defaults());
     AssertParseOk(parser, csv);
     AssertColumnsEq(parser, {{"", ""}, {"", ""}});
+    AssertLastRowEq(parser, {"", ""}, {false, false});
   }
 }
 
@@ -315,18 +404,21 @@ TEST(BlockParser, QuotingEmpty) {
     auto csv = MakeCSVData({"\"\"\n"});
     AssertParseOk(parser, csv);
     AssertColumnsEq(parser, {{""}}, {{true}} /* quoted */);
+    AssertLastRowEq(parser, {""}, {true});
   }
   {
     BlockParser parser(ParseOptions::Defaults());
     auto csv = MakeCSVData({",\"\"\n"});
     AssertParseOk(parser, csv);
     AssertColumnsEq(parser, {{""}, {""}}, {{false}, {true}} /* quoted */);
+    AssertLastRowEq(parser, {"", ""}, {false, true});
   }
   {
     BlockParser parser(ParseOptions::Defaults());
     auto csv = MakeCSVData({"\"\",\n"});
     AssertParseOk(parser, csv);
     AssertColumnsEq(parser, {{""}, {""}}, {{true}, {false}} /* quoted */);
+    AssertLastRowEq(parser, {"", ""}, {true, false});
   }
 }
 
diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc
index 89c3f4c..7ae7603 100644
--- a/cpp/src/arrow/csv/parser.cc
+++ b/cpp/src/arrow/csv/parser.cc
@@ -40,6 +40,32 @@ static Status MismatchingColumns(int32_t expected, int32_t 
actual) {
 
 static inline bool IsControlChar(uint8_t c) { return c < ' '; }
 
+int32_t SkipRows(const uint8_t* data, uint32_t size, int32_t num_rows,
+                 const uint8_t** out_data) {
+  const auto end = data + size;
+  int32_t skipped_rows = 0;
+  *out_data = data;
+
+  for (; skipped_rows < num_rows; ++skipped_rows) {
+    uint8_t c;
+    do {
+      while (ARROW_PREDICT_FALSE(data < end && !IsControlChar(*data))) {
+        ++data;
+      }
+      if (ARROW_PREDICT_FALSE(data == end)) {
+        return skipped_rows;
+      }
+      c = *data++;
+    } while (c != '\r' && c != '\n');
+    if (c == '\r' && data < end && *data == '\n') {
+      ++data;
+    }
+    *out_data = data;
+  }
+
+  return skipped_rows;
+}
+
 template <bool Quoting, bool Escaping>
 class SpecializedOptions {
  public:
diff --git a/cpp/src/arrow/csv/parser.h b/cpp/src/arrow/csv/parser.h
index fdddc37..60ad4c2 100644
--- a/cpp/src/arrow/csv/parser.h
+++ b/cpp/src/arrow/csv/parser.h
@@ -37,6 +37,13 @@ namespace csv {
 
 constexpr int32_t kMaxParserNumRows = 100000;
 
+/// Skip at most num_rows from the given input.  The input pointer is updated
+/// and the number of actually skipped rows is returns (may be less than
+/// requested if the input is too short).
+ARROW_EXPORT
+int32_t SkipRows(const uint8_t* data, uint32_t size, int32_t num_rows,
+                 const uint8_t** out_data);
+
 /// \class BlockParser
 /// \brief A reusable block-based parser for CSV data
 ///
@@ -96,6 +103,21 @@ class ARROW_EXPORT BlockParser {
     return Status::OK();
   }
 
+  template <typename Visitor>
+  Status VisitLastRow(Visitor&& visit) const {
+    const auto& values_buffer = values_buffers_.back();
+    const auto values = reinterpret_cast<const 
ValueDesc*>(values_buffer->data());
+    const auto start_pos =
+        static_cast<int32_t>(values_buffer->size() / sizeof(ValueDesc)) - 
num_cols_ - 1;
+    for (int32_t col_index = 0; col_index < num_cols_; ++col_index) {
+      auto start = values[start_pos + col_index].offset;
+      auto stop = values[start_pos + col_index + 1].offset;
+      auto quoted = values[start_pos + col_index + 1].quoted;
+      ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted));
+    }
+    return Status::OK();
+  }
+
  protected:
   ARROW_DISALLOW_COPY_AND_ASSIGN(BlockParser);
 
diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc
index ec4d179..1ef2d39 100644
--- a/cpp/src/arrow/csv/reader.cc
+++ b/cpp/src/arrow/csv/reader.cc
@@ -145,34 +145,55 @@ class BaseTableReader : public csv::TableReader {
   // Read header and column names from current block, create column builders
   Status ProcessHeader() {
     DCHECK_GT(cur_size_, 0);
-    if (parse_options_.header_rows == 0) {
-      // TODO allow passing names and/or generate column numbers?
-      return Status::Invalid("header_rows == 0 needs explicit column names");
-    }
-
-    BlockParser parser(pool_, parse_options_, num_cols_, 
parse_options_.header_rows);
 
-    uint32_t parsed_size = 0;
-    RETURN_NOT_OK(parser.Parse(reinterpret_cast<const char*>(cur_data_),
-                               static_cast<uint32_t>(cur_size_), 
&parsed_size));
-    if (parser.num_rows() != parse_options_.header_rows) {
-      return Status::Invalid(
-          "Could not read header rows from CSV file, either "
-          "file is too short or header is larger than block size");
-    }
-    if (parser.num_cols() == 0) {
-      return Status::Invalid("No columns in CSV file");
+    if (read_options_.skip_rows) {
+      // Skip initial rows (potentially invalid CSV data)
+      auto data = cur_data_;
+      auto num_skipped_rows = SkipRows(cur_data_, 
static_cast<uint32_t>(cur_size_),
+                                       read_options_.skip_rows, &data);
+      cur_size_ -= data - cur_data_;
+      cur_data_ = data;
+      if (num_skipped_rows < read_options_.skip_rows) {
+        return Status::Invalid(
+            "Could not skip initial ", read_options_.skip_rows,
+            " rows from CSV file, "
+            "either file is too short or header is larger than block size");
+      }
     }
-    num_cols_ = parser.num_cols();
-    DCHECK_GT(num_cols_, 0);
 
-    for (int32_t col_index = 0; col_index < num_cols_; ++col_index) {
+    if (read_options_.column_names.empty()) {
+      // Read one row with column names
+      BlockParser parser(pool_, parse_options_, num_cols_, 1);
+      uint32_t parsed_size = 0;
+      RETURN_NOT_OK(parser.Parse(reinterpret_cast<const char*>(cur_data_),
+                                 static_cast<uint32_t>(cur_size_), 
&parsed_size));
+      if (parser.num_rows() != 1) {
+        return Status::Invalid(
+            "Could not read column names from CSV file, either "
+            "file is too short or header is larger than block size");
+      }
+      if (parser.num_cols() == 0) {
+        return Status::Invalid("No columns in CSV file");
+      }
+      // Read column names from last header row
       auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> 
Status {
-        DCHECK_EQ(column_names_.size(), static_cast<uint32_t>(col_index));
         column_names_.emplace_back(reinterpret_cast<const char*>(data), size);
         return Status::OK();
       };
-      RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
+      RETURN_NOT_OK(parser.VisitLastRow(visit));
+      DCHECK_EQ(static_cast<size_t>(parser.num_cols()), column_names_.size());
+      // Skip parsed header row
+      cur_data_ += parsed_size;
+      cur_size_ -= parsed_size;
+    } else {
+      column_names_ = read_options_.column_names;
+    }
+
+    num_cols_ = static_cast<int32_t>(column_names_.size());
+    DCHECK_GT(num_cols_, 0);
+
+    // Construct column builders
+    for (int32_t col_index = 0; col_index < num_cols_; ++col_index) {
       std::shared_ptr<ColumnBuilder> builder;
       // Does the named column have a fixed type?
       auto it = convert_options_.column_types.find(column_names_[col_index]);
@@ -186,9 +207,6 @@ class BaseTableReader : public csv::TableReader {
       column_builders_.push_back(builder);
     }
 
-    // Skip parsed header rows
-    cur_data_ += parsed_size;
-    cur_size_ -= parsed_size;
     return Status::OK();
   }
 
diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
index 067b830..93e9cb3 100644
--- a/python/pyarrow/_csv.pyx
+++ b/python/pyarrow/_csv.pyx
@@ -51,6 +51,12 @@ cdef class ReadOptions:
         How much bytes to process at a time from the input stream.
         This will determine multi-threading granularity as well as
         the size of individual chunks in the Table.
+    skip_rows: int, optional (default 0)
+        The number of rows to skip at the start of the CSV data, not
+        including the row of column names (if any).
+    column_names: list, optional
+        The Table column names.  If empty, column names will be
+        read from the first row after `skip_rows`.
     """
     cdef:
         CCSVReadOptions options
@@ -58,12 +64,17 @@ cdef class ReadOptions:
     # Avoid mistakingly creating attributes
     __slots__ = ()
 
-    def __init__(self, use_threads=None, block_size=None):
+    def __init__(self, use_threads=None, block_size=None, skip_rows=None,
+                 column_names=None):
         self.options = CCSVReadOptions.Defaults()
         if use_threads is not None:
             self.use_threads = use_threads
         if block_size is not None:
             self.block_size = block_size
+        if skip_rows is not None:
+            self.skip_rows = skip_rows
+        if column_names is not None:
+            self.column_names = column_names
 
     @property
     def use_threads(self):
@@ -89,6 +100,32 @@ cdef class ReadOptions:
     def block_size(self, value):
         self.options.block_size = value
 
+    @property
+    def skip_rows(self):
+        """
+        The number of rows to skip at the start of the CSV data, not
+        including the row of column names (if any).
+        """
+        return self.options.skip_rows
+
+    @skip_rows.setter
+    def skip_rows(self, value):
+        self.options.skip_rows = value
+
+    @property
+    def column_names(self):
+        """
+        The Table column names.  If empty, column names will be
+        read from the first row after `skip_rows`.
+        """
+        return [frombytes(s) for s in self.options.column_names]
+
+    @column_names.setter
+    def column_names(self, value):
+        self.options.column_names.clear()
+        for item in value:
+            self.options.column_names.push_back(tobytes(item))
+
 
 cdef class ParseOptions:
     """
@@ -107,8 +144,6 @@ cdef class ParseOptions:
     escape_char: 1-character string or False, optional (default False)
         The character used optionally for escaping special characters
         (False if escaping is not allowed).
-    header_rows: int, optional (default 1)
-        The number of rows to skip at the start of the CSV data.
     newlines_in_values: bool, optional (default False)
         Whether newline characters are allowed in CSV values.
         Setting this to True reduces the performance of multi-threaded
@@ -124,7 +159,7 @@ cdef class ParseOptions:
     __slots__ = ()
 
     def __init__(self, delimiter=None, quote_char=None, double_quote=None,
-                 escape_char=None, header_rows=None, newlines_in_values=None,
+                 escape_char=None, newlines_in_values=None,
                  ignore_empty_lines=None):
         self.options = CCSVParseOptions.Defaults()
         if delimiter is not None:
@@ -135,8 +170,6 @@ cdef class ParseOptions:
             self.double_quote = double_quote
         if escape_char is not None:
             self.escape_char = escape_char
-        if header_rows is not None:
-            self.header_rows = header_rows
         if newlines_in_values is not None:
             self.newlines_in_values = newlines_in_values
         if ignore_empty_lines is not None:
@@ -204,17 +237,6 @@ cdef class ParseOptions:
             self.options.escaping = True
 
     @property
-    def header_rows(self):
-        """
-        The number of rows to skip at the start of the CSV data.
-        """
-        return self.options.header_rows
-
-    @header_rows.setter
-    def header_rows(self, value):
-        self.options.header_rows = value
-
-    @property
     def newlines_in_values(self):
         """
         Whether newline characters are allowed in CSV values.
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index 282572e..3a5a26b 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1056,7 +1056,6 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" 
nogil:
         c_bool double_quote
         c_bool escaping
         unsigned char escape_char
-        int32_t header_rows
         c_bool newlines_in_values
         c_bool ignore_empty_lines
 
@@ -1077,6 +1076,8 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" 
nogil:
     cdef cppclass CCSVReadOptions" arrow::csv::ReadOptions":
         c_bool use_threads
         int32_t block_size
+        int32_t skip_rows
+        vector[c_string] column_names
 
         @staticmethod
         CCSVReadOptions Defaults()
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index 9f0c08b..a8d7998 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -72,9 +72,20 @@ def test_read_options():
     opts.use_threads = False
     assert opts.use_threads is False
 
-    opts = cls(block_size=1234, use_threads=False)
+    assert opts.skip_rows == 0
+    opts.skip_rows = 3
+    assert opts.skip_rows == 3
+
+    assert opts.column_names == []
+    opts.column_names = ["ab", "cd"]
+    assert opts.column_names == ["ab", "cd"]
+
+    opts = cls(block_size=1234, use_threads=False, skip_rows=42,
+               column_names=["a", "b", "c"])
     assert opts.block_size == 1234
     assert opts.use_threads is False
+    assert opts.skip_rows == 42
+    assert opts.column_names == ["a", "b", "c"]
 
 
 def test_parse_options():
@@ -84,7 +95,6 @@ def test_parse_options():
     assert opts.quote_char == '"'
     assert opts.double_quote is True
     assert opts.escape_char is False
-    assert opts.header_rows == 1
     assert opts.newlines_in_values is False
     assert opts.ignore_empty_lines is True
 
@@ -110,17 +120,13 @@ def test_parse_options():
     opts.ignore_empty_lines = False
     assert opts.ignore_empty_lines is False
 
-    opts.header_rows = 2
-    assert opts.header_rows == 2
-
     opts = cls(delimiter=';', quote_char='%', double_quote=False,
-               escape_char='\\', header_rows=2, newlines_in_values=True,
+               escape_char='\\', newlines_in_values=True,
                ignore_empty_lines=False)
     assert opts.delimiter == ';'
     assert opts.quote_char == '%'
     assert opts.double_quote is False
     assert opts.escape_char == '\\'
-    assert opts.header_rows == 2
     assert opts.newlines_in_values is True
     assert opts.ignore_empty_lines is False
 
@@ -214,6 +220,92 @@ class BaseTestCSVRead:
         table = self.read_bytes(rows)
         assert table.to_pydict() == expected_data
 
+    def test_header_skip_rows(self):
+        rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n"
+
+        opts = ReadOptions()
+        opts.skip_rows = 1
+        table = self.read_bytes(rows, read_options=opts)
+        self.check_names(table, ["ef", "gh"])
+        assert table.to_pydict() == {
+            "ef": ["ij", "mn"],
+            "gh": ["kl", "op"],
+            }
+
+        opts.skip_rows = 3
+        table = self.read_bytes(rows, read_options=opts)
+        self.check_names(table, ["mn", "op"])
+        assert table.to_pydict() == {
+            "mn": [],
+            "op": [],
+            }
+
+        opts.skip_rows = 4
+        with pytest.raises(pa.ArrowInvalid):
+            # Not enough rows
+            table = self.read_bytes(rows, read_options=opts)
+
+        # Can skip rows with a different number of columns
+        rows = b"abcd\n,,,,,\nij,kl\nmn,op\n"
+        opts.skip_rows = 2
+        table = self.read_bytes(rows, read_options=opts)
+        self.check_names(table, ["ij", "kl"])
+        assert table.to_pydict() == {
+            "ij": ["mn"],
+            "kl": ["op"],
+            }
+
+    def test_header_column_names(self):
+        rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n"
+
+        opts = ReadOptions()
+        opts.column_names = ["x", "y"]
+        table = self.read_bytes(rows, read_options=opts)
+        self.check_names(table, ["x", "y"])
+        assert table.to_pydict() == {
+            "x": ["ab", "ef", "ij", "mn"],
+            "y": ["cd", "gh", "kl", "op"],
+            }
+
+        opts.skip_rows = 3
+        table = self.read_bytes(rows, read_options=opts)
+        self.check_names(table, ["x", "y"])
+        assert table.to_pydict() == {
+            "x": ["mn"],
+            "y": ["op"],
+            }
+
+        opts.skip_rows = 4
+        table = self.read_bytes(rows, read_options=opts)
+        self.check_names(table, ["x", "y"])
+        assert table.to_pydict() == {
+            "x": [],
+            "y": [],
+            }
+
+        opts.skip_rows = 5
+        with pytest.raises(pa.ArrowInvalid):
+            # Not enough rows
+            table = self.read_bytes(rows, read_options=opts)
+
+        # Unexpected number of columns
+        opts.skip_rows = 0
+        opts.column_names = ["x", "y", "z"]
+        with pytest.raises(pa.ArrowInvalid,
+                           match="Expected 3 columns, got 2"):
+            table = self.read_bytes(rows, read_options=opts)
+
+        # Can skip rows with a different number of columns
+        rows = b"abcd\n,,,,,\nij,kl\nmn,op\n"
+        opts.skip_rows = 2
+        opts.column_names = ["x", "y"]
+        table = self.read_bytes(rows, read_options=opts)
+        self.check_names(table, ["x", "y"])
+        assert table.to_pydict() == {
+            "x": ["ij", "mn"],
+            "y": ["kl", "op"],
+            }
+
     def test_simple_ints(self):
         # Infer integer columns
         rows = b"a,b,c\n1,2,3\n4,5,6\n"
diff --git a/r/R/csv.R b/r/R/csv.R
index 6ac0118..bf69830 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -42,16 +42,16 @@
 #' characters? This is more general than `escape_double` as backslashes
 #' can be used to escape the delimiter character, the quote character, or
 #' to add special characters like `\\n`.
-# #' @param col_names If `TRUE`, the first row of the input will be used as the
-# #' column names and will not be included in the data frame. Note that `FALSE`
-# #' is not currently supported, nor is specifying a character vector of column
-# #' names.
+#' @param col_names If `TRUE`, the first row of the input will be used as the
+#' column names and will not be included in the data frame. (Note that `FALSE`
+#' is not currently supported.) Alternatively, you can specify a character
+#' vector of column names.
 #' @param col_select A [tidy selection specification][tidyselect::vars_select]
 #' of columns, as used in `dplyr::select()`.
 #' @param skip_empty_rows Should blank rows be ignored altogether? If
 #' `TRUE`, blank rows will not be represented at all. If `FALSE`, they will be
 #' filled with missings.
-# #' @param skip Number of lines to skip before reading data.
+#' @param skip Number of lines to skip before reading data.
 #' @param parse_options see [csv_parse_options()]. If given, this overrides any
 #' parsing options provided in other arguments (e.g. `delim`, `quote`, etc.).
 #' @param convert_options see [csv_convert_options()]
@@ -66,39 +66,41 @@ read_delim_arrow <- function(file,
                              quote = '"',
                              escape_double = TRUE,
                              escape_backslash = FALSE,
-                             # col_names = TRUE,
+                             col_names = TRUE,
                              # col_types = TRUE,
                              col_select = NULL,
                              # na = c("", "NA"),
                              # quoted_na = TRUE,
                              skip_empty_rows = TRUE,
-                             # skip = 0L,
+                             skip = 0L,
                              parse_options = NULL,
                              convert_options = NULL,
-                             read_options = csv_read_options(),
+                             read_options = NULL,
                              as_tibble = TRUE) {
 
-  # These are hardcoded pending 
https://issues.apache.org/jira/browse/ARROW-5747
-  col_names <- TRUE
-  skip <- 0L
-
+  if (identical(col_names, FALSE)) {
+    stop("Not implemented", call.=FALSE)
+  }
   if (is.null(parse_options)) {
-    if (isTRUE(col_names)) {
-      # Add one row to skip, to match arrow's header_rows
-      skip <- skip + 1L
-      # Note that with the hardcoding, header_rows is always 1, which
-      # turns out to be the only value that works meaningfully
-    }
     parse_options <- readr_to_csv_parse_options(
       delim,
       quote,
       escape_double,
       escape_backslash,
-      skip_empty_rows,
-      skip
+      skip_empty_rows
     )
   }
 
+  if (is.null(read_options)) {
+    if (isTRUE(col_names)) {
+      # C++ default to parse is 0-length string array
+      col_names <- character(0)
+    }
+    read_options <- csv_read_options(
+      skip_rows = skip,
+      column_names = col_names
+    )
+  }
   if (is.null(convert_options)) {
     # TODO:
     # * na strings (needs wiring in csv_convert_options)
@@ -117,10 +119,6 @@ read_delim_arrow <- function(file,
   )
 
   tab <- reader$Read()$select(!!enquo(col_select))
-  if (is.character(col_names)) {
-    # TODO: Rename `tab`'s columns
-    # See https://github.com/apache/arrow/pull/4557
-  }
 
   if (isTRUE(as_tibble)) {
     tab <- as.data.frame(tab)
@@ -135,16 +133,16 @@ read_csv_arrow <- function(file,
                            quote = '"',
                            escape_double = TRUE,
                            escape_backslash = FALSE,
-                           # col_names = TRUE,
+                           col_names = TRUE,
                            # col_types = TRUE,
                            col_select = NULL,
                            # na = c("", "NA"),
                            # quoted_na = TRUE,
                            skip_empty_rows = TRUE,
-                           # skip = 0L,
+                           skip = 0L,
                            parse_options = NULL,
                            convert_options = NULL,
-                           read_options = csv_read_options(),
+                           read_options = NULL,
                            as_tibble = TRUE) {
 
   mc <- match.call()
@@ -159,16 +157,16 @@ read_tsv_arrow <- function(file,
                            quote = '"',
                            escape_double = TRUE,
                            escape_backslash = FALSE,
-                           # col_names = TRUE,
+                           col_names = TRUE,
                            # col_types = TRUE,
                            col_select = NULL,
                            # na = c("", "NA"),
                            # quoted_na = TRUE,
                            skip_empty_rows = TRUE,
-                           # skip = 0L,
+                           skip = 0L,
                            parse_options = NULL,
                            convert_options = NULL,
-                           read_options = csv_read_options(),
+                           read_options = NULL,
                            as_tibble = TRUE) {
 
   mc <- match.call()
@@ -192,15 +190,25 @@ read_tsv_arrow <- function(file,
 #' Read options for the Arrow file readers
 #'
 #' @param use_threads Whether to use the global CPU thread pool
-#' @param block_size Block size we request from the IO layer; also determines 
the size of chunks when use_threads is `TRUE`. NB: if false, JSON input must 
end with an empty line
+#' @param block_size Block size we request from the IO layer; also determines
+#' the size of chunks when use_threads is `TRUE`. NB: if `FALSE`, JSON input
+#' must end with an empty line.
+#' @param skip_rows Number of lines to skip before reading data.
+#' @param column_names Character vector to supply column names. If length-0
+#' (the default), the first non-skipped row will be parsed to generate column
+#' names.
 #'
 #' @export
 csv_read_options <- function(use_threads = option_use_threads(),
-                             block_size = 1048576L) {
+                             block_size = 1048576L,
+                             skip_rows = 0L,
+                             column_names = character(0)) {
   shared_ptr(`arrow::csv::ReadOptions`, csv___ReadOptions__initialize(
     list(
       use_threads = use_threads,
-      block_size = block_size
+      block_size = block_size,
+      skip_rows = skip_rows,
+      column_names = column_names
     )
   ))
 }
@@ -209,8 +217,7 @@ readr_to_csv_parse_options <- function(delim = ",",
                                        quote = '"',
                                        escape_double = TRUE,
                                        escape_backslash = FALSE,
-                                       skip_empty_rows = TRUE,
-                                       skip = 0L) {
+                                       skip_empty_rows = TRUE) {
   # This function translates from the readr argument list to the arrow arg 
names
   # TODO: validate inputs
   csv_parse_options(
@@ -221,8 +228,7 @@ readr_to_csv_parse_options <- function(delim = ",",
     escaping = escape_backslash,
     escape_char = '\\',
     newlines_in_values = escape_backslash,
-    ignore_empty_lines = skip_empty_rows,
-    header_rows = skip
+    ignore_empty_lines = skip_empty_rows
   )
 }
 
@@ -236,7 +242,6 @@ readr_to_csv_parse_options <- function(delim = ",",
 #' @param escape_char Escaping character (if `escaping` is `TRUE`)
 #' @param newlines_in_values Whether values are allowed to contain CR (`0x0d`) 
and LF (`0x0a`) characters
 #' @param ignore_empty_lines Whether empty lines are ignored.  If `FALSE`, an 
empty line represents
-#' @param header_rows Number of header rows to skip (including the first row 
containing column names)
 #'
 #' @export
 csv_parse_options <- function(delimiter = ",",
@@ -246,8 +251,7 @@ csv_parse_options <- function(delimiter = ",",
                               escaping = FALSE,
                               escape_char = '\\',
                               newlines_in_values = FALSE,
-                              ignore_empty_lines = TRUE,
-                              header_rows = 1L) {
+                              ignore_empty_lines = TRUE) {
 
   shared_ptr(`arrow::csv::ParseOptions`, csv___ParseOptions__initialize(
     list(
@@ -258,8 +262,7 @@ csv_parse_options <- function(delimiter = ",",
       escaping = escaping,
       escape_char = escape_char,
       newlines_in_values = newlines_in_values,
-      ignore_empty_lines = ignore_empty_lines,
-      header_rows = header_rows
+      ignore_empty_lines = ignore_empty_lines
     )
   ))
 }
diff --git a/r/README.md b/r/README.md
index 43280f3..47458cf 100644
--- a/r/README.md
+++ b/r/README.md
@@ -48,14 +48,6 @@ library.
 
 ``` r
 library(arrow)
-#> 
-#> Attaching package: 'arrow'
-#> The following object is masked from 'package:utils':
-#> 
-#>     timestamp
-#> The following objects are masked from 'package:base':
-#> 
-#>     array, table
 set.seed(24)
 
 tab <- arrow::table(x = 1:10, y = rnorm(10))
diff --git a/r/man/csv_parse_options.Rd b/r/man/csv_parse_options.Rd
index 17c5ba2..a46cfb3 100644
--- a/r/man/csv_parse_options.Rd
+++ b/r/man/csv_parse_options.Rd
@@ -8,7 +8,7 @@
 csv_parse_options(delimiter = ",", quoting = TRUE,
   quote_char = "\\"", double_quote = TRUE, escaping = FALSE,
   escape_char = "\\\\", newlines_in_values = FALSE,
-  ignore_empty_lines = TRUE, header_rows = 1L)
+  ignore_empty_lines = TRUE)
 
 json_parse_options(newlines_in_values = FALSE)
 }
@@ -28,8 +28,6 @@ json_parse_options(newlines_in_values = FALSE)
 \item{newlines_in_values}{Whether values are allowed to contain CR 
(\code{0x0d}) and LF (\code{0x0a}) characters}
 
 \item{ignore_empty_lines}{Whether empty lines are ignored.  If \code{FALSE}, 
an empty line represents}
-
-\item{header_rows}{Number of header rows to skip (including the first row 
containing column names)}
 }
 \description{
 Parsing options for Arrow file readers
diff --git a/r/man/csv_read_options.Rd b/r/man/csv_read_options.Rd
index ddfc9d1..38b6e47 100644
--- a/r/man/csv_read_options.Rd
+++ b/r/man/csv_read_options.Rd
@@ -6,14 +6,22 @@
 \title{Read options for the Arrow file readers}
 \usage{
 csv_read_options(use_threads = option_use_threads(),
-  block_size = 1048576L)
+  block_size = 1048576L, skip_rows = 0L, column_names = character(0))
 
 json_read_options(use_threads = TRUE, block_size = 1048576L)
 }
 \arguments{
 \item{use_threads}{Whether to use the global CPU thread pool}
 
-\item{block_size}{Block size we request from the IO layer; also determines the 
size of chunks when use_threads is \code{TRUE}. NB: if false, JSON input must 
end with an empty line}
+\item{block_size}{Block size we request from the IO layer; also determines
+the size of chunks when use_threads is \code{TRUE}. NB: if \code{FALSE}, JSON 
input
+must end with an empty line.}
+
+\item{skip_rows}{Number of lines to skip before reading data.}
+
+\item{column_names}{Character vector to supply column names. If length-0
+(the default), the first non-skipped row will be parsed to generate column
+names.}
 }
 \description{
 Read options for the Arrow file readers
diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd
index ff732ae..0726889 100644
--- a/r/man/read_delim_arrow.Rd
+++ b/r/man/read_delim_arrow.Rd
@@ -7,22 +7,20 @@
 \title{Read a CSV or other delimited file with Arrow}
 \usage{
 read_delim_arrow(file, delim = ",", quote = "\\"",
-  escape_double = TRUE, escape_backslash = FALSE, col_select = NULL,
-  skip_empty_rows = TRUE, parse_options = NULL,
-  convert_options = NULL, read_options = csv_read_options(),
+  escape_double = TRUE, escape_backslash = FALSE, col_names = TRUE,
+  col_select = NULL, skip_empty_rows = TRUE, skip = 0L,
+  parse_options = NULL, convert_options = NULL, read_options = NULL,
   as_tibble = TRUE)
 
 read_csv_arrow(file, quote = "\\"", escape_double = TRUE,
-  escape_backslash = FALSE, col_select = NULL,
-  skip_empty_rows = TRUE, parse_options = NULL,
-  convert_options = NULL, read_options = csv_read_options(),
-  as_tibble = TRUE)
+  escape_backslash = FALSE, col_names = TRUE, col_select = NULL,
+  skip_empty_rows = TRUE, skip = 0L, parse_options = NULL,
+  convert_options = NULL, read_options = NULL, as_tibble = TRUE)
 
 read_tsv_arrow(file, quote = "\\"", escape_double = TRUE,
-  escape_backslash = FALSE, col_select = NULL,
-  skip_empty_rows = TRUE, parse_options = NULL,
-  convert_options = NULL, read_options = csv_read_options(),
-  as_tibble = TRUE)
+  escape_backslash = FALSE, col_names = TRUE, col_select = NULL,
+  skip_empty_rows = TRUE, skip = 0L, parse_options = NULL,
+  convert_options = NULL, read_options = NULL, as_tibble = TRUE)
 }
 \arguments{
 \item{file}{A character path to a local file, or an Arrow input stream}
@@ -40,6 +38,11 @@ characters? This is more general than \code{escape_double} 
as backslashes
 can be used to escape the delimiter character, the quote character, or
 to add special characters like \code{\\n}.}
 
+\item{col_names}{If \code{TRUE}, the first row of the input will be used as the
+column names and will not be included in the data frame. (Note that 
\code{FALSE}
+is not currently supported.) Alternatively, you can specify a character
+vector of column names.}
+
 \item{col_select}{A \link[tidyselect:vars_select]{tidy selection specification}
 of columns, as used in \code{dplyr::select()}.}
 
@@ -47,6 +50,8 @@ of columns, as used in \code{dplyr::select()}.}
 \code{TRUE}, blank rows will not be represented at all. If \code{FALSE}, they 
will be
 filled with missings.}
 
+\item{skip}{Number of lines to skip before reading data.}
+
 \item{parse_options}{see 
\code{\link[=csv_parse_options]{csv_parse_options()}}. If given, this overrides 
any
 parsing options provided in other arguments (e.g. \code{delim}, \code{quote}, 
etc.).}
 
diff --git a/r/src/csv.cpp b/r/src/csv.cpp
index bfcbae7..6165636 100644
--- a/r/src/csv.cpp
+++ b/r/src/csv.cpp
@@ -28,6 +28,8 @@ std::shared_ptr<arrow::csv::ReadOptions> 
csv___ReadOptions__initialize(List_ opt
       
std::make_shared<arrow::csv::ReadOptions>(arrow::csv::ReadOptions::Defaults());
   res->use_threads = options["use_threads"];
   res->block_size = options["block_size"];
+  res->skip_rows = options["skip_rows"];
+  res->column_names = 
Rcpp::as<std::vector<std::string>>(options["column_names"]);
   return res;
 }
 
@@ -43,7 +45,6 @@ std::shared_ptr<arrow::csv::ParseOptions> 
csv___ParseOptions__initialize(List_ o
   res->double_quote = options["double_quote"];
   res->escape_char = get_char(options["escape_char"]);
   res->newlines_in_values = options["newlines_in_values"];
-  res->header_rows = options["header_rows"];
   res->ignore_empty_lines = options["ignore_empty_lines"];
   return res;
 }
diff --git a/r/tests/testthat/test-arrow-csv.R 
b/r/tests/testthat/test-arrow-csv.R
index aed9638..81e35b3 100644
--- a/r/tests/testthat/test-arrow-csv.R
+++ b/r/tests/testthat/test-arrow-csv.R
@@ -81,29 +81,39 @@ test_that("read_delim_arrow parsing options: quote", {
 })
 
 test_that("read_csv_arrow parsing options: col_names", {
-  skip("Invalid: Empty CSV file")
   tf <- tempfile()
   on.exit(unlink(tf))
 
+  # Writing the CSV without the header
   write.table(iris, tf, sep = ",", row.names = FALSE, col.names = FALSE)
-  tab1 <- read_csv_arrow(tf, col_names = FALSE)
+
+  expect_error(read_csv_arrow(tf, col_names = FALSE), "Not implemented")
+
+  tab1 <- read_csv_arrow(tf, col_names = names(iris))
 
   expect_identical(names(tab1), names(iris))
   iris$Species <- as.character(iris$Species)
   expect_equivalent(iris, tab1)
+
+  # This errors (correctly) because I haven't given enough names
+  # but the error message is "Invalid: Empty CSV file", which is not accurate
+  expect_error(
+    read_csv_arrow(tf, col_names = names(iris)[1])
+  )
+  # Same here
+  expect_error(
+    read_csv_arrow(tf, col_names = c(names(iris), names(iris)))
+  )
 })
 
 test_that("read_csv_arrow parsing options: skip", {
-  skip("Invalid: Empty CSV file")
   tf <- tempfile()
   on.exit(unlink(tf))
 
+  # Adding two garbage lines to start the csv
   cat("asdf\nqwer\n", file = tf)
   suppressWarnings(write.table(iris, tf, sep = ",", row.names = FALSE, append 
= TRUE))
-  # This works:
-  # print(head(readr::read_csv(tf, skip = 2)))
 
-  # This errors:
   tab1 <- read_csv_arrow(tf, skip = 2)
 
   expect_identical(names(tab1), names(iris))

[arrow] branch master updated: ARROW-5747: [C++] Improve CSV header and column names options

Reply via email to