n3world commented on a change in pull request #10202:
URL: https://github.com/apache/arrow/pull/10202#discussion_r627044874



##########
File path: cpp/src/arrow/csv/parser.cc
##########
@@ -35,14 +35,13 @@ using detail::ParsedValueDesc;
 
 namespace {
 
-Status ParseError(const char* message) {
-  return Status::Invalid("CSV parse error: ", message);
+template <typename... Args>
+Status ParseError(Args&&... args) {
+  return Status::Invalid("CSV parse error: ", std::forward<Args>(args)...);
 }
 
-Status MismatchingColumns(int32_t expected, int32_t actual) {
-  char s[50];
-  snprintf(s, sizeof(s), "Expected %d columns, got %d", expected, actual);
-  return ParseError(s);
+Status MismatchingColumns(int32_t expected, int32_t actual, const std::string& 
row) {

Review comment:
       Yes row index or line number which are not always the same because in 
theory new lines can be in a value would be nice but there didn't seem to be 
anything which tracked overall parser location at this level to be able to do 
that. That would also be good for the convert layer so that if a conversion 
error does occur not just column number but line/row number could be given too.

##########
File path: cpp/src/arrow/csv/parser.cc
##########
@@ -76,9 +76,45 @@ class PresizedDataWriter {
     parsed_[parsed_size_++] = static_cast<uint8_t>(c);
   }
 
+  // Push the value of a fully complete field. This should only be used to 
fill in missing
+  // values. This method can reallocate the buffer if there isn't enough extra 
space for
+  // the field.
+  Status PushField(const std::string& field) {
+    if (field.length() > extra_allocated_) {
+      // just in case this happens more allocate enough for 10x this amount
+      auto to_allocate = static_cast<uint32_t>(
+          std::max(field.length() * 10, 
static_cast<std::string::size_type>(128)));
+      int64_t new_capacity = parsed_capacity_ + to_allocate;
+      RETURN_NOT_OK(parsed_buffer_->Resize(new_capacity));
+      parsed_ = parsed_buffer_->mutable_data();
+      parsed_capacity_ = new_capacity;
+      extra_allocated_ += to_allocate;
+    }

Review comment:
       doh of course I do. Thanks

##########
File path: cpp/src/arrow/csv/parser_test.cc
##########
@@ -555,6 +555,123 @@ TEST(BlockParser, MismatchingNumColumns) {
   }
 }
 
+TEST(BlockParser, MismatchingNumColumnsSkip) {
+  ParseOptions opts = ParseOptions::Defaults();
+  opts.invalid_row_handler = InvalidRowHandlers::Skip();
+  {
+    BlockParser parser(opts);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a,b\nc\nd,e\n"));
+    ASSERT_EQ(2, parser.num_rows());
+    ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"d", "e"}, {false, 
false}));
+  }
+  {
+    BlockParser parser(opts, 2 /* num_cols */);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a\nb,c\n"));
+    ASSERT_EQ(1, parser.num_rows());
+    ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"b", "c"}, {false, 
false}));
+  }
+  {
+    BlockParser parser(opts, 2 /* num_cols */);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a,b,c\nd,e\n"));
+    ASSERT_EQ(1, parser.num_rows());
+    ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"d", "e"}, {false, 
false}));
+  }
+}
+
+TEST(BlockParser, MismatchingNumColumnsAddNulls) {
+  ParseOptions opts = ParseOptions::Defaults();
+  opts.invalid_row_handler = InvalidRowHandlers::AddNulls("NULL");
+  {
+    BlockParser parser(opts);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a,b\nc\n"));
+    ASSERT_EQ(2, parser.num_rows());
+    ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"c", "NULL"}, {false, 
false}));
+  }
+  {
+    BlockParser parser(opts, 2 /* num_cols */);
+    ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a\n"));
+    ASSERT_EQ(1, parser.num_rows());
+    ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"a", "NULL"}, {false, 
false}));
+  }
+  {
+    uint32_t out_size;
+    BlockParser parser(opts, 2 /* num_cols */);
+    Status st = Parse(parser, "a,b,c\nd,e\n", &out_size);
+    ASSERT_RAISES(Invalid, st);
+  }
+}
+
+TEST(BlockParser, MismatchingNumColumnsForce) {
+  ParseOptions opts = ParseOptions::Defaults();
+  opts.invalid_row_handler = InvalidRowHandlers::Force();

Review comment:
       I didn't do that because the csv reader only allowed you to visit either 
a column or the last row which made it very awkward to validate an arbitrary 
row but maybe I should just validate it by column since they are small enough

##########
File path: cpp/src/arrow/csv/parser.cc
##########
@@ -76,9 +76,45 @@ class PresizedDataWriter {
     parsed_[parsed_size_++] = static_cast<uint8_t>(c);
   }
 
+  // Push the value of a fully complete field. This should only be used to 
fill in missing
+  // values. This method can reallocate the buffer if there isn't enough extra 
space for
+  // the field.
+  Status PushField(const std::string& field) {
+    if (field.length() > extra_allocated_) {
+      // just in case this happens more allocate enough for 10x this amount
+      auto to_allocate = static_cast<uint32_t>(
+          std::max(field.length() * 10, 
static_cast<std::string::size_type>(128)));
+      int64_t new_capacity = parsed_capacity_ + to_allocate;
+      RETURN_NOT_OK(parsed_buffer_->Resize(new_capacity));
+      parsed_ = parsed_buffer_->mutable_data();
+      parsed_capacity_ = new_capacity;
+      extra_allocated_ += to_allocate;

Review comment:
       yup that was dumb




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to