n3world commented on a change in pull request #10202: URL: https://github.com/apache/arrow/pull/10202#discussion_r627044874
########## File path: cpp/src/arrow/csv/parser.cc ########## @@ -35,14 +35,13 @@ using detail::ParsedValueDesc; namespace { -Status ParseError(const char* message) { - return Status::Invalid("CSV parse error: ", message); +template <typename... Args> +Status ParseError(Args&&... args) { + return Status::Invalid("CSV parse error: ", std::forward<Args>(args)...); } -Status MismatchingColumns(int32_t expected, int32_t actual) { - char s[50]; - snprintf(s, sizeof(s), "Expected %d columns, got %d", expected, actual); - return ParseError(s); +Status MismatchingColumns(int32_t expected, int32_t actual, const std::string& row) { Review comment: Yes row index or line number which are not always the same because in theory new lines can be in a value would be nice but there didn't seem to be anything which tracked overall parser location at this level to be able to do that. That would also be good for the convert layer so that if a conversion error does occur not just column number but line/row number could be given too. ########## File path: cpp/src/arrow/csv/parser.cc ########## @@ -76,9 +76,45 @@ class PresizedDataWriter { parsed_[parsed_size_++] = static_cast<uint8_t>(c); } + // Push the value of a fully complete field. This should only be used to fill in missing + // values. This method can reallocate the buffer if there isn't enough extra space for + // the field. + Status PushField(const std::string& field) { + if (field.length() > extra_allocated_) { + // just in case this happens more allocate enough for 10x this amount + auto to_allocate = static_cast<uint32_t>( + std::max(field.length() * 10, static_cast<std::string::size_type>(128))); + int64_t new_capacity = parsed_capacity_ + to_allocate; + RETURN_NOT_OK(parsed_buffer_->Resize(new_capacity)); + parsed_ = parsed_buffer_->mutable_data(); + parsed_capacity_ = new_capacity; + extra_allocated_ += to_allocate; + } Review comment: doh of course I do. Thanks ########## File path: cpp/src/arrow/csv/parser_test.cc ########## @@ -555,6 +555,123 @@ TEST(BlockParser, MismatchingNumColumns) { } } +TEST(BlockParser, MismatchingNumColumnsSkip) { + ParseOptions opts = ParseOptions::Defaults(); + opts.invalid_row_handler = InvalidRowHandlers::Skip(); + { + BlockParser parser(opts); + ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a,b\nc\nd,e\n")); + ASSERT_EQ(2, parser.num_rows()); + ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"d", "e"}, {false, false})); + } + { + BlockParser parser(opts, 2 /* num_cols */); + ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a\nb,c\n")); + ASSERT_EQ(1, parser.num_rows()); + ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"b", "c"}, {false, false})); + } + { + BlockParser parser(opts, 2 /* num_cols */); + ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a,b,c\nd,e\n")); + ASSERT_EQ(1, parser.num_rows()); + ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"d", "e"}, {false, false})); + } +} + +TEST(BlockParser, MismatchingNumColumnsAddNulls) { + ParseOptions opts = ParseOptions::Defaults(); + opts.invalid_row_handler = InvalidRowHandlers::AddNulls("NULL"); + { + BlockParser parser(opts); + ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a,b\nc\n")); + ASSERT_EQ(2, parser.num_rows()); + ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"c", "NULL"}, {false, false})); + } + { + BlockParser parser(opts, 2 /* num_cols */); + ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, "a\n")); + ASSERT_EQ(1, parser.num_rows()); + ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"a", "NULL"}, {false, false})); + } + { + uint32_t out_size; + BlockParser parser(opts, 2 /* num_cols */); + Status st = Parse(parser, "a,b,c\nd,e\n", &out_size); + ASSERT_RAISES(Invalid, st); + } +} + +TEST(BlockParser, MismatchingNumColumnsForce) { + ParseOptions opts = ParseOptions::Defaults(); + opts.invalid_row_handler = InvalidRowHandlers::Force(); Review comment: I didn't do that because the csv reader only allowed you to visit either a column or the last row which made it very awkward to validate an arbitrary row but maybe I should just validate it by column since they are small enough ########## File path: cpp/src/arrow/csv/parser.cc ########## @@ -76,9 +76,45 @@ class PresizedDataWriter { parsed_[parsed_size_++] = static_cast<uint8_t>(c); } + // Push the value of a fully complete field. This should only be used to fill in missing + // values. This method can reallocate the buffer if there isn't enough extra space for + // the field. + Status PushField(const std::string& field) { + if (field.length() > extra_allocated_) { + // just in case this happens more allocate enough for 10x this amount + auto to_allocate = static_cast<uint32_t>( + std::max(field.length() * 10, static_cast<std::string::size_type>(128))); + int64_t new_capacity = parsed_capacity_ + to_allocate; + RETURN_NOT_OK(parsed_buffer_->Resize(new_capacity)); + parsed_ = parsed_buffer_->mutable_data(); + parsed_capacity_ = new_capacity; + extra_allocated_ += to_allocate; Review comment: yup that was dumb -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org