This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 8192902 ARROW-5791: [C++] Fix infinite loop with more the 32768 columns. 8192902 is described below commit 8192902f00a7ab74ca93e6f2d9316fd32d18923f Author: Micah Kornfield <emkornfi...@gmail.com> AuthorDate: Mon Jul 1 13:42:14 2019 -0500 ARROW-5791: [C++] Fix infinite loop with more the 32768 columns. But really 32768 columns should be enough for anyone :) Author: Micah Kornfield <emkornfi...@gmail.com> Closes #4762 from emkornfield/csv and squashes the following commits: ab0504c16 <Micah Kornfield> lower number of columns in test to satisfy ming 8f53a8a58 <Micah Kornfield> remove test acfe2d894 <Micah Kornfield> remove cap, make min rows_in_chunk 512 08ddc2238 <Micah Kornfield> remove floor duplication 211472a12 <Micah Kornfield> powers of 2 are better b91a9e177 <Micah Kornfield> ARROW-5791: Fix infinite loop with more the 32768 columns. Cap max columns --- cpp/src/arrow/csv/parser-test.cc | 25 +++++++++++++++++++++++++ cpp/src/arrow/csv/parser.cc | 7 +++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/csv/parser-test.cc b/cpp/src/arrow/csv/parser-test.cc index 3655230..d1790b2 100644 --- a/cpp/src/arrow/csv/parser-test.cc +++ b/cpp/src/arrow/csv/parser-test.cc @@ -439,6 +439,31 @@ TEST(BlockParser, Escaping) { } } +// Generate test data with the given number of columns. +std::string MakeLotsOfCsvColumns(int32_t num_columns) { + std::string values, header; + header.reserve(num_columns * 10); + values.reserve(num_columns * 10); + for (int x = 0; x < num_columns; x++) { + if (x != 0) { + header += ","; + values += ","; + } + header += "c" + std::to_string(x); + values += std::to_string(x); + } + + header += "\n"; + values += "\n"; + return MakeCSVData({header, values}); +} + +TEST(BlockParser, LotsOfColumns) { + auto options = ParseOptions::Defaults(); + BlockParser parser(options); + AssertParseOk(parser, MakeLotsOfCsvColumns(1024 * 100)); +} + TEST(BlockParser, QuotedEscape) { auto options = ParseOptions::Defaults(); options.escaping = true; diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc index a7ca71c..89c3f4c 100644 --- a/cpp/src/arrow/csv/parser.cc +++ b/cpp/src/arrow/csv/parser.cc @@ -397,16 +397,19 @@ Status BlockParser::DoParseSpecialized(const char* start, uint32_t size, bool is return ParseError("Empty CSV file or block: cannot infer number of columns"); } } + while (!finished_parsing && data < data_end && num_rows_ < max_num_rows_) { // We know the number of columns, so can presize a values array for // a given number of rows DCHECK_GE(num_cols_, 0); int32_t rows_in_chunk; + constexpr int32_t kTargetChunkSize = 32768; if (num_cols_ > 0) { - rows_in_chunk = std::min(32768 / num_cols_, max_num_rows_ - num_rows_); + rows_in_chunk = std::min(std::max(kTargetChunkSize / num_cols_, 512), + max_num_rows_ - num_rows_); } else { - rows_in_chunk = std::min(32768, max_num_rows_ - num_rows_); + rows_in_chunk = std::min(kTargetChunkSize, max_num_rows_ - num_rows_); } PresizedValuesWriter values_writer(pool_, rows_in_chunk, num_cols_);