n3world commented on a change in pull request #10321: URL: https://github.com/apache/arrow/pull/10321#discussion_r639783428
########## File path: python/pyarrow/tests/test_csv.py ########## @@ -52,18 +52,20 @@ def generate_col_names(): yield first + second -def make_random_csv(num_cols=2, num_rows=10, linesep='\r\n'): +def make_random_csv(num_cols=2, num_rows=10, linesep='\r\n', write_names=True): arr = np.random.RandomState(42).randint(0, 1000, size=(num_cols, num_rows)) - col_names = list(itertools.islice(generate_col_names(), num_cols)) csv = io.StringIO() - csv.write(",".join(col_names)) + if write_names: + col_names = list(itertools.islice(generate_col_names(), num_cols)) + csv.write(",".join(col_names)) csv.write(linesep) for row in arr.T: csv.write(",".join(map(str, row))) csv.write(linesep) csv = csv.getvalue().encode() columns = [pa.array(a, type=pa.int64()) for a in arr] - expected = pa.Table.from_arrays(columns, col_names) + expected = pa.Table.from_arrays( + columns, col_names) if write_names else None Review comment: If write_names is false then col_names is not set so the Table cannot be created. I'll change this to be a condition on col_names because that is a bit more obvious ########## File path: cpp/src/arrow/csv/parser.cc ########## @@ -35,14 +35,21 @@ using detail::ParsedValueDesc; namespace { -Status ParseError(const char* message) { - return Status::Invalid("CSV parse error: ", message); +template <typename... Args> +Status ParseError(Args&&... args) { + return Status::Invalid("CSV parse error: ", std::forward<Args>(args)...); } -Status MismatchingColumns(int32_t expected, int32_t actual) { - char s[50]; - snprintf(s, sizeof(s), "Expected %d columns, got %d", expected, actual); - return ParseError(s); +Status MismatchingColumns(int32_t expected, int32_t actual, int64_t row_num, + util::string_view row) { + if (row.length() > 100) { + row = row.substr(0, 96); Review comment: Sorry about that the ellipse was accidentally removed when I added the row_num to the output ########## File path: cpp/src/arrow/csv/parser.h ########## @@ -63,19 +63,24 @@ class ARROW_EXPORT DataBatch { uint32_t num_bytes() const { return parsed_size_; } template <typename Visitor> - Status VisitColumn(int32_t col_index, Visitor&& visit) const { + Status VisitColumn(int32_t col_index, int64_t first_row, Visitor&& visit) const { using detail::ParsedValueDesc; + int64_t row = first_row; for (size_t buf_index = 0; buf_index < values_buffers_.size(); ++buf_index) { const auto& values_buffer = values_buffers_[buf_index]; const auto values = reinterpret_cast<const ParsedValueDesc*>(values_buffer->data()); const auto max_pos = static_cast<int32_t>(values_buffer->size() / sizeof(ParsedValueDesc)) - 1; - for (int32_t pos = col_index; pos < max_pos; pos += num_cols_) { + for (int32_t pos = col_index; pos < max_pos; pos += num_cols_, ++row) { auto start = values[pos].offset; auto stop = values[pos + 1].offset; auto quoted = values[pos + 1].quoted; - ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted)); + Status status = visit(parsed_ + start, stop - start, quoted); + if (ARROW_PREDICT_FALSE(first_row >= 0 && !status.ok())) { Review comment: ARROW_RETURN_NOT_OK adds the extra context when that is enabled so I think it would be better to keep that around or add a new macro which doesn't check status just adds the context and returns when it is enabled ########## File path: cpp/src/arrow/csv/reader.cc ########## @@ -319,11 +319,12 @@ class ReaderMixin { public: ReaderMixin(io::IOContext io_context, std::shared_ptr<io::InputStream> input, const ReadOptions& read_options, const ParseOptions& parse_options, - const ConvertOptions& convert_options) + const ConvertOptions& convert_options, bool count_rows) : io_context_(std::move(io_context)), read_options_(read_options), parse_options_(parse_options), convert_options_(convert_options), + num_rows_seen_(count_rows ? 1 : -1), Review comment: Just to reduce the number of member variables. Using a separate bool might be a bit clearer but it doesn't simplify the code any and you end up having one variable indicating if another variable is being used and using -1/<0 to indicate disabled is common enough I didn't think it obfuscated the intent too much. If you feel strongly there should be two member variables to track row count I can make that change. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org