n3world commented on a change in pull request #10794: URL: https://github.com/apache/arrow/pull/10794#discussion_r695731681
########## File path: python/pyarrow/tests/test_csv.py ########## @@ -609,28 +489,184 @@ def test_skip_rows_after_names(self): assert (values[opts.skip_rows + opts.skip_rows_after_names:] == table_dict[name]) - def test_header_column_names(self): + def test_row_number_offset_in_errors(self, use_threads): + # Row numbers are only correctly counted in serial reads + def format_msg(msg_format, row, *args): + if use_threads: + row_info = "" + else: + row_info = "Row #{}: ".format(row) + return msg_format.format(row_info, *args) + + csv, _ = make_random_csv(4, 100, write_names=True) + + read_options = ReadOptions() + read_options.block_size = len(csv) / 3 + convert_options = ConvertOptions() + convert_options.column_types = {"a": pa.int32()} + + # Test without skip_rows and column names in the csv + csv_bad_columns = csv + b"1,2\r\n" + message_columns = format_msg("{}Expected 4 columns, got 2", 102) + with pytest.raises(pa.ArrowInvalid, match=message_columns): + self.read_bytes(csv_bad_columns, use_threads, + read_options=read_options, + convert_options=convert_options) + + csv_bad_type = csv + b"a,b,c,d\r\n" + message_value = format_msg( + "In CSV column #0: {}" + "CSV conversion error to int32: invalid value 'a'", + 102, csv) + with pytest.raises(pa.ArrowInvalid, match=message_value): + self.read_bytes(csv_bad_type, use_threads, + read_options=read_options, + convert_options=convert_options) + + long_row = (b"this is a long row" * 15) + b",3\r\n" + csv_bad_columns_long = csv + long_row + message_long = format_msg("{}Expected 4 columns, got 2: {} ...", 102, + long_row[0:96].decode("utf-8")) + with pytest.raises(pa.ArrowInvalid, match=message_long): + self.read_bytes(csv_bad_columns_long, use_threads, + read_options=read_options, + convert_options=convert_options) + + # Test skipping rows after the names + read_options.skip_rows_after_names = 47 + + with pytest.raises(pa.ArrowInvalid, match=message_columns): + self.read_bytes(csv_bad_columns, use_threads, + read_options=read_options, + convert_options=convert_options) + + with pytest.raises(pa.ArrowInvalid, match=message_value): + self.read_bytes(csv_bad_type, use_threads, + read_options=read_options, + convert_options=convert_options) + + with pytest.raises(pa.ArrowInvalid, match=message_long): + self.read_bytes(csv_bad_columns_long, use_threads, + read_options=read_options, + convert_options=convert_options) + + read_options.skip_rows_after_names = 0 + + # Test without skip_rows and column names not in the csv + csv, _ = make_random_csv(4, 100, write_names=False) + read_options.column_names = ["a", "b", "c", "d"] + csv_bad_columns = csv + b"1,2\r\n" + message_columns = format_msg("{}Expected 4 columns, got 2", 101) + with pytest.raises(pa.ArrowInvalid, match=message_columns): + self.read_bytes(csv_bad_columns, use_threads, + read_options=read_options, + convert_options=convert_options) + + csv_bad_columns_long = csv + long_row + message_long = format_msg("{}Expected 4 columns, got 2: {} ...", 101, + long_row[0:96].decode("utf-8")) + with pytest.raises(pa.ArrowInvalid, match=message_long): + self.read_bytes(csv_bad_columns_long, use_threads, + read_options=read_options, + convert_options=convert_options) + + csv_bad_type = csv + b"a,b,c,d\r\n" + message_value = format_msg( + "In CSV column #0: {}" + "CSV conversion error to int32: invalid value 'a'", + 101) + message_value = message_value.format(len(csv)) + with pytest.raises(pa.ArrowInvalid, match=message_value): + self.read_bytes(csv_bad_type, use_threads, + read_options=read_options, + convert_options=convert_options) + + # Test with skip_rows and column names not in the csv + read_options.skip_rows = 23 + with pytest.raises(pa.ArrowInvalid, match=message_columns): + self.read_bytes(csv_bad_columns, use_threads, + read_options=read_options, + convert_options=convert_options) + + with pytest.raises(pa.ArrowInvalid, match=message_value): + self.read_bytes(csv_bad_type, use_threads, + read_options=read_options, + convert_options=convert_options) + + +@pytest.mark.parametrize('use_threads', [False, True]) Review comment: My goal was to unify the test structure of the table and stream tests so that tests can be written once and executed for both readers. I opted to go to the parameter approach because that was recently done for the streaming tests and assumed since that was more recent it was the more desired style. I can revert it back to the inheritance approach instead but that would mean I would want to undo the parameterization that @westonpace recently did on the streaming tests. It doesn't matter to me which approach is used but it makes it a lot easier if it is the same style for testing the different readers. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org