[GitHub] [arrow] n3world commented on a change in pull request #10794: ARROW-13441: [C++][CSV] Skip empty batches in column decoder

GitBox Wed, 25 Aug 2021 06:05:21 -0700


n3world commented on a change in pull request #10794:
URL: https://github.com/apache/arrow/pull/10794#discussion_r695731681




##########
File path: python/pyarrow/tests/test_csv.py
##########
@@ -609,28 +489,184 @@ def test_skip_rows_after_names(self):
             assert (values[opts.skip_rows + opts.skip_rows_after_names:] ==
                     table_dict[name])
 
-    def test_header_column_names(self):
+    def test_row_number_offset_in_errors(self, use_threads):
+        # Row numbers are only correctly counted in serial reads
+        def format_msg(msg_format, row, *args):
+            if use_threads:
+                row_info = ""
+            else:
+                row_info = "Row #{}: ".format(row)
+            return msg_format.format(row_info, *args)
+
+        csv, _ = make_random_csv(4, 100, write_names=True)
+
+        read_options = ReadOptions()
+        read_options.block_size = len(csv) / 3
+        convert_options = ConvertOptions()
+        convert_options.column_types = {"a": pa.int32()}
+
+        # Test without skip_rows and column names in the csv
+        csv_bad_columns = csv + b"1,2\r\n"
+        message_columns = format_msg("{}Expected 4 columns, got 2", 102)
+        with pytest.raises(pa.ArrowInvalid, match=message_columns):
+            self.read_bytes(csv_bad_columns, use_threads,
+                            read_options=read_options,
+                            convert_options=convert_options)
+
+        csv_bad_type = csv + b"a,b,c,d\r\n"
+        message_value = format_msg(
+            "In CSV column #0: {}"
+            "CSV conversion error to int32: invalid value 'a'",
+            102, csv)
+        with pytest.raises(pa.ArrowInvalid, match=message_value):
+            self.read_bytes(csv_bad_type, use_threads,
+                            read_options=read_options,
+                            convert_options=convert_options)
+
+        long_row = (b"this is a long row" * 15) + b",3\r\n"
+        csv_bad_columns_long = csv + long_row
+        message_long = format_msg("{}Expected 4 columns, got 2: {} ...", 102,
+                                  long_row[0:96].decode("utf-8"))
+        with pytest.raises(pa.ArrowInvalid, match=message_long):
+            self.read_bytes(csv_bad_columns_long, use_threads,
+                            read_options=read_options,
+                            convert_options=convert_options)
+
+        # Test skipping rows after the names
+        read_options.skip_rows_after_names = 47
+
+        with pytest.raises(pa.ArrowInvalid, match=message_columns):
+            self.read_bytes(csv_bad_columns, use_threads,
+                            read_options=read_options,
+                            convert_options=convert_options)
+
+        with pytest.raises(pa.ArrowInvalid, match=message_value):
+            self.read_bytes(csv_bad_type, use_threads,
+                            read_options=read_options,
+                            convert_options=convert_options)
+
+        with pytest.raises(pa.ArrowInvalid, match=message_long):
+            self.read_bytes(csv_bad_columns_long, use_threads,
+                            read_options=read_options,
+                            convert_options=convert_options)
+
+        read_options.skip_rows_after_names = 0
+
+        # Test without skip_rows and column names not in the csv
+        csv, _ = make_random_csv(4, 100, write_names=False)
+        read_options.column_names = ["a", "b", "c", "d"]
+        csv_bad_columns = csv + b"1,2\r\n"
+        message_columns = format_msg("{}Expected 4 columns, got 2", 101)
+        with pytest.raises(pa.ArrowInvalid, match=message_columns):
+            self.read_bytes(csv_bad_columns, use_threads,
+                            read_options=read_options,
+                            convert_options=convert_options)
+
+        csv_bad_columns_long = csv + long_row
+        message_long = format_msg("{}Expected 4 columns, got 2: {} ...", 101,
+                                  long_row[0:96].decode("utf-8"))
+        with pytest.raises(pa.ArrowInvalid, match=message_long):
+            self.read_bytes(csv_bad_columns_long, use_threads,
+                            read_options=read_options,
+                            convert_options=convert_options)
+
+        csv_bad_type = csv + b"a,b,c,d\r\n"
+        message_value = format_msg(
+            "In CSV column #0: {}"
+            "CSV conversion error to int32: invalid value 'a'",
+            101)
+        message_value = message_value.format(len(csv))
+        with pytest.raises(pa.ArrowInvalid, match=message_value):
+            self.read_bytes(csv_bad_type, use_threads,
+                            read_options=read_options,
+                            convert_options=convert_options)
+
+        # Test with skip_rows and column names not in the csv
+        read_options.skip_rows = 23
+        with pytest.raises(pa.ArrowInvalid, match=message_columns):
+            self.read_bytes(csv_bad_columns, use_threads,
+                            read_options=read_options,
+                            convert_options=convert_options)
+
+        with pytest.raises(pa.ArrowInvalid, match=message_value):
+            self.read_bytes(csv_bad_type, use_threads,
+                            read_options=read_options,
+                            convert_options=convert_options)
+
+
+@pytest.mark.parametrize('use_threads', [False, True])

Review comment:
       My goal was to unify the test structure of the table and stream tests so 
that tests can be written once and executed for both readers. I opted to go to 
the parameter approach because that was recently done for the streaming tests 
and assumed since that was more recent it was the more desired style. I can 
revert it back to the inheritance approach instead but that would mean I would 
want to undo the parameterization that @westonpace recently did on the 
streaming tests.
   
   It doesn't matter to me which approach is used but it makes it a lot easier 
if it is the same style for testing the different readers.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] n3world commented on a change in pull request #10794: ARROW-13441: [C++][CSV] Skip empty batches in column decoder

Reply via email to