This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 7deb35839 Include line and field number in CSV UTF-8 error (#3656) 
(#3657)
7deb35839 is described below

commit 7deb35839d55afb77370a41e3395529ddf78bf59
Author: Raphael Taylor-Davies <1781103+tustv...@users.noreply.github.com>
AuthorDate: Sat Feb 4 14:12:17 2023 +0000

    Include line and field number in CSV UTF-8 error (#3656) (#3657)
    
    * Include line and field number in CSV UTF-8 error (#3656)
    
    * Additional test case
---
 arrow-csv/src/reader/mod.rs     | 38 ++++++++++++++++++++++++++++++++++++++
 arrow-csv/src/reader/records.rs | 16 +++++++++++++++-
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs
index cff1337dd..925f50449 100644
--- a/arrow-csv/src/reader/mod.rs
+++ b/arrow-csv/src/reader/mod.rs
@@ -2231,4 +2231,42 @@ mod tests {
             }
         }
     }
+
+    fn err_test(csv: &[u8], expected: &str) {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("text1", DataType::Utf8, false),
+            Field::new("text2", DataType::Utf8, false),
+        ]));
+        let buffer = std::io::BufReader::with_capacity(2, Cursor::new(csv));
+        let b = ReaderBuilder::new()
+            .with_schema(schema)
+            .with_batch_size(2)
+            .build_buffered(buffer)
+            .unwrap();
+        let err = b.collect::<Result<Vec<_>, _>>().unwrap_err().to_string();
+        assert_eq!(err, expected)
+    }
+
+    #[test]
+    fn test_invalid_utf8() {
+        err_test(
+            b"sdf,dsfg\ndfd,hgh\xFFue\n,sds\nFalhghse,",
+            "Csv error: Encountered invalid UTF-8 data for line 2 and field 2",
+        );
+
+        err_test(
+            b"sdf,dsfg\ndksdk,jf\nd\xFFfd,hghue\n,sds\nFalhghse,",
+            "Csv error: Encountered invalid UTF-8 data for line 3 and field 1",
+        );
+
+        err_test(
+            b"sdf,dsfg\ndksdk,jf\ndsdsfd,hghue\n,sds\nFalhghse,\xFF",
+            "Csv error: Encountered invalid UTF-8 data for line 5 and field 2",
+        );
+
+        err_test(
+            b"\xFFsdf,dsfg\ndksdk,jf\ndsdsfd,hghue\n,sds\nFalhghse,\xFF",
+            "Csv error: Encountered invalid UTF-8 data for line 1 and field 1",
+        );
+    }
 }
diff --git a/arrow-csv/src/reader/records.rs b/arrow-csv/src/reader/records.rs
index c4da36ca4..a59d02e0e 100644
--- a/arrow-csv/src/reader/records.rs
+++ b/arrow-csv/src/reader/records.rs
@@ -193,7 +193,21 @@ impl RecordDecoder {
 
         // Need to truncate data t1o the actual amount of data read
         let data = 
std::str::from_utf8(&self.data[..self.data_len]).map_err(|e| {
-            ArrowError::CsvError(format!("Encountered invalid UTF-8 data: 
{e}"))
+            let valid_up_to = e.valid_up_to();
+
+            // We can't use binary search because of empty fields
+            let idx = self.offsets[..self.offsets_len]
+                .iter()
+                .rposition(|x| *x <= valid_up_to)
+                .unwrap();
+
+            let field = idx % self.num_columns + 1;
+            let line_offset = self.line_number - self.num_rows;
+            let line = line_offset + idx / self.num_columns;
+
+            ArrowError::CsvError(format!(
+                "Encountered invalid UTF-8 data for line {line} and field 
{field}"
+            ))
         })?;
 
         let offsets = &self.offsets[..self.offsets_len];

Reply via email to