This is an automated email from the ASF dual-hosted git repository. tustvold pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push: new 7deb35839 Include line and field number in CSV UTF-8 error (#3656) (#3657) 7deb35839 is described below commit 7deb35839d55afb77370a41e3395529ddf78bf59 Author: Raphael Taylor-Davies <1781103+tustv...@users.noreply.github.com> AuthorDate: Sat Feb 4 14:12:17 2023 +0000 Include line and field number in CSV UTF-8 error (#3656) (#3657) * Include line and field number in CSV UTF-8 error (#3656) * Additional test case --- arrow-csv/src/reader/mod.rs | 38 ++++++++++++++++++++++++++++++++++++++ arrow-csv/src/reader/records.rs | 16 +++++++++++++++- 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index cff1337dd..925f50449 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -2231,4 +2231,42 @@ mod tests { } } } + + fn err_test(csv: &[u8], expected: &str) { + let schema = Arc::new(Schema::new(vec![ + Field::new("text1", DataType::Utf8, false), + Field::new("text2", DataType::Utf8, false), + ])); + let buffer = std::io::BufReader::with_capacity(2, Cursor::new(csv)); + let b = ReaderBuilder::new() + .with_schema(schema) + .with_batch_size(2) + .build_buffered(buffer) + .unwrap(); + let err = b.collect::<Result<Vec<_>, _>>().unwrap_err().to_string(); + assert_eq!(err, expected) + } + + #[test] + fn test_invalid_utf8() { + err_test( + b"sdf,dsfg\ndfd,hgh\xFFue\n,sds\nFalhghse,", + "Csv error: Encountered invalid UTF-8 data for line 2 and field 2", + ); + + err_test( + b"sdf,dsfg\ndksdk,jf\nd\xFFfd,hghue\n,sds\nFalhghse,", + "Csv error: Encountered invalid UTF-8 data for line 3 and field 1", + ); + + err_test( + b"sdf,dsfg\ndksdk,jf\ndsdsfd,hghue\n,sds\nFalhghse,\xFF", + "Csv error: Encountered invalid UTF-8 data for line 5 and field 2", + ); + + err_test( + b"\xFFsdf,dsfg\ndksdk,jf\ndsdsfd,hghue\n,sds\nFalhghse,\xFF", + "Csv error: Encountered invalid UTF-8 data for line 1 and field 1", + ); + } } diff --git a/arrow-csv/src/reader/records.rs b/arrow-csv/src/reader/records.rs index c4da36ca4..a59d02e0e 100644 --- a/arrow-csv/src/reader/records.rs +++ b/arrow-csv/src/reader/records.rs @@ -193,7 +193,21 @@ impl RecordDecoder { // Need to truncate data t1o the actual amount of data read let data = std::str::from_utf8(&self.data[..self.data_len]).map_err(|e| { - ArrowError::CsvError(format!("Encountered invalid UTF-8 data: {e}")) + let valid_up_to = e.valid_up_to(); + + // We can't use binary search because of empty fields + let idx = self.offsets[..self.offsets_len] + .iter() + .rposition(|x| *x <= valid_up_to) + .unwrap(); + + let field = idx % self.num_columns + 1; + let line_offset = self.line_number - self.num_rows; + let line = line_offset + idx / self.num_columns; + + ArrowError::CsvError(format!( + "Encountered invalid UTF-8 data for line {line} and field {field}" + )) })?; let offsets = &self.offsets[..self.offsets_len];