This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 6c654988fe4576783c00ab3c6329e175231eda88 Author: Patrick Mezard <patr...@mezard.eu> AuthorDate: Tue Jun 9 17:00:16 2020 +0200 NUTCH-2790 indexer-csv: escape field leading quote character Before the change, the leading quote of a field value like '"value' would be left unescaped. --- .../java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java | 3 +-- .../org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java | 9 +++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java index 160d03d..99c0702 100644 --- a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java +++ b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java @@ -405,13 +405,12 @@ public class CSVIndexWriter implements IndexWriter { if (max > maxFieldLength) { max = maxFieldLength; } - while (nextQuoteChar > 0 && nextQuoteChar < max) { + while (nextQuoteChar >= 0 && nextQuoteChar < max) { csvout.write(value.substring(start, nextQuoteChar).getBytes(encoding)); csvout.write(escapeCharacter.bytes); csvout.write(quoteCharacter.bytes); start = nextQuoteChar + 1; nextQuoteChar = quoteCharacter.find(value, start); - if (nextQuoteChar > max) break; } csvout.write(value.substring(start, max).getBytes(encoding)); } diff --git a/src/plugin/indexer-csv/src/test/org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java b/src/plugin/indexer-csv/src/test/org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java index 761d042..5714cc2 100644 --- a/src/plugin/indexer-csv/src/test/org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java +++ b/src/plugin/indexer-csv/src/test/org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java @@ -159,6 +159,15 @@ public class TestCSVIndexWriter { } @Test + public void testCSVescapeLeadingQuotes() throws IOException { + String[] params = { CSVConstants.CSV_FIELDS, "test" }; + String[] fields = { "test", "\"quote\"" }; + String csv = getCSV(params, fields); + assertEquals("Leading quotes inside a quoted field must be escaped", + "\"\"\"quote\"\"\"", csv.trim()); + } + + @Test public void testCSVclipMaxLength() throws IOException { String[] params = { CSVConstants.CSV_FIELDS, "test", CSVConstants.CSV_MAXFIELDLENGTH, "8" };