This is an automated email from the ASF dual-hosted git repository.
ggregory pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-csv.git
The following commit(s) were added to refs/heads/master by this push:
new ac280e7 [CSV-93] Allow the handling of NULL values.
ac280e7 is described below
commit ac280e705d99cc1168f93f7d9a5f92618bebbb97
Author: Gary Gregory <[email protected]>
AuthorDate: Thu Jul 8 12:36:23 2021 -0400
[CSV-93] Allow the handling of NULL values.
- [CSV-253] Handle absent values in input (null).
- Cleaned up version of PR 77 from dota17 where:
- Don't duplicate two state items from the format.
- Use try-with-resources.
- Remove useless parens.
- Update Javaodc.
- Sort members in the new tests.
- Use builder.
---
.../java/org/apache/commons/csv/CSVParser.java | 37 ++++-
src/main/java/org/apache/commons/csv/Lexer.java | 1 +
.../java/org/apache/commons/csv/QuoteMode.java | 2 +-
src/main/java/org/apache/commons/csv/Token.java | 3 +
.../org/apache/commons/csv/CSVPrinterTest.java | 2 +-
.../apache/commons/csv/issues/JiraCsv253Test.java | 54 ++++++++
.../apache/commons/csv/issues/JiraCsv93Test.java | 150 +++++++++++++++++++++
7 files changed, 243 insertions(+), 6 deletions(-)
diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java
b/src/main/java/org/apache/commons/csv/CSVParser.java
index 3224013..94bb361 100644
--- a/src/main/java/org/apache/commons/csv/CSVParser.java
+++ b/src/main/java/org/apache/commons/csv/CSVParser.java
@@ -446,8 +446,7 @@ public final class CSVParser implements
Iterable<CSVRecord>, Closeable {
if (lastRecord && inputClean.isEmpty() &&
this.format.getTrailingDelimiter()) {
return;
}
- final String nullString = this.format.getNullString();
- this.recordList.add(inputClean.equals(nullString) ? null : inputClean);
+ this.recordList.add(handleNull(inputClean));
}
/**
@@ -636,7 +635,26 @@ public final class CSVParser implements
Iterable<CSVRecord>, Closeable {
}
/**
- * Gets whether this parser is closed.
+ * Handle whether input is parsed as null
+ *
+ * @param input
+ * the cell data to further processed
+ * @return null if input is parsed as null, or input itself if input isn't
parsed as null
+ */
+ private String handleNull(final String input) {
+ final boolean isQuoted = this.reusableToken.isQuoted;
+ final String nullString = format.getNullString();
+ final boolean strictQuoteMode = isStrictQuoteMode();
+ if (input.equals(nullString)) {
+ // nullString = NULL(String), distinguish between "NULL" and NULL
in ALL_NON_NULL or NON_NUMERIC quote mode
+ return strictQuoteMode && isQuoted ? input : null;
+ }
+ // don't set nullString, distinguish between "" and ,, (absent values)
in All_NON_NULL or NON_NUMERIC quote mode
+ return strictQuoteMode && nullString == null && input.isEmpty() &&
!isQuoted ? null : input;
+ }
+
+ /**
+ * Tests whether this parser is closed.
*
* @return whether this parser is closed.
*/
@@ -645,7 +663,18 @@ public final class CSVParser implements
Iterable<CSVRecord>, Closeable {
}
/**
- * Returns an iterator on the records.
+ * Tests whether the format's {@link QuoteMode} is {@link
QuoteMode#ALL_NON_NULL} or {@link QuoteMode#NON_NUMERIC}.
+ *
+ * @return true if the format's {@link QuoteMode} is {@link
QuoteMode#ALL_NON_NULL} or
+ * {@link QuoteMode#NON_NUMERIC}.
+ */
+ private boolean isStrictQuoteMode() {
+ return this.format.getQuoteMode() == QuoteMode.ALL_NON_NULL ||
+ this.format.getQuoteMode() == QuoteMode.NON_NUMERIC;
+ }
+
+ /**
+ * Returns the record iterator.
*
* <p>
* An {@link IOException} caught during the iteration are re-thrown as an
diff --git a/src/main/java/org/apache/commons/csv/Lexer.java
b/src/main/java/org/apache/commons/csv/Lexer.java
index a0c75aa..1f14543 100644
--- a/src/main/java/org/apache/commons/csv/Lexer.java
+++ b/src/main/java/org/apache/commons/csv/Lexer.java
@@ -324,6 +324,7 @@ final class Lexer implements Closeable {
* on invalid state: EOF before closing encapsulator or
invalid character before delimiter or EOL
*/
private Token parseEncapsulatedToken(final Token token) throws IOException
{
+ token.isQuoted = true;
// save current line number in case needed for IOE
final long startLineNumber = getCurrentLineNumber();
int c;
diff --git a/src/main/java/org/apache/commons/csv/QuoteMode.java
b/src/main/java/org/apache/commons/csv/QuoteMode.java
index 272deb7..a9b33a1 100644
--- a/src/main/java/org/apache/commons/csv/QuoteMode.java
+++ b/src/main/java/org/apache/commons/csv/QuoteMode.java
@@ -17,7 +17,7 @@
package org.apache.commons.csv;
/**
- * Defines quoting behavior when printing.
+ * Defines quoting behavior.
*/
public enum QuoteMode {
diff --git a/src/main/java/org/apache/commons/csv/Token.java
b/src/main/java/org/apache/commons/csv/Token.java
index dff7d01..2dedc58 100644
--- a/src/main/java/org/apache/commons/csv/Token.java
+++ b/src/main/java/org/apache/commons/csv/Token.java
@@ -55,10 +55,13 @@ final class Token {
/** Token ready flag: indicates a valid token with content (ready for the
parser). */
boolean isReady;
+ boolean isQuoted;
+
void reset() {
content.setLength(0);
type = INVALID;
isReady = false;
+ isQuoted = false;
}
/**
diff --git a/src/test/java/org/apache/commons/csv/CSVPrinterTest.java
b/src/test/java/org/apache/commons/csv/CSVPrinterTest.java
index 780a502..6890bc2 100644
--- a/src/test/java/org/apache/commons/csv/CSVPrinterTest.java
+++ b/src/test/java/org/apache/commons/csv/CSVPrinterTest.java
@@ -873,7 +873,7 @@ public class CSVPrinterTest {
String expected = "\"NULL\"\tNULL\n";
assertEquals(expected, writer.toString());
String[] record0 = toFirstRecordValues(expected, format);
- assertArrayEquals(new Object[2], record0);
+ assertArrayEquals(s, record0);
s = new String[] { "\\N", null };
format = CSVFormat.MYSQL.withNullString("\\N");
diff --git a/src/test/java/org/apache/commons/csv/issues/JiraCsv253Test.java
b/src/test/java/org/apache/commons/csv/issues/JiraCsv253Test.java
new file mode 100644
index 0000000..9050731
--- /dev/null
+++ b/src/test/java/org/apache/commons/csv/issues/JiraCsv253Test.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.csv.issues;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Iterator;
+
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVRecord;
+import org.apache.commons.csv.QuoteMode;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Setting QuoteMode:ALL_NON_NULL or NON_NUMERIC can distinguish between empty
string columns and absent value columns.
+ */
+public class JiraCsv253Test {
+
+ private void assertArrayEqual(final String[] expected, final CSVRecord
actual) {
+ for (int i = 0; i < expected.length; i++) {
+ assertEquals(expected[i], actual.get(i));
+ }
+ }
+
+ @Test
+ public void testHandleAbsentValues() throws IOException {
+ final String source = "\"John\",,\"Doe\"\n" + ",\"AA\",123\n" +
"\"John\",90,\n" + "\"\",,90";
+ final CSVFormat csvFormat =
CSVFormat.DEFAULT.builder().setQuoteMode(QuoteMode.NON_NUMERIC).build();
+ try (final CSVParser parser = csvFormat.parse(new
StringReader(source))) {
+ final Iterator<CSVRecord> csvRecords = parser.iterator();
+ assertArrayEqual(new String[] {"John", null, "Doe"},
csvRecords.next());
+ assertArrayEqual(new String[] {null, "AA", "123"},
csvRecords.next());
+ assertArrayEqual(new String[] {"John", "90", null},
csvRecords.next());
+ assertArrayEqual(new String[] {"", null, "90"}, csvRecords.next());
+ }
+ }
+}
diff --git a/src/test/java/org/apache/commons/csv/issues/JiraCsv93Test.java
b/src/test/java/org/apache/commons/csv/issues/JiraCsv93Test.java
new file mode 100644
index 0000000..e122348
--- /dev/null
+++ b/src/test/java/org/apache/commons/csv/issues/JiraCsv93Test.java
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.csv.issues;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVRecord;
+import org.apache.commons.csv.QuoteMode;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Add more tests about null value.
+ * <p>
+ * QuoteMode:ALL_NON_NULL (Quotes all non-null fields, null will not be quoted
but not null will be quoted). when
+ * withNullString("NULL"), NULL String value ("NULL") and null value (null)
will be formatted as '"NULL",NULL'. So it
+ * also should be parsed as NULL String value and null value (["NULL", null]),
It should be distinguish in parsing. And
+ * when don't set nullString in CSVFormat, String '"",' should be parsed as ""
and null (["", null]) according to null
+ * will not be quoted but not null will be quoted. QuoteMode:NON_NUMERIC, same
as ALL_NON_NULL.
+ * </p>
+ * <p>
+ * This can solve the problem of distinguishing between empty string columns
and absent value columns which just like
+ * Jira CSV-253 to a certain extent.
+ * </p>
+ */
+public class JiraCsv93Test {
+ private static Object[] objects1 = {"abc", "", null, "a,b,c", 123};
+
+ private static Object[] objects2 = {"abc", "NULL", null, "a,b,c", 123};
+
+ private void every(final CSVFormat csvFormat, final Object[] objects,
final String format, final String[] data)
+ throws IOException {
+ final String source = csvFormat.format(objects);
+ assertEquals(format, csvFormat.format(objects));
+ try (final CSVParser csvParser = csvFormat.parse(new
StringReader(source))) {
+ final CSVRecord csvRecord = csvParser.iterator().next();
+ for (int i = 0; i < data.length; i++) {
+ assertEquals(csvRecord.get(i), data[i]);
+ }
+ }
+ }
+
+ @Test
+ public void testWithNotSetNullString() throws IOException {
+ // @formatter:off
+ every(CSVFormat.DEFAULT,
+ objects1,
+ "abc,,,\"a,b,c\",123",
+ new String[]{"abc", "", "", "a,b,c", "123"});
+ every(CSVFormat.DEFAULT.builder().setQuoteMode(QuoteMode.ALL).build(),
+ objects1,
+ "\"abc\",\"\",,\"a,b,c\",\"123\"",
+ new String[]{"abc", "", "", "a,b,c", "123"});
+
every(CSVFormat.DEFAULT.builder().setQuoteMode(QuoteMode.ALL_NON_NULL).build(),
+ objects1,
+ "\"abc\",\"\",,\"a,b,c\",\"123\"",
+ new String[]{"abc", "", null, "a,b,c", "123"});
+
every(CSVFormat.DEFAULT.builder().setQuoteMode(QuoteMode.MINIMAL).build(),
+ objects1,
+ "abc,,,\"a,b,c\",123",
+ new String[]{"abc", "", "", "a,b,c", "123"});
+
every(CSVFormat.DEFAULT.builder().setEscape('?').setQuoteMode(QuoteMode.NONE).build(),
+ objects1,
+ "abc,,,a?,b?,c,123",
+ new String[]{"abc", "", "", "a,b,c", "123"});
+
every(CSVFormat.DEFAULT.builder().setQuoteMode(QuoteMode.NON_NUMERIC).build(),
+ objects1,
+ "\"abc\",\"\",,\"a,b,c\",123",
+ new String[]{"abc", "", null, "a,b,c", "123"});
+ // @formatter:on
+ }
+
+ @Test
+ public void testWithSetNullStringEmptyString() throws IOException {
+ // @formatter:off
+ every(CSVFormat.DEFAULT.builder().setNullString("").build(),
+ objects1,
+ "abc,,,\"a,b,c\",123",
+ new String[]{"abc", null, null, "a,b,c", "123"});
+
every(CSVFormat.DEFAULT.builder().setNullString("").setQuoteMode(QuoteMode.ALL).build(),
+ objects1,
+ "\"abc\",\"\",\"\",\"a,b,c\",\"123\"",
+ new String[]{"abc", null, null, "a,b,c", "123"});
+
every(CSVFormat.DEFAULT.builder().setNullString("").setQuoteMode(QuoteMode.ALL_NON_NULL).build(),
+ objects1,
+ "\"abc\",\"\",,\"a,b,c\",\"123\"",
+ new String[]{"abc", "", null, "a,b,c", "123"});
+
every(CSVFormat.DEFAULT.builder().setNullString("").setQuoteMode(QuoteMode.MINIMAL).build(),
+ objects1,
+ "abc,,,\"a,b,c\",123",
+ new String[]{"abc", null, null, "a,b,c", "123"});
+
every(CSVFormat.DEFAULT.builder().setNullString("").setEscape('?').setQuoteMode(QuoteMode.NONE).build(),
+ objects1,
+ "abc,,,a?,b?,c,123",
+ new String[]{"abc", null, null, "a,b,c", "123"});
+
every(CSVFormat.DEFAULT.builder().setNullString("").setQuoteMode(QuoteMode.NON_NUMERIC).build(),
+ objects1,
+ "\"abc\",\"\",,\"a,b,c\",123",
+ new String[]{"abc", "", null, "a,b,c", "123"});
+ // @formatter:on
+ }
+
+ @Test
+ public void testWithSetNullStringNULL() throws IOException {
+ // @formatter:off
+ every(CSVFormat.DEFAULT.builder().setNullString("NULL").build(),
+ objects2,
+ "abc,NULL,NULL,\"a,b,c\",123",
+ new String[]{"abc", null, null, "a,b,c", "123"});
+
every(CSVFormat.DEFAULT.builder().setNullString("NULL").setQuoteMode(QuoteMode.ALL).build(),
+ objects2,
+ "\"abc\",\"NULL\",\"NULL\",\"a,b,c\",\"123\"",
+ new String[]{"abc", null, null, "a,b,c", "123"});
+
every(CSVFormat.DEFAULT.builder().setNullString("NULL").setQuoteMode(QuoteMode.ALL_NON_NULL).build(),
+ objects2,
+ "\"abc\",\"NULL\",NULL,\"a,b,c\",\"123\"",
+ new String[]{"abc", "NULL", null, "a,b,c", "123"});
+
every(CSVFormat.DEFAULT.builder().setNullString("NULL").setQuoteMode(QuoteMode.MINIMAL).build(),
+ objects2,
+ "abc,NULL,NULL,\"a,b,c\",123",
+ new String[]{"abc", null, null, "a,b,c", "123"});
+
every(CSVFormat.DEFAULT.builder().setNullString("NULL").setEscape('?').setQuoteMode(QuoteMode.NONE).build(),
+ objects2,
+ "abc,NULL,NULL,a?,b?,c,123",
+ new String[]{"abc", null, null, "a,b,c", "123"});
+
every(CSVFormat.DEFAULT.builder().setNullString("NULL").setQuoteMode(QuoteMode.NON_NUMERIC).build(),
+ objects2,
+ "\"abc\",\"NULL\",NULL,\"a,b,c\",123",
+ new String[]{"abc", "NULL", null, "a,b,c", "123"});
+ // @formatter:on
+ }
+}
\ No newline at end of file