2013/5/3 <s...@apache.org> > Author: sebb > Date: Fri May 3 01:10:34 2013 > New Revision: 1478621 > > URL: http://svn.apache.org/r1478621 > Log: > CSV-58 Unescape handling needs rethinking > Fixed up most issues. > TODO should TAB, FF and BACKSPACE be un/escaped? >
Thanks for taking care of this sebb. > > Modified: > > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java > > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/Lexer.java > > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java > > Modified: > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java > URL: > http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java?rev=1478621&r1=1478620&r2=1478621&view=diff > > ============================================================================== > --- > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java > (original) > +++ > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java > Fri May 3 01:10:34 2013 > @@ -160,7 +160,12 @@ final class CSVLexer extends Lexer { > tkn.type = TOKEN; > break; > } else if (isEscape(c)) { > - tkn.content.append((char) readEscape()); > + final int unescaped = readEscape(); > + if (unescaped == Constants.END_OF_STREAM) { // unexpected > char after escape > + tkn.content.append((char) c).append((char) > in.getLastChar()); > + } else { > + tkn.content.append((char) unescaped); > + } > c = in.read(); // continue > } else { > tkn.content.append((char) c); > @@ -203,7 +208,12 @@ final class CSVLexer extends Lexer { > c = in.read(); > > if (isEscape(c)) { > - tkn.content.append((char) readEscape()); > + final int unescaped = readEscape(); > + if (unescaped == Constants.END_OF_STREAM) { // unexpected > char after escape > + tkn.content.append((char) c).append((char) > in.getLastChar()); > + } else { > + tkn.content.append((char) unescaped); > + } > } else if (isQuoteChar(c)) { > if (isQuoteChar(in.lookAhead())) { > // double or escaped encapsulator -> add single > encapsulator to token > > Modified: > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/Lexer.java > URL: > http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/Lexer.java?rev=1478621&r1=1478620&r2=1478621&view=diff > > ============================================================================== > --- > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/Lexer.java > (original) > +++ > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/Lexer.java > Fri May 3 01:10:34 2013 > @@ -74,8 +74,18 @@ abstract class Lexer { > } > > // TODO escape handling needs more work > + /** > + * Handle an escape sequence. > + * The current character must be the escape character. > + * On return, the next character is available by calling {@link > ExtendedBufferedReader#getLastChar()} > + * on the input stream. > + * > + * @return the unescaped character (as an int) or {@link > END_OF_STREAM} if char following the escape is invalid. > + * @throws IOException if there is a problem reading the stream or > the end of stream is detected: > + * the escape character is not allowed at end of strem > + */ > int readEscape() throws IOException { > - // assume c is the escape char (normally a backslash) > + // the escape char has just been read (normally a backslash) > final int c = in.read(); > switch (c) { > case 'r': > @@ -88,10 +98,21 @@ abstract class Lexer { > return BACKSPACE; > case 'f': > return FF; > + case CR: > + case LF: > + case FF: // TODO is this correct? > + case TAB: // TODO is this correct? Do tabs need to be escaped? > + case BACKSPACE: // TODO is this correct? > + return c; > case END_OF_STREAM: > throw new IOException("EOF whilst processing escape > sequence"); > default: > - return c; > + // Now check for meta-characters > + if (isDelimiter(c) || isEscape(c) || isQuoteChar(c) || > isCommentStart(c)) { > + return c; > + } > + // indicate unexpected char - available from in.getLastChar() > + return END_OF_STREAM; > } > } > > > Modified: > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java > URL: > http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java?rev=1478621&r1=1478620&r2=1478621&view=diff > > ============================================================================== > --- > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java > (original) > +++ > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java > Fri May 3 01:10:34 2013 > @@ -36,7 +36,6 @@ import java.io.IOException; > import java.io.StringReader; > > import org.junit.Before; > -import org.junit.Ignore; > import org.junit.Test; > > /** > @@ -311,48 +310,42 @@ public class CSVLexerTest { > assertThat(lexer.nextToken(new Token()), hasContent("character" + > LF + "Escaped")); > } > > - @Test > + @Test // TODO is this correct? Do we expect TAB to be un/escaped? > public void testEscapedTab() throws Exception { > final Lexer lexer = getLexer("character\\" + TAB + "Escaped", > formatWithEscaping); > assertThat(lexer.nextToken(new Token()), hasContent("character" + > TAB + "Escaped")); > } > > - @Test > + @Test // TODO is this correct? Do we expect BACKSPACE to be > un/escaped? > public void testEscapeBackspace() throws Exception { > final Lexer lexer = getLexer("character\\" + BACKSPACE + > "Escaped", formatWithEscaping); > assertThat(lexer.nextToken(new Token()), hasContent("character" + > BACKSPACE + "Escaped")); > } > > - @Test > + @Test // TODO is this correct? Do we expect FF to be un/escaped? > public void testEscapeFF() throws Exception { > final Lexer lexer = getLexer("character\\" + FF + "Escaped", > formatWithEscaping); > assertThat(lexer.nextToken(new Token()), hasContent("character" + > FF + "Escaped")); > } > > - // FIXME this should work after CSV-58 is resolved. Currently the > result will be "charactera\NEscaped" > @Test > - @Ignore > public void testEscapedMySqlNullValue() throws Exception { > // MySQL uses \N to symbolize null values. We have to restore this > final Lexer lexer = getLexer("character\\NEscaped", > formatWithEscaping); > assertThat(lexer.nextToken(new Token()), > hasContent("character\\NEscaped")); > } > > - // FIXME this should work after CSV-58 is resolved. Currently the > result will be "characteraEscaped" > @Test > - @Ignore > public void testEscapedCharacter() throws Exception { > final Lexer lexer = getLexer("character\\aEscaped", > formatWithEscaping); > assertThat(lexer.nextToken(new Token()), > hasContent("character\\aEscaped")); > } > > - // FIXME this should work after CSV-58 is resolved. Currently the > result will be "characterCREscaped" > @Test > - @Ignore > public void testEscapedControlCharacter() throws Exception { > - // we are explicitly using an escape different from \ here, > because \r is the character sequence for CR > + // we are explicitly using an escape different from \ here > final Lexer lexer = getLexer("character!rEscaped", > CSVFormat.newBuilder().withEscape('!').build()); > - assertThat(lexer.nextToken(new Token()), > hasContent("character!rEscaped")); > + assertThat(lexer.nextToken(new Token()), hasContent("character" + > CR + "Escaped")); > } > > @Test > > > -- http://people.apache.org/~britter/ http://www.systemoutprintln.de/ http://twitter.com/BenediktRitter http://github.com/britter