Repository: phoenix Updated Branches: refs/heads/3.0 b5cbb79b6 -> 8e19e68bb
http://git-wip-us.apache.org/repos/asf/phoenix/blob/8e19e68b/phoenix-core/src/main/java/org/apache/commons/csv/Lexer.java ---------------------------------------------------------------------- diff --git a/phoenix-core/src/main/java/org/apache/commons/csv/Lexer.java b/phoenix-core/src/main/java/org/apache/commons/csv/Lexer.java deleted file mode 100644 index 95cf13d..0000000 --- a/phoenix-core/src/main/java/org/apache/commons/csv/Lexer.java +++ /dev/null @@ -1,431 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.commons.csv; - -import static org.apache.commons.csv.Constants.BACKSPACE; -import static org.apache.commons.csv.Constants.CR; -import static org.apache.commons.csv.Constants.END_OF_STREAM; -import static org.apache.commons.csv.Constants.FF; -import static org.apache.commons.csv.Constants.LF; -import static org.apache.commons.csv.Constants.TAB; -import static org.apache.commons.csv.Constants.UNDEFINED; -import static org.apache.commons.csv.Token.Type.COMMENT; -import static org.apache.commons.csv.Token.Type.EOF; -import static org.apache.commons.csv.Token.Type.EORECORD; -import static org.apache.commons.csv.Token.Type.INVALID; -import static org.apache.commons.csv.Token.Type.TOKEN; - -import java.io.IOException; - -/** - * - * - * @version $Id: Lexer.java 1512650 2013-08-10 11:46:28Z britter $ - */ -final class Lexer { - - /** - * Constant char to use for disabling comments, escapes and encapsulation. The value -2 is used because it - * won't be confused with an EOF signal (-1), and because the Unicode value {@code FFFE} would be encoded as two - * chars (using surrogates) and thus there should never be a collision with a real text char. - */ - private static final char DISABLED = '\ufffe'; - - private final char delimiter; - private final char escape; - private final char quoteChar; - private final char commentStart; - - private final boolean ignoreSurroundingSpaces; - private final boolean ignoreEmptyLines; - - /** The input stream */ - private final ExtendedBufferedReader in; - - /** INTERNAL API. but ctor needs to be called dynamically by PerformanceTest class */ - Lexer(final CSVFormat format, final ExtendedBufferedReader in) { - this.in = in; - this.delimiter = format.getDelimiter(); - this.escape = mapNullToDisabled(format.getEscape()); - this.quoteChar = mapNullToDisabled(format.getQuoteChar()); - this.commentStart = mapNullToDisabled(format.getCommentStart()); - this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces(); - this.ignoreEmptyLines = format.getIgnoreEmptyLines(); - } - - /** - * Returns the next token. - * <p/> - * A token corresponds to a term, a record change or an end-of-file indicator. - * - * @param token - * an existing Token object to reuse. The caller is responsible to initialize the Token. - * @return the next token found - * @throws java.io.IOException - * on stream access error - */ - Token nextToken(final Token token) throws IOException { - - // get the last read char (required for empty line detection) - int lastChar = in.getLastChar(); - - // read the next char and set eol - int c = in.read(); - /* - * Note: The following call will swallow LF if c == CR. But we don't need to know if the last char was CR or LF - * - they are equivalent here. - */ - boolean eol = readEndOfLine(c); - - // empty line detection: eol AND (last char was EOL or beginning) - if (ignoreEmptyLines) { - while (eol && isStartOfLine(lastChar)) { - // go on char ahead ... - lastChar = c; - c = in.read(); - eol = readEndOfLine(c); - // reached end of file without any content (empty line at the end) - if (isEndOfFile(c)) { - token.type = EOF; - // don't set token.isReady here because no content - return token; - } - } - } - - // did we reach eof during the last iteration already ? EOF - if (isEndOfFile(lastChar) || (!isDelimiter(lastChar) && isEndOfFile(c))) { - token.type = EOF; - // don't set token.isReady here because no content - return token; - } - - if (isStartOfLine(lastChar) && isCommentStart(c)) { - final String line = in.readLine(); - if (line == null) { - token.type = EOF; - // don't set token.isReady here because no content - return token; - } - final String comment = line.trim(); - token.content.append(comment); - token.type = COMMENT; - return token; - } - - // important: make sure a new char gets consumed in each iteration - while (token.type == INVALID) { - // ignore whitespaces at beginning of a token - if (ignoreSurroundingSpaces) { - while (isWhitespace(c) && !eol) { - c = in.read(); - eol = readEndOfLine(c); - } - } - - // ok, start of token reached: encapsulated, or token - if (isDelimiter(c)) { - // empty token return TOKEN("") - token.type = TOKEN; - } else if (eol) { - // empty token return EORECORD("") - // noop: token.content.append(""); - token.type = EORECORD; - } else if (isQuoteChar(c)) { - // consume encapsulated token - parseEncapsulatedToken(token); - } else if (isEndOfFile(c)) { - // end of file return EOF() - // noop: token.content.append(""); - token.type = EOF; - token.isReady = true; // there is data at EOF - } else { - // next token must be a simple token - // add removed blanks when not ignoring whitespace chars... - parseSimpleToken(token, c); - } - } - return token; - } - - /** - * Parses a simple token. - * <p/> - * Simple token are tokens which are not surrounded by encapsulators. A simple token might contain escaped - * delimiters (as \, or \;). The token is finished when one of the following conditions become true: - * <ul> - * <li>end of line has been reached (EORECORD)</li> - * <li>end of stream has been reached (EOF)</li> - * <li>an unescaped delimiter has been reached (TOKEN)</li> - * </ul> - * - * @param token - * the current token - * @param ch - * the current character - * @return the filled token - * @throws IOException - * on stream access error - */ - private Token parseSimpleToken(final Token token, int ch) throws IOException { - // Faster to use while(true)+break than while(token.type == INVALID) - while (true) { - if (readEndOfLine(ch)) { - token.type = EORECORD; - break; - } else if (isEndOfFile(ch)) { - token.type = EOF; - token.isReady = true; // There is data at EOF - break; - } else if (isDelimiter(ch)) { - token.type = TOKEN; - break; - } else if (isEscape(ch)) { - final int unescaped = readEscape(); - if (unescaped == Constants.END_OF_STREAM) { // unexpected char after escape - token.content.append((char) ch).append((char) in.getLastChar()); - } else { - token.content.append((char) unescaped); - } - ch = in.read(); // continue - } else { - token.content.append((char) ch); - ch = in.read(); // continue - } - } - - if (ignoreSurroundingSpaces) { - trimTrailingSpaces(token.content); - } - - return token; - } - - /** - * Parses an encapsulated token. - * <p/> - * Encapsulated tokens are surrounded by the given encapsulating-string. The encapsulator itself might be included - * in the token using a doubling syntax (as "", '') or using escaping (as in \", \'). Whitespaces before and after - * an encapsulated token are ignored. The token is finished when one of the following conditions become true: - * <ul> - * <li>an unescaped encapsulator has been reached, and is followed by optional whitespace then:</li> - * <ul> - * <li>delimiter (TOKEN)</li> - * <li>end of line (EORECORD)</li> - * </ul> - * <li>end of stream has been reached (EOF)</li> </ul> - * - * @param token - * the current token - * @return a valid token object - * @throws IOException - * on invalid state: EOF before closing encapsulator or invalid character before delimiter or EOL - */ - private Token parseEncapsulatedToken(final Token token) throws IOException { - // save current line number in case needed for IOE - final long startLineNumber = getCurrentLineNumber(); - int c; - while (true) { - c = in.read(); - - if (isEscape(c)) { - final int unescaped = readEscape(); - if (unescaped == Constants.END_OF_STREAM) { // unexpected char after escape - token.content.append((char) c).append((char) in.getLastChar()); - } else { - token.content.append((char) unescaped); - } - } else if (isQuoteChar(c)) { - if (isQuoteChar(in.lookAhead())) { - // double or escaped encapsulator -> add single encapsulator to token - c = in.read(); - token.content.append((char) c); - } else { - // token finish mark (encapsulator) reached: ignore whitespace till delimiter - while (true) { - c = in.read(); - if (isDelimiter(c)) { - token.type = TOKEN; - return token; - } else if (isEndOfFile(c)) { - token.type = EOF; - token.isReady = true; // There is data at EOF - return token; - } else if (readEndOfLine(c)) { - token.type = EORECORD; - return token; - } else if (!isWhitespace(c)) { - // error invalid char between token and next delimiter - throw new IOException("(line " + getCurrentLineNumber() + - ") invalid char between encapsulated token and delimiter."); - } - } - } - } else if (isEndOfFile(c)) { - // error condition (end of file before end of token) - throw new IOException("(startline " + startLineNumber + - ") EOF reached before encapsulated token finished"); - } else { - // consume character - token.content.append((char) c); - } - } - } - - private char mapNullToDisabled(final Character c) { - return c == null ? DISABLED : c.charValue(); - } - - /** - * Returns the current line number - * - * @return the current line number - */ - long getCurrentLineNumber() { - return in.getCurrentLineNumber(); - } - - // TODO escape handling needs more work - /** - * Handle an escape sequence. - * The current character must be the escape character. - * On return, the next character is available by calling {@link ExtendedBufferedReader#getLastChar()} - * on the input stream. - * - * @return the unescaped character (as an int) or {@link Constants#END_OF_STREAM} if char following the escape is - * invalid. - * @throws IOException if there is a problem reading the stream or the end of stream is detected: - * the escape character is not allowed at end of strem - */ - int readEscape() throws IOException { - // the escape char has just been read (normally a backslash) - final int ch = in.read(); - switch (ch) { - case 'r': - return CR; - case 'n': - return LF; - case 't': - return TAB; - case 'b': - return BACKSPACE; - case 'f': - return FF; - case CR: - case LF: - case FF: // TODO is this correct? - case TAB: // TODO is this correct? Do tabs need to be escaped? - case BACKSPACE: // TODO is this correct? - return ch; - case END_OF_STREAM: - throw new IOException("EOF whilst processing escape sequence"); - default: - // Now check for meta-characters - if (isMetaChar(ch)) { - return ch; - } - // indicate unexpected char - available from in.getLastChar() - return END_OF_STREAM; - } - } - - void trimTrailingSpaces(final StringBuilder buffer) { - int length = buffer.length(); - while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) { - length = length - 1; - } - if (length != buffer.length()) { - buffer.setLength(length); - } - } - - /** - * Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character... - * - * @return true if the given or next character is a line-terminator - */ - boolean readEndOfLine(int ch) throws IOException { - // check if we have \r\n... - if (ch == CR && in.lookAhead() == LF) { - // note: does not change ch outside of this method! - ch = in.read(); - } - return ch == LF || ch == CR; - } - - boolean isClosed() { - return in.isClosed(); - } - - /** - * @return true if the given char is a whitespace character - */ - boolean isWhitespace(final int ch) { - return !isDelimiter(ch) && Character.isWhitespace((char) ch); - } - - /** - * Checks if the current character represents the start of a line: a CR, LF or is at the start of the file. - * - * @param ch the character to check - * @return true if the character is at the start of a line. - */ - boolean isStartOfLine(final int ch) { - return ch == LF || ch == CR || ch == UNDEFINED; - } - - /** - * @return true if the given character indicates end of file - */ - boolean isEndOfFile(final int ch) { - return ch == END_OF_STREAM; - } - - boolean isDelimiter(final int ch) { - return ch == delimiter; - } - - boolean isEscape(final int ch) { - return ch == escape; - } - - boolean isQuoteChar(final int ch) { - return ch == quoteChar; - } - - boolean isCommentStart(final int ch) { - return ch == commentStart; - } - - private boolean isMetaChar(final int ch) { - return ch == delimiter || - ch == escape || - ch == quoteChar || - ch == commentStart; - } - - /** - * Closes resources. - * - * @throws IOException - * If an I/O error occurs - */ - void close() throws IOException { - in.close(); - } -} http://git-wip-us.apache.org/repos/asf/phoenix/blob/8e19e68b/phoenix-core/src/main/java/org/apache/commons/csv/Quote.java ---------------------------------------------------------------------- diff --git a/phoenix-core/src/main/java/org/apache/commons/csv/Quote.java b/phoenix-core/src/main/java/org/apache/commons/csv/Quote.java deleted file mode 100644 index 774da96..0000000 --- a/phoenix-core/src/main/java/org/apache/commons/csv/Quote.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.csv; - -/** - * Defines quote behavior when printing. - * - * @version $Id: Quote.java 1465441 2013-04-07 18:38:56Z britter $ - */ -public enum Quote { - - /** - * Quotes all fields. - */ - ALL, - - /** - * Quotes fields which contain special characters such as a delimiter, quote character or any of the characters in - * line separator. - */ - MINIMAL, - - /** - * Quotes all non-numeric fields. - */ - NON_NUMERIC, - - /** - * Never quotes fields. When the delimiter occurs in data, it is preceded by the current escape character. If the - * escape character is not set, printing will throw an exception if any characters that require escaping are - * encountered. - */ - NONE -} http://git-wip-us.apache.org/repos/asf/phoenix/blob/8e19e68b/phoenix-core/src/main/java/org/apache/commons/csv/Token.java ---------------------------------------------------------------------- diff --git a/phoenix-core/src/main/java/org/apache/commons/csv/Token.java b/phoenix-core/src/main/java/org/apache/commons/csv/Token.java deleted file mode 100644 index 7049e67..0000000 --- a/phoenix-core/src/main/java/org/apache/commons/csv/Token.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.commons.csv; - -import static org.apache.commons.csv.Token.Type.INVALID; - -/** - * Internal token representation. - * <p/> - * It is used as contract between the lexer and the parser. - * - * @version $Id: Token.java 1509069 2013-08-01 02:04:27Z ggregory $ - */ -final class Token { - - /** length of the initial token (content-)buffer */ - private static final int INITIAL_TOKEN_LENGTH = 50; - - enum Type { - /** Token has no valid content, i.e. is in its initialized state. */ - INVALID, - - /** Token with content, at beginning or in the middle of a line. */ - TOKEN, - - /** Token (which can have content) when the end of file is reached. */ - EOF, - - /** Token with content when the end of a line is reached. */ - EORECORD, - - /** Token is a comment line. */ - COMMENT - } - - /** Token type */ - Token.Type type = INVALID; - - /** The content buffer. */ - final StringBuilder content = new StringBuilder(INITIAL_TOKEN_LENGTH); - - /** Token ready flag: indicates a valid token with content (ready for the parser). */ - boolean isReady; - - void reset() { - content.setLength(0); - type = INVALID; - isReady = false; - } - - /** - * Eases IDE debugging. - * - * @return a string helpful for debugging. - */ - @Override - public String toString() { - return type.name() + " [" + content.toString() + "]"; - } -} http://git-wip-us.apache.org/repos/asf/phoenix/blob/8e19e68b/phoenix-core/src/main/java/org/apache/commons/csv/package-info.java ---------------------------------------------------------------------- diff --git a/phoenix-core/src/main/java/org/apache/commons/csv/package-info.java b/phoenix-core/src/main/java/org/apache/commons/csv/package-info.java deleted file mode 100644 index 29e7fef..0000000 --- a/phoenix-core/src/main/java/org/apache/commons/csv/package-info.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Apache Commons CSV Format Support. - * - * <p>CSV are widely used as interfaces to legacy systems or manual data-imports. - * CSV stands for "Comma Separated Values" (or sometimes "Character Separated - * Values"). The CSV data format is defined in - * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> - * but many dialects exist.</p> - * - * <p>Common to all file dialects is its basic structure: The CSV data-format - * is record oriented, whereas each record starts on a new textual line. A - * record is build of a list of values. Keep in mind that not all records - * must have an equal number of values:</p> - * <pre> - * csv := records* - * record := values* - * </pre> - * - * <p>The following list contains the CSV aspects the Commons CSV parser supports:</p> - * <dl> - * <dt>Separators (for lines)</dt> - * <dd>The record separators are hardcoded and cannot be changed. The must be '\r', '\n' or '\r\n'.</dd> - * - * <dt>Delimiter (for values)</dt> - * <dd>The delimiter for values is freely configurable (default ',').</dd> - * - * <dt>Comments</dt> - * <dd>Some CSV-dialects support a simple comment syntax. A comment is a record - * which must start with a designated character (the commentStarter). A record - * of this kind is treated as comment and gets removed from the input (default none)</dd> - * - * <dt>Encapsulator</dt> - * <dd>Two encapsulator characters (default '"') are used to enclose -> complex values.</dd> - * - * <dt>Simple values</dt> - * <dd>A simple value consist of all characters (except the delimiter) until - * (but not including) the next delimiter or a record-terminator. Optionally - * all surrounding whitespaces of a simple value can be ignored (default: true).</dd> - * - * <dt>Complex values</dt> - * <dd>Complex values are encapsulated within a pair of the defined encapsulator characters. - * The encapsulator itself must be escaped or doubled when used inside complex values. - * Complex values preserve all kind of formatting (including newlines -> multiline-values)</dd> - * - * <dt>Empty line skipping</dt> - * <dd>Optionally empty lines in CSV files can be skipped. - * Otherwise, empty lines will return a record with a single empty value.</dd> - * </dl> - * - * <p>In addition to individually defined dialects, two predefined dialects (strict-csv, and excel-csv) - * can be set directly.</p> <!-- TODO fix --> - * - * <p>Example usage:</p> - * <blockquote><pre> - * Reader in = new StringReader("a,b,c"); - * for (CSVRecord record : CSVFormat.DEFAULT.parse(in)) { - * for (String field : record) { - * System.out.print("\"" + field + "\", "); - * } - * System.out.println(); - * } - * </pre></blockquote> - */ - -package org.apache.commons.csv; http://git-wip-us.apache.org/repos/asf/phoenix/blob/8e19e68b/phoenix-core/src/main/java/org/apache/phoenix/util/CSVCommonsLoader.java ---------------------------------------------------------------------- diff --git a/phoenix-core/src/main/java/org/apache/phoenix/util/CSVCommonsLoader.java b/phoenix-core/src/main/java/org/apache/phoenix/util/CSVCommonsLoader.java index ef4375c..0bf366e 100644 --- a/phoenix-core/src/main/java/org/apache/phoenix/util/CSVCommonsLoader.java +++ b/phoenix-core/src/main/java/org/apache/phoenix/util/CSVCommonsLoader.java @@ -17,6 +17,7 @@ */ package org.apache.phoenix.util; +import com.google.common.base.Charsets; import com.google.common.base.Joiner; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; @@ -133,7 +134,7 @@ public class CSVCommonsLoader { CSVFormat format = CSVFormat.DEFAULT .withIgnoreEmptyLines(true) .withDelimiter(asControlCharacter(fieldDelimiter)) - .withQuoteChar(asControlCharacter(quoteCharacter)); + .withQuote(asControlCharacter(quoteCharacter)); if (escapeCharacter != null) { format = format.withEscape(asControlCharacter(escapeCharacter)); @@ -187,8 +188,7 @@ public class CSVCommonsLoader { * @throws Exception */ public void upsert(String fileName) throws Exception { - CSVParser parser = CSVParser.parse(new File(fileName), - format); + CSVParser parser = CSVParser.parse(new File(fileName), Charsets.UTF_8, format); upsert(parser); } http://git-wip-us.apache.org/repos/asf/phoenix/blob/8e19e68b/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 4f412c4..5bf9ee7 100644 --- a/pom.xml +++ b/pom.xml @@ -89,6 +89,7 @@ <commons-io.version>2.1</commons-io.version> <commons-lang.version>2.5</commons-lang.version> <commons-logging.version>1.1.1</commons-logging.version> + <commons-csv.version>1.0</commons-csv.version> <sqlline.version>1.1.2</sqlline.version> <guava.version>12.0.1</guava.version> <jackson.version>1.8.8</jackson.version> @@ -484,6 +485,11 @@ <artifactId>joda-time</artifactId> <version>${jodatime.version}</version> </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-csv</artifactId> + <version>${commons-csv.version}</version> + </dependency> <!-- Test Dependencies --> <dependency>