scolebourne 2004/02/13 16:31:55 Modified: lang/src/test/org/apache/commons/lang TokenizerTest.java lang/src/java/org/apache/commons/lang Tokenizer.java Log: Improve Tokenizer with CSV and TSV plus change default to StringTokenizer like
includes code from Matthew Inger Revision Changes Path 1.2 +136 -3 jakarta-commons/lang/src/test/org/apache/commons/lang/TokenizerTest.java Index: TokenizerTest.java =================================================================== RCS file: /home/cvs/jakarta-commons/lang/src/test/org/apache/commons/lang/TokenizerTest.java,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- TokenizerTest.java 17 Nov 2003 23:02:18 -0000 1.1 +++ TokenizerTest.java 14 Feb 2004 00:31:55 -0000 1.2 @@ -1,7 +1,7 @@ /* ==================================================================== * The Apache Software License, Version 1.1 * - * Copyright (c) 2002-2003 The Apache Software Foundation. All rights + * Copyright (c) 2003-2004 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without @@ -90,6 +90,9 @@ String input = "a;b;c;\"d;\"\"e\";f; ; ;"; Tokenizer tok = new Tokenizer(input); tok.setDelimiterChar(';'); + tok.setQuoteChar('"'); + tok.setIgnoredMatcher(Tokenizer.TRIM_MATCHER); + tok.setIgnoreEmptyTokens(false); String tokens [] = tok.getAllTokens(); String expected[] = new String[] @@ -120,7 +123,9 @@ String input = "a;b;c ;\"d;\"\"e\";f; ; ;"; Tokenizer tok = new Tokenizer(input); tok.setDelimiterChar(';'); + tok.setQuoteChar('"'); tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER); + tok.setIgnoreEmptyTokens(false); String tokens [] = tok.getAllTokens(); String expected[] = new String[] @@ -151,7 +156,9 @@ String input = "a;b; c;\"d;\"\"e\";f; ; ;"; Tokenizer tok = new Tokenizer(input); tok.setDelimiterChar(';'); + tok.setQuoteChar('"'); tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER); + tok.setIgnoreEmptyTokens(false); String tokens [] = tok.getAllTokens(); String expected[] = new String[] @@ -182,6 +189,8 @@ String input = "a;b; c;\"d;\"\"e\";f; ; ;"; Tokenizer tok = new Tokenizer(input); tok.setDelimiterChar(';'); + tok.setQuoteChar('"'); + tok.setIgnoredMatcher(Tokenizer.TRIM_MATCHER); tok.setIgnoreEmptyTokens(true); String tokens [] = tok.getAllTokens(); @@ -210,6 +219,9 @@ String input = "a;b; c;\"d;\"\"e\";f; ; ;"; Tokenizer tok = new Tokenizer(input); tok.setDelimiterChar(';'); + tok.setQuoteChar('"'); + tok.setIgnoredMatcher(Tokenizer.TRIM_MATCHER); + tok.setIgnoreEmptyTokens(false); tok.setEmptyTokenAsNull(true); String tokens [] = tok.getAllTokens(); @@ -241,6 +253,9 @@ String input = "a;b; c;\"d;\"\"e\";f; ; ;"; Tokenizer tok = new Tokenizer(input); tok.setDelimiterChar(';'); + tok.setQuoteChar('"'); + tok.setIgnoredMatcher(Tokenizer.TRIM_MATCHER); + tok.setIgnoreEmptyTokens(false); // tok.setTreatingEmptyAsNull(true); String tokens [] = tok.getAllTokens(); @@ -285,7 +300,8 @@ String input = "a b c \"d e\" f "; Tokenizer tok = new Tokenizer(input); - tok.setDelimiterMatcher(Tokenizer.SPACES_MATCHER); + tok.setDelimiterMatcher(Tokenizer.SPACE_MATCHER); + tok.setQuoteMatcher(Tokenizer.DOUBLE_QUOTE_MATCHER); tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER); tok.setIgnoreEmptyTokens(false); String tokens [] = tok.getAllTokens(); @@ -317,7 +333,8 @@ String input = "a b c \"d e\" f "; Tokenizer tok = new Tokenizer(input); - tok.setDelimiterMatcher(Tokenizer.SPACES_MATCHER); + tok.setDelimiterMatcher(Tokenizer.SPACE_MATCHER); + tok.setQuoteMatcher(Tokenizer.DOUBLE_QUOTE_MATCHER); tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER); tok.setIgnoreEmptyTokens(true); String tokens [] = tok.getAllTokens(); @@ -341,4 +358,120 @@ } + public void testBasic1() { + String input = "a b c"; + Tokenizer tok = new Tokenizer(input); + assertEquals("a", tok.next()); + assertEquals("b", tok.next()); + assertEquals("c", tok.next()); + } + + public void testBasic2() { + String input = "a \nb\fc"; + Tokenizer tok = new Tokenizer(input); + assertEquals("a", tok.next()); + assertEquals("b", tok.next()); + assertEquals("c", tok.next()); + } + + public void testBasic3() { + String input = "a \nb\u0001\fc"; + Tokenizer tok = new Tokenizer(input); + assertEquals("a", tok.next()); + assertEquals("b\u0001", tok.next()); + assertEquals("c", tok.next()); + } + + public void testBasic4() { + String input = "a \"b\" c"; + Tokenizer tok = new Tokenizer(input); + assertEquals("a", tok.next()); + assertEquals("\"b\"", tok.next()); + assertEquals("c", tok.next()); + } + + public void testBasicQuoted1() { + String input = "a \"b\" c"; + Tokenizer tok = new Tokenizer(input, ' ', '"'); + assertEquals("a", tok.next()); + assertEquals("b", tok.next()); + assertEquals("c", tok.next()); + } + + public void testBasicDelim1() { + String input = "a:b:c"; + Tokenizer tok = new Tokenizer(input, ':'); + assertEquals("a", tok.next()); + assertEquals("b", tok.next()); + assertEquals("c", tok.next()); + } + + public void testBasicDelim2() { + String input = "a:b:c"; + Tokenizer tok = new Tokenizer(input, ','); + assertEquals("a:b:c", tok.next()); + } + + public void testBasicEmpty1() { + String input = "a b c"; + Tokenizer tok = new Tokenizer(input); + tok.setIgnoreEmptyTokens(false); + assertEquals("a", tok.next()); + assertEquals("", tok.next()); + assertEquals("b", tok.next()); + assertEquals("c", tok.next()); + } + + public void testBasicEmpty2() { + String input = "a b c"; + Tokenizer tok = new Tokenizer(input); + tok.setIgnoreEmptyTokens(false); + tok.setEmptyTokenAsNull(true); + assertEquals("a", tok.next()); + assertEquals(null, tok.next()); + assertEquals("b", tok.next()); + assertEquals("c", tok.next()); + } + + public void testGetContent() { + String input = "a b c \"d e\" f "; + Tokenizer tok = new Tokenizer(input); + assertSame(input, tok.getContent()); + + tok = new Tokenizer(input.toCharArray()); + assertEquals(input, tok.getContent()); + } + + public void testReset() { + String input = "a b c"; + Tokenizer tok = new Tokenizer(input); + assertEquals("a", tok.next()); + assertEquals("b", tok.next()); + assertEquals("c", tok.next()); + tok.reset(); + assertEquals("a", tok.next()); + assertEquals("b", tok.next()); + assertEquals("c", tok.next()); + tok.reset("d e"); + assertEquals("d", tok.next()); + assertEquals("e", tok.next()); + tok.reset("f g".toCharArray()); + assertEquals("f", tok.next()); + assertEquals("g", tok.next()); + } + + public void testMatcher() { + assertEquals(true, Tokenizer.SPACE_MATCHER.isMatch(' ')); + assertEquals(false, Tokenizer.SPACE_MATCHER.isMatch('\n')); + assertEquals(false, Tokenizer.SPACE_MATCHER.isMatch('\u0001')); + + assertEquals(true, Tokenizer.TRIM_MATCHER.isMatch(' ')); + assertEquals(true, Tokenizer.TRIM_MATCHER.isMatch('\n')); + assertEquals(true, Tokenizer.TRIM_MATCHER.isMatch('\u0001')); + + assertEquals(true, Tokenizer.SPLIT_MATCHER.isMatch(' ')); + assertEquals(true, Tokenizer.SPLIT_MATCHER.isMatch('\n')); + assertEquals(false, Tokenizer.SPLIT_MATCHER.isMatch('\u0001')); + } + } 1.4 +247 -55 jakarta-commons/lang/src/java/org/apache/commons/lang/Tokenizer.java Index: Tokenizer.java =================================================================== RCS file: /home/cvs/jakarta-commons/lang/src/java/org/apache/commons/lang/Tokenizer.java,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- Tokenizer.java 13 Feb 2004 01:58:50 -0000 1.3 +++ Tokenizer.java 14 Feb 2004 00:31:55 -0000 1.4 @@ -1,7 +1,7 @@ /* ==================================================================== * The Apache Software License, Version 1.1 * - * Copyright (c) 2002-2003 The Apache Software Foundation. All rights + * Copyright (c) 2003-2004 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without @@ -62,9 +62,9 @@ * Tokenizes a string based based on delimiters (separators) * and supporting quoting and ignored character concepts. * <p> - * This class can split a String into many smaller strings. It aims to do a - * similar job to java util StringTokenizer, however it offers much more - * control and flexibility. + * This class can split a String into many smaller strings. + * It aims to do a similar job to java util StringTokenizer, however it offers + * much more control and flexibility. By default, it is setup like StringTokenizer. * <p> * The input String is split into a number of <i>tokens</i>. * Each token is separated from the next String by a <i>delimiter</i>. @@ -73,22 +73,36 @@ * The processing then strips all the <i>ignored</i> characters from each side of the token. * The token may also have <i>quotes</i> to mark an area not to be stripped or tokenized. * Empty tokens may be removed or returned as null. + * This example is based on the CSV tokenizer. * <pre> - * "a,b,c" - Three tokens "a","b","c" (comma delimiter) - * "a, b , c" - Three tokens "a","b","c" (ignored space characters stripped) + * "a,b,c" - Three tokens "a","b","c" (comma delimiter) + * "a, b , c" - Three tokens "a","b","c" (ignored space characters stripped) * "a, " b ", c" - Three tokens "a"," b ","c" (quoted text untouched) * </pre> * <p> - * By default, this tokenizer has the following properties: - * <pre> - * Property Default - * --------- ------- - * delimiter , (comma) - * quote " (double quote) - * ignored char <= 32 (as per trim) - * emptyTokenAsNull false - * ignoreEmptyTokens false - * </pre> + * + * This tokenizer has the following properties and options: + * + * <table> + * <tr> + * <th>Property</th><th>Type</th><th>Default</th> + * </tr> + * <tr> + * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> + * </tr> + * <tr> + * <td>quote</td><td>NoneMatcher</td><td>{}</td> + * </tr> + * <tr> + * <td>ignore</td><td>NoneMatcher</td><td>{}</td> + * </tr> + * <tr> + * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> + * </tr> + * <tr> + * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> + * </tr> + * </table> * * @author Matthew Inger * @author Stephen Colebourne @@ -96,17 +110,30 @@ * @since 2.1 * @version $Id$ */ -public class Tokenizer implements ListIterator { - // TODO: Constructors - // TODO: Tests - // TODO: Static factories CSV/StringTokenizer - +public class Tokenizer implements ListIterator, Cloneable { + /** * A Matcher which matches the comma character. * Best used for <code>delimiter</code>. */ public static final Matcher COMMA_MATCHER = new CharMatcher(','); /** + * A Matcher which matches the tab character. + * Best used for <code>delimiter</code>. + */ + public static final Matcher TAB_MATCHER = new CharMatcher('\t'); + /** + * A Matcher which matches the space character. + * Best used for <code>delimiter</code>. + */ + public static final Matcher SPACE_MATCHER = new CharMatcher(' '); + /** + * A Matcher which matches the same characters as StringTokenizer, + * namely space, tab, newline, formfeed. + * Best used for <code>delimiter</code>. + */ + public static final Matcher SPLIT_MATCHER = new CharSetMatcher(" \t\n\r\f"); + /** * A Matcher which matches the double quote character. * Best used for <code>quote</code>. */ @@ -115,98 +142,199 @@ * A Matcher which matches the String trim() whitespace characters. * Best used for <code>ignored</code>. */ - public static final Matcher SPACES_MATCHER = new TrimMatcher(); + public static final Matcher TRIM_MATCHER = new TrimMatcher(); /** * A Matcher that matches no characters. Don't use this for delimiters! * Best used for <code>ignored</code>. */ public static final Matcher NONE_MATCHER = new NoMatcher(); + + private static final Tokenizer CSV_TOKENIZER_PROTOTYPE; + private static final Tokenizer TSV_TOKENIZER_PROTOTYPE; + + static { + CSV_TOKENIZER_PROTOTYPE = new Tokenizer(StringUtils.EMPTY); + CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(COMMA_MATCHER); + CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(DOUBLE_QUOTE_MATCHER); + CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(TRIM_MATCHER); + CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); + CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); + + TSV_TOKENIZER_PROTOTYPE = new Tokenizer(StringUtils.EMPTY); + TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(TAB_MATCHER); + TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(DOUBLE_QUOTE_MATCHER); + TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(TRIM_MATCHER); + TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); + TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); + } /** The text to work on */ private char chars[]; + /** The input text, null if char[] input */ + private String text; /** The parsed tokens */ private String tokens[]; /** The current iteration position */ private int tokenPos; /** The delimiter matcher */ - private Matcher delim = COMMA_MATCHER; + private Matcher delim = SPLIT_MATCHER; /** The quote matcher */ - private Matcher quote = DOUBLE_QUOTE_MATCHER; + private Matcher quote = NONE_MATCHER; /** The ignored matcher */ - private Matcher ignored = SPACES_MATCHER; + private Matcher ignored = NONE_MATCHER; /** Whether to return empty tokens as null */ private boolean emptyAsNull = false; /** Whether to ignore empty tokens */ - private boolean ignoreEmptyTokens = false; + private boolean ignoreEmptyTokens = true; + + //----------------------------------------------------------------------- + /** + * Get a tokenizer instance which parses Comma Seperated Value + * strings. You must call a "reset" method to set the string which + * you want to parse. + */ + public static final Tokenizer getCSVInstance() { + return (Tokenizer)(CSV_TOKENIZER_PROTOTYPE.clone()); + } + + /** + * Get a tokenizer instance which parses Comma Seperated Value + * strings, initializing it with the given input. + * + * @param input the string to parse + */ + public static final Tokenizer getCSVInstance(String input) { + Tokenizer tok = (Tokenizer)(CSV_TOKENIZER_PROTOTYPE.clone()); + tok.reset(input); + return tok; + } + + /** + * Get a tokenizer instance which parses Comma Seperated Value + * strings, initializing it with the given input. + * + * @param input the text to parse + */ + public static final Tokenizer getCSVInstance(char[] input) { + Tokenizer tok = (Tokenizer)(CSV_TOKENIZER_PROTOTYPE.clone()); + tok.reset(input); + return tok; + } + + /** + * Get a tokenizer instance which parses Tab Seperated Value + * strings. You must call a "reset" method to set the string which + * you want to parse. + */ + public static final Tokenizer getTSVInstance() { + return (Tokenizer)(TSV_TOKENIZER_PROTOTYPE.clone()); + } + + /** + * Get a tokenizer instance which parses Tab Seperated Value + * strings, initializing it with the given input. + * + * @param input the string to parse + */ + public static final Tokenizer getTSVInstance(String input) { + Tokenizer tok = (Tokenizer)(TSV_TOKENIZER_PROTOTYPE.clone()); + tok.reset(input); + return tok; + } + + /** + * Get a tokenizer instance which parses Tab Seperated Value + * strings, initializing it with the given input. + * + * @param input the text to parse + */ + public static final Tokenizer getTSVInstance(char[] input) { + Tokenizer tok = (Tokenizer)(TSV_TOKENIZER_PROTOTYPE.clone()); + tok.reset(input); + return tok; + } //----------------------------------------------------------------------- /** - * Constructor. + * Constructs a tokenizer splitting on space, tab, newline and formfeed + * as per StringTokenizer. * * @param input the string which is to be parsed */ public Tokenizer(String input) { - this(input.toCharArray()); + super(); + this.text = input; + this.chars = input.toCharArray(); // no clone as toCharArray() clones } /** - * Constructor. + * Constructs a tokenizer splitting on space, tab, newline and formfeed + * as per StringTokenizer. * * @param input the string which is to be parsed * @param delim the field delimiter character */ public Tokenizer(String input, char delim) { - this(input.toCharArray(), delim); + this(input); + setDelimiterChar(delim); } /** - * Constructor. + * Constructs a tokenizer splitting on space, tab, newline and formfeed + * as per StringTokenizer. * * @param input the string which is to be parsed * @param delim the field delimiter character */ public Tokenizer(String input, CharSetMatcher delim) { - this(input.toCharArray(), delim); + this(input); + setDelimiterMatcher(delim); } /** - * Constructor. + * Constructs a tokenizer splitting on space, tab, newline and formfeed + * as per StringTokenizer. * * @param input the string which is to be parsed * @param delim the field delimiter character * @param quote the field quoted string character */ public Tokenizer(String input, char delim, char quote) { - this(input.toCharArray(), delim, quote); + this(input, delim); + setQuoteChar(quote); } /** - * Constructor. + * Constructs a tokenizer splitting on space, tab, newline and formfeed + * as per StringTokenizer. * * @param input the string which is to be parsed * @param delim the field delimiter character * @param quote the field quoted string character */ public Tokenizer(String input, CharSetMatcher delim, CharSetMatcher quote) { - this(input.toCharArray(), delim, quote); + this(input, delim); + setQuoteMatcher(quote); } /** - * Constructor. + * Constructs a tokenizer splitting on space, tab, newline and formfeed + * as per StringTokenizer. * - * @param input the string which is to be parsed + * @param input the string which is to be parsed, cloned */ public Tokenizer(char[] input) { super(); + this.text = null; this.chars = (char[]) input.clone(); - this.tokenPos = 0; } /** - * Constructor. + * Constructs a tokenizer splitting on space, tab, newline and formfeed + * as per StringTokenizer. * - * @param input the string which is to be parsed + * @param input the string which is to be parsed, cloned * @param delim the field delimiter character */ public Tokenizer(char[] input, char delim) { @@ -215,9 +343,10 @@ } /** - * Constructor. + * Constructs a tokenizer splitting on space, tab, newline and formfeed + * as per StringTokenizer. * - * @param input the string which is to be parsed + * @param input the string which is to be parsed, cloned * @param delim the field delimiter character */ public Tokenizer(char[] input, CharSetMatcher delim) { @@ -226,9 +355,10 @@ } /** - * Constructor. + * Constructs a tokenizer splitting on space, tab, newline and formfeed + * as per StringTokenizer. * - * @param input the string which is to be parsed + * @param input the string which is to be parsed, cloned * @param delim the field delimiter character * @param quote the field quoted string character */ @@ -238,9 +368,10 @@ } /** - * Constructor. + * Constructs a tokenizer splitting on space, tab, newline and formfeed + * as per StringTokenizer. * - * @param input the string which is to be parsed + * @param input the string which is to be parsed, cloned * @param delim the field delimiter character * @param quote the field quoted string character */ @@ -307,6 +438,32 @@ tokens = null; } + /** + * Reset this tokenizer, giving it a new input string to parse. + * In this manner you can re-use a tokenizer with the same settings + * on multiple input lines. + * + * @param input the new string to tokenize + */ + public void reset(String input) { + reset(); + this.text = input; + chars = input.toCharArray(); // no clone as toCharArray() clones + } + + /** + * Reset this tokenizer, giving it a new input string to parse. + * In this manner you can re-use a tokenizer with the same settings + * on multiple input lines. + * + * @param input the new character array to tokenize, cloned + */ + public void reset(char [] input) { + reset(); + this.text = null; + chars = (char[]) input.clone(); + } + // ListIterator //----------------------------------------------------------------------- /** @@ -473,15 +630,18 @@ token.setLength(0); int len = chars.length; - // skip all leading whitespace, unless it is the + // Skip all leading whitespace, unless it is the // field delimiter or the quote character - while (start < len && - ignored.isMatch(chars[start]) && - !delim.isMatch(chars[start]) && - !quote.isMatch(chars[start])) { - start++; + int current = start; + while (current < len && + ignored.isMatch(chars[current]) && + !delim.isMatch(chars[current]) && + !quote.isMatch(chars[current])) { + current++; } + start = current; + // Read the token depending on what the first // character is like if (delim.isMatch(chars[start])) { @@ -763,6 +923,36 @@ this.ignoreEmptyTokens = ignoreEmptyTokens; } + //----------------------------------------------------------------------- + /** + * Gets the String content that the tokenizer is parsing. + * + * @return the string content being parsed + */ + public String getContent() { + if (text == null) { + text = new String(chars); + } + return text; + } + + //----------------------------------------------------------------------- + /** + * Create a new instance of this Tokenizer. + * The new instance is reset so that it will be at the start of the token list. + */ + public Object clone() { + try { + Tokenizer cloned = (Tokenizer) super.clone(); + // chars[] does not need additional clone as it is treated as immutable + cloned.reset(); + return cloned; + + } catch (CloneNotSupportedException ex) { + return null; + } + } + //----------------------------------------------------------------------- /** * Class used to define a set of characters for matching purposes. @@ -801,7 +991,9 @@ * @param chars the characters to match, must not be null */ public CharSetMatcher(String chars) { - this(chars.toCharArray()); + super(); + this.chars = chars.toCharArray(); + Arrays.sort(this.chars); } /** --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]