lang Tokenizer.java

scolebourne Fri, 13 Feb 2004 16:31:45 -0800

scolebourne    2004/02/13 16:31:55

  Modified:    lang/src/test/org/apache/commons/lang TokenizerTest.java
               lang/src/java/org/apache/commons/lang Tokenizer.java
  Log:
  Improve Tokenizer with CSV and TSV plus change default to StringTokenizer like


  includes code from Matthew Inger
  
  Revision  Changes    Path
  1.2       +136 -3    
jakarta-commons/lang/src/test/org/apache/commons/lang/TokenizerTest.java
  
  Index: TokenizerTest.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-commons/lang/src/test/org/apache/commons/lang/TokenizerTest.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- TokenizerTest.java        17 Nov 2003 23:02:18 -0000      1.1
  +++ TokenizerTest.java        14 Feb 2004 00:31:55 -0000      1.2
  @@ -1,7 +1,7 @@
   /* ====================================================================
    * The Apache Software License, Version 1.1
    *
  - * Copyright (c) 2002-2003 The Apache Software Foundation.  All rights
  + * Copyright (c) 2003-2004 The Apache Software Foundation.  All rights
    * reserved.
    *
    * Redistribution and use in source and binary forms, with or without
  @@ -90,6 +90,9 @@
           String input = "a;b;c;\"d;\"\"e\";f; ; ;";
           Tokenizer tok = new Tokenizer(input);
           tok.setDelimiterChar(';');
  +        tok.setQuoteChar('"');
  +        tok.setIgnoredMatcher(Tokenizer.TRIM_MATCHER);
  +        tok.setIgnoreEmptyTokens(false);
           String tokens [] = tok.getAllTokens();
   
           String expected[] = new String[]
  @@ -120,7 +123,9 @@
           String input = "a;b;c ;\"d;\"\"e\";f; ; ;";
           Tokenizer tok = new Tokenizer(input);
           tok.setDelimiterChar(';');
  +        tok.setQuoteChar('"');
           tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER);
  +        tok.setIgnoreEmptyTokens(false);
           String tokens [] = tok.getAllTokens();
   
           String expected[] = new String[]
  @@ -151,7 +156,9 @@
           String input = "a;b; c;\"d;\"\"e\";f; ; ;";
           Tokenizer tok = new Tokenizer(input);
           tok.setDelimiterChar(';');
  +        tok.setQuoteChar('"');
           tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER);
  +        tok.setIgnoreEmptyTokens(false);
           String tokens [] = tok.getAllTokens();
   
           String expected[] = new String[]
  @@ -182,6 +189,8 @@
           String input = "a;b; c;\"d;\"\"e\";f; ; ;";
           Tokenizer tok = new Tokenizer(input);
           tok.setDelimiterChar(';');
  +        tok.setQuoteChar('"');
  +        tok.setIgnoredMatcher(Tokenizer.TRIM_MATCHER);
           tok.setIgnoreEmptyTokens(true);
           String tokens [] = tok.getAllTokens();
   
  @@ -210,6 +219,9 @@
           String input = "a;b; c;\"d;\"\"e\";f; ; ;";
           Tokenizer tok = new Tokenizer(input);
           tok.setDelimiterChar(';');
  +        tok.setQuoteChar('"');
  +        tok.setIgnoredMatcher(Tokenizer.TRIM_MATCHER);
  +        tok.setIgnoreEmptyTokens(false);
           tok.setEmptyTokenAsNull(true);
           String tokens [] = tok.getAllTokens();
   
  @@ -241,6 +253,9 @@
           String input = "a;b; c;\"d;\"\"e\";f; ; ;";
           Tokenizer tok = new Tokenizer(input);
           tok.setDelimiterChar(';');
  +        tok.setQuoteChar('"');
  +        tok.setIgnoredMatcher(Tokenizer.TRIM_MATCHER);
  +        tok.setIgnoreEmptyTokens(false);
   //        tok.setTreatingEmptyAsNull(true);
           String tokens [] = tok.getAllTokens();
   
  @@ -285,7 +300,8 @@
   
           String input = "a   b c \"d e\" f ";
           Tokenizer tok = new Tokenizer(input);
  -        tok.setDelimiterMatcher(Tokenizer.SPACES_MATCHER);
  +        tok.setDelimiterMatcher(Tokenizer.SPACE_MATCHER);
  +        tok.setQuoteMatcher(Tokenizer.DOUBLE_QUOTE_MATCHER);
           tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER);
           tok.setIgnoreEmptyTokens(false);
           String tokens [] = tok.getAllTokens();
  @@ -317,7 +333,8 @@
   
           String input = "a   b c \"d e\" f ";
           Tokenizer tok = new Tokenizer(input);
  -        tok.setDelimiterMatcher(Tokenizer.SPACES_MATCHER);
  +        tok.setDelimiterMatcher(Tokenizer.SPACE_MATCHER);
  +        tok.setQuoteMatcher(Tokenizer.DOUBLE_QUOTE_MATCHER);
           tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER);
           tok.setIgnoreEmptyTokens(true);
           String tokens [] = tok.getAllTokens();
  @@ -341,4 +358,120 @@
   
       }
   
  +    public void testBasic1() {
  +        String input = "a  b c";
  +        Tokenizer tok = new Tokenizer(input);
  +        assertEquals("a", tok.next());
  +        assertEquals("b", tok.next());
  +        assertEquals("c", tok.next());
  +    }
  +    
  +    public void testBasic2() {
  +        String input = "a \nb\fc";
  +        Tokenizer tok = new Tokenizer(input);
  +        assertEquals("a", tok.next());
  +        assertEquals("b", tok.next());
  +        assertEquals("c", tok.next());
  +    }
  +    
  +    public void testBasic3() {
  +        String input = "a \nb\u0001\fc";
  +        Tokenizer tok = new Tokenizer(input);
  +        assertEquals("a", tok.next());
  +        assertEquals("b\u0001", tok.next());
  +        assertEquals("c", tok.next());
  +    }
  +    
  +    public void testBasic4() {
  +        String input = "a \"b\" c";
  +        Tokenizer tok = new Tokenizer(input);
  +        assertEquals("a", tok.next());
  +        assertEquals("\"b\"", tok.next());
  +        assertEquals("c", tok.next());
  +    }
  +    
  +    public void testBasicQuoted1() {
  +        String input = "a \"b\" c";
  +        Tokenizer tok = new Tokenizer(input, ' ', '"');
  +        assertEquals("a", tok.next());
  +        assertEquals("b", tok.next());
  +        assertEquals("c", tok.next());
  +    }
  +    
  +    public void testBasicDelim1() {
  +        String input = "a:b:c";
  +        Tokenizer tok = new Tokenizer(input, ':');
  +        assertEquals("a", tok.next());
  +        assertEquals("b", tok.next());
  +        assertEquals("c", tok.next());
  +    }
  +    
  +    public void testBasicDelim2() {
  +        String input = "a:b:c";
  +        Tokenizer tok = new Tokenizer(input, ',');
  +        assertEquals("a:b:c", tok.next());
  +    }
  +    
  +    public void testBasicEmpty1() {
  +        String input = "a  b c";
  +        Tokenizer tok = new Tokenizer(input);
  +        tok.setIgnoreEmptyTokens(false);
  +        assertEquals("a", tok.next());
  +        assertEquals("", tok.next());
  +        assertEquals("b", tok.next());
  +        assertEquals("c", tok.next());
  +    }
  +    
  +    public void testBasicEmpty2() {
  +        String input = "a  b c";
  +        Tokenizer tok = new Tokenizer(input);
  +        tok.setIgnoreEmptyTokens(false);
  +        tok.setEmptyTokenAsNull(true);
  +        assertEquals("a", tok.next());
  +        assertEquals(null, tok.next());
  +        assertEquals("b", tok.next());
  +        assertEquals("c", tok.next());
  +    }
  +    
  +    public void testGetContent() {
  +        String input = "a   b c \"d e\" f ";
  +        Tokenizer tok = new Tokenizer(input);
  +        assertSame(input, tok.getContent());
  +        
  +        tok = new Tokenizer(input.toCharArray());
  +        assertEquals(input, tok.getContent());
  +    }
  +
  +    public void testReset() {
  +        String input = "a b c";
  +        Tokenizer tok = new Tokenizer(input);
  +        assertEquals("a", tok.next());
  +        assertEquals("b", tok.next());
  +        assertEquals("c", tok.next());
  +        tok.reset();
  +        assertEquals("a", tok.next());
  +        assertEquals("b", tok.next());
  +        assertEquals("c", tok.next());
  +        tok.reset("d e");
  +        assertEquals("d", tok.next());
  +        assertEquals("e", tok.next());
  +        tok.reset("f g".toCharArray());
  +        assertEquals("f", tok.next());
  +        assertEquals("g", tok.next());
  +    }
  +    
  +    public void testMatcher() {
  +        assertEquals(true, Tokenizer.SPACE_MATCHER.isMatch(' '));
  +        assertEquals(false, Tokenizer.SPACE_MATCHER.isMatch('\n'));
  +        assertEquals(false, Tokenizer.SPACE_MATCHER.isMatch('\u0001'));
  +        
  +        assertEquals(true, Tokenizer.TRIM_MATCHER.isMatch(' '));
  +        assertEquals(true, Tokenizer.TRIM_MATCHER.isMatch('\n'));
  +        assertEquals(true, Tokenizer.TRIM_MATCHER.isMatch('\u0001'));
  +        
  +        assertEquals(true, Tokenizer.SPLIT_MATCHER.isMatch(' '));
  +        assertEquals(true, Tokenizer.SPLIT_MATCHER.isMatch('\n'));
  +        assertEquals(false, Tokenizer.SPLIT_MATCHER.isMatch('\u0001'));
  +    }
  +    
   }
  
  
  
  1.4       +247 -55   
jakarta-commons/lang/src/java/org/apache/commons/lang/Tokenizer.java
  
  Index: Tokenizer.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-commons/lang/src/java/org/apache/commons/lang/Tokenizer.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- Tokenizer.java    13 Feb 2004 01:58:50 -0000      1.3
  +++ Tokenizer.java    14 Feb 2004 00:31:55 -0000      1.4
  @@ -1,7 +1,7 @@
   /* ====================================================================
    * The Apache Software License, Version 1.1
    *
  - * Copyright (c) 2002-2003 The Apache Software Foundation.  All rights
  + * Copyright (c) 2003-2004 The Apache Software Foundation.  All rights
    * reserved.
    *
    * Redistribution and use in source and binary forms, with or without
  @@ -62,9 +62,9 @@
    * Tokenizes a string based based on delimiters (separators)
    * and supporting quoting and ignored character concepts.
    * <p>
  - * This class can split a String into many smaller strings. It aims to do a
  - * similar job to java util StringTokenizer, however it offers much more
  - * control and flexibility.
  + * This class can split a String into many smaller strings.
  + * It aims to do a similar job to java util StringTokenizer, however it offers
  + * much more control and flexibility. By default, it is setup like StringTokenizer.
    * <p>
    * The input String is split into a number of <i>tokens</i>.
    * Each token is separated from the next String by a <i>delimiter</i>.
  @@ -73,22 +73,36 @@
    * The processing then strips all the <i>ignored</i> characters from each side of 
the token.
    * The token may also have <i>quotes</i> to mark an area not to be stripped or 
tokenized.
    * Empty tokens may be removed or returned as null.
  + * This example is based on the CSV tokenizer.
    * <pre>
  - * "a,b,c"       - Three tokens "a","b","c" (comma delimiter)
  - * "a, b , c"    - Three tokens "a","b","c" (ignored space characters stripped)
  + * "a,b,c"       - Three tokens "a","b","c"   (comma delimiter)
  + * "a, b , c"    - Three tokens "a","b","c"   (ignored space characters stripped)
    * "a, " b ", c" - Three tokens "a"," b ","c" (quoted text untouched)
    * </pre>
    * <p>
  - * By default, this tokenizer has the following properties:
  - * <pre>
  - * Property                     Default
  - * ---------                    -------
  - * delimiter                    ,  (comma)
  - * quote                        "  (double quote)
  - * ignored                      char &lt;= 32 (as per trim)
  - * emptyTokenAsNull             false
  - * ignoreEmptyTokens            false
  - * </pre>
  + *
  + * This tokenizer has the following properties and options:
  + *
  + * <table>
  + *  <tr>
  + *   <th>Property</th><th>Type</th><th>Default</th>
  + *  </tr>
  + *  <tr>
  + *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
  + *  </tr>
  + *  <tr>
  + *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
  + *  </tr>
  + *  <tr>
  + *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
  + *  </tr>
  + *  <tr>
  + *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
  + *  </tr>
  + *  <tr>
  + *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
  + *  </tr>
  + * </table>
    *
    * @author Matthew Inger
    * @author Stephen Colebourne
  @@ -96,17 +110,30 @@
    * @since 2.1
    * @version $Id$
    */
  -public class Tokenizer implements ListIterator {
  -    // TODO: Constructors
  -    // TODO: Tests
  -    // TODO: Static factories CSV/StringTokenizer
  -    
  +public class Tokenizer implements ListIterator, Cloneable {
  +
       /**
        * A Matcher which matches the comma character.
        * Best used for <code>delimiter</code>.
        */
       public static final Matcher COMMA_MATCHER = new CharMatcher(',');
       /**
  +     * A Matcher which matches the tab character.
  +     * Best used for <code>delimiter</code>.
  +     */
  +    public static final Matcher TAB_MATCHER = new CharMatcher('\t');
  +    /**
  +     * A Matcher which matches the space character.
  +     * Best used for <code>delimiter</code>.
  +     */
  +    public static final Matcher SPACE_MATCHER = new CharMatcher(' ');
  +    /**
  +     * A Matcher which matches the same characters as StringTokenizer,
  +     * namely space, tab, newline, formfeed.
  +     * Best used for <code>delimiter</code>.
  +     */
  +    public static final Matcher SPLIT_MATCHER = new CharSetMatcher(" \t\n\r\f");
  +    /**
        * A Matcher which matches the double quote character.
        * Best used for <code>quote</code>.
        */
  @@ -115,98 +142,199 @@
        * A Matcher which matches the String trim() whitespace characters.
        * Best used for <code>ignored</code>.
        */
  -    public static final Matcher SPACES_MATCHER = new TrimMatcher();
  +    public static final Matcher TRIM_MATCHER = new TrimMatcher();
       /**
        * A Matcher that matches no characters. Don't use this for delimiters!
        * Best used for <code>ignored</code>.
        */
       public static final Matcher NONE_MATCHER = new NoMatcher();
  +    
  +    private static final Tokenizer CSV_TOKENIZER_PROTOTYPE;
  +    private static final Tokenizer TSV_TOKENIZER_PROTOTYPE;
  +
  +    static {
  +        CSV_TOKENIZER_PROTOTYPE = new Tokenizer(StringUtils.EMPTY);
  +        CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(COMMA_MATCHER);
  +        CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(DOUBLE_QUOTE_MATCHER);
  +        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(TRIM_MATCHER);
  +        CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
  +        CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
  +
  +        TSV_TOKENIZER_PROTOTYPE = new Tokenizer(StringUtils.EMPTY);
  +        TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(TAB_MATCHER);
  +        TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(DOUBLE_QUOTE_MATCHER);
  +        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(TRIM_MATCHER);
  +        TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
  +        TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
  +    }
   
       /** The text to work on */
       private char chars[];
  +    /** The input text, null if char[] input */
  +    private String text;
       /** The parsed tokens */
       private String tokens[];
       /** The current iteration position */
       private int tokenPos;
   
       /** The delimiter matcher */
  -    private Matcher delim = COMMA_MATCHER;
  +    private Matcher delim = SPLIT_MATCHER;
       /** The quote matcher */
  -    private Matcher quote = DOUBLE_QUOTE_MATCHER;
  +    private Matcher quote = NONE_MATCHER;
       /** The ignored matcher */
  -    private Matcher ignored = SPACES_MATCHER;
  +    private Matcher ignored = NONE_MATCHER;
       /** Whether to return empty tokens as null */
       private boolean emptyAsNull = false;
       /** Whether to ignore empty tokens */
  -    private boolean ignoreEmptyTokens = false;
  +    private boolean ignoreEmptyTokens = true;
  +
  +    //-----------------------------------------------------------------------
  +    /**
  +     * Get a tokenizer instance which parses Comma Seperated Value
  +     * strings.  You must call a "reset" method to set the string which
  +     * you want to parse.
  +     */
  +    public static final Tokenizer getCSVInstance() {
  +        return (Tokenizer)(CSV_TOKENIZER_PROTOTYPE.clone());
  +    }
  +
  +    /**
  +     * Get a tokenizer instance which parses Comma Seperated Value
  +     * strings, initializing it with the given input.
  +     * 
  +     * @param input  the string to parse
  +     */
  +    public static final Tokenizer getCSVInstance(String input) {
  +        Tokenizer tok = (Tokenizer)(CSV_TOKENIZER_PROTOTYPE.clone());
  +        tok.reset(input);
  +        return tok;
  +    }
  +
  +    /**
  +     * Get a tokenizer instance which parses Comma Seperated Value
  +     * strings, initializing it with the given input.
  +     * 
  +     * @param input  the text to parse
  +     */
  +    public static final Tokenizer getCSVInstance(char[] input) {
  +        Tokenizer tok = (Tokenizer)(CSV_TOKENIZER_PROTOTYPE.clone());
  +        tok.reset(input);
  +        return tok;
  +    }
  +
  +    /**
  +     * Get a tokenizer instance which parses Tab Seperated Value
  +     * strings.  You must call a "reset" method to set the string which
  +     * you want to parse.
  +     */
  +    public static final Tokenizer getTSVInstance() {
  +        return (Tokenizer)(TSV_TOKENIZER_PROTOTYPE.clone());
  +    }
  +
  +    /**
  +     * Get a tokenizer instance which parses Tab Seperated Value
  +     * strings, initializing it with the given input.
  +     * 
  +     * @param input  the string to parse
  +     */
  +    public static final Tokenizer getTSVInstance(String input) {
  +        Tokenizer tok = (Tokenizer)(TSV_TOKENIZER_PROTOTYPE.clone());
  +        tok.reset(input);
  +        return tok;
  +    }
  +
  +    /**
  +     * Get a tokenizer instance which parses Tab Seperated Value
  +     * strings, initializing it with the given input.
  +     * 
  +     * @param input  the text to parse
  +     */
  +    public static final Tokenizer getTSVInstance(char[] input) {
  +        Tokenizer tok = (Tokenizer)(TSV_TOKENIZER_PROTOTYPE.clone());
  +        tok.reset(input);
  +        return tok;
  +    }
   
       //-----------------------------------------------------------------------
       /**
  -     * Constructor.
  +     * Constructs a tokenizer splitting on space, tab, newline and formfeed
  +     * as per StringTokenizer.
        * 
        * @param input  the string which is to be parsed
        */
       public Tokenizer(String input) {
  -        this(input.toCharArray());
  +        super();
  +        this.text = input;
  +        this.chars = input.toCharArray();  // no clone as toCharArray() clones
       }
   
       /**
  -     * Constructor.
  +     * Constructs a tokenizer splitting on space, tab, newline and formfeed
  +     * as per StringTokenizer.
        * 
        * @param input  the string which is to be parsed
        * @param delim  the field delimiter character
        */
       public Tokenizer(String input, char delim) {
  -        this(input.toCharArray(), delim);
  +        this(input);
  +        setDelimiterChar(delim);
       }
   
       /**
  -     * Constructor.
  +     * Constructs a tokenizer splitting on space, tab, newline and formfeed
  +     * as per StringTokenizer.
        * 
        * @param input  the string which is to be parsed
        * @param delim  the field delimiter character
        */
       public Tokenizer(String input, CharSetMatcher delim) {
  -        this(input.toCharArray(), delim);
  +        this(input);
  +        setDelimiterMatcher(delim);
       }
   
       /**
  -     * Constructor.
  +     * Constructs a tokenizer splitting on space, tab, newline and formfeed
  +     * as per StringTokenizer.
        * 
        * @param input  the string which is to be parsed
        * @param delim  the field delimiter character
        * @param quote  the field quoted string character
        */
       public Tokenizer(String input, char delim, char quote) {
  -        this(input.toCharArray(), delim, quote);
  +        this(input, delim);
  +        setQuoteChar(quote);
       }
   
       /**
  -     * Constructor.
  +     * Constructs a tokenizer splitting on space, tab, newline and formfeed
  +     * as per StringTokenizer.
        * 
        * @param input  the string which is to be parsed
        * @param delim  the field delimiter character
        * @param quote  the field quoted string character
        */
       public Tokenizer(String input, CharSetMatcher delim, CharSetMatcher quote) {
  -        this(input.toCharArray(), delim, quote);
  +        this(input, delim);
  +        setQuoteMatcher(quote);
       }
   
       /**
  -     * Constructor.
  +     * Constructs a tokenizer splitting on space, tab, newline and formfeed
  +     * as per StringTokenizer.
        * 
  -     * @param input  the string which is to be parsed
  +     * @param input  the string which is to be parsed, cloned
        */
       public Tokenizer(char[] input) {
           super();
  +        this.text = null;
           this.chars = (char[]) input.clone();
  -        this.tokenPos = 0;
       }
   
       /**
  -     * Constructor.
  +     * Constructs a tokenizer splitting on space, tab, newline and formfeed
  +     * as per StringTokenizer.
        * 
  -     * @param input  the string which is to be parsed
  +     * @param input  the string which is to be parsed, cloned
        * @param delim the field delimiter character
        */
       public Tokenizer(char[] input, char delim) {
  @@ -215,9 +343,10 @@
       }
   
       /**
  -     * Constructor.
  +     * Constructs a tokenizer splitting on space, tab, newline and formfeed
  +     * as per StringTokenizer.
        * 
  -     * @param input  the string which is to be parsed
  +     * @param input  the string which is to be parsed, cloned
        * @param delim  the field delimiter character
        */
       public Tokenizer(char[] input, CharSetMatcher delim) {
  @@ -226,9 +355,10 @@
       }
   
       /**
  -     * Constructor.
  +     * Constructs a tokenizer splitting on space, tab, newline and formfeed
  +     * as per StringTokenizer.
        * 
  -     * @param input  the string which is to be parsed
  +     * @param input  the string which is to be parsed, cloned
        * @param delim  the field delimiter character
        * @param quote  the field quoted string character
        */
  @@ -238,9 +368,10 @@
       }
   
       /**
  -     * Constructor.
  +     * Constructs a tokenizer splitting on space, tab, newline and formfeed
  +     * as per StringTokenizer.
        * 
  -     * @param input  the string which is to be parsed
  +     * @param input  the string which is to be parsed, cloned
        * @param delim  the field delimiter character
        * @param quote  the field quoted string character
        */
  @@ -307,6 +438,32 @@
           tokens = null;
       }
   
  +    /**
  +     * Reset this tokenizer, giving it a new input string to parse.
  +     * In this manner you can re-use a tokenizer with the same settings
  +     * on multiple input lines.
  +     * 
  +     * @param input  the new string to tokenize
  +     */
  +    public void reset(String input) {
  +        reset();
  +        this.text = input;
  +        chars = input.toCharArray();  // no clone as toCharArray() clones
  +    }
  +
  +    /**
  +     * Reset this tokenizer, giving it a new input string to parse.
  +     * In this manner you can re-use a tokenizer with the same settings
  +     * on multiple input lines.
  +     * 
  +     * @param input  the new character array to tokenize, cloned
  +     */
  +    public void reset(char [] input) {
  +        reset();
  +        this.text = null;
  +        chars = (char[]) input.clone();
  +    }
  +
       // ListIterator
       //-----------------------------------------------------------------------
       /**
  @@ -473,15 +630,18 @@
           token.setLength(0);
           int len = chars.length;
   
  -        // skip all leading whitespace, unless it is the
  +        // Skip all leading whitespace, unless it is the
           // field delimiter or the quote character
  -        while (start < len &&
  -                ignored.isMatch(chars[start]) &&
  -                !delim.isMatch(chars[start]) &&
  -                !quote.isMatch(chars[start])) {
  -            start++;
  +        int current = start;
  +        while (current < len &&
  +                ignored.isMatch(chars[current]) &&
  +                !delim.isMatch(chars[current]) &&
  +                !quote.isMatch(chars[current])) {
  +            current++;
           }
   
  +        start = current;
  +
           // Read the token depending on what the first
           // character is like
           if (delim.isMatch(chars[start])) {
  @@ -763,6 +923,36 @@
           this.ignoreEmptyTokens = ignoreEmptyTokens;
       }
   
  +    //-----------------------------------------------------------------------
  +    /**
  +     * Gets the String content that the tokenizer is parsing.
  +     * 
  +     * @return the string content being parsed
  +     */
  +    public String getContent() {
  +        if (text == null) {
  +            text = new String(chars);
  +        }
  +        return text;
  +    }
  +    
  +    //-----------------------------------------------------------------------
  +    /**
  +     * Create a new instance of this Tokenizer.
  +     * The new instance is reset so that it will be at the start of the token list.
  +     */
  +    public Object clone() {
  +        try {
  +            Tokenizer cloned = (Tokenizer) super.clone();
  +            // chars[] does not need additional clone as it is treated as immutable
  +            cloned.reset();
  +            return cloned;
  +            
  +        } catch (CloneNotSupportedException ex) {
  +            return null;
  +        }
  +    }
  +
       //-----------------------------------------------------------------------    
       /**
        * Class used to define a set of characters for matching purposes.
  @@ -801,7 +991,9 @@
            * @param chars  the characters to match, must not be null
            */
           public CharSetMatcher(String chars) {
  -            this(chars.toCharArray());
  +            super();
  +            this.chars = chars.toCharArray();
  +            Arrays.sort(this.chars);
           }
   
           /**
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: jakarta-commons/lang/src/java/org/apache/commons/lang Tokenizer.java

Reply via email to