cutting 02/01/24 11:02:52
Modified: src/java/org/apache/lucene/analysis LetterTokenizer.java
LowerCaseTokenizer.java
Added: src/java/org/apache/lucene/analysis CharTokenizer.java
WhitespaceAnalyzer.java WhitespaceTokenizer.java
Removed: src/java/org/apache/lucene/analysis NullAnalyzer.java
NullTokenizer.java
Log:
Renamed NullTokenizer and Analyzer to WhitespaceTokenizer and Analyzer.
Also re-structured the implementation of several tokenizers so that they
share code, basing them on the new class CharAnalyzer.
Revision Changes Path
1.2 +7 -44
jakarta-lucene/src/java/org/apache/lucene/analysis/LetterTokenizer.java
Index: LetterTokenizer.java
===================================================================
RCS file:
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/LetterTokenizer.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- LetterTokenizer.java 18 Sep 2001 16:29:49 -0000 1.1
+++ LetterTokenizer.java 24 Jan 2002 19:02:52 -0000 1.2
@@ -63,52 +63,15 @@
Note: this does a decent job for most European languages, but does a terrible
job for some Asian languages, where words are not separated by spaces. */
-public final class LetterTokenizer extends Tokenizer {
+public class LetterTokenizer extends CharTokenizer {
+ /** Construct a new LetterTokenizer. */
public LetterTokenizer(Reader in) {
- input = in;
+ super(in);
}
- private int offset = 0, bufferIndex=0, dataLen=0;
- private final static int MAX_WORD_LEN = 255;
- private final static int IO_BUFFER_SIZE = 1024;
- private final char[] buffer = new char[MAX_WORD_LEN];
- private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
-
- public final Token next() throws java.io.IOException {
- int length = 0;
- int start = offset;
- while (true) {
- final char c;
-
- offset++;
- if (bufferIndex >= dataLen) {
- dataLen = input.read(ioBuffer);
- bufferIndex = 0;
- };
- if (dataLen == -1) {
- if (length > 0)
- break;
- else
- return null;
- }
- else
- c = (char) ioBuffer[bufferIndex++];
-
- if (Character.isLetter(c)) { // if it's a letter
-
- if (length == 0) // start of token
- start = offset-1;
-
- buffer[length++] = c; // buffer it
-
- if (length == MAX_WORD_LEN) // buffer overflow!
- break;
-
- } else if (length > 0) // at non-Letter w/ chars
- break; // return 'em
-
- }
-
- return new Token(new String(buffer, 0, length), start, start+length);
+ /** Collects only characters which satisfy
+ * {@link Character.isLetter(char)}.*/
+ protected boolean isTokenChar(char c) {
+ return Character.isLetter(c);
}
}
1.2 +7 -44
jakarta-lucene/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java
Index: LowerCaseTokenizer.java
===================================================================
RCS file:
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- LowerCaseTokenizer.java 18 Sep 2001 16:29:50 -0000 1.1
+++ LowerCaseTokenizer.java 24 Jan 2002 19:02:52 -0000 1.2
@@ -65,52 +65,15 @@
Note: this does a decent job for most European languages, but does a terrible
job for some Asian languages, where words are not separated by spaces. */
-public final class LowerCaseTokenizer extends Tokenizer {
+public final class LowerCaseTokenizer extends LetterTokenizer {
+ /** Construct a new LowerCaseTokenizer. */
public LowerCaseTokenizer(Reader in) {
- input = in;
+ super(in);
}
- private int offset = 0, bufferIndex=0, dataLen=0;
- private final static int MAX_WORD_LEN = 255;
- private final static int IO_BUFFER_SIZE = 1024;
- private final char[] buffer = new char[MAX_WORD_LEN];
- private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
-
- public final Token next() throws java.io.IOException {
- int length = 0;
- int start = offset;
- while (true) {
- final char c;
-
- offset++;
- if (bufferIndex >= dataLen) {
- dataLen = input.read(ioBuffer);
- bufferIndex = 0;
- };
- if (dataLen == -1) {
- if (length > 0)
- break;
- else
- return null;
- }
- else
- c = (char) ioBuffer[bufferIndex++];
-
- if (Character.isLetter(c)) { // if it's a letter
-
- if (length == 0) // start of token
- start = offset-1;
-
- buffer[length++] = Character.toLowerCase(c);
- // buffer it
- if (length == MAX_WORD_LEN) // buffer overflow!
- break;
-
- } else if (length > 0) // at non-Letter w/ chars
- break; // return 'em
-
- }
-
- return new Token(new String(buffer, 0, length), start, start+length);
+ /** Collects only characters which satisfy
+ * {@link Character.isLetter(char)}.*/
+ protected char normalize(char c) {
+ return Character.toLowerCase(c);
}
}
1.1
jakarta-lucene/src/java/org/apache/lucene/analysis/CharTokenizer.java
Index: CharTokenizer.java
===================================================================
package org.apache.lucene.analysis;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact [EMAIL PROTECTED]
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.Reader;
/** An abstract base class for simple, character-oriented tokenizers.*/
public abstract class CharTokenizer extends Tokenizer {
public CharTokenizer(Reader input) {
this.input = input;
}
private int offset = 0, bufferIndex=0, dataLen=0;
private final static int MAX_WORD_LEN = 255;
private final static int IO_BUFFER_SIZE = 1024;
private final char[] buffer = new char[MAX_WORD_LEN];
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
/** Returns true iff a character should be included in a token. This
* tokenizer generates as tokens adjacent sequences of characters which
* satisfy this predicate. Characters for which this is false are used to
* define token boundaries and are not included in tokens. */
protected abstract boolean isTokenChar(char c);
/** Called on each token character to normalize it before it is added to the
* token. The default implementation does nothing. Subclasses may use this
* to, e.g., lowercase tokens. */
protected char normalize(char c) { return c; }
/** Returns the next token in the stream, or null at EOS. */
public final Token next() throws java.io.IOException {
int length = 0;
int start = offset;
while (true) {
final char c;
offset++;
if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
};
if (dataLen == -1) {
if (length > 0)
break;
else
return null;
}
else
c = (char) ioBuffer[bufferIndex++];
if (isTokenChar(c)) { // if it's a token char
if (length == 0) // start of token
start = offset-1;
buffer[length++] = normalize(c); // buffer it, normalized
if (length == MAX_WORD_LEN) // buffer overflow!
break;
} else if (length > 0) // at non-Letter w/ chars
break; // return 'em
}
return new Token(new String(buffer, 0, length), start, start+length);
}
}
1.1
jakarta-lucene/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java
Index: WhitespaceAnalyzer.java
===================================================================
package org.apache.lucene.analysis;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact [EMAIL PROTECTED]
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.Reader;
/** An Analyzer that uses WhitespaceTokenizer. */
public final class WhitespaceAnalyzer extends Analyzer {
public final TokenStream tokenStream(String fieldName, Reader reader) {
return new WhitespaceTokenizer(reader);
}
}
1.1
jakarta-lucene/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java
Index: WhitespaceTokenizer.java
===================================================================
package org.apache.lucene.analysis;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact [EMAIL PROTECTED]
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.Reader;
/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
* Adjacent sequences of non-Whitespace characters form tokens. */
public class WhitespaceTokenizer extends CharTokenizer {
/** Construct a new WhitespaceTokenizer. */
public WhitespaceTokenizer(Reader in) {
super(in);
}
/** Collects only characters which do not satisfy
* {@link Character.isWhitespace(char)}.*/
protected boolean isTokenChar(char c) {
return !Character.isWhitespace(c);
}
}
--
To unsubscribe, e-mail: <mailto:[EMAIL PROTECTED]>
For additional commands, e-mail: <mailto:[EMAIL PROTECTED]>