This would be useful for implementing an N-gram filter. I'd support
adding something like this to the Lucene core.
Doug
Yonik Seeley wrote:
Just brainstorming...
Here's a completely untested prototype of what a BufferingTokenStream
might look like, and a possible implementation of removing duplicate
tokens on top of it.
-Yonik
class RemoveDuplicatesTokenFilter extends BufferedTokenStream {
public RemoveDuplicatesTokenFilter(TokenStream input) {super(input);}
public Token process(Token t) throws IOException {
Token tok = read();
while (tok!=null && tok.getPositionIncrement()==0) {
boolean dup=false;
for (Token outTok : output()) {
if (outTok.termText().equals(tok.termText())) {
dup=true;
break;
}
}
if (!dup) write(tok);
}
if (tok != null) pushBack(tok);
return t;
}
}
/**
* Handles input and output buffering of TokenStream
*
*
* // Example of a class implementing the rule "A" "B" => "Q"
* class MyTokenStream extends BufferedTokenStream {
* public MyTokenStream(TokenStream input) {super(input);}
* public Token process(Token t) throws IOException {
* if ("A".equals(t.termText())) {
* Token t2 = read();
* if (t2!=null && "B".equals(t2.termText())) t.setTermText("Q");
* if (t2!=null) pushBack(t2);
* }
* return t;
* }
*}
*
* <pre>
* // Example of a class implementing "A" "B" => "A" "A" "B"
*class MyTokenStream extends BufferedTokenStream {
* public MyTokenStream(TokenStream input) {super(input);}
* public Token process(Token t) throws IOException {
* if ("A".equals(t.termText()) && "B".equals(peek(1).termText())
write(t);
* return t;
* }
*}
* </pre>
*
*
* @author yonik
* @version $Id$
*/
abstract class BufferedTokenStream extends TokenStream {
// in the futute, might be faster if we implemented as an array
based CircularQueue
private final LinkedList<Token> inQueue = new LinkedList<Token>();
private final LinkedList<Token> outQueue = new LinkedList<Token>();
private final TokenStream input;
public BufferedTokenStream(TokenStream input) {
this.input = input;
}
/** Process a token. Subclasses may read more tokens from the input
stream,
* write more tokens to the output stream, or simply return the next token
* to be output. Subclasses may return null if the token is to be
dropped.
* If a subclass writes tokens to the output stream and returns a
non-null Token,
* the returned Token is considered to be at the head of the token
output stream.
*/
public abstract Token process(Token t) throws IOException;
public final Token next() throws IOException {
while (true) {
if (!outQueue.isEmpty()) return outQueue.removeFirst();
Token t = inQueue.isEmpty() ? input.next() : inQueue.removeFirst();
if (t==null) return null;
Token out = process(t);
if (out!=null) return out;
// loop back to top in case process() put something on the output
queue
}
}
/** read a token from the input stream */
public Token read() throws IOException {
if (inQueue.size()==0) {
Token t = input.next();
return t;
}
return inQueue.getFirst();
}
/** push a token back into the input stream */
public void pushBack(Token t) {
inQueue.addFirst(t);
}
/** peek n tokens ahead in the stream (1 based... 0 is invalid) */
public Token peek(int n) throws IOException {
int fillCount = n-inQueue.size();
for (int i=0; i<fillCount; i++) {
Token t = input.next();
if (t==null) return null;
inQueue.add(t);
}
return inQueue.get(n-1);
}
/** write a token to the output stream */
public void write(Token t) {
outQueue.add(t);
}
Iterable<Token> output() {
return outQueue;
}
}