This would be useful for implementing an N-gram filter. I'd support adding something like this to the Lucene core.

Doug

Yonik Seeley wrote:
Just brainstorming...
Here's a completely untested prototype of what a BufferingTokenStream
might look like, and a possible implementation of removing duplicate
tokens on top of it.


-Yonik



class RemoveDuplicatesTokenFilter extends BufferedTokenStream {
 public RemoveDuplicatesTokenFilter(TokenStream input) {super(input);}
 public Token process(Token t) throws IOException {
   Token tok = read();
   while (tok!=null && tok.getPositionIncrement()==0) {
     boolean dup=false;
     for (Token outTok : output()) {
       if (outTok.termText().equals(tok.termText())) {
         dup=true;
         break;
       }
     }
     if (!dup) write(tok);
   }
   if (tok != null) pushBack(tok);
   return t;
 }
}


/**
* Handles input and output buffering of TokenStream
*
*
* // Example of a class implementing the rule "A" "B" => "Q"
* class MyTokenStream extends BufferedTokenStream {
* public MyTokenStream(TokenStream input) {super(input);}
* public Token process(Token t) throws IOException {
*   if ("A".equals(t.termText())) {
*     Token t2 = read();
*     if (t2!=null && "B".equals(t2.termText())) t.setTermText("Q");
*     if (t2!=null) pushBack(t2);
*   }
*   return t;
* }
*}
*
* <pre>
* // Example of a class implementing "A" "B" => "A" "A" "B"
*class MyTokenStream extends BufferedTokenStream {
*  public MyTokenStream(TokenStream input) {super(input);}
*  public Token process(Token t) throws IOException {
* if ("A".equals(t.termText()) && "B".equals(peek(1).termText()) write(t);
*    return t;
*  }
*}
* </pre>
*
*
* @author yonik
* @version $Id$
*/
abstract class BufferedTokenStream extends TokenStream {
 // in the futute, might be faster if we implemented as an array
based CircularQueue
 private final LinkedList<Token> inQueue = new LinkedList<Token>();
 private final LinkedList<Token> outQueue = new LinkedList<Token>();
 private final TokenStream input;

 public BufferedTokenStream(TokenStream input) {
   this.input = input;
 }

/** Process a token. Subclasses may read more tokens from the input stream,
  * write more tokens to the output stream, or simply return the next token
* to be output. Subclasses may return null if the token is to be dropped.
  * If a subclass writes tokens to the output stream and returns a
non-null Token,
  * the returned Token is considered to be at the head of the token
output stream.
  */
 public abstract Token process(Token t) throws IOException;

 public final Token next() throws IOException {
   while (true) {
     if (!outQueue.isEmpty()) return outQueue.removeFirst();
     Token t = inQueue.isEmpty() ? input.next() : inQueue.removeFirst();
     if (t==null) return null;
     Token out = process(t);
     if (out!=null) return out;
// loop back to top in case process() put something on the output queue
   }
 }

 /** read a token from the input stream */
 public Token read() throws IOException {
   if (inQueue.size()==0) {
     Token t = input.next();
     return t;
   }
   return inQueue.getFirst();
 }

 /** push a token back into the input stream */
 public void pushBack(Token t) {
   inQueue.addFirst(t);
 }

 /** peek n tokens ahead in the stream (1 based... 0 is invalid) */
 public Token peek(int n) throws IOException {
   int fillCount = n-inQueue.size();
   for (int i=0; i<fillCount; i++) {
     Token t = input.next();
     if (t==null) return null;
     inQueue.add(t);
   }
   return inQueue.get(n-1);
 }

 /** write a token to the output stream */
 public void write(Token t) {
   outQueue.add(t);
 }

 Iterable<Token> output() {
   return outQueue;
 }

}

Reply via email to