Howdy....not sure if anyone else wants this but here is my first attempt at writing an analyzer for an email address...modifications, updates, fixes welcome.

-------------- EmailAnalyzer

import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;

public class EmailAnalyzer extends Analyzer {
public EmailAnalyzer() {
   }

   public TokenStream tokenStream(String fieldName, Reader reader) {
       // return a tokenstream based on the email address
TokenStream result = new EmailFilter(new LowerCaseFilter(new StandardTokenizer(reader))); return result;
   }
}

-------------- end EmailAnalyzer

-------------- EmailFilter

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Token;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Stack;

public class EmailFilter extends TokenFilter {
   public static final String TOKEN_TYPE_EMAIL = "EMAILPART";

   private Stack emailTokenStack;
public EmailFilter(TokenStream in) {
       super(in);
       emailTokenStack = new Stack();
   }

   public Token next() throws IOException {

       if (emailTokenStack.size() > 0) {
           return (Token) emailTokenStack.pop();
}
       Token token = input.next();
       if (token == null) {
           return null;
       }

       addEmailPartsToStack(token);

       return token;
   }
private void addEmailPartsToStack(Token token) throws IOException {
       String[] parts = getEmailParts(token.termText());

       if (parts == null) return;

       for (int i = 0; i < parts.length; i++) {
           Token synToken = new Token(parts[i],
                                token.startOffset(),
                                token.endOffset(),
                                TOKEN_TYPE_EMAIL);
           synToken.setPositionIncrement(0);

           emailTokenStack.push(synToken);
       }
   }

   /*
    * Parses emails into its parts for tokenization.
    * For example [EMAIL PROTECTED] would be broken into
    *
    *    [EMAIL PROTECTED]
    *    [john]
    *    [foo.com]
    *    [foo]
    *    [com]
* */
   private String[] getEmailParts(String email) {
       // array for the parts
       String[] emailParts;
       // so i can add them before calling toArray
       ArrayList partsList = new ArrayList();

       /* let's do it */
       // split on the @
       String[] splitOnAmpersand = email.split("@");
       // add the username
       partsList.add(splitOnAmpersand[0]);
       // add the full host name
       partsList.add(splitOnAmpersand[1]);
// split the host name into pieces
       String[] splitOnDot = splitOnAmpersand[1].split("\\.");
       // add all pieces from splitOnDot
       for (int i=0; i < splitOnDot.length; i++) {
           partsList.add(splitOnDot[i]);
       }
/* * if this is great than 2 then we need to add the domain name which
        *  should be the last two
* */
       if (splitOnDot.length > 2) {
String domain = splitOnDot[splitOnDot.length-2] + "." + splitOnDot[splitOnDot.length-1];
           // add domain
           partsList.add(domain);
       }
return (String[]) partsList.toArray(new String[0]); }

}

------------ end EmailFilter

Let me know...
-Michael

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to