[Nutch-general] RE: Nutch does not use stemmers?

Howie Wang Tue, 21 Jun 2005 13:07:20 -0700

I've gotten a couple of questions offlist about stemming
so I thought I'd just post here with my changes. Sorry that
some of the changes are in the main code and not in a plugin. It
seemed that it's more efficient to put in the main analyzer. It
would be nice if later releases could add support for plugging
in a custom stemmer/analyzer.


The first change I made is in NutchDocumentAnalyzer.java.

Import the following classes at the top of the file:
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.PorterStemFilter;

Change tokenStream to:

  public TokenStream tokenStream(String field, Reader reader) {

TokenStream ts = CommonGrams.getFilter(new NutchDocumentTokenizer(reader),field);

if (field.equals("content") || field.equals("title")) {
   ts = new LowerCaseFilter(ts);
   return new PorterStemFilter(ts);
} else {
   return ts;
}
  }

The second change is in CommonGrams.java.
Import the following classes near the top:

import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.PorterStemFilter;

In optimizePhrase, after this line:

  TokenStream ts = getFilter(new ArrayTokens(phrase), field);

Add:

  ts = new PorterStemFilter(new LowerCaseFilter(ts));

And the rest is a new QueryFilter plugin that I'm calling query-stemmer.
Here's the full source for the Java file. You can copy the build.xml
and plugin.xml from query-basic, and alter the names for query-stemmer.

/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package org.apache.nutch.searcher.stemmer;

import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.PorterStemFilter;

import org.apache.nutch.analysis.NutchDocumentAnalyzer;
import org.apache.nutch.analysis.CommonGrams;

import org.apache.nutch.searcher.QueryFilter;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.Query.*;

import java.io.IOException;
import java.util.HashSet;
import java.io.StringReader;

/** The default query filter.  Query terms in the default query field are
* expanded to search the url, anchor and content document fields.*/
public class StemmerQueryFilter implements QueryFilter {

 private static float URL_BOOST = 4.0f;
 private static float ANCHOR_BOOST = 2.0f;

 private static int SLOP = Integer.MAX_VALUE;
 private static float PHRASE_BOOST = 1.0f;

private static final String[] FIELDS = {"url", "anchor", "content","title"};private static final float[] FIELD_BOOSTS = {URL_BOOST, ANCHOR_BOOST,1.0f, 2.0f};


 /** Set the boost factor for url matches, relative to content and anchor
  * matches */
 public static void setUrlBoost(float boost) { URL_BOOST = boost; }

 /** Set the boost factor for title/anchor matches, relative to url and
  * content matches. */
 public static void setAnchorBoost(float boost) { ANCHOR_BOOST = boost; }

/** Set the boost factor for sloppy phrase matches relative to unorderedterm

  * matches. */
 public static void setPhraseBoost(float boost) { PHRASE_BOOST = boost; }

 /** Set the maximum number of terms permitted between matching terms in a
  * sloppy phrase match. */
 public static void setSlop(int slop) { SLOP = slop; }

 public BooleanQuery filter(Query input, BooleanQuery output) {
   addTerms(input, output);
   addSloppyPhrases(input, output);
   return output;
 }

 private static void addTerms(Query input, BooleanQuery output) {
   Clause[] clauses = input.getClauses();
   for (int i = 0; i < clauses.length; i++) {
     Clause c = clauses[i];

     if (!c.getField().equals(Clause.DEFAULT_FIELD))
       continue;                                 // skip non-default fields

     BooleanQuery out = new BooleanQuery();
     for (int f = 0; f < FIELDS.length; f++) {

       Clause o = c;
        String[] opt;

        // TODO: I'm a little nervous about stemming for all default fields.
        //       Should keep an eye on this.

if (c.isPhrase()) { // optimize phraseclauses

            opt = CommonGrams.optimizePhrase(c.getPhrase(), FIELDS[f]);
       } else {
            System.out.println("o.getTerm = " + o.getTerm().toString());
            opt = getStemmedWords(o.getTerm().toString());
        }
        if (opt.length==1) {

o = new Clause(new Term(opt[0]), c.isRequired(),c.isProhibited());

        } else {

o = new Clause(new Phrase(opt), c.isRequired(),c.isProhibited());

        }

       out.add(o.isPhrase()
               ? exactPhrase(o.getPhrase(), FIELDS[f], FIELD_BOOSTS[f])
               : termQuery(FIELDS[f], o.getTerm(), FIELD_BOOSTS[f]),
               false, false);
     }
     output.add(out, c.isRequired(), c.isProhibited());
   }
   System.out.println("query = " + output.toString());
 }

   private static String[] getStemmedWords(String value) {
          StringReader sr = new StringReader(value);
          TokenStream ts = new PorterStemFilter(new LowerCaseTokenizer(sr));

          String stemmedValue = "";
          try {
              Token token = ts.next();
              int count = 0;
              while (token != null) {
                  System.out.println("token = " + token.termText());
                  System.out.println("type = " + token.type());

                  if (count == 0)
                      stemmedValue = token.termText();
                  else
                      stemmedValue = stemmedValue + " " + token.termText();

                  token = ts.next();
                  count++;
              }
          } catch (Exception e) {
              stemmedValue = value;
          }

          if (stemmedValue.equals("")) {
              stemmedValue = value;
          }

          String[] stemmedValues = stemmedValue.split("\\s+");

          for (int j=0; j<stemmedValues.length; j++) {
              System.out.println("stemmedValues = " + stemmedValues[j]);
          }
          return stemmedValues;
   }


 private static void addSloppyPhrases(Query input, BooleanQuery output) {
   Clause[] clauses = input.getClauses();
   for (int f = 0; f < FIELDS.length; f++) {

     PhraseQuery sloppyPhrase = new PhraseQuery();
     sloppyPhrase.setBoost(FIELD_BOOSTS[f] * PHRASE_BOOST);
     sloppyPhrase.setSlop("anchor".equals(FIELDS[f])
                          ? NutchDocumentAnalyzer.INTER_ANCHOR_GAP
                          : SLOP);
     int sloppyTerms = 0;

     for (int i = 0; i < clauses.length; i++) {
       Clause c = clauses[i];

       if (!c.getField().equals(Clause.DEFAULT_FIELD))
         continue;                               // skip non-default fields

       if (c.isPhrase())                         // skip exact phrases
         continue;

       if (c.isProhibited())                     // skip prohibited terms
         continue;

       sloppyPhrase.add(luceneTerm(FIELDS[f], c.getTerm()));
       sloppyTerms++;
     }

     if (sloppyTerms > 1)
       output.add(sloppyPhrase, false, false);
   }
 }


 private static org.apache.lucene.search.Query
       termQuery(String field, Term term, float boost) {
   TermQuery result = new TermQuery(luceneTerm(field, term));
   result.setBoost(boost);
   return result;
 }

/** Utility to construct a Lucene exact phrase query for a Nutch phrase.*/

 private static org.apache.lucene.search.Query
      exactPhrase(Phrase nutchPhrase,
                  String field, float boost) {
   Term[] terms = nutchPhrase.getTerms();
   PhraseQuery exactPhrase = new PhraseQuery();
   for (int i = 0; i < terms.length; i++) {
     exactPhrase.add(luceneTerm(field, terms[i]));
   }
   exactPhrase.setBoost(boost);
   return exactPhrase;
 }

/** Utility to construct a Lucene Term given a Nutch query term and field.*/

 private static org.apache.lucene.index.Term luceneTerm(String field,
                                                        Term term) {
   return new org.apache.lucene.index.Term(field, term.toString());
 }
}




-------------------------------------------------------
SF.Net email is sponsored by: Discover Easy Linux Migration Strategies
from IBM. Find simple to follow Roadmaps, straightforward articles,
informative Webcasts and more! Get everything you need to get up to
speed, fast. http://ads.osdn.com/?ad_id=7477&alloc_id=16492&op=click
_______________________________________________
Nutch-general mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/nutch-general

[Nutch-general] RE: Nutch does not use stemmers?

Reply via email to