Re: svn commit: r332747 - in /lucene/java/trunk: ./ src/java/org/apache/lucene/search/regex/ src/test/org/apache/lucene/search/regex/

Yonik Seeley Thu, 26 Jan 2006 07:03:46 -0800

I haven't had a chance to do much on this lately (BigMultiTermScorer),
so here is some code I had sitting around, unfinished & untested, but
may stimulate discussion on the direction.


-Yonik



package org.apache.lucene.search;

import org.apache.lucene.index.*;
import org.apache.lucene.util.SmallFloat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;

import java.io.IOException;
import java.util.BitSet;

import junit.framework.TestCase;


class TestWildcardQuery2 extends TestCase {

  void addDocs(IndexWriter writer, int num, int range) {
    int id=0;
    for (int i=0; i<num; i++) {
      if (++id >= range) id=0;
      Document doc = new Document();
      doc.add(new Field("id",Integer.toString(id), Field.Store.NO,
Field.Index.UN_TOKENIZED));
    }
  }

  Directory getIndex(int size) throws IOException {
    Directory dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true);
    addDocs(writer,1000,1000);
    writer.close();
    return dir;
  }

  public void testScore() {


  }



}


class WildcardQuery2 extends Query {
  protected final Term term;

  public WildcardQuery2(Term term) {
    this.term=term;
  }

  // refactor to MultiTermWeight and share it?
  protected class WildcardWeight implements Weight {
    private Searcher searcher;
    private float queryNorm;
    private float queryWeight;

    public WildcardWeight(Searcher searcher) {
      this.searcher = searcher;
    }

    public Query getQuery() {
      return WildcardQuery2.this;
    }

    public float getValue() {
      return queryWeight;
    }

    public float sumOfSquaredWeights() throws IOException {
      queryWeight = getBoost();
      return queryWeight * queryWeight;
    }

    public void normalize(float norm) {
      this.queryNorm = norm;
      queryWeight *= this.queryNorm;
    }

    public Scorer scorer(IndexReader reader) throws IOException {
      // could analyze the number of terms at this point and only
      // use the Big scorer if the number of terms are high.
      BigMultiTermScorer scorer = new
BigMultiTermScorer(getSimilarity(searcher), reader);
      scorer.add(new WildcardTermEnum(reader, term),
              getSimilarity(searcher),
              this,
              reader.norms(term.field()),
              true,
              true
      );
      scorer.done();
      return scorer;
    }

    public Explanation explain(IndexReader reader, int doc) throws IOException {
      return new Explanation(1.0f, "WildcardQuery2 dummy explain");
    }
  }


  public String toString(String field) {
    return "WildCardQuery2("+term+")";
  }
}


/**  BigMultiTermScorer should be used when the number of terms in an
expanded query is larger
 * than the maximum number of clauses in a boolean query.
 *
 * @author yonik
 * @version $Id$
 */
class BigMultiTermScorer extends Scorer{
  private final IndexReader reader;
  private final float[] normDecoder;
  private final byte[] scores;
  private final BitSet docs;

  private int pos=-1;

  // It may be desirable to share one score[] across multiple clauses
  // of a query to save memory... say in the case of
  //   QUERY = title:foo* OR subject:foo*
  //   QUERY = foo* OR bar*
  // Right now, this can be done by instantiating a single scorer and
  // calling add() multiple times.  An alternate way could be to pass
  // in the score[] in the constructor, and share across multiple Scorer
  // instances.  This might be needed to optimize "foo* AND bar*" since
  // that requires two scorers.

  // Alternate pattern: create a ScoreAccumulator class that could
  // be shared with multiple scorers.  That's pretty much what MatchManyScorer
  // is anyway though.


  public BigMultiTermScorer(Similarity similarity, IndexReader reader)
throws IOException {
    super(similarity);
    this.reader = reader;
    int maxDoc = reader.maxDoc();
    scores = new byte[maxDoc];
    docs = new BitSet(maxDoc);
    normDecoder = Similarity.getNormDecoder();
  }

  // notes: similarity, weight, norms are passed separately for each add()
  // to enable sharing of this scorer for multiple clauses of a query.

  public void add(TermEnum terms, Similarity similarity, Weight w, 
byte[] norms, boolean include_idf, boolean include_tf) throws
IOException {
    float weightVal = w.getValue();
    int maxDoc = reader.maxDoc();

    TermDocs tdocs = reader.termDocs();
    while (terms.next()) {
      tdocs.seek(terms);
      float termScore = weightVal;
      if (include_idf) {
        termScore *= similarity.idf(terms.docFreq(),maxDoc);
      }
      add(tdocs, similarity, termScore, norms, include_tf);
    }
  }

  /**
   *
   * @param tdocs
   * @param similarity
   * @param termScore  all components of the score that are not
document specific. (weight,idf are not document specific, tf,norm are)
   * @param norms
   * @param include_tf
   * @throws IOException
   */
  public void add(TermDocs tdocs, Similarity similarity, float
termScore, byte[] norms, boolean include_tf) throws IOException {
    while (tdocs.next()) {
      int doc = tdocs.doc();
      float subscore = termScore;
      if (include_tf) subscore *= similarity.tf(tdocs.freq());
      if (norms!=null) subscore *= normDecoder[norms[doc&0xff]];
      add(doc,subscore);
    }
  }

  public void add(int doc, float score) {
    float curr = SmallFloat.byte52ToFloat(scores[doc]);
    scores[doc] = SmallFloat.floatToByte52(curr+score);
    docs.set(doc);
  }

  /** done should be called after all calls to add() and before the
   * first call to next().
   */
  public void done() {
    // done() isn't really needed in the current implementation, but
    // it may be needed in an alternate implementation.
    pos=-1;
  }


  public boolean next() throws IOException {
    pos = docs.nextSetBit(pos+1);
    return pos>=0;
  }

  public int doc() {
    return pos;
  }

  public float score() throws IOException {
    return SmallFloat.byte52ToFloat(scores[pos]);
  }

  public boolean skipTo(int target) throws IOException {
    pos=target-1;
    return next();
  }

  public Explanation explain(int doc) throws IOException {
    return null;
  }
}

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Re: svn commit: r332747 - in /lucene/java/trunk: ./ src/java/org/apache/lucene/search/regex/ src/test/org/apache/lucene/search/regex/

Reply via email to