I haven't had a chance to do much on this lately (BigMultiTermScorer),
so here is some code I had sitting around, unfinished & untested, but
may stimulate discussion on the direction.
-Yonik
package org.apache.lucene.search;
import org.apache.lucene.index.*;
import org.apache.lucene.util.SmallFloat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import java.io.IOException;
import java.util.BitSet;
import junit.framework.TestCase;
class TestWildcardQuery2 extends TestCase {
void addDocs(IndexWriter writer, int num, int range) {
int id=0;
for (int i=0; i<num; i++) {
if (++id >= range) id=0;
Document doc = new Document();
doc.add(new Field("id",Integer.toString(id), Field.Store.NO,
Field.Index.UN_TOKENIZED));
}
}
Directory getIndex(int size) throws IOException {
Directory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true);
addDocs(writer,1000,1000);
writer.close();
return dir;
}
public void testScore() {
}
}
class WildcardQuery2 extends Query {
protected final Term term;
public WildcardQuery2(Term term) {
this.term=term;
}
// refactor to MultiTermWeight and share it?
protected class WildcardWeight implements Weight {
private Searcher searcher;
private float queryNorm;
private float queryWeight;
public WildcardWeight(Searcher searcher) {
this.searcher = searcher;
}
public Query getQuery() {
return WildcardQuery2.this;
}
public float getValue() {
return queryWeight;
}
public float sumOfSquaredWeights() throws IOException {
queryWeight = getBoost();
return queryWeight * queryWeight;
}
public void normalize(float norm) {
this.queryNorm = norm;
queryWeight *= this.queryNorm;
}
public Scorer scorer(IndexReader reader) throws IOException {
// could analyze the number of terms at this point and only
// use the Big scorer if the number of terms are high.
BigMultiTermScorer scorer = new
BigMultiTermScorer(getSimilarity(searcher), reader);
scorer.add(new WildcardTermEnum(reader, term),
getSimilarity(searcher),
this,
reader.norms(term.field()),
true,
true
);
scorer.done();
return scorer;
}
public Explanation explain(IndexReader reader, int doc) throws IOException {
return new Explanation(1.0f, "WildcardQuery2 dummy explain");
}
}
public String toString(String field) {
return "WildCardQuery2("+term+")";
}
}
/** BigMultiTermScorer should be used when the number of terms in an
expanded query is larger
* than the maximum number of clauses in a boolean query.
*
* @author yonik
* @version $Id$
*/
class BigMultiTermScorer extends Scorer{
private final IndexReader reader;
private final float[] normDecoder;
private final byte[] scores;
private final BitSet docs;
private int pos=-1;
// It may be desirable to share one score[] across multiple clauses
// of a query to save memory... say in the case of
// QUERY = title:foo* OR subject:foo*
// QUERY = foo* OR bar*
// Right now, this can be done by instantiating a single scorer and
// calling add() multiple times. An alternate way could be to pass
// in the score[] in the constructor, and share across multiple Scorer
// instances. This might be needed to optimize "foo* AND bar*" since
// that requires two scorers.
// Alternate pattern: create a ScoreAccumulator class that could
// be shared with multiple scorers. That's pretty much what MatchManyScorer
// is anyway though.
public BigMultiTermScorer(Similarity similarity, IndexReader reader)
throws IOException {
super(similarity);
this.reader = reader;
int maxDoc = reader.maxDoc();
scores = new byte[maxDoc];
docs = new BitSet(maxDoc);
normDecoder = Similarity.getNormDecoder();
}
// notes: similarity, weight, norms are passed separately for each add()
// to enable sharing of this scorer for multiple clauses of a query.
public void add(TermEnum terms, Similarity similarity, Weight w,
byte[] norms, boolean include_idf, boolean include_tf) throws
IOException {
float weightVal = w.getValue();
int maxDoc = reader.maxDoc();
TermDocs tdocs = reader.termDocs();
while (terms.next()) {
tdocs.seek(terms);
float termScore = weightVal;
if (include_idf) {
termScore *= similarity.idf(terms.docFreq(),maxDoc);
}
add(tdocs, similarity, termScore, norms, include_tf);
}
}
/**
*
* @param tdocs
* @param similarity
* @param termScore all components of the score that are not
document specific. (weight,idf are not document specific, tf,norm are)
* @param norms
* @param include_tf
* @throws IOException
*/
public void add(TermDocs tdocs, Similarity similarity, float
termScore, byte[] norms, boolean include_tf) throws IOException {
while (tdocs.next()) {
int doc = tdocs.doc();
float subscore = termScore;
if (include_tf) subscore *= similarity.tf(tdocs.freq());
if (norms!=null) subscore *= normDecoder[norms[doc&0xff]];
add(doc,subscore);
}
}
public void add(int doc, float score) {
float curr = SmallFloat.byte52ToFloat(scores[doc]);
scores[doc] = SmallFloat.floatToByte52(curr+score);
docs.set(doc);
}
/** done should be called after all calls to add() and before the
* first call to next().
*/
public void done() {
// done() isn't really needed in the current implementation, but
// it may be needed in an alternate implementation.
pos=-1;
}
public boolean next() throws IOException {
pos = docs.nextSetBit(pos+1);
return pos>=0;
}
public int doc() {
return pos;
}
public float score() throws IOException {
return SmallFloat.byte52ToFloat(scores[pos]);
}
public boolean skipTo(int target) throws IOException {
pos=target-1;
return next();
}
public Explanation explain(int doc) throws IOException {
return null;
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]