Hello, this is BM25 algorithm I implement in Lucene.
it doen't work because I have compaired my results with the results of MG4J
(with the same documents set)
I don't know if I have a wrong formule or there are another mistake
Could you help me ?
--------------------------------------------------------------------------------------------------------------------------------
public class BM25Scorer extends Scorer {
private final static double EPSILON_SCORE = 1.000000082240371E-9;
private final static double DEFAULT_K1 = 0.75d;
private final static double DEFAULT_B = 0.95d;
private double b = DEFAULT_B;
private double k1 = DEFAULT_K1;
private IndexReader reader;
private Term term;
private Hits hits;
private int position; // document position in hits
private IndexSearcher searcher;
private int cooc = 0; // How many times a term appears in the
document
private float idf;
public float score() throws IOException {
TermFreqVector tfv = reader.getTermFreqVector( hits.id(position),
term.field() );
String[] terms = tfv.getTerms();
int[] freqs = tfv.getTermFrequencies();
for (int i = 0 ; i < terms.length ; i++) {
if( terms[i].equalsIgnoreCase(term.text()) ){
cooc = freqs[i];
}
}
idf = searcher.getSimilarity().idf(term, searcher);
Document document = (Document)hits.doc(position);
String[] values = document.getValues("DOCUMENT_LENGTH"); //
document length is a field of my index
long docLength = Long.valueOf(values[0]).longValue(); // document
lenght (number of words)
long averageLength = 200;
double loga = Math.max( EPSILON_SCORE, new Float(idf
).doubleValue());
double score = ( loga * (k1 + 1) * cooc ) / (cooc + k1*( (1-b) +
(b*docLength/averageLength) ) );
return new Float(score).floatValue();
}