I'm just now starting to play with the scoring algorithm.  The first change
I want to make is to have the score ignore term frequency.  I created this
test script to validate my understanding of the API, but my custom
Similarity class doesn't seem to affect the tf values in the output, and I
can't figure out why.  I've looked at the docs, the scoring page on the
lucene site, and various archived posts, and I don't see anything I've done
wrong.

The print statement in tf() was to test if the overridden method is even
getting called.  It's not.

---
import PyLucene

def main():
    store = build_index()
    searcher = PyLucene.IndexSearcher(store)
    searcher.setSimilarity(SimilaritySansTF())
    parser = PyLucene.QueryParser('_all_', PyLucene.StandardAnalyzer())

    query = parser.parse('foo')
    hits = searcher.search(query)

    for i, doc in hits:
        print '[%02d] %s (%0.2f)' % (i, doc.get('_all_'), hits.score(i))
        print '\t%s' % (searcher.explain(query, hits.id(i)))

def build_index():
    store = PyLucene.RAMDirectory()
    writer = PyLucene.IndexWriter(store, PyLucene.StandardAnalyzer(), True)

    doc = PyLucene.Document()
    doc.add(PyLucene.Field('_all_', 'foo bar bar', PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
    writer.addDocument(doc)

    doc = PyLucene.Document()
    doc.add(PyLucene.Field('_all_', 'foo foo bar', PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
    writer.addDocument(doc)

    doc = PyLucene.Document()
    doc.add(PyLucene.Field('_all_', 'foo bar', PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
    writer.addDocument(doc)

    writer.optimize()
    writer.close()

    return store

class SimilaritySansTF(PyLucene.DefaultSimilarity):
    def tf(freq):
        print 'freak out!'
        return 1

main()
---

-ofer

_______________________________________________
pylucene-dev mailing list
[email protected]
http://lists.osafoundation.org/mailman/listinfo/pylucene-dev

Reply via email to