I'm just now starting to play with the scoring algorithm. The first change
I want to make is to have the score ignore term frequency. I created this
test script to validate my understanding of the API, but my custom
Similarity class doesn't seem to affect the tf values in the output, and I
can't figure out why. I've looked at the docs, the scoring page on the
lucene site, and various archived posts, and I don't see anything I've done
wrong.
The print statement in tf() was to test if the overridden method is even
getting called. It's not.
---
import PyLucene
def main():
store = build_index()
searcher = PyLucene.IndexSearcher(store)
searcher.setSimilarity(SimilaritySansTF())
parser = PyLucene.QueryParser('_all_', PyLucene.StandardAnalyzer())
query = parser.parse('foo')
hits = searcher.search(query)
for i, doc in hits:
print '[%02d] %s (%0.2f)' % (i, doc.get('_all_'), hits.score(i))
print '\t%s' % (searcher.explain(query, hits.id(i)))
def build_index():
store = PyLucene.RAMDirectory()
writer = PyLucene.IndexWriter(store, PyLucene.StandardAnalyzer(), True)
doc = PyLucene.Document()
doc.add(PyLucene.Field('_all_', 'foo bar bar', PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
writer.addDocument(doc)
doc = PyLucene.Document()
doc.add(PyLucene.Field('_all_', 'foo foo bar', PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
writer.addDocument(doc)
doc = PyLucene.Document()
doc.add(PyLucene.Field('_all_', 'foo bar', PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
writer.addDocument(doc)
writer.optimize()
writer.close()
return store
class SimilaritySansTF(PyLucene.DefaultSimilarity):
def tf(freq):
print 'freak out!'
return 1
main()
---
-ofer
_______________________________________________
pylucene-dev mailing list
[email protected]
http://lists.osafoundation.org/mailman/listinfo/pylucene-dev