cutting 2002/07/29 12:11:15 Modified: . CHANGES.txt src/java/org/apache/lucene/document Document.java Field.java src/java/org/apache/lucene/index DocumentWriter.java IndexReader.java src/java/org/apache/lucene/search PhrasePrefixQuery.java PhraseScorer.java Similarity.java TermScorer.java Added: src/test/org/apache/lucene/search TestDocBoost.java Log: msg.txt Revision Changes Path 1.28 +12 -1 jakarta-lucene/CHANGES.txt Index: CHANGES.txt =================================================================== RCS file: /home/cvs/jakarta-lucene/CHANGES.txt,v retrieving revision 1.27 retrieving revision 1.28 diff -u -r1.27 -r1.28 --- CHANGES.txt 26 Jul 2002 17:32:54 -0000 1.27 +++ CHANGES.txt 29 Jul 2002 19:11:14 -0000 1.28 @@ -48,6 +48,17 @@ stems from nouns and verbs derived from the same word. (gschwarz) + 12. Added support for boosting the score of documents and fields via + the new methods Document.setBoost(float) and Field.setBoost(float). + + Note: This changes the encoding of an indexed value. Indexes + should be re-created from scratch in order for search scores to + be correct. With the new code and an old index, searches will + yield very large scores for shorter fields, and very small scores + for longer fields. Once the index is re-created, scores will be + as before. (cutting) + + 1.2 RC6 1. Changed QueryParser.jj to have "?" be a special character which 1.3 +32 -0 jakarta-lucene/src/java/org/apache/lucene/document/Document.java Index: Document.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/document/Document.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- Document.java 17 Jul 2002 21:54:38 -0000 1.2 +++ Document.java 29 Jul 2002 19:11:14 -0000 1.3 @@ -55,6 +55,8 @@ */ import java.util.Enumeration; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Hits; /** Documents are the unit of indexing and search. * @@ -66,9 +68,39 @@ public final class Document implements java.io.Serializable { DocumentFieldList fieldList = null; + private float boost = 1.0f; /** Constructs a new document with no fields. */ public Document() {} + + + /** Sets a boost factor for hits on any field of this document. This value + * will be multiplied into the score of all hits on this document. + * + * <p>Values are multiplied into the value of {@link Field#getBoost()} of + * each field in this document. Thus, this method in effect sets a default + * boost for the fields of this document. + * + * @see Field#setBoost(float) + */ + public void setBoost(float boost) { + this.boost = boost; + } + + /** Returns the boost factor for hits on any field of this document. + * + * <p>The default value is 1.0. + * + * <p>Note: This value is not stored directly with the document in the index. + * Documents returned from {@link IndexReader#document(int)} and {@link + * Hits#doc(int)} may thus not have the same value present as when this + * document was indexed. + * + * @see #setBoost(float) + */ + public float getBoost() { + return boost; + } /** Adds a field to a document. Several fields may be added with * the same name. In this case, if the fields are indexed, their text is 1.7 +40 -0 jakarta-lucene/src/java/org/apache/lucene/document/Field.java Index: Field.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/document/Field.java,v retrieving revision 1.6 retrieving revision 1.7 diff -u -r1.6 -r1.7 --- Field.java 17 Jul 2002 21:54:38 -0000 1.6 +++ Field.java 29 Jul 2002 19:11:14 -0000 1.7 @@ -56,6 +56,9 @@ import java.io.Reader; import java.util.Date; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.Hits; /** A field is a section of a Document. Each field has two parts, a name and a @@ -72,6 +75,43 @@ private boolean isStored = false; private boolean isIndexed = true; private boolean isTokenized = true; + + private float boost = 1.0f; + + /** Sets the boost factor hits on this field. This value will be + * multiplied into the score of all hits on this this field of this + * document. + * + * <p>The boost is multiplied by {@link Document#getBoost()} of the document + * containing this field. If a document has multiple fields with the same + * name, all such values are multiplied together. This product is then + * multipled by the value {@link Similarity#normalizeLength(int)}, and + * rounded by {@link Similarity#encodeNorm(float)} before it is stored in the + * index. One should attempt to ensure that this product does not overflow + * the range of that encoding. + * + * @see Document#setBoost(float) + * @see Similarity#normalizeLength(int) + * @see Similarity#encodeNorm(float) + */ + public void setBoost(float boost) { + this.boost = boost; + } + + /** Returns the boost factor for hits on any field of this document. + * + * <p>The default value is 1.0. + * + * <p>Note: this value is not stored directly with the document in the index. + * Documents returned from {@link IndexReader#document(int)} and {@link + * Hits#doc(int)} may thus not have the same value present as when this field + * was indexed. + * + * @see #setBoost(float) + */ + public float getBoost() { + return boost; + } /** Constructs a String-valued Field that is not tokenized, but is indexed and stored. Useful for non-text fields, e.g. date or url. */ 1.2 +13 -4 jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java Index: DocumentWriter.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- DocumentWriter.java 18 Sep 2001 16:29:52 -0000 1.1 +++ DocumentWriter.java 29 Jul 2002 19:11:15 -0000 1.2 @@ -59,6 +59,7 @@ import java.io.StringReader; import java.util.Hashtable; import java.util.Enumeration; +import java.util.Arrays; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -100,6 +101,10 @@ // invert doc into postingTable postingTable.clear(); // clear postingTable fieldLengths = new int[fieldInfos.size()]; // init fieldLengths + + fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts + Arrays.fill(fieldBoosts, doc.getBoost()); + invertDocument(doc); // sort postingTable into an array @@ -130,6 +135,7 @@ // Used to buffer a document before it is written to the index. private final Hashtable postingTable = new Hashtable(); private int[] fieldLengths; + private float[] fieldBoosts; // Tokenizes the fields of a document into Postings. private final void invertDocument(Document doc) @@ -168,6 +174,7 @@ } fieldLengths[fieldNumber] = position; // save field length + fieldBoosts[fieldNumber] *= field.getBoost(); } } } @@ -310,12 +317,14 @@ while (fields.hasMoreElements()) { Field field = (Field)fields.nextElement(); if (field.isIndexed()) { - int fieldNumber = fieldInfos.fieldNumber(field.name()); - OutputStream norm = directory.createFile(segment + ".f" + fieldNumber); + int n = fieldInfos.fieldNumber(field.name()); + float norm = + fieldBoosts[n] * Similarity.normalizeLength(fieldLengths[n]); + OutputStream norms = directory.createFile(segment + ".f" + n); try { - norm.writeByte(Similarity.norm(fieldLengths[fieldNumber])); + norms.writeByte(Similarity.encodeNorm(norm)); } finally { - norm.close(); + norms.close(); } } } 1.10 +5 -3 jakarta-lucene/src/java/org/apache/lucene/index/IndexReader.java Index: IndexReader.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/IndexReader.java,v retrieving revision 1.9 retrieving revision 1.10 diff -u -r1.9 -r1.10 --- IndexReader.java 15 Feb 2002 18:59:42 -0000 1.9 +++ IndexReader.java 29 Jul 2002 19:11:15 -0000 1.10 @@ -60,6 +60,7 @@ import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.Lock; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; /** IndexReader is an abstract class, providing an interface for accessing an index. Search of an index is done entirely through this abstract interface, @@ -177,9 +178,10 @@ abstract public boolean isDeleted(int n); /** Returns the byte-encoded normalization factor for the named field of - every document. This is used by the search code to score documents. - @see org.apache.lucene.search.Similarity#norm - */ + * every document. This is used by the search code to score documents. + * + * @see Field#setBoost(float) + */ abstract public byte[] norms(String field) throws IOException; /** Returns an enumeration of all the terms in the index. 1.2 +1 -1 jakarta-lucene/src/java/org/apache/lucene/search/PhrasePrefixQuery.java Index: PhrasePrefixQuery.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhrasePrefixQuery.java,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- PhrasePrefixQuery.java 18 Jul 2002 14:39:58 -0000 1.1 +++ PhrasePrefixQuery.java 29 Jul 2002 19:11:15 -0000 1.2 @@ -66,7 +66,7 @@ /** * PhrasePrefixQuery is a generalized version of PhraseQuery, with an added - * method {@link add(Term[])}. + * method {@link #add(Term[])}. * To use this class, to search for the phrase "Microsoft app*" first use * add(Term) on the term "Microsoft", then find all terms that has "app" as * prefix using IndexReader.terms(Term), and use PhrasePrefixQuery.add(Term[] 1.2 +1 -1 jakarta-lucene/src/java/org/apache/lucene/search/PhraseScorer.java Index: PhraseScorer.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhraseScorer.java,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- PhraseScorer.java 18 Sep 2001 16:29:57 -0000 1.1 +++ PhraseScorer.java 29 Jul 2002 19:11:15 -0000 1.2 @@ -93,7 +93,7 @@ if (freq > 0.0) { float score = Similarity.tf(freq)*weight; // compute score - score *= Similarity.norm(norms[first.doc]); // normalize + score *= Similarity.decodeNorm(norms[first.doc]); // normalize results.collect(first.doc, score); // add to results } last.next(); // resume scanning 1.2 +69 -20 jakarta-lucene/src/java/org/apache/lucene/search/Similarity.java Index: Similarity.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/Similarity.java,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- Similarity.java 18 Sep 2001 16:29:58 -0000 1.1 +++ Similarity.java 29 Jul 2002 19:11:15 -0000 1.2 @@ -56,6 +56,7 @@ import java.io.IOException; import org.apache.lucene.index.Term; +import org.apache.lucene.document.Field; /** Internal class used for scoring. * <p>Public only so that the indexing code can compute and store the @@ -63,32 +64,80 @@ public final class Similarity { private Similarity() {} // no public constructor - /** Computes the normalization byte for a document given the total number of - * terms contained in the document. These values are stored in an index and - * used by the search code. */ - public static final byte norm(int numTerms) { - // Scales 1/sqrt(numTerms) into a byte, i.e. 256/sqrt(numTerms). - // Math.ceil is used to ensure that even very long documents don't get a - // zero norm byte, as that is reserved for zero-lengthed documents and - // deleted documents. - return (byte) Math.ceil(255.0 / Math.sqrt(numTerms)); + static final float[] NORM_TABLE = new float[256]; + + static { + for (int i = 0; i < 256; i++) + NORM_TABLE[i] = byteToFloat((byte)i); } + /** Computes the normalization value for a document given the total number of + * terms contained in a field. These values are stored in an index and used + * by the search code. + * + * <p>The formula used is: <code>1.0f / Math.sqrt(numTerms)</code> + * + * @see Field#setBoost(float) + */ + public static float normalizeLength(int numTerms) { + return (float)(1.0 / Math.sqrt(numTerms)); + } + + /** Decodes a normalization factor stored in an index. + * @see #encodeNorm(float) + */ + public static float decodeNorm(byte b) { + return NORM_TABLE[b & 0xFF]; + } - private static final float[] makeNormTable() { - float[] result = new float[256]; - for (int i = 0; i < 256; i++) - result[i] = i / 255.0F; - return result; + /** Encodes a normalization factor for storage in an index. + * + * <p>The encoding uses a five-bit exponent and three-bit mantissa, thus + * representing values from around 7x10^9 to 2x10^-9 with about one + * significant decimal digit of accuracy. Zero is also represented. + * Negative numbers are rounded up to zero. Values too large to represent + * are rounded down to the largest representable value. Positive values too + * small to represent are rounded up to the smallest positive representable + * value. + * + * @see Field#setBoost(float) + */ + public static byte encodeNorm(float f) { + return floatToByte(f); } - static final float[] NORM_TABLE = makeNormTable(); - - static final float norm(byte normByte) { - // Un-scales from the byte encoding of a norm into a float, i.e., - // approximately 1/sqrt(numTerms). - return NORM_TABLE[normByte & 0xFF]; + private static float byteToFloat(byte b) { + if (b == 0) // zero is a special case + return 0.0f; + int mantissa = b & 7; + int exponent = (b >> 3) & 31; + int bits = ((exponent+(63-15)) << 24) | (mantissa << 21); + return Float.intBitsToFloat(bits); } + + private static byte floatToByte(float f) { + if (f < 0.0f) // round negatives up to zero + f = 0.0f; + + if (f == 0.0f) // zero is a special case + return 0; + + int bits = Float.floatToIntBits(f); // parse float into parts + int mantissa = (bits & 0xffffff) >> 21; + int exponent = (((bits >> 24) & 0x7f) - 63) + 15; + + if (exponent > 31) { // overflow: use max value + exponent = 31; + mantissa = 7; + } + + if (exponent < 1) { // underflow: use min value + exponent = 1; + mantissa = 0; + } + + return (byte)((exponent << 3) | mantissa); // pack into a byte + } static final float tf(int freq) { return (float)Math.sqrt(freq); 1.2 +1 -1 jakarta-lucene/src/java/org/apache/lucene/search/TermScorer.java Index: TermScorer.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/TermScorer.java,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- TermScorer.java 18 Sep 2001 16:29:58 -0000 1.1 +++ TermScorer.java 29 Jul 2002 19:11:15 -0000 1.2 @@ -98,7 +98,7 @@ ? scoreCache[f] // cache hit : Similarity.tf(f)*weight; // cache miss - score *= Similarity.norm(norms[d]); // normalize for field + score *= Similarity.decodeNorm(norms[d]); // normalize for field c.collect(d, score); // collect score 1.1 jakarta-lucene/src/test/org/apache/lucene/search/TestDocBoost.java Index: TestDocBoost.java =================================================================== package org.apache.lucene.search; /* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Lucene" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact [EMAIL PROTECTED] * * 5. Products derived from this software may not be called "Apache", * "Apache Lucene", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */ import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.search.Query; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import junit.framework.TestCase; /** Document boost unit test. * * @author Doug Cutting * @version $Revision: 1.1 $ */ public class TestDocBoost extends TestCase { public TestDocBoost(String name) { super(name); } public static void test() throws Exception { RAMDirectory store = new RAMDirectory(); IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true); Field f1 = Field.Text("field", "word"); Field f2 = Field.Text("field", "word"); f2.setBoost(2.0f); Document d1 = new Document(); Document d2 = new Document(); Document d3 = new Document(); Document d4 = new Document(); d3.setBoost(3.0f); d4.setBoost(2.0f); d1.add(f1); // boost = 1 d2.add(f2); // boost = 2 d3.add(f1); // boost = 3 d4.add(f2); // boost = 4 writer.addDocument(d1); writer.addDocument(d2); writer.addDocument(d3); writer.addDocument(d4); writer.optimize(); writer.close(); final float[] scores = new float[4]; new IndexSearcher(store).search (new TermQuery(new Term("field", "word")), new HitCollector() { public final void collect(int doc, float score) { scores[doc] = score; } }); float lastScore = 0.0f; for (int i = 0; i < 4; i++) { assertTrue(scores[i] > lastScore); lastScore = scores[i]; } } }
-- To unsubscribe, e-mail: <mailto:[EMAIL PROTECTED]> For additional commands, e-mail: <mailto:[EMAIL PROTECTED]>