search TestDocBoost.java

cutting Mon, 29 Jul 2002 11:49:35 -0700

cutting     2002/07/29 12:11:15

  Modified:    .        CHANGES.txt
               src/java/org/apache/lucene/document Document.java Field.java
               src/java/org/apache/lucene/index DocumentWriter.java
                        IndexReader.java
               src/java/org/apache/lucene/search PhrasePrefixQuery.java
                        PhraseScorer.java Similarity.java TermScorer.java
  Added:       src/test/org/apache/lucene/search TestDocBoost.java
  Log:
  msg.txt
  
  Revision  Changes    Path
  1.28      +12 -1     jakarta-lucene/CHANGES.txt
  
  Index: CHANGES.txt
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/CHANGES.txt,v
  retrieving revision 1.27
  retrieving revision 1.28
  diff -u -r1.27 -r1.28
  --- CHANGES.txt       26 Jul 2002 17:32:54 -0000      1.27
  +++ CHANGES.txt       29 Jul 2002 19:11:14 -0000      1.28
  @@ -48,6 +48,17 @@
        stems from nouns and verbs derived from the same word.
        (gschwarz)
   
  + 12. Added support for boosting the score of documents and fields via
  +     the new methods Document.setBoost(float) and Field.setBoost(float).
  +
  +     Note: This changes the encoding of an indexed value.  Indexes
  +     should be re-created from scratch in order for search scores to
  +     be correct.  With the new code and an old index, searches will
  +     yield very large scores for shorter fields, and very small scores
  +     for longer fields.  Once the index is re-created, scores will be
  +     as before. (cutting)
  +
  +
   1.2 RC6
   
    1. Changed QueryParser.jj to have "?" be a special character which
  
  
  
  1.3       +32 -0     jakarta-lucene/src/java/org/apache/lucene/document/Document.java
  
  Index: Document.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/document/Document.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- Document.java     17 Jul 2002 21:54:38 -0000      1.2
  +++ Document.java     29 Jul 2002 19:11:14 -0000      1.3
  @@ -55,6 +55,8 @@
    */
   
   import java.util.Enumeration;
  +import org.apache.lucene.index.IndexReader;
  +import org.apache.lucene.search.Hits;
   
   /** Documents are the unit of indexing and search.
    *
  @@ -66,9 +68,39 @@
   
   public final class Document implements java.io.Serializable {
     DocumentFieldList fieldList = null;
  +  private float boost = 1.0f;
   
     /** Constructs a new document with no fields. */
     public Document() {}
  +
  +
  +  /** Sets a boost factor for hits on any field of this document.  This value
  +   * will be multiplied into the score of all hits on this document.
  +   *
  +   * <p>Values are multiplied into the value of {@link Field#getBoost()} of
  +   * each field in this document.  Thus, this method in effect sets a default
  +   * boost for the fields of this document.
  +   *
  +   * @see Field#setBoost(float)
  +   */
  +  public void setBoost(float boost) {
  +    this.boost = boost;
  +  }
  +
  +  /** Returns the boost factor for hits on any field of this document.
  +   *
  +   * <p>The default value is 1.0.
  +   *
  +   * <p>Note: This value is not stored directly with the document in the index.
  +   * Documents returned from {@link IndexReader#document(int)} and {@link
  +   * Hits#doc(int)} may thus not have the same value present as when this
  +   * document was indexed.
  +   *
  +   * @see #setBoost(float)
  +   */
  +  public float getBoost() {
  +    return boost;
  +  }
   
     /** Adds a field to a document.  Several fields may be added with
      * the same name.  In this case, if the fields are indexed, their text is
  
  
  
  1.7       +40 -0     jakarta-lucene/src/java/org/apache/lucene/document/Field.java
  
  Index: Field.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/document/Field.java,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- Field.java        17 Jul 2002 21:54:38 -0000      1.6
  +++ Field.java        29 Jul 2002 19:11:14 -0000      1.7
  @@ -56,6 +56,9 @@
   
   import java.io.Reader;
   import java.util.Date;
  +import org.apache.lucene.index.IndexReader;
  +import org.apache.lucene.search.Similarity;
  +import org.apache.lucene.search.Hits;
   
   /**
     A field is a section of a Document.  Each field has two parts, a name and a
  @@ -72,6 +75,43 @@
     private boolean isStored = false;
     private boolean isIndexed = true;
     private boolean isTokenized = true;
  +
  +  private float boost = 1.0f;
  +
  +  /** Sets the boost factor hits on this field.  This value will be
  +   * multiplied into the score of all hits on this this field of this
  +   * document.
  +   *
  +   * <p>The boost is multiplied by {@link Document#getBoost()} of the document
  +   * containing this field.  If a document has multiple fields with the same
  +   * name, all such values are multiplied together.  This product is then
  +   * multipled by the value {@link Similarity#normalizeLength(int)}, and
  +   * rounded by {@link Similarity#encodeNorm(float)} before it is stored in the
  +   * index.  One should attempt to ensure that this product does not overflow
  +   * the range of that encoding.
  +   *
  +   * @see Document#setBoost(float)
  +   * @see Similarity#normalizeLength(int)
  +   * @see Similarity#encodeNorm(float)
  +   */
  +  public void setBoost(float boost) {
  +    this.boost = boost;
  +  }
  +
  +  /** Returns the boost factor for hits on any field of this document.
  +   *
  +   * <p>The default value is 1.0.
  +   *
  +   * <p>Note: this value is not stored directly with the document in the index.
  +   * Documents returned from {@link IndexReader#document(int)} and {@link
  +   * Hits#doc(int)} may thus not have the same value present as when this field
  +   * was indexed.
  +   *
  +   * @see #setBoost(float)
  +   */
  +  public float getBoost() {
  +    return boost;
  +  }
   
     /** Constructs a String-valued Field that is not tokenized, but is indexed
       and stored.  Useful for non-text fields, e.g. date or url.  */
  
  
  
  1.2       +13 -4     
jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java
  
  Index: DocumentWriter.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- DocumentWriter.java       18 Sep 2001 16:29:52 -0000      1.1
  +++ DocumentWriter.java       29 Jul 2002 19:11:15 -0000      1.2
  @@ -59,6 +59,7 @@
   import java.io.StringReader;
   import java.util.Hashtable;
   import java.util.Enumeration;
  +import java.util.Arrays;
   
   import org.apache.lucene.document.Document;
   import org.apache.lucene.document.Field;
  @@ -100,6 +101,10 @@
       // invert doc into postingTable
       postingTable.clear();                      // clear postingTable
       fieldLengths = new int[fieldInfos.size()];         // init fieldLengths
  +
  +    fieldBoosts = new float[fieldInfos.size()];        // init fieldBoosts
  +    Arrays.fill(fieldBoosts, doc.getBoost());
  +
       invertDocument(doc);
   
       // sort postingTable into an array
  @@ -130,6 +135,7 @@
     // Used to buffer a document before it is written to the index.
     private final Hashtable postingTable = new Hashtable();
     private int[] fieldLengths;
  +  private float[] fieldBoosts;
   
     // Tokenizes the fields of a document into Postings.
     private final void invertDocument(Document doc)
  @@ -168,6 +174,7 @@
        }
   
        fieldLengths[fieldNumber] = position;     // save field length
  +        fieldBoosts[fieldNumber] *= field.getBoost();
         }
       }
     }
  @@ -310,12 +317,14 @@
       while (fields.hasMoreElements()) {
         Field field = (Field)fields.nextElement();
         if (field.isIndexed()) {
  -     int fieldNumber = fieldInfos.fieldNumber(field.name());
  -     OutputStream norm = directory.createFile(segment + ".f" + fieldNumber);
  +     int n = fieldInfos.fieldNumber(field.name());
  +        float norm =
  +          fieldBoosts[n] * Similarity.normalizeLength(fieldLengths[n]);
  +     OutputStream norms = directory.createFile(segment + ".f" + n);
        try {
  -       norm.writeByte(Similarity.norm(fieldLengths[fieldNumber]));
  +       norms.writeByte(Similarity.encodeNorm(norm));
        } finally {
  -       norm.close();
  +       norms.close();
        }
         }
       }
  
  
  
  1.10      +5 -3      jakarta-lucene/src/java/org/apache/lucene/index/IndexReader.java
  
  Index: IndexReader.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/IndexReader.java,v
  retrieving revision 1.9
  retrieving revision 1.10
  diff -u -r1.9 -r1.10
  --- IndexReader.java  15 Feb 2002 18:59:42 -0000      1.9
  +++ IndexReader.java  29 Jul 2002 19:11:15 -0000      1.10
  @@ -60,6 +60,7 @@
   import org.apache.lucene.store.FSDirectory;
   import org.apache.lucene.store.Lock;
   import org.apache.lucene.document.Document;
  +import org.apache.lucene.document.Field;
   
   /** IndexReader is an abstract class, providing an interface for accessing an
     index.  Search of an index is done entirely through this abstract interface,
  @@ -177,9 +178,10 @@
     abstract public boolean isDeleted(int n);
   
     /** Returns the byte-encoded normalization factor for the named field of
  -    every document.  This is used by the search code to score documents.
  -    @see org.apache.lucene.search.Similarity#norm
  -    */
  +   * every document.  This is used by the search code to score documents.
  +   *
  +   * @see Field#setBoost(float)
  +   */
     abstract public byte[] norms(String field) throws IOException;
   
     /** Returns an enumeration of all the terms in the index.
  
  
  
  1.2       +1 -1      
jakarta-lucene/src/java/org/apache/lucene/search/PhrasePrefixQuery.java
  
  Index: PhrasePrefixQuery.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhrasePrefixQuery.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- PhrasePrefixQuery.java    18 Jul 2002 14:39:58 -0000      1.1
  +++ PhrasePrefixQuery.java    29 Jul 2002 19:11:15 -0000      1.2
  @@ -66,7 +66,7 @@
   
   /**
    * PhrasePrefixQuery is a generalized version of PhraseQuery, with an added
  - * method {@link add(Term[])}.
  + * method {@link #add(Term[])}.
    * To use this class, to search for the phrase "Microsoft app*" first use
    * add(Term) on the term "Microsoft", then find all terms that has "app" as
    * prefix using IndexReader.terms(Term), and use PhrasePrefixQuery.add(Term[]
  
  
  
  1.2       +1 -1      
jakarta-lucene/src/java/org/apache/lucene/search/PhraseScorer.java
  
  Index: PhraseScorer.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhraseScorer.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- PhraseScorer.java 18 Sep 2001 16:29:57 -0000      1.1
  +++ PhraseScorer.java 29 Jul 2002 19:11:15 -0000      1.2
  @@ -93,7 +93,7 @@
   
         if (freq > 0.0) {
        float score = Similarity.tf(freq)*weight; // compute score
  -     score *= Similarity.norm(norms[first.doc]); // normalize
  +     score *= Similarity.decodeNorm(norms[first.doc]); // normalize
        results.collect(first.doc, score);        // add to results
         }
         last.next();                             // resume scanning
  
  
  
  1.2       +69 -20    jakarta-lucene/src/java/org/apache/lucene/search/Similarity.java
  
  Index: Similarity.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/Similarity.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- Similarity.java   18 Sep 2001 16:29:58 -0000      1.1
  +++ Similarity.java   29 Jul 2002 19:11:15 -0000      1.2
  @@ -56,6 +56,7 @@
   
   import java.io.IOException;
   import org.apache.lucene.index.Term;
  +import org.apache.lucene.document.Field;
   
   /** Internal class used for scoring.
    * <p>Public only so that the indexing code can compute and store the
  @@ -63,32 +64,80 @@
   public final class Similarity {
     private Similarity() {}                      // no public constructor
   
  -  /** Computes the normalization byte for a document given the total number of
  -   * terms contained in the document.  These values are stored in an index and
  -   * used by the search code. */
  -  public static final byte norm(int numTerms) {
  -    // Scales 1/sqrt(numTerms) into a byte, i.e. 256/sqrt(numTerms).
  -    // Math.ceil is used to ensure that even very long documents don't get a
  -    // zero norm byte, as that is reserved for zero-lengthed documents and
  -    // deleted documents.
  -    return (byte) Math.ceil(255.0 / Math.sqrt(numTerms));
  +  static final float[] NORM_TABLE = new float[256];
  +
  +  static {
  +    for (int i = 0; i < 256; i++)
  +      NORM_TABLE[i] = byteToFloat((byte)i);
     }
   
  +  /** Computes the normalization value for a document given the total number of
  +   * terms contained in a field.  These values are stored in an index and used
  +   * by the search code.
  +   *
  +   * <p>The formula used is: <code>1.0f / Math.sqrt(numTerms)</code>
  +   *
  +   * @see Field#setBoost(float)
  +   */
  +  public static float normalizeLength(int numTerms) {
  +    return (float)(1.0 / Math.sqrt(numTerms));
  +  }
  +  
  +  /** Decodes a normalization factor stored in an index.
  +   * @see #encodeNorm(float)
  +   */
  +  public static float decodeNorm(byte b) {
  +    return NORM_TABLE[b & 0xFF];
  +  }
   
  -  private static final float[] makeNormTable() {
  -    float[] result = new float[256];
  -    for (int i = 0; i < 256; i++)
  -      result[i] = i / 255.0F;
  -    return result;
  +  /** Encodes a normalization factor for storage in an index.  
  +   *
  +   * <p>The encoding uses a five-bit exponent and three-bit mantissa, thus
  +   * representing values from around 7x10^9 to 2x10^-9 with about one
  +   * significant decimal digit of accuracy.  Zero is also represented.
  +   * Negative numbers are rounded up to zero.  Values too large to represent
  +   * are rounded down to the largest representable value.  Positive values too
  +   * small to represent are rounded up to the smallest positive representable
  +   * value.
  +   *
  +   * @see Field#setBoost(float)
  +   */
  +  public static byte encodeNorm(float f) {
  +    return floatToByte(f);
     }
   
  -  static final float[] NORM_TABLE = makeNormTable();
  -    
  -  static final float norm(byte normByte) {
  -    // Un-scales from the byte encoding of a norm into a float, i.e.,
  -    // approximately 1/sqrt(numTerms).
  -    return NORM_TABLE[normByte & 0xFF];
  +  private static float byteToFloat(byte b) {
  +    if (b == 0)                                   // zero is a special case
  +      return 0.0f;
  +    int mantissa = b & 7;
  +    int exponent = (b >> 3) & 31;
  +    int bits = ((exponent+(63-15)) << 24) | (mantissa << 21);
  +    return Float.intBitsToFloat(bits);
     }
  +   
  +  private static byte floatToByte(float f) {
  +    if (f < 0.0f)                                 // round negatives up to zero
  +      f = 0.0f;
  +
  +    if (f == 0.0f)                                // zero is a special case
  +      return 0;
  +
  +    int bits = Float.floatToIntBits(f);           // parse float into parts
  +    int mantissa = (bits & 0xffffff) >> 21;
  +    int exponent = (((bits >> 24) & 0x7f) - 63) + 15;
  +
  +    if (exponent > 31) {                          // overflow: use max value
  +      exponent = 31;
  +      mantissa = 7;
  +    }
  +
  +    if (exponent < 1) {                           // underflow: use min value
  +      exponent = 1;
  +      mantissa = 0;
  +    }
  +
  +    return (byte)((exponent << 3) | mantissa);    // pack into a byte
  +   }
   
     static final float tf(int freq) {
       return (float)Math.sqrt(freq);
  
  
  
  1.2       +1 -1      jakarta-lucene/src/java/org/apache/lucene/search/TermScorer.java
  
  Index: TermScorer.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/TermScorer.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- TermScorer.java   18 Sep 2001 16:29:58 -0000      1.1
  +++ TermScorer.java   29 Jul 2002 19:11:15 -0000      1.2
  @@ -98,7 +98,7 @@
         ? scoreCache[f]                          // cache hit
         : Similarity.tf(f)*weight;               // cache miss
   
  -      score *= Similarity.norm(norms[d]);      // normalize for field
  +      score *= Similarity.decodeNorm(norms[d]);        // normalize for field
   
         c.collect(d, score);                     // collect score
   
  
  
  
  1.1                  
jakarta-lucene/src/test/org/apache/lucene/search/TestDocBoost.java
  
  Index: TestDocBoost.java
  ===================================================================
  package org.apache.lucene.search;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact [EMAIL PROTECTED]
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import org.apache.lucene.index.Term;
  import org.apache.lucene.index.IndexWriter;
  import org.apache.lucene.search.Query;
  import org.apache.lucene.search.Hits;
  import org.apache.lucene.search.IndexSearcher;
  import org.apache.lucene.store.RAMDirectory;
  import org.apache.lucene.analysis.SimpleAnalyzer;
  import org.apache.lucene.document.Document;
  import org.apache.lucene.document.Field;
  
  import junit.framework.TestCase;
  
   /** Document boost unit test.
    *
    * @author Doug Cutting
    * @version $Revision: 1.1 $
    */
  public class TestDocBoost extends TestCase {
    public TestDocBoost(String name) {
      super(name);
    }
    
    public static void test() throws Exception {
      RAMDirectory store = new RAMDirectory();
      IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true);
      
      Field f1 = Field.Text("field", "word");
      Field f2 = Field.Text("field", "word");
      f2.setBoost(2.0f);
      
      Document d1 = new Document();
      Document d2 = new Document();
      Document d3 = new Document();
      Document d4 = new Document();
      d3.setBoost(3.0f);
      d4.setBoost(2.0f);
      
      d1.add(f1);                                 // boost = 1
      d2.add(f2);                                 // boost = 2
      d3.add(f1);                                 // boost = 3
      d4.add(f2);                                 // boost = 4
      
      writer.addDocument(d1);
      writer.addDocument(d2);
      writer.addDocument(d3);
      writer.addDocument(d4);
      writer.optimize();
      writer.close();
  
      final float[] scores = new float[4];
  
      new IndexSearcher(store).search
        (new TermQuery(new Term("field", "word")),
         new HitCollector() {
           public final void collect(int doc, float score) {
             scores[doc] = score;
           }
         });
      
      float lastScore = 0.0f;
  
      for (int i = 0; i < 4; i++) {
        assertTrue(scores[i] > lastScore);
        lastScore = scores[i];
      }
    }
  }


--
To unsubscribe, e-mail:   <mailto:[EMAIL PROTECTED]>
For additional commands, e-mail: <mailto:[EMAIL PROTECTED]>

cvs commit: jakarta-lucene/src/test/org/apache/lucene/search TestDocBoost.java

Reply via email to