big faceting speedup for single valued fields

Yonik Seeley Wed, 20 Sep 2006 14:21:45 -0700

I implemented the FieldCache implementation for faceted browsing on a
field that is not tokenized (integer, sint, string, etc) and not
multiValued, so there is a max of one token per document.


Before optimization:  10.03 seconds (filtercache not large enough to
hold all possible vals)
After optimization:  0.02 seconds   (500 times faster!)
Cool eh?

The base query matched 166,000 documents out of 25 million.
I faceted on author, which I assume there are many of, but I don't
have exact numbers.


This isn't the last word on faceted optimizations, there are other
common cases that may be optimized, but don't expect results anywhere
near this again ;-)

Here's the changes I'll commit shortly unless there are objections
(sorry about the false-changes... I think it must be trailing
whitespace that my IDE stripped of or something... I'll try to fix
that for the future)

Index: src/java/org/apache/solr/schema/FieldType.java
===================================================================
--- src/java/org/apache/solr/schema/FieldType.java      (revision 443015)
+++ src/java/org/apache/solr/schema/FieldType.java      (working copy)
@@ -57,7 +57,7 @@
  int properties;

  /** Returns true if fields of this type should be tokenized */
-  protected boolean isTokenized() {
+  public boolean isTokenized() {
    return (properties & TOKENIZED) != 0;
  }

Index: src/java/org/apache/solr/request/SimpleFacets.java
===================================================================
--- src/java/org/apache/solr/request/SimpleFacets.java  (revision 443015)
+++ src/java/org/apache/solr/request/SimpleFacets.java  (working copy)
@@ -20,23 +20,20 @@
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
-import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.*;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrException;
import org.apache.solr.request.SolrParams;
-import org.apache.solr.request.SolrQueryRequest;
-import org.apache.solr.request.SolrQueryResponse;
-import org.apache.solr.request.DefaultSolrParams;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.FieldType;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.schema.BoolField;
import org.apache.solr.search.*;
import org.apache.solr.util.NamedList;
import org.apache.solr.util.BoundedTreeSet;

import java.io.IOException;
import java.util.*;
-import java.util.logging.Level;

/**
 * A class that generates simple Facet information for a request.
@@ -52,15 +49,16 @@
  protected SolrParams params;
  /** Searcher to use for all calculations */
  protected SolrIndexSearcher searcher;
-
-  public SimpleFacets(SolrIndexSearcher searcher,
-                      DocSet docs,
+
+
+  public SimpleFacets(SolrIndexSearcher searcher,
+                      DocSet docs,
                      SolrParams params) {
    this.searcher = searcher;
    this.docs = docs;
    this.params = params;
  }
-
+
  /**
   * Looks at various Params to determing if any simple Facet Constraint count
   * computations are desired.
@@ -73,7 +71,7 @@
  public NamedList getFacetCounts() {

    // if someone called this method, benefit of the doubt: assume true
-    if (!params.getBool(params.FACET,true))
+    if (!params.getBool(params.FACET,true))
      return null;

    NamedList res = new NamedList();
@@ -82,7 +80,7 @@
      res.add("facet_queries", getFacetQueryCounts());

      res.add("facet_fields", getFacetFieldCounts());
-
+
    } catch (Exception e) {
      SolrException.logOnce(SolrCore.log, "Exception during facet counts", e);
      res.add("exception", SolrException.toStr(e));
@@ -97,7 +95,7 @@
   * @see SolrParams#FACET_QUERY
   */
  public NamedList getFacetQueryCounts() throws IOException,ParseException {
-
+
    NamedList res = new NamedList();

    /* Ignore SolrParams.DF - could have init param facet.query assuming
@@ -106,7 +104,7 @@
     * explicit.
     */
    SolrQueryParser qp = new SolrQueryParser(searcher.getSchema(),null);
-
+
    String[] facetQs = params.getParams(SolrParams.FACET_QUERY);
    if (null != facetQs && 0 != facetQs.length) {
      for (String q : facetQs) {
@@ -125,22 +123,33 @@
   * @see #getFacetFieldMissingCount
   * @see #getFacetTermEnumCounts
   */
-  public NamedList getFacetFieldCounts()
-    throws IOException {
-
+  public NamedList getFacetFieldCounts()
+          throws IOException {
+
    NamedList res = new NamedList();
    String[] facetFs = params.getParams(SolrParams.FACET_FIELD);
-    if (null != facetFs && 0 != facetFs.length) {
-
+    if (null != facetFs) {
      for (String f : facetFs) {
+        NamedList counts=null;
+        SchemaField sf = searcher.getSchema().getField(f);
+        FieldType ft = sf.getType();

-        NamedList counts = getFacetTermEnumCounts(f);
-
-        if (params.getFieldBool(f, params.FACET_MISSING, false))
-          counts.add(null, getFacetFieldMissingCount(f));
-
+        int limit = params.getFieldInt(f, params.FACET_LIMIT, 100);
+        boolean zeros = params.getFieldBool(f, params.FACET_ZEROS, true);
+        boolean missing = params.getFieldBool(f, params.FACET_MISSING, false);
+
+
+        if (sf.multiValued() || ft.isTokenized() || ft instanceof BoolField) {
+          counts = getFacetTermEnumCounts(f,limit,zeros);
+          if (missing) {
+            counts.add(null, getFacetFieldMissingCount(f));
+          }
+        } else {
+          // TODO: future logic could use filters instead of the fieldcache if
+          // the number of terms in the field is small enough.
+          counts = getFieldCacheCounts(f, limit, zeros, missing);
+        }
        res.add(f, counts);
-
      }
    }
    return res;
@@ -160,6 +169,63 @@
    return docs.andNotSize(hasVal);
  }

+
+  NamedList getFieldCacheCounts(String fieldName, int limit, boolean
zeros, boolean missing) throws IOException {
+    // TODO: If the number of terms is high compared to docs.size(),
and zeros==false,
+    //  we should use an alternate strategy to avoid
+    //  1) creating another huge int[] for the counts
+    //  2) looping over that huge int[] looking for the rare non-zeros.
+    //
+    // Yet another variation: if docs.size() is small and termvectors
are stored,
+    // then use them instead of the FieldCache.
+    //
+
+    FieldCache.StringIndex si =
FieldCache.DEFAULT.getStringIndex(searcher.getReader(), fieldName);
+    int[] count = new int[si.lookup.length];
+    DocIterator iter = docs.iterator();
+    while (iter.hasNext()) {
+      count[si.order[iter.nextDoc()]]++;
+    }
+
+    FieldType ft = searcher.getSchema().getFieldType(fieldName);
+    NamedList res = new NamedList();
+
+    // IDEA: we could also maintain a count of "other"... everything
that fell outside
+    // of the top 'N'
+
+    BoundedTreeSet<CountPair<String,Integer>> queue=null;
+
+    if (limit>=0) {
+      // TODO: compare performance of BoundedTreeSet compare against
priority queue?
+      queue = new BoundedTreeSet<CountPair<String,Integer>>(limit);
+    }
+
+    int min=-1;  // the smallest value in the top 'N' values
+    for (int i=1; i<count.length; i++) {
+      int c = count[i];
+      if (c==0 && !zeros) continue;
+      if (limit<0) {
+        res.add(ft.indexedToReadable(si.lookup[i]), c);
+      } else if (c>min) {
+        // NOTE: we use c>min rather than c>=min as an optimization
because we are going in
+        // index order, so we already know that the keys are ordered.
This can be very
+        // important if a lot of the counts are repeated (like zero
counts would be).
+        queue.add(new
CountPair<String,Integer>(ft.indexedToReadable(si.lookup[i]), c));
+        if (queue.size()>=limit) min=queue.last().val;
+      }
+    }
+
+    if (limit>=0) {
+      for (CountPair<String,Integer> p : queue) {
+        res.add(p.key, p.val);
+      }
+    }
+
+
+    if (missing) res.add(null, count[0]);
+    return res;
+  }
+
  /**
   * Returns a list of terms in the specified field along with the
   * corrisponding count of documents in the set that match that constraint.
@@ -167,50 +233,46 @@
   * @see SolrParams#FACET_LIMIT
   * @see SolrParams#FACET_ZEROS
   */
-  public NamedList getFacetTermEnumCounts(String fieldName)
+  public NamedList getFacetTermEnumCounts(String fieldName, int
limit, boolean zeros)
    throws IOException {
-
+
    /* :TODO: potential optimization...
-     * cache the Terms with the highest docFreq and try them first
-     * don't enum if we get our max from them
-     */
-
+    * cache the Terms with the highest docFreq and try them first
+    * don't enum if we get our max from them
+    */
+
    IndexSchema schema = searcher.getSchema();
    IndexReader r = searcher.getReader();
    FieldType ft = schema.getFieldType(fieldName);

-    Set<CountPair<String,Integer>> counts
+    Set<CountPair<String,Integer>> counts
      = new HashSet<CountPair<String,Integer>>();

-    int limit = params.getFieldInt(fieldName, params.FACET_LIMIT, 100);
    if (0 <= limit) {
      counts = new BoundedTreeSet<CountPair<String,Integer>>(limit);
    }

-    boolean zeros = params.getFieldBool(fieldName, params.FACET_ZEROS, true);
-
    TermEnum te = r.terms(new Term(fieldName,""));
    do {
      Term t = te.term();

-      if (null == t || ! t.field().equals(fieldName))
+      if (null == t || ! t.field().equals(fieldName))
        break;

      if (0 < te.docFreq()) { /* all docs may be deleted */
        int count = searcher.numDocs(new TermQuery(t),
                                     docs);

-        /* :TODO: is indexedToReadable correct? */
-        if (zeros || 0 < count)
+        if (zeros || 0 < count)
          counts.add(new CountPair<String,Integer>
-                     (ft.indexedToReadable(t.text()), count));
+                     (t.text(), count));

      }
    } while (te.next());

    NamedList res = new NamedList();
    for (CountPair<String,Integer> p : counts) {
-      res.add(p.key, p.val);
+      res.add(ft.indexedToReadable(p.key), p.val);
    }
    return res;
  }
@@ -220,7 +282,7 @@
   * <b>higher</b> vals come before lower vals.
   * In case of tie vals, then <b>lower</b> keys come before higher keys.
   */
-  public static class CountPair<K extends Comparable<? super K>, V
extends Comparable<? super V>>
+  public static class CountPair<K extends Comparable<? super K>, V
extends Comparable<? super V>>
    implements Comparable<CountPair<K,V>> {

    public CountPair(K k, V v) {
@@ -232,7 +294,7 @@
      return key.hashCode() ^ val.hashCode();
    }
    public boolean equals(Object o) {
-      return (o instanceof CountPair)
+      return (o instanceof CountPair)
        && (0 == this.compareTo((CountPair<K,V>) o));
    }
    public int compareTo(CountPair<K,V> o) {

Property changes on: src/java/org/apache/solr/request/SimpleFacets.java
___________________________________________________________________
Name: svn:eol-style
  + native



-Yonik

big faceting speedup for single valued fields

Reply via email to