I implemented the FieldCache implementation for faceted browsing on a
field that is not tokenized (integer, sint, string, etc) and not
multiValued, so there is a max of one token per document.
Before optimization: 10.03 seconds (filtercache not large enough to
hold all possible vals)
After optimization: 0.02 seconds (500 times faster!)
Cool eh?
The base query matched 166,000 documents out of 25 million.
I faceted on author, which I assume there are many of, but I don't
have exact numbers.
This isn't the last word on faceted optimizations, there are other
common cases that may be optimized, but don't expect results anywhere
near this again ;-)
Here's the changes I'll commit shortly unless there are objections
(sorry about the false-changes... I think it must be trailing
whitespace that my IDE stripped of or something... I'll try to fix
that for the future)
Index: src/java/org/apache/solr/schema/FieldType.java
===================================================================
--- src/java/org/apache/solr/schema/FieldType.java (revision 443015)
+++ src/java/org/apache/solr/schema/FieldType.java (working copy)
@@ -57,7 +57,7 @@
int properties;
/** Returns true if fields of this type should be tokenized */
- protected boolean isTokenized() {
+ public boolean isTokenized() {
return (properties & TOKENIZED) != 0;
}
Index: src/java/org/apache/solr/request/SimpleFacets.java
===================================================================
--- src/java/org/apache/solr/request/SimpleFacets.java (revision 443015)
+++ src/java/org/apache/solr/request/SimpleFacets.java (working copy)
@@ -20,23 +20,20 @@
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
-import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.*;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrException;
import org.apache.solr.request.SolrParams;
-import org.apache.solr.request.SolrQueryRequest;
-import org.apache.solr.request.SolrQueryResponse;
-import org.apache.solr.request.DefaultSolrParams;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.FieldType;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.schema.BoolField;
import org.apache.solr.search.*;
import org.apache.solr.util.NamedList;
import org.apache.solr.util.BoundedTreeSet;
import java.io.IOException;
import java.util.*;
-import java.util.logging.Level;
/**
* A class that generates simple Facet information for a request.
@@ -52,15 +49,16 @@
protected SolrParams params;
/** Searcher to use for all calculations */
protected SolrIndexSearcher searcher;
-
- public SimpleFacets(SolrIndexSearcher searcher,
- DocSet docs,
+
+
+ public SimpleFacets(SolrIndexSearcher searcher,
+ DocSet docs,
SolrParams params) {
this.searcher = searcher;
this.docs = docs;
this.params = params;
}
-
+
/**
* Looks at various Params to determing if any simple Facet Constraint count
* computations are desired.
@@ -73,7 +71,7 @@
public NamedList getFacetCounts() {
// if someone called this method, benefit of the doubt: assume true
- if (!params.getBool(params.FACET,true))
+ if (!params.getBool(params.FACET,true))
return null;
NamedList res = new NamedList();
@@ -82,7 +80,7 @@
res.add("facet_queries", getFacetQueryCounts());
res.add("facet_fields", getFacetFieldCounts());
-
+
} catch (Exception e) {
SolrException.logOnce(SolrCore.log, "Exception during facet counts", e);
res.add("exception", SolrException.toStr(e));
@@ -97,7 +95,7 @@
* @see SolrParams#FACET_QUERY
*/
public NamedList getFacetQueryCounts() throws IOException,ParseException {
-
+
NamedList res = new NamedList();
/* Ignore SolrParams.DF - could have init param facet.query assuming
@@ -106,7 +104,7 @@
* explicit.
*/
SolrQueryParser qp = new SolrQueryParser(searcher.getSchema(),null);
-
+
String[] facetQs = params.getParams(SolrParams.FACET_QUERY);
if (null != facetQs && 0 != facetQs.length) {
for (String q : facetQs) {
@@ -125,22 +123,33 @@
* @see #getFacetFieldMissingCount
* @see #getFacetTermEnumCounts
*/
- public NamedList getFacetFieldCounts()
- throws IOException {
-
+ public NamedList getFacetFieldCounts()
+ throws IOException {
+
NamedList res = new NamedList();
String[] facetFs = params.getParams(SolrParams.FACET_FIELD);
- if (null != facetFs && 0 != facetFs.length) {
-
+ if (null != facetFs) {
for (String f : facetFs) {
+ NamedList counts=null;
+ SchemaField sf = searcher.getSchema().getField(f);
+ FieldType ft = sf.getType();
- NamedList counts = getFacetTermEnumCounts(f);
-
- if (params.getFieldBool(f, params.FACET_MISSING, false))
- counts.add(null, getFacetFieldMissingCount(f));
-
+ int limit = params.getFieldInt(f, params.FACET_LIMIT, 100);
+ boolean zeros = params.getFieldBool(f, params.FACET_ZEROS, true);
+ boolean missing = params.getFieldBool(f, params.FACET_MISSING, false);
+
+
+ if (sf.multiValued() || ft.isTokenized() || ft instanceof BoolField) {
+ counts = getFacetTermEnumCounts(f,limit,zeros);
+ if (missing) {
+ counts.add(null, getFacetFieldMissingCount(f));
+ }
+ } else {
+ // TODO: future logic could use filters instead of the fieldcache if
+ // the number of terms in the field is small enough.
+ counts = getFieldCacheCounts(f, limit, zeros, missing);
+ }
res.add(f, counts);
-
}
}
return res;
@@ -160,6 +169,63 @@
return docs.andNotSize(hasVal);
}
+
+ NamedList getFieldCacheCounts(String fieldName, int limit, boolean
zeros, boolean missing) throws IOException {
+ // TODO: If the number of terms is high compared to docs.size(),
and zeros==false,
+ // we should use an alternate strategy to avoid
+ // 1) creating another huge int[] for the counts
+ // 2) looping over that huge int[] looking for the rare non-zeros.
+ //
+ // Yet another variation: if docs.size() is small and termvectors
are stored,
+ // then use them instead of the FieldCache.
+ //
+
+ FieldCache.StringIndex si =
FieldCache.DEFAULT.getStringIndex(searcher.getReader(), fieldName);
+ int[] count = new int[si.lookup.length];
+ DocIterator iter = docs.iterator();
+ while (iter.hasNext()) {
+ count[si.order[iter.nextDoc()]]++;
+ }
+
+ FieldType ft = searcher.getSchema().getFieldType(fieldName);
+ NamedList res = new NamedList();
+
+ // IDEA: we could also maintain a count of "other"... everything
that fell outside
+ // of the top 'N'
+
+ BoundedTreeSet<CountPair<String,Integer>> queue=null;
+
+ if (limit>=0) {
+ // TODO: compare performance of BoundedTreeSet compare against
priority queue?
+ queue = new BoundedTreeSet<CountPair<String,Integer>>(limit);
+ }
+
+ int min=-1; // the smallest value in the top 'N' values
+ for (int i=1; i<count.length; i++) {
+ int c = count[i];
+ if (c==0 && !zeros) continue;
+ if (limit<0) {
+ res.add(ft.indexedToReadable(si.lookup[i]), c);
+ } else if (c>min) {
+ // NOTE: we use c>min rather than c>=min as an optimization
because we are going in
+ // index order, so we already know that the keys are ordered.
This can be very
+ // important if a lot of the counts are repeated (like zero
counts would be).
+ queue.add(new
CountPair<String,Integer>(ft.indexedToReadable(si.lookup[i]), c));
+ if (queue.size()>=limit) min=queue.last().val;
+ }
+ }
+
+ if (limit>=0) {
+ for (CountPair<String,Integer> p : queue) {
+ res.add(p.key, p.val);
+ }
+ }
+
+
+ if (missing) res.add(null, count[0]);
+ return res;
+ }
+
/**
* Returns a list of terms in the specified field along with the
* corrisponding count of documents in the set that match that constraint.
@@ -167,50 +233,46 @@
* @see SolrParams#FACET_LIMIT
* @see SolrParams#FACET_ZEROS
*/
- public NamedList getFacetTermEnumCounts(String fieldName)
+ public NamedList getFacetTermEnumCounts(String fieldName, int
limit, boolean zeros)
throws IOException {
-
+
/* :TODO: potential optimization...
- * cache the Terms with the highest docFreq and try them first
- * don't enum if we get our max from them
- */
-
+ * cache the Terms with the highest docFreq and try them first
+ * don't enum if we get our max from them
+ */
+
IndexSchema schema = searcher.getSchema();
IndexReader r = searcher.getReader();
FieldType ft = schema.getFieldType(fieldName);
- Set<CountPair<String,Integer>> counts
+ Set<CountPair<String,Integer>> counts
= new HashSet<CountPair<String,Integer>>();
- int limit = params.getFieldInt(fieldName, params.FACET_LIMIT, 100);
if (0 <= limit) {
counts = new BoundedTreeSet<CountPair<String,Integer>>(limit);
}
- boolean zeros = params.getFieldBool(fieldName, params.FACET_ZEROS, true);
-
TermEnum te = r.terms(new Term(fieldName,""));
do {
Term t = te.term();
- if (null == t || ! t.field().equals(fieldName))
+ if (null == t || ! t.field().equals(fieldName))
break;
if (0 < te.docFreq()) { /* all docs may be deleted */
int count = searcher.numDocs(new TermQuery(t),
docs);
- /* :TODO: is indexedToReadable correct? */
- if (zeros || 0 < count)
+ if (zeros || 0 < count)
counts.add(new CountPair<String,Integer>
- (ft.indexedToReadable(t.text()), count));
+ (t.text(), count));
}
} while (te.next());
NamedList res = new NamedList();
for (CountPair<String,Integer> p : counts) {
- res.add(p.key, p.val);
+ res.add(ft.indexedToReadable(p.key), p.val);
}
return res;
}
@@ -220,7 +282,7 @@
* <b>higher</b> vals come before lower vals.
* In case of tie vals, then <b>lower</b> keys come before higher keys.
*/
- public static class CountPair<K extends Comparable<? super K>, V
extends Comparable<? super V>>
+ public static class CountPair<K extends Comparable<? super K>, V
extends Comparable<? super V>>
implements Comparable<CountPair<K,V>> {
public CountPair(K k, V v) {
@@ -232,7 +294,7 @@
return key.hashCode() ^ val.hashCode();
}
public boolean equals(Object o) {
- return (o instanceof CountPair)
+ return (o instanceof CountPair)
&& (0 == this.compareTo((CountPair<K,V>) o));
}
public int compareTo(CountPair<K,V> o) {
Property changes on: src/java/org/apache/solr/request/SimpleFacets.java
___________________________________________________________________
Name: svn:eol-style
+ native
-Yonik