Manybubbles has submitted this change and it was merged. Change subject: Phrase optimization ......................................................................
Phrase optimization If the query is just phrases and the field contains no phrases then skip trying to highlight it. This works for matched_fields as well. Change-Id: Ieacf9abb3be9dd7ba1bab89ad754e8ba5cef27f6 --- M experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java M experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/FieldWrapper.java M experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/ExperimentalHighlighterTest.java M experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/hit/weight/BasicQueryWeigher.java 4 files changed, 179 insertions(+), 82 deletions(-) Approvals: Chad: Looks good to me, but someone else must approve Manybubbles: Looks good to me, approved jenkins-bot: Verified diff --git a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java index 286a8a4..3fc545c 100644 --- a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java +++ b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java @@ -30,6 +30,7 @@ import org.wikimedia.search.highlighter.experimental.SnippetChooser; import org.wikimedia.search.highlighter.experimental.SnippetFormatter; import org.wikimedia.search.highlighter.experimental.SnippetWeigher; +import org.wikimedia.search.highlighter.experimental.hit.EmptyHitEnum; import org.wikimedia.search.highlighter.experimental.hit.MergingHitEnum; import org.wikimedia.search.highlighter.experimental.hit.OverlapMergingHitEnumWrapper; import org.wikimedia.search.highlighter.experimental.snippet.BasicScoreBasedSnippetChooser; @@ -184,6 +185,9 @@ private HitEnum buildHitFindingHitEnum() throws IOException { Set<String> matchedFields = context.field.fieldOptions().matchedFields(); if (matchedFields == null) { + if (!defaultField.canProduceHits()) { + return EmptyHitEnum.INSTANCE; + } return defaultField.buildHitEnum(); } List<HitEnum> toMerge = new ArrayList<HitEnum>(matchedFields.size()); @@ -195,9 +199,17 @@ } else { wrapper = new FieldWrapper(this, context, weigher, field); } - toMerge.add(wrapper.buildHitEnum()); + if (wrapper.canProduceHits()) { + toMerge.add(wrapper.buildHitEnum()); + } extraFields.add(wrapper); } + if (toMerge.size() == 0) { + return EmptyHitEnum.INSTANCE; + } + if (toMerge.size() == 1) { + return toMerge.get(0); + } return new MergingHitEnum(toMerge, HitEnum.LessThans.OFFSETS); } diff --git a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/FieldWrapper.java b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/FieldWrapper.java index 911823a..84d57c6 100644 --- a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/FieldWrapper.java +++ b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/FieldWrapper.java @@ -29,9 +29,9 @@ import org.wikimedia.search.highlighter.experimental.SourceExtracter; import org.wikimedia.search.highlighter.experimental.hit.ConcatHitEnum; import org.wikimedia.search.highlighter.experimental.hit.PositionBoostingHitEnumWrapper; -import org.wikimedia.search.highlighter.experimental.hit.WeightFilteredHitEnumWrapper; import org.wikimedia.search.highlighter.experimental.hit.ReplayingHitEnum.HitEnumAndLength; import org.wikimedia.search.highlighter.experimental.hit.TermWeigher; +import org.wikimedia.search.highlighter.experimental.hit.WeightFilteredHitEnumWrapper; import org.wikimedia.search.highlighter.experimental.hit.weight.CachingTermWeigher; import org.wikimedia.search.highlighter.experimental.hit.weight.MultiplyingTermWeigher; import org.wikimedia.search.highlighter.experimental.snippet.MultiSegmenter; @@ -110,8 +110,6 @@ } public SourceExtracter<String> buildSourceExtracter() throws IOException { - // TODO loading source is expensive if you don't need it. Delay - // this. List<String> fieldValues = getFieldValues(); switch (fieldValues.size()) { case 0: @@ -156,6 +154,15 @@ } } + /** + * Can this field produce hits? This'll return false when there are no terms + * worth anything that aren't in phrases AND there aren't any phrases on + * this field. + */ + public boolean canProduceHits() { + return weigher.maxTermWeight() > 0 || weigher.areTherePhrasesOnField(context.fieldName); + } + public HitEnum buildHitEnum() throws IOException { HitEnum e = buildHitEnumForSource(); diff --git a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/ExperimentalHighlighterTest.java b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/ExperimentalHighlighterTest.java index 19a723d..0e357e9 100644 --- a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/ExperimentalHighlighterTest.java +++ b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/ExperimentalHighlighterTest.java @@ -25,6 +25,7 @@ import static org.hamcrest.Matchers.both; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.lessThan; import java.io.IOException; import java.util.ArrayList; @@ -108,6 +109,53 @@ } } + /** + * Makes sure we skip fields if the query is just phrases and none of them are on the field. + */ + @Test + public void phraseQueryNoPossiblePhrasesSpeed() throws IOException { + buildIndex(); + // Size has to be big enough to make it really expensive to highlight + // the field to exaggerate the issue + int size = 100000; + int iterations = 100; + StringBuilder b = new StringBuilder(5 * size); + for (int i = 0; i < size; i++) { + b.append("test "); + } + client().prepareIndex("test", "test", "1").setSource("test", b.toString(), "test2", "simple test").get(); + refresh(); + + SearchRequestBuilder search = setHitSource(testSearch(matchPhraseQuery("test2", "simple test")), "analyze"); + long start = System.currentTimeMillis(); + for (int i = 0; i < iterations; i++) { + SearchResponse response = search.get(); + assertNotHighlighted(response, 0, "test"); + } + long total = System.currentTimeMillis() - start; + // Without the optimization this runs about 7 seconds, with, about 1.8. + assertThat(total, lessThan(3000l)); + + search.addHighlightedField(new HighlightBuilder.Field("test2").matchedFields("test.english", "test.whitespace")); + start = System.currentTimeMillis(); + for (int i = 0; i < iterations; i++) { + SearchResponse response = search.get(); + assertNotHighlighted(response, 0, "test"); + } + total = System.currentTimeMillis() - start; + // Without the optimization this runs about 7 seconds, with, about 1.8. + assertThat(total, lessThan(3000l)); + + search.addHighlightedField(new HighlightBuilder.Field("test2").matchedFields("test", "test2.whitespace")); + start = System.currentTimeMillis(); + for (int i = 0; i < iterations; i++) { + assertNoFailures(search.get()); + } + total = System.currentTimeMillis() - start; + // Without the optimization this runs about 7 seconds, with, about 1.8. + assertThat(total, lessThan(3000l)); + } + @Test public void phraseWithSeparations() throws IOException { buildIndex(); diff --git a/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/hit/weight/BasicQueryWeigher.java b/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/hit/weight/BasicQueryWeigher.java index 64da7e2..128a34d 100644 --- a/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/hit/weight/BasicQueryWeigher.java +++ b/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/hit/weight/BasicQueryWeigher.java @@ -16,7 +16,6 @@ import org.apache.lucene.util.automaton.ByteRunAutomaton; import org.apache.lucene.util.automaton.CompiledAutomaton; import org.wikimedia.highlighter.experimental.lucene.QueryFlattener; -import org.wikimedia.highlighter.experimental.lucene.QueryFlattener.Callback; import org.wikimedia.search.highlighter.experimental.HitEnum; import org.wikimedia.search.highlighter.experimental.hit.PhraseHitEnumWrapper; import org.wikimedia.search.highlighter.experimental.hit.TermSourceFinder; @@ -32,6 +31,7 @@ private final List<AutomatonSourceInfo> automata = new ArrayList<AutomatonSourceInfo>(); private final List<BytesRef> terms = new ArrayList<BytesRef>(); private final TermInfos termInfos; + private final float maxTermWeight; private Map<String, List<PhraseInfo>> phrases; private CompiledAutomaton acceptable; @@ -41,83 +41,9 @@ public BasicQueryWeigher(QueryFlattener flattener, final TermInfos termInfos, IndexReader reader, Query query) { this.termInfos = termInfos; - flattener.flatten(query, reader, new Callback() { - private int[][] phrase; - private int phrasePosition; - private int phraseTerm; - - @Override - public void flattened(BytesRef term, float boost, Object rewritten) { - int source = rewritten == null ? term.hashCode() : rewritten.hashCode(); - SourceInfo info = termInfos.get(term); - if (info == null) { - info = new SourceInfo(); - info.source = source; - info.weight = boost; - termInfos.put(term, info); - terms.add(BytesRef.deepCopyOf(term)); - } else { - /* - * If both terms can't be traced back to the same source we - * declare that they are from a new source by merging the - * hashes. This might not be ideal, but it has the advantage - * of being consistent. - */ - if (info.source != source) { - info.source = source * 31 + source; - } - info.weight = Math.max(info.weight, boost); - } - if (phrase != null) { - phrase[phrasePosition][phraseTerm++] = info.source; - } - } - - @Override - public void flattened(Automaton automaton, float boost, int source) { - AutomatonSourceInfo info = new AutomatonSourceInfo(automaton); - // Automata don't have a hashcode so we always use the source - info.source = source; - info.weight = boost; - automata.add(info); - if (phrase != null) { - phrase[phrasePosition][phraseTerm++] = info.source; - } - } - - @Override - public void startPhrase(int termCount) { - phrase = new int[termCount][]; - phrasePosition = -1; - } - @Override - public void startPhrasePosition(int termCount) { - phrasePosition++; - phrase[phrasePosition] = new int[termCount]; - phraseTerm = 0; - } - @Override - public void endPhrasePosition() { - } - @Override - public void endPhrase(String field, int slop, float weight) { - List<PhraseInfo> phraseList; - if (phrases == null) { - phrases = new HashMap<String, List<PhraseInfo>>(); - phraseList = new ArrayList<PhraseInfo>(); - phrases.put(field, phraseList); - } else { - phraseList = phrases.get(field); - if (phraseList == null) { - phraseList = new ArrayList<PhraseInfo>(); - phrases.put(field, phraseList); - } - } - phraseList.add(new PhraseInfo(phrase, slop, weight)); - phrase = null; - } - }); - + FlattenerCallback callback = new FlattenerCallback(); + flattener.flatten(query, reader, callback); + maxTermWeight = callback.maxTermWeight; } public boolean singleTerm() { @@ -151,6 +77,27 @@ e = new PhraseHitEnumWrapper(e, phrase.phrase, phrase.weight, phrase.slop); } return e; + } + + /** + * The maximum weight of a single term without phrase queries. + */ + public float maxTermWeight() { + return maxTermWeight; + } + + /** + * Are there phrases on the provided field? + */ + public boolean areTherePhrasesOnField(String field) { + if (phrases == null) { + return false; + } + List<PhraseInfo> phraseList = phrases.get(field); + if (phraseList == null) { + return false; + } + return !phraseList.isEmpty(); } public CompiledAutomaton acceptableTerms() { @@ -267,4 +214,87 @@ return b.toString(); } } + + private class FlattenerCallback implements QueryFlattener.Callback { + private float maxTermWeight = 0; + private int[][] phrase; + private int phrasePosition; + private int phraseTerm; + + @Override + public void flattened(BytesRef term, float boost, Object rewritten) { + maxTermWeight = Math.max(maxTermWeight, boost); + int source = rewritten == null ? term.hashCode() : rewritten.hashCode(); + SourceInfo info = termInfos.get(term); + if (info == null) { + info = new SourceInfo(); + info.source = source; + info.weight = boost; + termInfos.put(term, info); + terms.add(BytesRef.deepCopyOf(term)); + } else { + /* + * If both terms can't be traced back to the same source we + * declare that they are from a new source by merging the + * hashes. This might not be ideal, but it has the advantage + * of being consistent. + */ + if (info.source != source) { + info.source = source * 31 + source; + } + info.weight = Math.max(info.weight, boost); + } + if (phrase != null) { + phrase[phrasePosition][phraseTerm++] = info.source; + } + } + + @Override + public void flattened(Automaton automaton, float boost, int source) { + maxTermWeight = Math.max(maxTermWeight, boost); + AutomatonSourceInfo info = new AutomatonSourceInfo(automaton); + // Automata don't have a hashcode so we always use the source + info.source = source; + info.weight = boost; + automata.add(info); + if (phrase != null) { + phrase[phrasePosition][phraseTerm++] = info.source; + } + } + + @Override + public void startPhrase(int termCount) { + phrase = new int[termCount][]; + phrasePosition = -1; + } + + @Override + public void startPhrasePosition(int termCount) { + phrasePosition++; + phrase[phrasePosition] = new int[termCount]; + phraseTerm = 0; + } + + @Override + public void endPhrasePosition() { + } + + @Override + public void endPhrase(String field, int slop, float weight) { + List<PhraseInfo> phraseList; + if (phrases == null) { + phrases = new HashMap<String, List<PhraseInfo>>(); + phraseList = new ArrayList<PhraseInfo>(); + phrases.put(field, phraseList); + } else { + phraseList = phrases.get(field); + if (phraseList == null) { + phraseList = new ArrayList<PhraseInfo>(); + phrases.put(field, phraseList); + } + } + phraseList.add(new PhraseInfo(phrase, slop, weight)); + phrase = null; + } + } } -- To view, visit https://gerrit.wikimedia.org/r/132964 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ieacf9abb3be9dd7ba1bab89ad754e8ba5cef27f6 Gerrit-PatchSet: 1 Gerrit-Project: search/highlighter Gerrit-Branch: master Gerrit-Owner: Manybubbles <never...@wikimedia.org> Gerrit-Reviewer: BryanDavis <bda...@wikimedia.org> Gerrit-Reviewer: Chad <ch...@wikimedia.org> Gerrit-Reviewer: Manybubbles <never...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits