jenkins-bot has submitted this change and it was merged. Change subject: Fix function_score and common_terms queries ......................................................................
Fix function_score and common_terms queries also add a the max_expanded_terms option and change its default from 100 to 1024. function_score is pretty much a passthrough for highlighting. common_terms queries are handled by only highlighting the _uncommon_ terms by default but the remove_high_freq_terms_from_common_terms option can be set to false to leave them in. Change-Id: I722f9a18eafb9f1c0af36ae03eaacbd227cbee32 --- M README.md M experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java M experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/FieldWrapper.java M experimental-highlighter-elasticsearch-plugin/src/main/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattener.java M experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/AbstractExperimentalHighlighterIntegrationTestBase.java M experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattenerTest.java M experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/BasicQueriesTest.java R experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MiscellaneousTest.java A experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MultimatchTest.java M experimental-highlighter-lucene/pom.xml M experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattener.java M experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/hit/weight/BasicQueryWeigher.java M experimental-highlighter-lucene/src/test/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattenerTest.java 13 files changed, 488 insertions(+), 56 deletions(-) Approvals: Jdouglas: Looks good to me, approved jenkins-bot: Verified diff --git a/README.md b/README.md index 8127a32..d1d1555 100644 --- a/README.md +++ b/README.md @@ -276,6 +276,20 @@ * category will only be highlighted if there isn't a match in section_heading, redirect, or title. +The ```remove_high_freq_terms_from_common_terms``` option can be used to +highlight common terms when using the ```common_terms``` query. It defaults to +```true``` meaning common terms will not be highlighted. Setting it to +```false``` will highlight common terms in ```common_terms``` queries. Note +that this behavior was added in 1.3.1, 1.4.3, and 1.5.0 and before that common +terms were always highlighted by the ```common_terms``` query. + +The ```max_expanded_terms``` option can be used to control how many terms the +highlighter expands multi term queries into. The default is 1024 which is the +same as the ```fvh```'s default. Note that the highlighter doesn't need to +expand all multi term queries because it has special handling for many of them. +But when it does, this is how many terms it expands them into. This was added +in 1.3.1, 1.4.3, and 1.5.0 and before the value was hard coded to 100. + Offsets in postings or term vectors ----------------------------------- Since adding offsets to the postings (set ```index_options``` to ```offsets``` diff --git a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java index 91c18b6..075fb6d 100644 --- a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java +++ b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/ExperimentalHighlighter.java @@ -91,10 +91,15 @@ static class QueryCacheKey { private final Query query; + private final int maxExpandedTerms; private final boolean phraseAsTerms; - public QueryCacheKey(Query query, boolean phraseAsTerms) { + private final boolean removeHighFrequencyTermsFromCommonTerms; + + public QueryCacheKey(Query query, int maxExpandedTerms, boolean phraseAsTerms, boolean removeHighFrequencyTermsFromCommonTerms) { this.query = query; + this.maxExpandedTerms = maxExpandedTerms; this.phraseAsTerms = phraseAsTerms; + this.removeHighFrequencyTermsFromCommonTerms = removeHighFrequencyTermsFromCommonTerms; } @Override @@ -102,6 +107,8 @@ final int prime = 31; int result = 1; result = prime * result + (phraseAsTerms ? 1231 : 1237); + result = prime * result + maxExpandedTerms; + result = prime * result + (removeHighFrequencyTermsFromCommonTerms ? 1231 : 1237); result = prime * result + ((query == null) ? 0 : query.hashCode()); return result; } @@ -115,7 +122,11 @@ if (getClass() != obj.getClass()) return false; QueryCacheKey other = (QueryCacheKey) obj; + if (maxExpandedTerms != other.maxExpandedTerms) + return false; if (phraseAsTerms != other.phraseAsTerms) + return false; + if (removeHighFrequencyTermsFromCommonTerms != other.removeHighFrequencyTermsFromCommonTerms) return false; if (query == null) { if (other.query != null) @@ -231,7 +242,13 @@ } return context.field.fieldOptions().options().get(key); } - + + <T> T getOption(String key, T defaultValue) { + @SuppressWarnings("unchecked") + T value = (T) getOption(key); + return value == null ? defaultValue : value; + } + boolean scoreMatters() { return scoreMatters; } @@ -240,13 +257,11 @@ if (weigher != null) { return; } - boolean phraseAsTerms = false; + boolean phraseAsTerms = getOption("phrase_as_terms", false); + boolean removeHighFrequencyTermsFromCommonTerms = getOption("remove_high_freq_terms_from_common_terms", true); + int maxExpandedTerms = getOption("max_expanded_terms", 1024); // TODO simplify - Boolean phraseAsTermsOption = (Boolean) getOption("phrase_as_terms"); - if (phraseAsTermsOption != null) { - phraseAsTerms = phraseAsTermsOption; - } - QueryCacheKey key = new QueryCacheKey(context.query.originalQuery(), phraseAsTerms); + QueryCacheKey key = new QueryCacheKey(context.query.originalQuery(), maxExpandedTerms, phraseAsTerms, removeHighFrequencyTermsFromCommonTerms); weigher = cache.queryWeighers.get(key); if (weigher != null) { return; @@ -257,13 +272,13 @@ BigArrays.NON_RECYCLING_INSTANCE); // context.context.addReleasable(infos); weigher = new BasicQueryWeigher( - new ElasticsearchQueryFlattener(100, phraseAsTerms), infos, + new ElasticsearchQueryFlattener(maxExpandedTerms, phraseAsTerms, removeHighFrequencyTermsFromCommonTerms), infos, context.hitContext.topLevelReader(), context.query.originalQuery()); // Build the QueryWeigher with the top level reader to get all // the frequency information cache.queryWeighers.put(key, weigher); } - + /** * Builds the hit enum including any required wrappers. */ @@ -307,13 +322,13 @@ } Boolean caseInsensitiveOption = (Boolean) getOption("regex_case_insensitive"); boolean caseInsensitive = caseInsensitiveOption == null ? false : caseInsensitiveOption; - + List<HitEnum> hitEnums = new ArrayList<>(); List<String> fieldValues = defaultField.getFieldValues(); if (fieldValues.size() == 0) { return hitEnums; } - + for (String regex : getRegexes()) { if (luceneRegex) { if (caseInsensitive) { diff --git a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/FieldWrapper.java b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/FieldWrapper.java index 925adef..c7d778b 100644 --- a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/FieldWrapper.java +++ b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/elasticsearch/search/highlight/FieldWrapper.java @@ -28,6 +28,7 @@ import org.wikimedia.search.highlighter.experimental.Segmenter; import org.wikimedia.search.highlighter.experimental.SourceExtracter; import org.wikimedia.search.highlighter.experimental.hit.ConcatHitEnum; +import org.wikimedia.search.highlighter.experimental.hit.EmptyHitEnum; import org.wikimedia.search.highlighter.experimental.hit.PositionBoostingHitEnumWrapper; import org.wikimedia.search.highlighter.experimental.hit.ReplayingHitEnum.HitEnumAndLength; import org.wikimedia.search.highlighter.experimental.hit.TermWeigher; @@ -55,7 +56,7 @@ /** * Build a wrapper around the default field in the context. */ - public FieldWrapper(HighlightExecutionContext executionContext, HighlighterContext context, + public FieldWrapper(HighlightExecutionContext executionContext, HighlighterContext context, BasicQueryWeigher weigher) { this.executionContext = executionContext; this.context = context; @@ -282,7 +283,10 @@ List<String> fieldValues = getFieldValues(); switch (fieldValues.size()) { case 0: - return buildTokenStreamHitEnum(analyzer, ""); + // If there isn't any data then we assume there can't be any hits. + // This is more right than building the token stream hit enum + // against empty string. + return EmptyHitEnum.INSTANCE; case 1: return buildTokenStreamHitEnum(analyzer, fieldValues.get(0)); default: @@ -350,7 +354,7 @@ } return new ConstantTermWeigher<BytesRef>(); } - + public int getPositionGap() { if (positionGap < 0) { if (context.mapper instanceof StringFieldMapper) { diff --git a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattener.java b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattener.java index 3c42ef6..5513966 100644 --- a/experimental-highlighter-elasticsearch-plugin/src/main/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattener.java +++ b/experimental-highlighter-elasticsearch-plugin/src/main/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattener.java @@ -7,11 +7,20 @@ import org.apache.lucene.search.Query; import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery; import org.elasticsearch.common.lucene.search.XFilteredQuery; +import org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery; +import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery; import org.wikimedia.highlighter.experimental.lucene.QueryFlattener; public class ElasticsearchQueryFlattener extends QueryFlattener { - public ElasticsearchQueryFlattener(int maxMultiTermQueryTerms, boolean phraseAsTerms) { - super(maxMultiTermQueryTerms, phraseAsTerms); + /** + * Default configuration. + */ + public ElasticsearchQueryFlattener() { + super(); + } + + public ElasticsearchQueryFlattener(int maxMultiTermQueryTerms, boolean phraseAsTerms, boolean removeHighFrequencyTermsFromCommonTerms) { + super(maxMultiTermQueryTerms, phraseAsTerms, removeHighFrequencyTermsFromCommonTerms); } @Override @@ -23,6 +32,16 @@ } if (query instanceof MultiPhrasePrefixQuery) { flattenQuery((MultiPhrasePrefixQuery) query, pathBoost, sourceOverride, reader, + callback); + return true; + } + if (query instanceof FunctionScoreQuery) { + flattenQuery((FunctionScoreQuery) query, pathBoost, sourceOverride, reader, + callback); + return true; + } + if (query instanceof FiltersFunctionScoreQuery) { + flattenQuery((FiltersFunctionScoreQuery) query, pathBoost, sourceOverride, reader, callback); return true; } @@ -77,4 +96,18 @@ callback.endPhrase(query.getField(), query.getSlop(), boost); } } + + protected void flattenQuery(FunctionScoreQuery query, float pathBoost, + Object sourceOverride, IndexReader reader, Callback callback) { + if (query.getSubQuery() != null) { + flatten(query.getSubQuery(), pathBoost * query.getBoost(), sourceOverride, reader, callback); + } + } + + protected void flattenQuery(FiltersFunctionScoreQuery query, float pathBoost, + Object sourceOverride, IndexReader reader, Callback callback) { + if (query.getSubQuery() != null) { + flatten(query.getSubQuery(), pathBoost * query.getBoost(), sourceOverride, reader, callback); + } + } } diff --git a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/AbstractExperimentalHighlighterIntegrationTestBase.java b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/AbstractExperimentalHighlighterIntegrationTestBase.java index b73cf0a..8ccc782 100644 --- a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/AbstractExperimentalHighlighterIntegrationTestBase.java +++ b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/AbstractExperimentalHighlighterIntegrationTestBase.java @@ -51,6 +51,7 @@ throws IOException { XContentBuilder mapping = jsonBuilder().startObject(); mapping.startObject("test").startObject("properties"); + mapping.startObject("bar").field("type", "integer").endObject(); addField(mapping, "test", offsetsInPostings, fvhLikeTermVectors); addField(mapping, "test2", offsetsInPostings, fvhLikeTermVectors); mapping.startObject("foo").field("type").value("object").startObject("properties"); diff --git a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattenerTest.java b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattenerTest.java index bc5ce5b..2d3ca22 100644 --- a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattenerTest.java +++ b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/ElasticsearchQueryFlattenerTest.java @@ -16,13 +16,22 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.util.automaton.Automaton; import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery; +import org.elasticsearch.common.lucene.search.function.FieldValueFactorFunction; +import org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery; +import org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery.FilterFunction; +import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery; +import org.elasticsearch.common.lucene.search.function.ScoreFunction; import org.junit.Test; import org.mockito.ArgumentCaptor; import org.wikimedia.highlighter.experimental.lucene.QueryFlattener.Callback; public class ElasticsearchQueryFlattenerTest { + private final Term bar = new Term("foo", "bar"); + private final ScoreFunction scoreFunction = new FieldValueFactorFunction("foo", 1, FieldValueFactorFunction.Modifier.LN, null); + @Test public void phrasePrefixQueryPhraseAsPhrase() { phrasePrefixQueryTestCase(false); @@ -31,6 +40,21 @@ @Test public void phrasePrefixQueryPhraseAsTerms() { phrasePrefixQueryTestCase(true); + } + + @Test + public void functionScoreQuery() { + Callback callback = mock(Callback.class); + new ElasticsearchQueryFlattener().flatten(new FunctionScoreQuery(new TermQuery(bar), scoreFunction), null, callback); + verify(callback).flattened(bar.bytes(), 1f, null); + } + + @Test + public void filtersFunctionScoreQuery() { + Callback callback = mock(Callback.class); + Query query = new FiltersFunctionScoreQuery(new TermQuery(bar), null, new FilterFunction[] {}, 1); + new ElasticsearchQueryFlattener().flatten(query, null, callback); + verify(callback).flattened(bar.bytes(), 1f, null); } private void phrasePrefixQueryTestCase(boolean phraseAsTerms) { @@ -45,7 +69,7 @@ query.add(new Term[] { bar, anoth }); Callback callback = mock(Callback.class); - new ElasticsearchQueryFlattener(1, phraseAsTerms).flatten(query, null, callback); + new ElasticsearchQueryFlattener(1, phraseAsTerms, true).flatten(query, null, callback); // The first positions are sent as terms verify(callback).flattened(foo.bytes(), phraseAsTerms ? 1f : 0, null); diff --git a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/BasicQueriesTest.java b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/BasicQueriesTest.java index 59235c8..37f3506 100644 --- a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/BasicQueriesTest.java +++ b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/BasicQueriesTest.java @@ -1,8 +1,9 @@ package org.wikimedia.highlighter.experimental.elasticsearch.integration; +import static org.elasticsearch.index.query.FilterBuilders.termFilter; import static org.elasticsearch.index.query.QueryBuilders.boolQuery; +import static org.elasticsearch.index.query.QueryBuilders.functionScoreQuery; import static org.elasticsearch.index.query.QueryBuilders.fuzzyQuery; -import static org.elasticsearch.index.query.QueryBuilders.queryString; import static org.elasticsearch.index.query.QueryBuilders.rangeQuery; import static org.elasticsearch.index.query.QueryBuilders.regexpQuery; import static org.elasticsearch.index.query.QueryBuilders.spanFirstQuery; @@ -12,6 +13,7 @@ import static org.elasticsearch.index.query.QueryBuilders.spanTermQuery; import static org.elasticsearch.index.query.QueryBuilders.termQuery; import static org.elasticsearch.index.query.QueryBuilders.wildcardQuery; +import static org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders.fieldValueFactorFunction; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHighlight; import static org.hamcrest.Matchers.equalTo; @@ -182,4 +184,30 @@ equalTo("<em>tests</em> very simple <em>test</em>")); } } + + @Test + public void functionScoreQueryWithoutFilter() throws IOException { + buildIndex(); + client().prepareIndex("test", "test", "1").setSource("test", "test", "bar", 2).get(); + refresh(); + + SearchRequestBuilder search = testSearch(functionScoreQuery(termQuery("test", "test")).add(fieldValueFactorFunction("bar"))); + for (String hitSource : HIT_SOURCES) { + SearchResponse response = setHitSource(search, hitSource).get(); + assertHighlight(response, 0, "test", 0, equalTo("<em>test</em>")); + } + } + + @Test + public void functionScoreQueryWithFilter() throws IOException { + buildIndex(); + client().prepareIndex("test", "test", "1").setSource("test", "test", "bar", 2).get(); + refresh(); + + SearchRequestBuilder search = testSearch(functionScoreQuery(termQuery("test", "test")).add(termFilter("test", "test"), fieldValueFactorFunction("bar"))); + for (String hitSource : HIT_SOURCES) { + SearchResponse response = setHitSource(search, hitSource).get(); + assertHighlight(response, 0, "test", 0, equalTo("<em>test</em>")); + } + } } diff --git a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MiscelaneousTest.java b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MiscellaneousTest.java similarity index 91% rename from experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MiscelaneousTest.java rename to experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MiscellaneousTest.java index 8545fa9..92c74bc 100644 --- a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MiscelaneousTest.java +++ b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MiscellaneousTest.java @@ -1,10 +1,14 @@ package org.wikimedia.highlighter.experimental.elasticsearch.integration; +import static org.elasticsearch.index.query.FilterBuilders.idsFilter; import static org.elasticsearch.index.query.QueryBuilders.boolQuery; +import static org.elasticsearch.index.query.QueryBuilders.filteredQuery; import static org.elasticsearch.index.query.QueryBuilders.fuzzyQuery; import static org.elasticsearch.index.query.QueryBuilders.matchPhrasePrefixQuery; +import static org.elasticsearch.index.query.QueryBuilders.matchQuery; import static org.elasticsearch.index.query.QueryBuilders.prefixQuery; import static org.elasticsearch.index.query.QueryBuilders.queryString; +import static org.elasticsearch.index.query.QueryBuilders.rangeQuery; import static org.elasticsearch.index.query.QueryBuilders.regexpQuery; import static org.elasticsearch.index.query.QueryBuilders.termQuery; import static org.elasticsearch.index.query.QueryBuilders.wildcardQuery; @@ -19,8 +23,10 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Locale; +import java.util.Map; import java.util.concurrent.ExecutionException; import org.elasticsearch.action.admin.indices.optimize.OptimizeResponse; @@ -43,7 +49,7 @@ /** * Miscelaneous integration test that don't really have a good home. */ -public class MiscelaneousTest extends AbstractExperimentalHighlighterIntegrationTestBase { +public class MiscellaneousTest extends AbstractExperimentalHighlighterIntegrationTestBase { @Test public void mixOfAutomataAndNotQueries() throws IOException { buildIndex(); @@ -340,6 +346,42 @@ watch.stop(); } + /** + * There was a time when highlighting * would blow up because of _size being an empty numeric field. + */ + @Test + public void highlightStar() throws IOException { + buildIndex(); + indexTestData(); + + SearchResponse response = client().prepareSearch("test").setTypes("test") + .setQuery(matchQuery("_all", "very")).setHighlighterType("experimental") + .addHighlightedField("*").get(); + assertHighlight(response, 0, "test", 0, equalTo("tests <em>very</em> simple test")); + } + + /** + * max_expanded_terms should control how many terms we expand multi term + * queries into when we expand multi term queries. + */ + @Test + public void singleRangeQueryWithSmallRewrites() throws IOException { + buildIndex(true, true, 1); + client().prepareIndex("test", "test", "2").setSource("test", "test").get(); + indexTestData(); + + Map<String, Object> options = new HashMap<String, Object>(); + options.put("max_expanded_terms", 1); + SearchRequestBuilder search = testSearch(filteredQuery(rangeQuery("test").from("teso").to("tesz"), idsFilter("test").addIds("1"))) + .setHighlighterOptions(options); + for (String hitSource : HIT_SOURCES) { + options.put("hit_source", hitSource); + SearchResponse response = search.get(); + assertHighlight(response, 0, "test", 0, + equalTo("tests very simple <em>test</em>")); + } + } + // TODO matched_fields with different hit source - // TODO infer proper hit source + // TODO infer proper hit source } diff --git a/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MultimatchTest.java b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MultimatchTest.java new file mode 100644 index 0000000..bbc83e5 --- /dev/null +++ b/experimental-highlighter-elasticsearch-plugin/src/test/java/org/wikimedia/highlighter/experimental/elasticsearch/integration/MultimatchTest.java @@ -0,0 +1,98 @@ +package org.wikimedia.highlighter.experimental.elasticsearch.integration; + +import static org.elasticsearch.index.query.FilterBuilders.idsFilter; +import static org.elasticsearch.index.query.QueryBuilders.filteredQuery; +import static org.elasticsearch.index.query.QueryBuilders.multiMatchQuery; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHighlight; +import static org.hamcrest.Matchers.equalTo; + +import java.io.IOException; + +import org.elasticsearch.action.search.SearchRequestBuilder; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.index.query.MultiMatchQueryBuilder; +import org.junit.Test; +import org.wikimedia.highlighter.experimental.elasticsearch.AbstractExperimentalHighlighterIntegrationTestBase; + +public class MultimatchTest extends AbstractExperimentalHighlighterIntegrationTestBase { + @Test + public void multiMatch() throws IOException { + buildIndex(); + indexTestData(); + + SearchRequestBuilder search = testSearch(multiMatchQuery("test", "test")); + for (String hitSource : HIT_SOURCES) { + SearchResponse response = setHitSource(search, hitSource).get(); + assertHighlight(response, 0, "test", 0, equalTo("tests very simple <em>test</em>")); + } + } + + @Test + public void multiMatchCutoffAllLow() throws IOException { + buildIndex(true, true, 1); + indexTestData(); + + SearchRequestBuilder search = testSearch(multiMatchQuery("very test", "test").cutoffFrequency(1f)); + for (String hitSource : HIT_SOURCES) { + SearchResponse response = setHitSource(search, hitSource).get(); + assertHighlight(response, 0, "test", 0, equalTo("tests <em>very</em> simple <em>test</em>")); + } + } + + @Test + public void multiMatchCutoffHighAndLow() throws IOException { + buildIndex(true, true, 1); + client().prepareIndex("test", "test", "2").setSource("test", "test").get(); + indexTestData(); + + SearchRequestBuilder search = testSearch(multiMatchQuery("very test", "test").cutoffFrequency(1f)); + for (String hitSource : HIT_SOURCES) { + SearchResponse response = setHitSource(search, hitSource).get(); + assertHighlight(response, 0, "test", 0, equalTo("tests <em>very</em> simple test")); + } + } + + @Test + public void multiMatchPhraseCutoffHighAndLow() throws IOException { + buildIndex(true, true, 1); + client().prepareIndex("test", "test", "2").setSource("test", "simple").get(); + indexTestData(); + + // Looks like phrase doesn't respect cutoff in multimatch + SearchRequestBuilder search = testSearch(multiMatchQuery("very simple", "test").type(MultiMatchQueryBuilder.Type.PHRASE) + .cutoffFrequency(1)); + for (String hitSource : HIT_SOURCES) { + SearchResponse response = setHitSource(search, hitSource).get(); + assertHighlight(response, 0, "test", 0, equalTo("tests <em>very</em> <em>simple</em> test")); + } + } + + @Test + public void multiMatchPhrasePrefixCutoffHighAndLow() throws IOException { + buildIndex(true, true, 1); + client().prepareIndex("test", "test", "2").setSource("test", "simple").get(); + indexTestData(); + + // Looks like phrase doesn't respect cutoff in multimatch + SearchRequestBuilder search = testSearch(multiMatchQuery("very simple", "test").type(MultiMatchQueryBuilder.Type.PHRASE_PREFIX) + .cutoffFrequency(1)); + for (String hitSource : HIT_SOURCES) { + SearchResponse response = setHitSource(search, hitSource).get(); + assertHighlight(response, 0, "test", 0, equalTo("tests <em>very</em> <em>simple</em> test")); + } + } + + @Test + public void multiMatchCutoffAllHigh() throws IOException { + buildIndex(true, true, 1); + client().prepareIndex("test", "test", "2").setSource("test", "very test").get(); + indexTestData(); + + SearchRequestBuilder search = testSearch(filteredQuery(multiMatchQuery("very test", "test").cutoffFrequency(1f), + idsFilter("test").addIds("1"))); + for (String hitSource : HIT_SOURCES) { + SearchResponse response = setHitSource(search, hitSource).get(); + assertHighlight(response, 0, "test", 0, equalTo("tests <em>very</em> simple <em>test</em>")); + } + } +} diff --git a/experimental-highlighter-lucene/pom.xml b/experimental-highlighter-lucene/pom.xml index 94fc1b1..bd89266 100644 --- a/experimental-highlighter-lucene/pom.xml +++ b/experimental-highlighter-lucene/pom.xml @@ -30,6 +30,11 @@ </dependency> <dependency> <groupId>org.apache.lucene</groupId> + <artifactId>lucene-queries</artifactId> + <version>${lucene.version}</version> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> <artifactId>lucene-memory</artifactId> <version>${lucene.version}</version> <scope>test</scope> diff --git a/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattener.java b/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattener.java index f4f3db1..93ab1f1 100644 --- a/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattener.java +++ b/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattener.java @@ -7,7 +7,9 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; +import org.apache.lucene.queries.CommonTermsQuery; import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.DisjunctionMaxQuery; @@ -44,16 +46,25 @@ private final Set<Object> sentAutomata = new HashSet<Object>(); private final int maxMultiTermQueryTerms; private final boolean phraseAsTerms; + private final boolean removeHighFrequencyTermsFromCommonTerms; - public QueryFlattener(int maxMultiTermQueryTerms, boolean phraseAsTerms) { + /** + * Default configuration. + */ + public QueryFlattener() { + this(1000, false, true); + } + + public QueryFlattener(int maxMultiTermQueryTerms, boolean phraseAsTerms, boolean removeHighFrequencyTermsFromCommonTerms) { this.maxMultiTermQueryTerms = maxMultiTermQueryTerms; this.phraseAsTerms = phraseAsTerms; + this.removeHighFrequencyTermsFromCommonTerms = removeHighFrequencyTermsFromCommonTerms; } public interface Callback { /** * Called once per query containing the term. - * + * * @param term the term * @param boost weight of the term * @param sourceOverride null if the source of the term is the query @@ -65,7 +76,7 @@ /** * Called with each new automaton. QueryFlattener makes an effort to * only let the first copy of any duplicate automata through. - * + * * @param automaton automaton from the query * @param boost weight of terms matchign the automaton * @param source hashcode of the source. Automata don't have a hashcode @@ -94,7 +105,7 @@ /** * Should phrase queries be returned as terms? - * + * * @return true mean skip startPhrase and endPhrase and give the terms in a * phrase the weight of the whole phrase */ @@ -130,19 +141,10 @@ flattenQuery((WildcardQuery) query, pathBoost, sourceOverride, reader, callback); } else if (query instanceof PrefixQuery) { flattenQuery((PrefixQuery) query, pathBoost, sourceOverride, reader, callback); + } else if (query instanceof CommonTermsQuery) { + flattenQuery((CommonTermsQuery) query, pathBoost, sourceOverride, reader, callback); } else if (!flattenUnknown(query, pathBoost, sourceOverride, reader, callback)) { - if (query instanceof MultiTermQuery) { - MultiTermQuery copy = (MultiTermQuery) query.clone(); - copy.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite( - maxMultiTermQueryTerms)); - query = copy; - } - Query newRewritten; - try { - newRewritten = query.rewrite(reader); - } catch (IOException e) { - throw new WrappedExceptionFromLucene(e); - } + Query newRewritten = rewriteQuery(query, pathBoost, sourceOverride, reader); if (newRewritten != query) { // only rewrite once and then flatten again - the rewritten // query could have a special treatment @@ -377,6 +379,71 @@ callback.flattened(automaton, boost, source.hashCode()); } + protected void flattenQuery(CommonTermsQuery query, float pathBoost, Object sourceOverride, + IndexReader reader, Callback callback) { + Query rewritten = rewriteQuery(query, pathBoost, sourceOverride, reader); + if (!removeHighFrequencyTermsFromCommonTerms) { + flatten(rewritten, pathBoost, sourceOverride, reader, callback); + return; + } + /* + * Try to figure out if the query was rewritten into a list of low and + * high frequency terms. If it was, remove the high frequency terms. + * + * Note that this only works if high frequency terms are set to + * Occur.SHOULD and low frequency terms are set to Occur.MUST. That is + * usually the way it is done though. + */ + if (!(rewritten instanceof BooleanQuery)) { + // Nope - its a term query or something more exotic + flatten(rewritten, pathBoost, sourceOverride, reader, callback); + } + BooleanQuery bq = (BooleanQuery) rewritten; + BooleanClause[] clauses = bq.getClauses(); + if (clauses.length != 2) { + // Nope - its just a list of terms. + flattenQuery(bq, pathBoost, sourceOverride, reader, callback); + return; + } + if (clauses[0].getOccur() != Occur.SHOULD || clauses[1].getOccur() != Occur.MUST) { + // Nope - just a two term query + flattenQuery(bq, pathBoost, sourceOverride, reader, callback); + return; + } + if (!(clauses[0].getQuery() instanceof BooleanQuery && clauses[1].getQuery() instanceof BooleanQuery)) { + // Nope - terms of the wrong type. not sure how that happened. + flattenQuery(bq, pathBoost, sourceOverride, reader, callback); + return; + } + BooleanQuery lowFrequency = (BooleanQuery) clauses[1].getQuery(); + flattenQuery(lowFrequency, pathBoost, sourceOverride, reader, callback); + } + + protected Query rewriteQuery(MultiTermQuery query, float pathBoost, Object sourceOverride, IndexReader reader) { + query = (MultiTermQuery) query.clone(); + query.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite( + maxMultiTermQueryTerms)); + return rewritePreparedQuery(query, pathBoost, sourceOverride, reader); + } + + protected Query rewriteQuery(Query query, float pathBoost, Object sourceOverride, IndexReader reader) { + if (query instanceof MultiTermQuery) { + return rewriteQuery((MultiTermQuery) query, pathBoost, sourceOverride, reader); + } + return rewritePreparedQuery(query, pathBoost, sourceOverride, reader); + } + + /** + * Rewrites a query that's already ready for rewriting. + */ + protected Query rewritePreparedQuery(Query query, float pathBoost, Object sourceOverride, IndexReader reader) { + try { + return query.rewrite(reader); + } catch (IOException e) { + throw new WrappedExceptionFromLucene(e); + } + } + private static class FuzzyQueryInfo { private final String term; private final int maxEdits; diff --git a/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/hit/weight/BasicQueryWeigher.java b/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/hit/weight/BasicQueryWeigher.java index 706a723..f539adf 100644 --- a/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/hit/weight/BasicQueryWeigher.java +++ b/experimental-highlighter-lucene/src/main/java/org/wikimedia/highlighter/experimental/lucene/hit/weight/BasicQueryWeigher.java @@ -37,7 +37,7 @@ private CompiledAutomaton acceptable; public BasicQueryWeigher(IndexReader reader, Query query) { - this(new QueryFlattener(1000, false), new HashMapTermInfos(), reader, query); + this(new QueryFlattener(1000, false, true), new HashMapTermInfos(), reader, query); } public BasicQueryWeigher(QueryFlattener flattener, final TermInfos termInfos, IndexReader reader, Query query) { diff --git a/experimental-highlighter-lucene/src/test/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattenerTest.java b/experimental-highlighter-lucene/src/test/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattenerTest.java index cad5462..3e3da49 100644 --- a/experimental-highlighter-lucene/src/test/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattenerTest.java +++ b/experimental-highlighter-lucene/src/test/java/org/wikimedia/highlighter/experimental/lucene/QueryFlattenerTest.java @@ -1,7 +1,6 @@ package org.wikimedia.highlighter.experimental.lucene; import static org.hamcrest.Matchers.not; -import static org.junit.Assert.assertThat; import static org.mockito.Matchers.any; import static org.mockito.Matchers.anyFloat; import static org.mockito.Matchers.anyInt; @@ -15,9 +14,20 @@ import static org.mockito.Mockito.when; import static org.wikimedia.highlighter.experimental.lucene.LuceneMatchers.recognises; +import java.io.Closeable; import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import org.apache.lucene.analysis.core.KeywordAnalyzer; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.queries.CommonTermsQuery; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; @@ -28,20 +38,27 @@ import org.apache.lucene.search.RegexpQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; +import org.apache.lucene.util.Version; import org.apache.lucene.util.automaton.Automaton; import org.hamcrest.Matcher; import org.junit.Test; import org.mockito.ArgumentCaptor; import org.wikimedia.highlighter.experimental.lucene.QueryFlattener.Callback; -public class QueryFlattenerTest { +import com.google.common.collect.Lists; + +public class QueryFlattenerTest extends LuceneTestCase { + private final List<Closeable> toClose = new ArrayList<>(); private final Term bar = new Term("foo", "bar"); private final Term baz = new Term("foo", "baz"); @Test public void termQuery() { Callback callback = mock(Callback.class); - new QueryFlattener(1, false).flatten(new TermQuery(bar), null, callback); + new QueryFlattener().flatten(new TermQuery(bar), null, callback); verify(callback).flattened(bar.bytes(), 1f, null); } @@ -60,7 +77,7 @@ PhraseQuery q = new PhraseQuery(); q.add(bar); q.add(baz); - new QueryFlattener(1, phraseAsTerms).flatten(q, null, callback); + new QueryFlattener(1, phraseAsTerms, true).flatten(q, null, callback); verify(callback).flattened(bar.bytes(), phraseAsTerms ? 1f : 0, null); verify(callback).flattened(baz.bytes(), phraseAsTerms ? 1f : 0, null); if (phraseAsTerms) { @@ -82,7 +99,7 @@ BooleanQuery bq = new BooleanQuery(); bq.add(new BooleanClause(new TermQuery(bar), Occur.MUST)); bq.add(new BooleanClause(new TermQuery(baz), Occur.MUST_NOT)); - new QueryFlattener(1, false).flatten(bq, null, callback); + new QueryFlattener().flatten(bq, null, callback); verify(callback).flattened(bar.bytes(), 1f, null); verify(callback, never()).flattened(eq(baz.bytes()), anyFloat(), isNull(Query.class)); } @@ -92,53 +109,137 @@ Callback callback = mock(Callback.class); Query rewritten = mock(Query.class); when(rewritten.rewrite(null)).thenReturn(new TermQuery(bar)); - new QueryFlattener(1, false).flatten(rewritten, null, callback); + new QueryFlattener().flatten(rewritten, null, callback); verify(callback).flattened(bar.bytes(), 1f, rewritten); } @Test public void fuzzyQuery() { - flattenedToAutomatonThatMatches(new FuzzyQuery(bar), recognises(bar), recognises(baz), - recognises("barr"), recognises("bor"), not(recognises("barrrr"))); + flattenedToAutomatonThatMatches(new FuzzyQuery(bar), recognises(bar), recognises(baz), recognises("barr"), recognises("bor"), + not(recognises("barrrr"))); } @Test public void fuzzyQueryShorterThenPrefix() { Callback callback = mock(Callback.class); - new QueryFlattener(1, false).flatten(new FuzzyQuery(bar, 2, 100), null, callback); + new QueryFlattener().flatten(new FuzzyQuery(bar, 2, 100), null, callback); verify(callback).flattened(bar.bytes(), 1f, null); verify(callback, never()).flattened(any(Automaton.class), anyFloat(), anyInt()); } @Test public void regexpQuery() { - flattenedToAutomatonThatMatches(new RegexpQuery(new Term("test", "ba[zr]")), - recognises(bar), recognises(baz), not(recognises("barr"))); + flattenedToAutomatonThatMatches(new RegexpQuery(new Term("test", "ba[zr]")), recognises(bar), recognises(baz), + not(recognises("barr"))); } @Test public void wildcardQuery() { - flattenedToAutomatonThatMatches(new WildcardQuery(new Term("test", "ba?")), - recognises(bar), recognises(baz), not(recognises("barr"))); + flattenedToAutomatonThatMatches(new WildcardQuery(new Term("test", "ba?")), recognises(bar), recognises(baz), + not(recognises("barr"))); - flattenedToAutomatonThatMatches(new WildcardQuery(new Term("test", "ba*")), - recognises(bar), recognises(baz), recognises("barr"), not(recognises("bor"))); + flattenedToAutomatonThatMatches(new WildcardQuery(new Term("test", "ba*")), recognises(bar), recognises(baz), recognises("barr"), + not(recognises("bor"))); } @Test public void prefixQuery() { - flattenedToAutomatonThatMatches(new PrefixQuery(new Term("test", "ba")), recognises(bar), - recognises(baz), recognises("barr"), not(recognises("bor"))); + flattenedToAutomatonThatMatches(new PrefixQuery(new Term("test", "ba")), recognises(bar), recognises(baz), recognises("barr"), + not(recognises("bor"))); + } + + @Test + public void commonTermsQueryNoRemove() { + IndexReader reader = readerWithTerms(bar, randomIntBetween(1, 20), baz, randomIntBetween(1, 20)); + Callback callback = mock(Callback.class); + CommonTermsQuery q = new CommonTermsQuery(Occur.SHOULD, Occur.MUST, 10f); + q.add(bar); + q.add(baz); + new QueryFlattener(100, false, false).flatten(q, reader, callback); + verify(callback).flattened(bar.bytes(), 1f, null); + verify(callback).flattened(baz.bytes(), 1f, null); + } + + @Test + public void commonTermsQueryAllCommon() { + IndexReader reader = readerWithTerms(bar, randomIntBetween(11, 20), baz, randomIntBetween(11, 20)); + Callback callback = mock(Callback.class); + CommonTermsQuery q = new CommonTermsQuery(Occur.SHOULD, Occur.MUST, 10f); + q.add(bar); + q.add(baz); + new QueryFlattener().flatten(q, reader, callback); + verify(callback).flattened(bar.bytes(), 1f, null); + verify(callback).flattened(baz.bytes(), 1f, null); + } + + @Test + public void commonTermsQueryAllUncommon() { + IndexReader reader = readerWithTerms(bar, randomIntBetween(1, 10), baz, randomIntBetween(1, 10)); + Callback callback = mock(Callback.class); + CommonTermsQuery q = new CommonTermsQuery(Occur.SHOULD, Occur.MUST, 10f); + q.add(bar); + q.add(baz); + new QueryFlattener().flatten(q, reader, callback); + verify(callback).flattened(bar.bytes(), 1f, null); + verify(callback).flattened(baz.bytes(), 1f, null); + } + + @Test + public void commonTermsQueryOneUncommon() { + IndexReader reader = readerWithTerms(bar, randomIntBetween(1, 10), baz, randomIntBetween(11, 20)); + Callback callback = mock(Callback.class); + CommonTermsQuery q = new CommonTermsQuery(Occur.SHOULD, Occur.MUST, 10f); + q.add(bar); + q.add(baz); + new QueryFlattener().flatten(q, reader, callback); + verify(callback).flattened(bar.bytes(), 1f, null); + verify(callback, never()).flattened(eq(baz.bytes()), anyFloat(), any(Object.class)); } @SafeVarargs private final void flattenedToAutomatonThatMatches(Query query, Matcher<Automaton>... matchers) { Callback callback = mock(Callback.class); - new QueryFlattener(1, false).flatten(query, null, callback); + new QueryFlattener().flatten(query, null, callback); ArgumentCaptor<Automaton> a = ArgumentCaptor.forClass(Automaton.class); verify(callback).flattened(a.capture(), eq(1f), anyInt()); for (Matcher<Automaton> matcher : matchers) { assertThat(a.getValue(), matcher); } } + + private IndexReader readerWithTerms(Object... termsAndFreqs) { + try { + assertEquals("Expected an even number of terms and freqs", 0, termsAndFreqs.length % 2); + Directory dir = newDirectory(); + toClose.add(dir); + try (IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(Version.LUCENE_4_9, new KeywordAnalyzer()))) { + for (int i = 0; i < termsAndFreqs.length; i += 2) { + Term term = (Term) termsAndFreqs[i]; + int freq = ((Number) termsAndFreqs[i + 1]).intValue(); + for (int f = 0; f < freq; f++) { + writer.addDocument(Collections.singleton(new TextField(term.field(), term.text(), Field.Store.NO))); + } + } + } + IndexReader reader = DirectoryReader.open(dir); + toClose.add(reader); + return reader; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private int randomIntBetween(int min, int max) { + return TestUtil.nextInt(random(), min, max); + } + + @Override + public void tearDown() throws Exception { + super.tearDown(); + + for (Closeable c : Lists.reverse(toClose)) { + c.close(); + } + toClose.clear(); + } } -- To view, visit https://gerrit.wikimedia.org/r/208127 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I722f9a18eafb9f1c0af36ae03eaacbd227cbee32 Gerrit-PatchSet: 1 Gerrit-Project: search/highlighter Gerrit-Branch: 1.3 Gerrit-Owner: Manybubbles <never...@wikimedia.org> Gerrit-Reviewer: Chad <ch...@wikimedia.org> Gerrit-Reviewer: Jdouglas <jdoug...@wikimedia.org> Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits