This is an automated email from the ASF dual-hosted git repository. mkhl pushed a commit to branch branch_9x in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/branch_9x by this push: new 8171a5367c3 SOLR-16682: transfer MLT Component queries via {!bool} (#1408) 8171a5367c3 is described below commit 8171a5367c3aae579660d5f4eb0c6cf3dc1ee175 Author: Mikhail Khludnev <mkhlud...@users.noreply.github.com> AuthorDate: Tue Feb 28 18:50:03 2023 +0300 SOLR-16682: transfer MLT Component queries via {!bool} (#1408) transfer MLT Component queries via {!bool} (#1260) Co-authored-by: David Smiley <dsmi...@apache.org> --- solr/CHANGES.txt | 3 + .../apache/solr/handler/MoreLikeThisHandler.java | 93 ++++++--------- .../handler/component/MoreLikeThisComponent.java | 126 +++++++++++++-------- .../component/DistributedMLTComponentTest.java | 28 ++++- 4 files changed, 142 insertions(+), 108 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index e708dff7561..c8855eb016f 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -201,6 +201,9 @@ Bug Fixes * SOLR-16679: Fix solr.jetty.ssl.verifyClientHostName logging (Kevin Risden) +* SOLR-16682: MoreLikeThis Component fails with SyntaxError: Cannot parse if document terms contains symbols from query parser syntax + (Mikhail Khludnev) + Build --------------------- * Upgrade forbiddenapis to 3.4 (Uwe Schindler) diff --git a/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java b/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java index 2192aa9cabe..47ca8e3108b 100644 --- a/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java @@ -52,7 +52,6 @@ import org.apache.solr.handler.component.ResponseBuilder; import org.apache.solr.request.SimpleFacets; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; -import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.SchemaField; import org.apache.solr.search.DocIterator; import org.apache.solr.search.DocList; @@ -130,8 +129,6 @@ public class MoreLikeThisHandler extends RequestHandlerBase { // Hold on to the interesting terms if relevant TermStyle termStyle = TermStyle.get(params.get(MoreLikeThisParams.INTERESTING_TERMS)); - List<InterestingTerm> interesting = - (termStyle == TermStyle.NONE) ? null : new ArrayList<>(mlt.mlt.getMaxQueryTerms()); DocListAndSet mltDocs = null; @@ -159,7 +156,7 @@ public class MoreLikeThisHandler extends RequestHandlerBase { // Find documents MoreLikeThis - either with a reader or a query // -------------------------------------------------------------------------------- if (reader != null) { - mltDocs = mlt.getMoreLikeThis(reader, start, rows, filters, interesting, flags); + mltDocs = mlt.getMoreLikeThis(reader, start, rows, filters, flags); } else if (q != null) { // Matching options boolean includeMatch = params.getBool(MoreLikeThisParams.MATCH_INCLUDE, true); @@ -177,7 +174,7 @@ public class MoreLikeThisHandler extends RequestHandlerBase { if (iterator.hasNext()) { // do a MoreLikeThis query for each document in results int id = iterator.nextDoc(); - mltDocs = mlt.getMoreLikeThis(id, start, rows, filters, interesting, flags); + mltDocs = mlt.getMoreLikeThis(id, start, rows, filters, flags); } } else { throw new SolrException( @@ -195,7 +192,9 @@ public class MoreLikeThisHandler extends RequestHandlerBase { } rsp.addResponse(mltDocs.docList); - if (interesting != null) { + if (termStyle != TermStyle.NONE) { + final List<InterestingTerm> interesting = + mlt.getInterestingTerms(mlt.getBoostedMLTQuery(), mlt.mlt.getMaxQueryTerms()); if (termStyle == TermStyle.DETAILS) { NamedList<Float> it = new NamedList<>(); for (InterestingTerm t : interesting) { @@ -351,14 +350,14 @@ public class MoreLikeThisHandler extends RequestHandlerBase { } private Query rawMLTQuery; - private Query boostedMLTQuery; + private BooleanQuery boostedMLTQuery; private BooleanQuery realMLTQuery; public Query getRawMLTQuery() { return rawMLTQuery; } - public Query getBoostedMLTQuery() { + public BooleanQuery getBoostedMLTQuery() { return boostedMLTQuery; } @@ -366,7 +365,7 @@ public class MoreLikeThisHandler extends RequestHandlerBase { return realMLTQuery; } - private Query getBoostedQuery(Query mltquery) { + private BooleanQuery getBoostedQuery(Query mltquery) { BooleanQuery boostedQuery = (BooleanQuery) mltquery; if (boostFields.size() > 0) { BooleanQuery.Builder newQ = new BooleanQuery.Builder(); @@ -392,18 +391,13 @@ public class MoreLikeThisHandler extends RequestHandlerBase { } public DocListAndSet getMoreLikeThis( - int id, int start, int rows, List<Query> filters, List<InterestingTerm> terms, int flags) - throws IOException { + int id, int start, int rows, List<Query> filters, int flags) throws IOException { Document doc = reader.document(id); - rawMLTQuery = mlt.like(id); - boostedMLTQuery = getBoostedQuery(rawMLTQuery); - if (terms != null) { - fillInterestingTermsFromMLTQuery(boostedMLTQuery, terms); - } + final Query boostedQuery = getBoostedMLTQuery(id); // exclude current document from results BooleanQuery.Builder realMLTQuery = new BooleanQuery.Builder(); - realMLTQuery.add(boostedMLTQuery, BooleanClause.Occur.MUST); + realMLTQuery.add(boostedQuery, BooleanClause.Occur.MUST); realMLTQuery.add( new TermQuery( new Term( @@ -423,14 +417,15 @@ public class MoreLikeThisHandler extends RequestHandlerBase { return results; } + /** Sets {@link #boostedMLTQuery} and returns it */ + public BooleanQuery getBoostedMLTQuery(int docNum) throws IOException { + rawMLTQuery = mlt.like(docNum); + boostedMLTQuery = getBoostedQuery(rawMLTQuery); + return boostedMLTQuery; + } + public DocListAndSet getMoreLikeThis( - Reader reader, - int start, - int rows, - List<Query> filters, - List<InterestingTerm> terms, - int flags) - throws IOException { + Reader reader, int start, int rows, List<Query> filters, int flags) throws IOException { // SOLR-5351: if only check against a single field, use the reader directly. Otherwise we // repeat the stream's content for multiple fields so that query terms can be pulled from any // of those fields. @@ -450,14 +445,9 @@ public class MoreLikeThisHandler extends RequestHandlerBase { for (String field : fields) { multifieldDoc.put(field, streamValue); } - rawMLTQuery = mlt.like(multifieldDoc); } - boostedMLTQuery = getBoostedQuery(rawMLTQuery); - if (terms != null) { - fillInterestingTermsFromMLTQuery(boostedMLTQuery, terms); - } DocListAndSet results = new DocListAndSet(); if (this.needDocSet) { results = searcher.getDocListAndSet(boostedMLTQuery, filters, null, start, rows, flags); @@ -466,37 +456,19 @@ public class MoreLikeThisHandler extends RequestHandlerBase { } return results; } - - public NamedList<BooleanQuery> getMoreLikeTheseQuery(DocList docs) throws IOException { - IndexSchema schema = searcher.getSchema(); - NamedList<BooleanQuery> result = new NamedList<>(); - DocIterator iterator = docs.iterator(); - while (iterator.hasNext()) { - int id = iterator.nextDoc(); - String uniqueId = schema.printableUniqueKey(reader.document(id)); - - BooleanQuery mltquery = (BooleanQuery) mlt.like(id); - if (mltquery.clauses().size() == 0) { - return result; - } - mltquery = (BooleanQuery) getBoostedQuery(mltquery); - - // exclude current document from results - BooleanQuery.Builder mltQuery = new BooleanQuery.Builder(); - mltQuery.add(mltquery, BooleanClause.Occur.MUST); - - mltQuery.add( - new TermQuery(new Term(uniqueKeyField.getName(), uniqueId)), - BooleanClause.Occur.MUST_NOT); - result.add(uniqueId, mltQuery.build()); - } - - return result; - } - - private void fillInterestingTermsFromMLTQuery(Query query, List<InterestingTerm> terms) { - Collection<BooleanClause> clauses = ((BooleanQuery) query).clauses(); + /** + * Yields terms with boosts from the boosted MLT query. + * + * @param maxTerms how many terms to return, a negative value means all terms are returned + */ + public List<InterestingTerm> getInterestingTerms(BooleanQuery boostedMLTQuery, int maxTerms) { + assert boostedMLTQuery != null : "strictly expecting it's set"; + Collection<BooleanClause> clauses = boostedMLTQuery.clauses(); + List<InterestingTerm> output = new ArrayList<>(maxTerms < 0 ? clauses.size() : maxTerms); for (BooleanClause o : clauses) { + if (maxTerms > -1 && output.size() >= maxTerms) { + break; + } Query q = o.getQuery(); float boost = 1f; if (q instanceof BoostQuery) { @@ -507,10 +479,11 @@ public class MoreLikeThisHandler extends RequestHandlerBase { InterestingTerm it = new InterestingTerm(); it.boost = boost; it.term = ((TermQuery) q).getTerm(); - terms.add(it); + output.add(it); } // alternatively we could use // mltquery.extractTerms( terms ); + return output; } public MoreLikeThis getMoreLikeThis() { diff --git a/solr/core/src/java/org/apache/solr/handler/component/MoreLikeThisComponent.java b/solr/core/src/java/org/apache/solr/handler/component/MoreLikeThisComponent.java index 0f220e0d99a..e272c646116 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/MoreLikeThisComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/MoreLikeThisComponent.java @@ -23,13 +23,16 @@ import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; -import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.TreeMap; -import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.Term; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.CharsRefBuilder; +import org.apache.solr.client.solrj.util.ClientUtils; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.params.CommonParams; @@ -95,24 +98,24 @@ public class MoreLikeThisComponent extends SearchComponent { rb.rsp.add("moreLikeThis", new NamedList<DocList>()); return; } - MoreLikeThisHandler.MoreLikeThisHelper mlt = new MoreLikeThisHandler.MoreLikeThisHelper(params, searcher); - - NamedList<BooleanQuery> bQuery = mlt.getMoreLikeTheseQuery(rb.getResults().docList); - - NamedList<String> temp = new NamedList<>(); - Iterator<Entry<String, BooleanQuery>> idToQueryIt = bQuery.iterator(); - - while (idToQueryIt.hasNext()) { - Entry<String, BooleanQuery> idToQuery = idToQueryIt.next(); - String s = idToQuery.getValue().toString(); - - log.debug("MLT Query:{}", s); - temp.add(idToQuery.getKey(), idToQuery.getValue().toString()); + NamedList<NamedList<?>> mltQueryByDocKey = new NamedList<>(); + for (DocIterator results = rb.getResults().docList.iterator(); results.hasNext(); ) { + int docId = results.nextDoc(); + final List<MoreLikeThisHandler.InterestingTerm> interestingTerms = + mlt.getInterestingTerms(mlt.getBoostedMLTQuery(docId), -1); + if (interestingTerms.isEmpty()) { + continue; + } + final String uniqueKey = rb.req.getSchema().getUniqueKeyField().getName(); + final Document document = rb.req.getSearcher().doc(docId); + final String uniqueVal = rb.req.getSchema().printableUniqueKey(document); + final NamedList<String> mltQ = + mltViaQueryParams(rb.req.getSchema(), interestingTerms, uniqueKey, uniqueVal); + mltQueryByDocKey.add(uniqueVal, mltQ); } - - rb.rsp.add("moreLikeThis", temp); + rb.rsp.add("moreLikeThis", mltQueryByDocKey); } else { NamedList<DocList> sim = getMoreLikeThese(rb, rb.req.getSearcher(), rb.getResults().docList, flags); @@ -127,6 +130,53 @@ public class MoreLikeThisComponent extends SearchComponent { } } + private static NamedList<String> mltViaQueryParams( + IndexSchema schema, + List<MoreLikeThisHandler.InterestingTerm> terms, + String uniqueField, + String uniqueVal) { + final NamedList<String> mltQ = new NamedList<>(); + StringBuilder q = new StringBuilder("{!bool"); + q.append(" must_not=$"); + int cnt = 0; + String param = "mltq" + (cnt++); + q.append(param); + mltQ.add(param, "{!field f=" + uniqueField + "}" + uniqueVal); + final StringBuilder reuseStr = new StringBuilder(); + final CharsRefBuilder reuseChar = new CharsRefBuilder(); + for (MoreLikeThisHandler.InterestingTerm term : terms) { + param = "mltq" + (cnt++); + q.append(" should=$"); + q.append(param); + mltQ.add(param, toParserParam(schema, term.term, term.boost, reuseStr, reuseChar)); + } + q.append("}"); + mltQ.add(CommonParams.Q, q.toString()); + return mltQ; + } + + private static String toParserParam( + IndexSchema schema, + Term term1, + float boost, + StringBuilder reuseStr, + CharsRefBuilder reuseChar) { + reuseStr.setLength(0); + if (boost != 1f) { + reuseStr.append("{!boost b="); + reuseStr.append(boost); + reuseStr.append("}"); + } + final String field = term1.field(); + final CharsRef val = + schema.getField(field).getType().indexedToReadable(term1.bytes(), reuseChar); + reuseStr.append("{!term f="); + reuseStr.append(ClientUtils.encodeLocalParamVal(field)); + reuseStr.append("}"); + reuseStr.append(val); + return reuseStr.toString(); + } + @Override public void handleResponses(ResponseBuilder rb, ShardRequest sreq) { if ((sreq.purpose & ShardRequest.PURPOSE_GET_TOP_IDS) != 0 @@ -139,17 +189,18 @@ public class MoreLikeThisComponent extends SearchComponent { // This should only happen in case of using shards.tolerant=true. Omit this ShardResponse continue; } - NamedList<?> moreLikeThisReponse = - (NamedList<?>) r.getSolrResponse().getResponse().get("moreLikeThis"); + @SuppressWarnings("unchecked") + NamedList<NamedList<String>> moreLikeThisReponse = + (NamedList<NamedList<String>>) r.getSolrResponse().getResponse().get("moreLikeThis"); if (log.isDebugEnabled()) { log.debug("ShardRequest.response.shard: {}", r.getShard()); } if (moreLikeThisReponse != null) { - for (Entry<String, ?> entry : moreLikeThisReponse) { + for (Entry<String, NamedList<String>> entry : moreLikeThisReponse) { if (log.isDebugEnabled()) { log.debug("id: '{}' Query: '{}'", entry.getKey(), entry.getValue()); } - ShardRequest s = buildShardQuery(rb, (String) entry.getValue(), entry.getKey()); + ShardRequest s = buildShardQuery(rb, entry.getValue(), entry.getKey()); rb.addRequest(this, s); } } @@ -309,7 +360,7 @@ public class MoreLikeThisComponent extends SearchComponent { return result; } - ShardRequest buildShardQuery(ResponseBuilder rb, String q, String key) { + ShardRequest buildShardQuery(ResponseBuilder rb, NamedList<String> q, String key) { ShardRequest s = new ShardRequest(); s.params = new ModifiableSolrParams(rb.req.getParams()); s.purpose |= ShardRequest.PURPOSE_GET_MLT_RESULTS; @@ -337,24 +388,9 @@ public class MoreLikeThisComponent extends SearchComponent { s.params.set(CommonParams.FL, "score," + id); s.params.set(SORT, "score desc"); // MLT Query is submitted as normal query to shards. - s.params.set(CommonParams.Q, q); - - return s; - } - - ShardRequest buildMLTQuery(ResponseBuilder rb, String q) { - ShardRequest s = new ShardRequest(); - s.params = new ModifiableSolrParams(); - - s.params.set(CommonParams.START, 0); - - String id = rb.req.getSchema().getUniqueKeyField().getName(); - - s.params.set(CommonParams.FL, "score," + id); - // MLT Query is submitted as normal query to shards. - s.params.set(CommonParams.Q, q); + s.params.remove(CommonParams.Q); + q.forEach((k, v) -> s.params.add(k, v)); - s.shards = ShardRequest.ALL_SHARDS; return s; } @@ -375,12 +411,8 @@ public class MoreLikeThisComponent extends SearchComponent { SimpleOrderedMap<Object> interestingTermsResponse = null; MoreLikeThisParams.TermStyle interestingTermsConfig = MoreLikeThisParams.TermStyle.get(p.get(MoreLikeThisParams.INTERESTING_TERMS)); - List<MoreLikeThisHandler.InterestingTerm> interestingTerms = - (interestingTermsConfig == MoreLikeThisParams.TermStyle.NONE) - ? null - : new ArrayList<>(mltHelper.getMoreLikeThis().getMaxQueryTerms()); - if (interestingTerms != null) { + if (interestingTermsConfig != MoreLikeThisParams.TermStyle.NONE) { interestingTermsResponse = new SimpleOrderedMap<>(); } @@ -388,8 +420,7 @@ public class MoreLikeThisComponent extends SearchComponent { int id = iterator.nextDoc(); int rows = p.getInt(MoreLikeThisParams.DOC_COUNT, 5); - DocListAndSet similarDocuments = - mltHelper.getMoreLikeThis(id, 0, rows, null, interestingTerms, flags); + DocListAndSet similarDocuments = mltHelper.getMoreLikeThis(id, 0, rows, null, flags); String name = schema.printableUniqueKey(searcher.doc(id)); mltResponse.add(name, similarDocuments.docList); @@ -410,6 +441,9 @@ public class MoreLikeThisComponent extends SearchComponent { } if (interestingTermsResponse != null) { + List<MoreLikeThisHandler.InterestingTerm> interestingTerms = + mltHelper.getInterestingTerms( + mltHelper.getBoostedMLTQuery(), mltHelper.getMoreLikeThis().getMaxQueryTerms()); if (interestingTermsConfig == MoreLikeThisParams.TermStyle.DETAILS) { SimpleOrderedMap<Float> interestingTermsWithScore = new SimpleOrderedMap<>(); for (MoreLikeThisHandler.InterestingTerm interestingTerm : interestingTerms) { diff --git a/solr/core/src/test/org/apache/solr/handler/component/DistributedMLTComponentTest.java b/solr/core/src/test/org/apache/solr/handler/component/DistributedMLTComponentTest.java index bf803e3b2b1..2d5de7e3bd3 100644 --- a/solr/core/src/test/org/apache/solr/handler/component/DistributedMLTComponentTest.java +++ b/solr/core/src/test/org/apache/solr/handler/component/DistributedMLTComponentTest.java @@ -91,7 +91,7 @@ public class DistributedMLTComponentTest extends BaseDistributedSearchTestCase { id, "9", "lowerfilt", - "The quick red fox jumped over the lazy big and large brown dogs.", + "The quick red:fox jumped over the lazy big and large brown dogs.", "lowerfilt1", "x"); index(id, "10", "lowerfilt", "blue", "lowerfilt1", "x"); @@ -100,7 +100,7 @@ public class DistributedMLTComponentTest extends BaseDistributedSearchTestCase { id, "13", "lowerfilt", - "The quote red fox jumped over the lazy brown dogs.", + "The quote RED)FOX jumped over the lazy brown dogs.", "lowerfilt1", "y"); index( @@ -389,5 +389,29 @@ public class DistributedMLTComponentTest extends BaseDistributedSearchTestCase { Long actual = ((SolrDocumentList) entry.getValue()).getNumFound(); assertEquals("MLT mismatch for id=" + key, expected, actual); } + // test boost mlt.qf + query( + "q", + "lowerfilt:moon", + "fl", + id, + MoreLikeThisParams.MIN_TERM_FREQ, + 2, + MoreLikeThisParams.MIN_DOC_FREQ, + 1, + "sort", + "id_i1 desc", + "mlt", + "true", + "mlt.fl", + "lowerfilt1,lowerfilt", + "mlt.qf", + "lowerfilt1^1.2 lowerfilt^3.4", + "qt", + requestHandlerName, + "shards.qt", + requestHandlerName, + "mlt.count", + "20"); } }