DCausse has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/328644 )
Change subject: Workaround timeout limitation ...................................................................... Workaround timeout limitation Timeout checks are passive with Lucene, it means that the timeout is only checked when a doc matches or when we switch to a new segment. The problem is that the timeout can be very inaccurate when scanning a large segment where no docs match. Workaround this issue by a horrible hack: - inspect elastic SearchContect.current() to determine if a timeout is used - Build a fake collector to be able to throw TimeExceededException (private constructor) This hack will be certainly hard to port to future elastic version (esp. if the SearchContext becomes hard to access at this stage) Bug: T152895 Change-Id: I1ce0378e475bf5c122cb8586916722992f82bed3 --- M src/main/java/org/wikimedia/search/extra/regex/AcceleratedSourceRegexQuery.java M src/main/java/org/wikimedia/search/extra/regex/UnacceleratedSourceRegexQuery.java M src/test/java/org/wikimedia/search/extra/regex/SourceRegexQueryTest.java 3 files changed, 68 insertions(+), 11 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/extra refs/changes/44/328644/1 diff --git a/src/main/java/org/wikimedia/search/extra/regex/AcceleratedSourceRegexQuery.java b/src/main/java/org/wikimedia/search/extra/regex/AcceleratedSourceRegexQuery.java index 097b2ad..aae2e77 100644 --- a/src/main/java/org/wikimedia/search/extra/regex/AcceleratedSourceRegexQuery.java +++ b/src/main/java/org/wikimedia/search/extra/regex/AcceleratedSourceRegexQuery.java @@ -44,6 +44,7 @@ // TODO: Get rid of this shared mutable state, we should be able to use // the generic timeout system. private final MutableValueInt inspected = new MutableValueInt(); + private final TimeoutChecker timeoutChecker = new TimeoutChecker(); @Override public Scorer scorer(final LeafReaderContext context) throws IOException { @@ -51,7 +52,8 @@ if(approxScorer == null) { return null; } - return new ConstantScoreScorer(this, 1f, new RegexTwoPhaseIterator(approxScorer.iterator(), context, inspected)); + timeoutChecker.nextSegment(context); + return new ConstantScoreScorer(this, 1f, new RegexTwoPhaseIterator(approxScorer.iterator(), context, inspected, timeoutChecker)); } }; } diff --git a/src/main/java/org/wikimedia/search/extra/regex/UnacceleratedSourceRegexQuery.java b/src/main/java/org/wikimedia/search/extra/regex/UnacceleratedSourceRegexQuery.java index 0dee8aa..7968a3b 100644 --- a/src/main/java/org/wikimedia/search/extra/regex/UnacceleratedSourceRegexQuery.java +++ b/src/main/java/org/wikimedia/search/extra/regex/UnacceleratedSourceRegexQuery.java @@ -5,15 +5,10 @@ import lombok.EqualsAndHashCode; import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.ConstantScoreScorer; -import org.apache.lucene.search.ConstantScoreWeight; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.TwoPhaseIterator; -import org.apache.lucene.search.Weight; +import org.apache.lucene.search.*; import org.apache.lucene.util.mutable.MutableValueInt; +import org.elasticsearch.search.SearchService; +import org.elasticsearch.search.internal.SearchContext; import org.wikimedia.search.extra.regex.SourceRegexQuery.Rechecker; import org.wikimedia.search.extra.regex.SourceRegexQuery.Settings; import org.wikimedia.search.extra.util.FieldValues; @@ -56,9 +51,11 @@ // TODO: Get rid of this shared mutable state, we should be able to use // the generic timeout system. private final MutableValueInt inspected = new MutableValueInt(); + private final TimeoutChecker timeoutChecker = new TimeoutChecker(); @Override public Scorer scorer(final LeafReaderContext context) throws IOException { + timeoutChecker.nextSegment(context); // We can stop matching early if we are allowed to inspect less // doc than the number of docs available in this segment. // This is because we use a DocIdSetIterator.all. @@ -66,25 +63,29 @@ if(remaining < 0) { remaining = 0; } + context.reader().getLiveDocs(); int maxDoc = remaining > context.reader().maxDoc() ? context.reader().maxDoc() : remaining; final DocIdSetIterator approximation = DocIdSetIterator.all(maxDoc); - return new ConstantScoreScorer(this, 1f, new RegexTwoPhaseIterator(approximation, context, inspected)); + return new ConstantScoreScorer(this, 1f, new RegexTwoPhaseIterator(approximation, context, inspected, timeoutChecker)); } }; } protected class RegexTwoPhaseIterator extends TwoPhaseIterator { private final LeafReaderContext context; + private final TimeoutChecker timeoutChecker; private MutableValueInt inspected; - protected RegexTwoPhaseIterator(DocIdSetIterator approximation, LeafReaderContext context, MutableValueInt inspected) { + protected RegexTwoPhaseIterator(DocIdSetIterator approximation, LeafReaderContext context, MutableValueInt inspected, TimeoutChecker timeoutChecker) { super(approximation); this.context = context; this.inspected = inspected; + this.timeoutChecker = timeoutChecker; } @Override public boolean matches() throws IOException { + timeoutChecker.check(approximation.docID()); if (inspected.value >= settings.getMaxInspect()) { return false; } @@ -105,4 +106,36 @@ } } + /** + * Horrible hack to workaround the fact that TimeLimitingCollector.TimeExceededException has a private ctor + * FIXME: find proper solutions to handle timeouts + */ + protected static class TimeoutChecker { + private LeafCollector collector; + private final Collector topCollector; + public TimeoutChecker() throws IOException { + long timeout = SearchContext.current().timeoutInMillis(); + if(timeout != SearchService.NO_TIMEOUT.millis()) { + topCollector = new TimeLimitingCollector(new NullCollector(), SearchContext.current().timeEstimateCounter(), timeout); + } else { + topCollector = new NullCollector(); + } + } + public void check(int docId) throws IOException { + collector.collect(docId); + } + + public void nextSegment(LeafReaderContext context) throws IOException { + collector = topCollector.getLeafCollector(context); + } + + private static class NullCollector extends SimpleCollector { + @Override + public void collect(int doc) throws IOException {} + @Override + public boolean needsScores() { + return true; + } + } + } } diff --git a/src/test/java/org/wikimedia/search/extra/regex/SourceRegexQueryTest.java b/src/test/java/org/wikimedia/search/extra/regex/SourceRegexQueryTest.java index 5ae46c9..283572a 100644 --- a/src/test/java/org/wikimedia/search/extra/regex/SourceRegexQueryTest.java +++ b/src/test/java/org/wikimedia/search/extra/regex/SourceRegexQueryTest.java @@ -20,6 +20,7 @@ import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.rest.RestStatus; +import org.junit.Assert; import org.junit.Test; import org.wikimedia.search.extra.AbstractPluginIntegrationTest; @@ -155,6 +156,27 @@ } @Test + public void testTimeout() throws InterruptedException, ExecutionException, + IOException { + setup(); + indexRandom(false, doc("findmefound0", "findmefound")); + for(int i = 1; i < 10000; i++) { + indexRandom(false, doc("findmenotfound"+i, "findmenotfound ")); + indexRandom(false, doc("findmefound"+i, "findmenot")); + } + client().admin().indices().prepareForceMerge("test").setMaxNumSegments(1).get(); + refresh(); + + // warmup + SearchResponse resp = search(filter("findme")).get(); + Assert.assertTrue(resp.getHits().getTotalHits() > 0); + + resp = search(filter("(((f.){1,20}n.){1,20}m.){1,20}")).setSize(10).setTimeout("500ms").get(); + Assert.assertTrue(resp.getHits().getTotalHits() > 0); + Assert.assertTrue(resp.isTimedOut()); + } + + @Test public void caseInsensitiveMatching() throws InterruptedException, ExecutionException, IOException { setup(); indexRandom(true, doc("findme", "I have the test in me.")); -- To view, visit https://gerrit.wikimedia.org/r/328644 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I1ce0378e475bf5c122cb8586916722992f82bed3 Gerrit-PatchSet: 1 Gerrit-Project: search/extra Gerrit-Branch: master Gerrit-Owner: DCausse <dcau...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits