jpountz commented on code in PR #13036: URL: https://github.com/apache/lucene/pull/13036#discussion_r1474129857
########## lucene/core/src/test/org/apache/lucene/search/TestBooleanQuery.java: ########## @@ -962,6 +962,118 @@ public void testDisjunctionMatchesCount() throws IOException { dir.close(); } + public void testTwoClauseTermDisjunctionCountOptimization() throws Exception { + int largerTermCount = RandomNumbers.randomIntBetween(random(), 11, 100); + int smallerTermCount = RandomNumbers.randomIntBetween(random(), 1, (largerTermCount - 1) / 10); + + List<String[]> docContent = new ArrayList<>(largerTermCount + smallerTermCount); + + for (int i = 0; i < largerTermCount; i++) { + docContent.add(new String[] {"large"}); + } + + for (int i = 0; i < smallerTermCount; i++) { + docContent.add(new String[] {"small", "also small"}); + } + + try (Directory dir = newDirectory()) { + try (IndexWriter w = + new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()))) { + + for (String[] values : docContent) { + Document doc = new Document(); + for (String value : values) { + doc.add(new StringField("foo", value, Field.Store.NO)); + } + w.addDocument(doc); + } + w.forceMerge(1); + } + + try (IndexReader reader = DirectoryReader.open(dir)) { + final int[] countInvocations = new int[] {0}; + IndexSearcher countingIndexSearcher = + new IndexSearcher(reader) { + @Override + public int count(Query query) throws IOException { + countInvocations[0]++; + return super.count(query); + } + }; + + { + // Test no matches in either term + countInvocations[0] = 0; + BooleanQuery query = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "no match")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("foo", "also no match")), BooleanClause.Occur.SHOULD) + .build(); + + assertEquals(0, countingIndexSearcher.count(query)); + assertEquals(3, countInvocations[0]); + } + { + // Test match in one term + countInvocations[0] = 0; + BooleanQuery query = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "no match")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("foo", "small")), BooleanClause.Occur.SHOULD) + .build(); + + assertEquals(smallerTermCount, countingIndexSearcher.count(query)); + assertEquals(3, countInvocations[0]); + } Review Comment: Can you do the same with clauses in the opposite order? ########## lucene/core/src/test/org/apache/lucene/search/TestBooleanQuery.java: ########## @@ -962,6 +962,118 @@ public void testDisjunctionMatchesCount() throws IOException { dir.close(); } + public void testTwoClauseTermDisjunctionCountOptimization() throws Exception { + int largerTermCount = RandomNumbers.randomIntBetween(random(), 11, 100); + int smallerTermCount = RandomNumbers.randomIntBetween(random(), 1, (largerTermCount - 1) / 10); + + List<String[]> docContent = new ArrayList<>(largerTermCount + smallerTermCount); + + for (int i = 0; i < largerTermCount; i++) { + docContent.add(new String[] {"large"}); + } + + for (int i = 0; i < smallerTermCount; i++) { + docContent.add(new String[] {"small", "also small"}); + } + + try (Directory dir = newDirectory()) { + try (IndexWriter w = + new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()))) { + + for (String[] values : docContent) { + Document doc = new Document(); + for (String value : values) { + doc.add(new StringField("foo", value, Field.Store.NO)); + } + w.addDocument(doc); + } + w.forceMerge(1); + } + + try (IndexReader reader = DirectoryReader.open(dir)) { + final int[] countInvocations = new int[] {0}; + IndexSearcher countingIndexSearcher = + new IndexSearcher(reader) { + @Override + public int count(Query query) throws IOException { + countInvocations[0]++; + return super.count(query); + } + }; + + { + // Test no matches in either term + countInvocations[0] = 0; + BooleanQuery query = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "no match")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("foo", "also no match")), BooleanClause.Occur.SHOULD) + .build(); + + assertEquals(0, countingIndexSearcher.count(query)); + assertEquals(3, countInvocations[0]); + } + { + // Test match in one term + countInvocations[0] = 0; + BooleanQuery query = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("foo", "no match")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("foo", "small")), BooleanClause.Occur.SHOULD) + .build(); + + assertEquals(smallerTermCount, countingIndexSearcher.count(query)); + assertEquals(3, countInvocations[0]); + } + { + // Test match in both terms that hits optimization threshold + countInvocations[0] = 0; + boolean smallFirst = randomBoolean(); Review Comment: Can you test both queries explicitly instead of relying on randomization? ########## lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java: ########## @@ -179,6 +180,36 @@ boolean isPureDisjunction() { return clauses.size() == getClauses(Occur.SHOULD).size() && minimumNumberShouldMatch <= 1; } + /** Whether this query is a two clause disjunction with two term query clauses. */ + boolean isTwoClausePureDisjunctionWithTerms() { + return clauses.size() == 2 + && isPureDisjunction() + && clauses.get(0).getQuery() instanceof TermQuery + && clauses.get(1).getQuery() instanceof TermQuery; + } + + /** + * Rewrite a single two clause disjunction query with terms to two term queries and a conjunction + * query using the inclusion–exclusion principle. + */ + Query[] rewriteTwoClauseDisjunctionWithTermsForCount(IndexSearcher indexSearcher) + throws IOException { + BooleanQuery.Builder newQuery = new BooleanQuery.Builder(); + Query[] queries = new Query[3]; + for (int i = 0; i < clauses.size(); i++) { + TermQuery termQuery = (TermQuery) clauses.get(i).getQuery(); + if (termQuery.getTermStates() == null) { Review Comment: Nit: Can you add a comment that this is here to help avoid redo the same terms dictionary lookup multiple times? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org