[GitHub] [lucene] jtibshirani commented on a change in pull request #736: LUCENE-10458: BoundedDocSetIdIterator may supply error count in Weigth#count(LeafReaderContext) when missingValue enables

GitBox Fri, 18 Mar 2022 15:13:33 -0700


jtibshirani commented on a change in pull request #736:
URL: https://github.com/apache/lucene/pull/736#discussion_r830381767




##########
File path: 
lucene/sandbox/src/java/org/apache/lucene/sandbox/search/IndexSortSortedNumericDocValuesRangeQuery.java
##########
@@ -278,7 +287,21 @@ private BoundedDocSetIdIterator getDocIdSetIterator(
     }
 
     int lastDocIdExclusive = high + 1;
-    return new BoundedDocSetIdIterator(firstDocIdInclusive, 
lastDocIdExclusive, delegate);
+    Object missingValue = sortField.getMissingValue();

Review comment:
       Pinging again on this: what do you think of moving `reader.hasDeletions` 
up to the check inside `count` instead?

##########
File path: 
lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java
##########
@@ -459,56 +468,122 @@ public void 
testIndexSortOptimizationDeactivated(RandomIndexWriter writer) throw
     reader.close();
   }
 
-  public void testCount() throws IOException {
-    Directory dir = newDirectory();
-    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
-    Sort indexSort = new Sort(new SortedNumericSortField("field", 
SortField.Type.LONG));
-    iwc.setIndexSort(indexSort);
-    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
-    Document doc = new Document();
-    doc.add(new SortedNumericDocValuesField("field", 10));
-    writer.addDocument(doc);
-    IndexReader reader = writer.getReader();
-    IndexSearcher searcher = newSearcher(reader);
+  public void testCompareCount() throws IOException {
+    final int iters = atLeast(10);
+    for (int iter = 0; iter < iters; ++iter) {
+      Directory dir = newDirectory();
+      IndexWriterConfig iwc = new IndexWriterConfig(new 
MockAnalyzer(random()));
+      SortField sortField = new SortedNumericSortField("field", 
SortField.Type.LONG);
+      boolean enableMissingValue = random().nextBoolean();
+      if (enableMissingValue) {
+        long missingValue =
+            random().nextBoolean()
+                ? Long.MIN_VALUE
+                : (random().nextBoolean() ? Long.MAX_VALUE : 
random().nextLong());
+        sortField.setMissingValue(missingValue);
+      }
+      RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
 
-    Query fallbackQuery = LongPoint.newRangeQuery("field", 1, 42);
-    Query query = new IndexSortSortedNumericDocValuesRangeQuery("field", 1, 
42, fallbackQuery);
-    Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
-    for (LeafReaderContext context : searcher.getLeafContexts()) {
-      assertEquals(1, weight.count(context));
+      final int numDocs = atLeast(100);
+      for (int i = 0; i < numDocs; ++i) {
+        Document doc = new Document();
+        final int numValues = TestUtil.nextInt(random(), 0, 1);
+        for (int j = 0; j < numValues; ++j) {
+          final long value = TestUtil.nextLong(random(), -100, 10000);
+          doc =
+              random().nextBoolean()
+                  ? createMissingValueDocument()
+                  : createSNDVAndPointDocument("field", value);
+        }
+        writer.addDocument(doc);
+      }
+
+      if (random().nextBoolean()) {
+        writer.deleteDocuments(LongPoint.newRangeQuery("field", 0L, 10L));
+      }
+
+      final IndexReader reader = writer.getReader();
+      final IndexSearcher searcher = newSearcher(reader);
+      writer.close();
+
+      for (int i = 0; i < 100; ++i) {
+        final long min =
+            random().nextBoolean() ? Long.MIN_VALUE : 
TestUtil.nextLong(random(), -100, 10000);
+        final long max =
+            random().nextBoolean() ? Long.MAX_VALUE : 
TestUtil.nextLong(random(), -100, 10000);
+        final Query q1 = LongPoint.newRangeQuery("field", min, max);
+
+        final Query fallbackQuery = LongPoint.newRangeQuery("field", min, max);
+        final Query q2 =
+            new IndexSortSortedNumericDocValuesRangeQuery("field", min, max, 
fallbackQuery);
+        final Weight weight1 = q1.createWeight(searcher, ScoreMode.COMPLETE, 
1.0f);
+        final Weight weight2 = q2.createWeight(searcher, ScoreMode.COMPLETE, 
1.0f);
+        assertSameCount(weight1, weight2, searcher);
+      }
+
+      reader.close();
+      dir.close();
     }
+  }
 
-    writer.close();
-    reader.close();
-    dir.close();
+  private void assertSameCount(Weight weight1, Weight weight2, IndexSearcher 
searcher)
+      throws IOException {
+    for (LeafReaderContext context : searcher.getLeafContexts()) {
+      assertEquals(weight1.count(context), weight2.count(context));
+    }
   }
 
-  public void testFallbackCount() throws IOException {
+  public void testCountBoundary() throws IOException {
     Directory dir = newDirectory();
     IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
-    Sort indexSort = new Sort(new SortedNumericSortField("field", 
SortField.Type.LONG));
+    SortField sortField = new SortedNumericSortField("field", 
SortField.Type.LONG);
+    // true: lower
+    // false: upper
+    boolean lowerOrUpper = random().nextBoolean();

Review comment:
       Small comment, maybe `useLower` would be clearer here, then you wouldn't 
need the comment explanation.

##########
File path: 
lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java
##########
@@ -59,7 +61,14 @@ public void testSameHitsAsPointRangeQuery() throws 
IOException {
       IndexWriterConfig iwc = new IndexWriterConfig(new 
MockAnalyzer(random()));
       boolean reverse = random().nextBoolean();
       SortField sortField = new SortedNumericSortField("dv", 
SortField.Type.LONG, reverse);
-      sortField.setMissingValue(random().nextLong());
+      boolean enableMissingValue = random().nextBoolean();
+      if (enableMissingValue) {
+        long missingValue =

Review comment:
       I find this check a little hard to follow and the probabilities end up 
being surprising -- 1/2 for MIN_VALUE, 1/4 for MAX_VALUE, 1/4 a random long. 
Maybe we could do half the time a random long, half the time randomly choose 
between MIN_VALUE and MAX_VALUE.

##########
File path: 
lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java
##########
@@ -459,56 +468,122 @@ public void 
testIndexSortOptimizationDeactivated(RandomIndexWriter writer) throw
     reader.close();
   }
 
-  public void testCount() throws IOException {
-    Directory dir = newDirectory();
-    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
-    Sort indexSort = new Sort(new SortedNumericSortField("field", 
SortField.Type.LONG));
-    iwc.setIndexSort(indexSort);
-    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
-    Document doc = new Document();
-    doc.add(new SortedNumericDocValuesField("field", 10));
-    writer.addDocument(doc);
-    IndexReader reader = writer.getReader();
-    IndexSearcher searcher = newSearcher(reader);
+  public void testCompareCount() throws IOException {
+    final int iters = atLeast(10);
+    for (int iter = 0; iter < iters; ++iter) {
+      Directory dir = newDirectory();
+      IndexWriterConfig iwc = new IndexWriterConfig(new 
MockAnalyzer(random()));
+      SortField sortField = new SortedNumericSortField("field", 
SortField.Type.LONG);
+      boolean enableMissingValue = random().nextBoolean();
+      if (enableMissingValue) {
+        long missingValue =
+            random().nextBoolean()
+                ? Long.MIN_VALUE
+                : (random().nextBoolean() ? Long.MAX_VALUE : 
random().nextLong());
+        sortField.setMissingValue(missingValue);
+      }
+      RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
 
-    Query fallbackQuery = LongPoint.newRangeQuery("field", 1, 42);
-    Query query = new IndexSortSortedNumericDocValuesRangeQuery("field", 1, 
42, fallbackQuery);
-    Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
-    for (LeafReaderContext context : searcher.getLeafContexts()) {
-      assertEquals(1, weight.count(context));
+      final int numDocs = atLeast(100);
+      for (int i = 0; i < numDocs; ++i) {
+        Document doc = new Document();
+        final int numValues = TestUtil.nextInt(random(), 0, 1);
+        for (int j = 0; j < numValues; ++j) {
+          final long value = TestUtil.nextLong(random(), -100, 10000);
+          doc =
+              random().nextBoolean()
+                  ? createMissingValueDocument()
+                  : createSNDVAndPointDocument("field", value);
+        }
+        writer.addDocument(doc);
+      }
+
+      if (random().nextBoolean()) {
+        writer.deleteDocuments(LongPoint.newRangeQuery("field", 0L, 10L));
+      }
+
+      final IndexReader reader = writer.getReader();
+      final IndexSearcher searcher = newSearcher(reader);
+      writer.close();
+
+      for (int i = 0; i < 100; ++i) {
+        final long min =
+            random().nextBoolean() ? Long.MIN_VALUE : 
TestUtil.nextLong(random(), -100, 10000);
+        final long max =
+            random().nextBoolean() ? Long.MAX_VALUE : 
TestUtil.nextLong(random(), -100, 10000);
+        final Query q1 = LongPoint.newRangeQuery("field", min, max);
+
+        final Query fallbackQuery = LongPoint.newRangeQuery("field", min, max);
+        final Query q2 =
+            new IndexSortSortedNumericDocValuesRangeQuery("field", min, max, 
fallbackQuery);
+        final Weight weight1 = q1.createWeight(searcher, ScoreMode.COMPLETE, 
1.0f);
+        final Weight weight2 = q2.createWeight(searcher, ScoreMode.COMPLETE, 
1.0f);
+        assertSameCount(weight1, weight2, searcher);
+      }
+
+      reader.close();
+      dir.close();
     }
+  }
 
-    writer.close();
-    reader.close();
-    dir.close();
+  private void assertSameCount(Weight weight1, Weight weight2, IndexSearcher 
searcher)
+      throws IOException {
+    for (LeafReaderContext context : searcher.getLeafContexts()) {
+      assertEquals(weight1.count(context), weight2.count(context));
+    }
   }
 
-  public void testFallbackCount() throws IOException {

Review comment:
       Could you explain why you removed this test? It seemed helpful to me.

##########
File path: lucene/CHANGES.txt
##########
@@ -56,14 +56,17 @@ API Changes
 
 New Features
 ---------------------
-(No changes)
+
+* LUCENE-10385: Implement Weight#count on 
IndexSortSortedNumericDocValuesRangeQuery
+  to speed up computing the number of hits when possible. (Lu Xugang, Cavanna, 
Adrien Grand)

Review comment:
       We could keep Luca's name here: Luca Cavanna :)




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

[GitHub] [lucene] jtibshirani commented on a change in pull request #736: LUCENE-10458: BoundedDocSetIdIterator may supply error count in Weigth#count(LeafReaderContext) when missingValue enables

Reply via email to