gf2121 commented on code in PR #13221: URL: https://github.com/apache/lucene/pull/13221#discussion_r1544575799
########## lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java: ########## @@ -193,170 +209,160 @@ public void setHitsThresholdReached() throws IOException { private void updateCompetitiveIterator() throws IOException { if (enableSkipping == false || hitsThresholdReached == false - || (queueFull == false && topValueSet == false)) return; + || (leafTopSet == false && queueFull == false)) return; // if some documents have missing points, check that missing values prohibits optimization - if ((pointValues.getDocCount() < maxDoc) && isMissingValueCompetitive()) { + boolean dense = pointValues.getDocCount() == maxDoc; + if (dense == false && isMissingValueCompetitive()) { return; // we can't filter out documents, as documents with missing values are competitive } - updateCounter++; - // Start sampling if we get called too much - if (updateCounter > 256 - && (updateCounter & (currentSkipInterval - 1)) != currentSkipInterval - 1) { + if (competitiveIterator instanceof CompetitiveIterator iter) { + encodeBottom(); + // CompetitiveIterator already built, try to reduce clause. + tryReduceDisjunctionClause(iter); return; } - if (reverse == false) { - if (queueFull) { // bottom is available only when queue is full - maxValueAsBytes = maxValueAsBytes == null ? new byte[bytesCount] : maxValueAsBytes; - encodeBottom(); - } - if (topValueSet) { - minValueAsBytes = minValueAsBytes == null ? new byte[bytesCount] : minValueAsBytes; - encodeTop(); - } - } else { - if (queueFull) { // bottom is available only when queue is full - minValueAsBytes = minValueAsBytes == null ? new byte[bytesCount] : minValueAsBytes; - encodeBottom(); - } - if (topValueSet) { - maxValueAsBytes = maxValueAsBytes == null ? new byte[bytesCount] : maxValueAsBytes; - encodeTop(); + + if (thresholdAsLong == -1) { + if (dense == false) { + competitiveIterator = getNumericDocValues(context, field); + leadCost = Math.min(leadCost, competitiveIterator.cost()); } + long threshold = Math.min(leadCost >> 3, maxDoc >> 5); + thresholdAsLong = intersectThresholdValue(threshold); } - DocIdSetBuilder result = new DocIdSetBuilder(maxDoc); - PointValues.IntersectVisitor visitor = - new PointValues.IntersectVisitor() { - DocIdSetBuilder.BulkAdder adder; - - @Override - public void grow(int count) { - adder = result.grow(count); - } + if ((reverse == false && bottomAsComparableLong() <= thresholdAsLong) + || (reverse && bottomAsComparableLong() >= thresholdAsLong)) { + encodeBottom(); + DisjunctionBuildVisitor visitor = new DisjunctionBuildVisitor(); + competitiveIterator = visitor.generateCompetitiveIterator(); + } + } - @Override - public void visit(int docID) { - if (docID <= maxDocVisited) { - return; // Already visited or skipped - } - adder.add(docID); - } + private void tryReduceDisjunctionClause(CompetitiveIterator iter) { + int originalSize = iter.disis.size(); + Predicate<DisiAndMostCompetitiveValue> isCompetitive = + d -> d.mostCompetitiveValue <= maxValueAsLong && d.mostCompetitiveValue >= minValueAsLong; - @Override - public void visit(int docID, byte[] packedValue) { - if (docID <= maxDocVisited) { - return; // already visited or skipped - } - if (maxValueAsBytes != null) { - int cmp = bytesComparator.compare(packedValue, 0, maxValueAsBytes, 0); - - if (cmp > 0) return; - } - if (minValueAsBytes != null) { - int cmp = bytesComparator.compare(packedValue, 0, minValueAsBytes, 0); + while (iter.disis.isEmpty() == false && isCompetitive.test(iter.disis.getFirst()) == false) { + iter.disis.removeFirst(); + } - if (cmp < 0) return; - } - adder.add(docID); // doc is competitive - } + if (originalSize != iter.disis.size()) { + iter.disjunction.clear(); + iter.disjunction.addAll(iter.disis); + } + } - @Override - public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { - if (maxValueAsBytes != null) { - int cmp = bytesComparator.compare(minPackedValue, 0, maxValueAsBytes, 0); - // 1. cmp ==0 and pruning==Pruning.GREATER_THAN_OR_EQUAL_TO : if the sort is - // ascending then maxValueAsBytes is bottom's next less value, so it is competitive - // 2. cmp ==0 and pruning==Pruning.GREATER_THAN: maxValueAsBytes equals to - // bottom, but there are multiple comparators, so it could be competitive - if (cmp > 0) return PointValues.Relation.CELL_OUTSIDE_QUERY; - } - if (minValueAsBytes != null) { - int cmp = bytesComparator.compare(maxPackedValue, 0, minValueAsBytes, 0); - if (cmp < 0) return PointValues.Relation.CELL_OUTSIDE_QUERY; - } - if ((maxValueAsBytes != null - && bytesComparator.compare(maxPackedValue, 0, maxValueAsBytes, 0) > 0) - || (minValueAsBytes != null - && bytesComparator.compare(minPackedValue, 0, minValueAsBytes, 0) < 0)) { - return PointValues.Relation.CELL_CROSSES_QUERY; - } - return PointValues.Relation.CELL_INSIDE_QUERY; - } - }; - final long threshold = iteratorCost >>> 3; - - if (PointValues.isEstimatedPointCountGreaterThanOrEqualTo(visitor, pointTree, threshold)) { - // the new range is not selective enough to be worth materializing, it doesn't reduce number - // of docs at least 8x - updateSkipInterval(false); - if (pointValues.getDocCount() < iteratorCost) { - // Use the set of doc with values to help drive iteration - competitiveIterator = getNumericDocValues(context, field); - iteratorCost = pointValues.getDocCount(); + /** + * Find out the value that threshold docs away from topValue/infinite. + * + * <p>TODO: we are assuming a binary tree + */ + private long intersectThresholdValue(long threshold) throws IOException { Review Comment: Great point! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org