jpountz commented on code in PR #13221: URL: https://github.com/apache/lucene/pull/13221#discussion_r1600034795
########## lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java: ########## @@ -207,102 +208,91 @@ private void updateCompetitiveIterator() throws IOException { || hitsThresholdReached == false || (leafTopSet == false && queueFull == false)) return; // if some documents have missing points, check that missing values prohibits optimization - if ((pointValues.getDocCount() < maxDoc) && isMissingValueCompetitive()) { + boolean dense = pointValues.getDocCount() == maxDoc; + if (dense == false && isMissingValueCompetitive()) { return; // we can't filter out documents, as documents with missing values are competitive } - updateCounter++; - // Start sampling if we get called too much - if (updateCounter > 256 - && (updateCounter & (currentSkipInterval - 1)) != currentSkipInterval - 1) { + if (competitiveIterator instanceof CompetitiveIterator iter) { + if (queueFull) { + encodeBottom(); + } + // CompetitiveIterator already built, try to reduce clause. + tryReduceDisjunctionClause(iter); return; } - if (queueFull) { - encodeBottom(); + if (thresholdAsLong == -1) { + if (dense == false) { + competitiveIterator = getNumericDocValues(context, field); + leadCost = Math.min(leadCost, competitiveIterator.cost()); + } + long threshold = Math.min(leadCost >> 3, maxDoc >> 5); + thresholdAsLong = intersectThresholdValue(threshold); Review Comment: What if `thresholdAsLong` was -1 here, would we then run the computation again later? Do we need to make `thresholdAsLong` an Optional, or at least a nullable `Long`? ########## lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java: ########## @@ -405,5 +395,278 @@ public int advance(int target) throws IOException { protected abstract long bottomAsComparableLong(); protected abstract long topAsComparableLong(); + + class DisjunctionBuildVisitor extends RangeVisitor { + + final Deque<DisiAndMostCompetitiveValue> disis = new ArrayDeque<>(); + // most competitive entry stored last. + final Consumer<DisiAndMostCompetitiveValue> adder = + reverse == false ? disis::addFirst : disis::addLast; + + final int minBlockLength = minBlockLength(); + + final LSBRadixSorter sorter = new LSBRadixSorter(); + int[] docs = IntsRef.EMPTY_INTS; + int index = 0; + int blockMaxDoc = -1; + boolean docsInOrder = true; + long blockMinValue = Long.MAX_VALUE; + long blockMaxValue = Long.MIN_VALUE; + + private DisjunctionBuildVisitor() { + super(minValueAsLong, maxValueAsLong, maxDocVisited); + } + + @Override + public void grow(int count) { + docs = ArrayUtil.grow(docs, index + count + 1); + } + + @Override + protected void consumeDoc(int doc) { + docs[index++] = doc; + if (doc >= blockMaxDoc) { + blockMaxDoc = doc; + } else { + docsInOrder = false; + } + } + + void intersectLeaves(PointValues.PointTree pointTree) throws IOException { + PointValues.Relation r = + compare(pointTree.getMinPackedValue(), pointTree.getMaxPackedValue()); + switch (r) { + case CELL_INSIDE_QUERY, CELL_CROSSES_QUERY -> { + if (pointTree.moveToChild()) { + do { + intersectLeaves(pointTree); + } while (pointTree.moveToSibling()); + pointTree.moveToParent(); + } else { + if (r == PointValues.Relation.CELL_CROSSES_QUERY) { + pointTree.visitDocValues(this); + } else { + pointTree.visitDocIDs(this); + } + updateMinMax( + sortableBytesToLong(pointTree.getMinPackedValue()), + sortableBytesToLong(pointTree.getMaxPackedValue())); + } + } + case CELL_OUTSIDE_QUERY -> {} + default -> throw new IllegalStateException("unreachable code"); + } + } + + void updateMinMax(long leafMinValue, long leafMaxValue) throws IOException { + this.blockMinValue = Math.min(blockMinValue, leafMinValue); + this.blockMaxValue = Math.max(blockMaxValue, leafMaxValue); + if (index >= minBlockLength) { + update(); + this.blockMinValue = Long.MAX_VALUE; + this.blockMaxValue = Long.MIN_VALUE; + } + } + + void update() throws IOException { + if (blockMinValue > blockMaxValue) { + return; + } + long mostCompetitiveValue = + reverse == false + ? Math.max(blockMinValue, minValueAsLong) + : Math.min(blockMaxValue, maxValueAsLong); + + if (docsInOrder == false) { + sorter.sort(PackedInts.bitsRequired(blockMaxDoc), docs, index); + } + docs[index] = DocIdSetIterator.NO_MORE_DOCS; + DocIdSetIterator iter = new IntArrayDocIdSet(docs, index).iterator(); + adder.accept(new DisiAndMostCompetitiveValue(iter, mostCompetitiveValue)); + docs = IntsRef.EMPTY_INTS; + index = 0; + blockMaxDoc = -1; + docsInOrder = true; + } + + DocIdSetIterator generateCompetitiveIterator() throws IOException { + intersectLeaves(pointValues.getPointTree()); + update(); + + if (disis.isEmpty()) { + return DocIdSetIterator.empty(); + } + assert assertMostCompetitiveValuesSorted(disis); + + PriorityQueue<DisiAndMostCompetitiveValue> disjunction = + new PriorityQueue<>(disis.size()) { + @Override + protected boolean lessThan( + DisiAndMostCompetitiveValue a, DisiAndMostCompetitiveValue b) { + return a.disi.docID() < b.disi.docID(); + } + }; + disjunction.addAll(disis); + + return new CompetitiveIterator(maxDoc, disis, disjunction); + } + + /** + * Used for assert. When reverse is false, smaller values are more competitive, so + * mostCompetitiveValues should be in desc order. + */ + private boolean assertMostCompetitiveValuesSorted(Deque<DisiAndMostCompetitiveValue> deque) { + long lastValue = reverse == false ? Long.MAX_VALUE : Long.MIN_VALUE; + for (DisiAndMostCompetitiveValue value : deque) { + if (reverse == false) { + assert value.mostCompetitiveValue <= lastValue + : deque.stream().map(d -> d.mostCompetitiveValue).toList().toString(); + } else { + assert value.mostCompetitiveValue >= lastValue + : deque.stream().map(d -> d.mostCompetitiveValue).toList().toString(); + } + lastValue = value.mostCompetitiveValue; + } + return true; + } + + private int minBlockLength() { + // bottom value can be much more competitive than thresholdAsLong, recompute the cost. + int cost = + Math.toIntExact( Review Comment: can we actually safely cast to an int? the field may be multi-valued and have more than Integer.MAX_VALUE values? FWIW I'd be ok with throwing an exception if there are more than MAX_DISJUNCTION_CLAUSE*Integer.MAX_VALUE values or something like that, so that `minBlockLength` can still be stored as an integer. ########## lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java: ########## @@ -405,5 +395,278 @@ public int advance(int target) throws IOException { protected abstract long bottomAsComparableLong(); protected abstract long topAsComparableLong(); + + class DisjunctionBuildVisitor extends RangeVisitor { + + final Deque<DisiAndMostCompetitiveValue> disis = new ArrayDeque<>(); + // most competitive entry stored last. + final Consumer<DisiAndMostCompetitiveValue> adder = + reverse == false ? disis::addFirst : disis::addLast; + + final int minBlockLength = minBlockLength(); + + final LSBRadixSorter sorter = new LSBRadixSorter(); + int[] docs = IntsRef.EMPTY_INTS; + int index = 0; + int blockMaxDoc = -1; + boolean docsInOrder = true; + long blockMinValue = Long.MAX_VALUE; + long blockMaxValue = Long.MIN_VALUE; + + private DisjunctionBuildVisitor() { + super(minValueAsLong, maxValueAsLong, maxDocVisited); + } + + @Override + public void grow(int count) { + docs = ArrayUtil.grow(docs, index + count + 1); + } + + @Override + protected void consumeDoc(int doc) { + docs[index++] = doc; + if (doc >= blockMaxDoc) { + blockMaxDoc = doc; + } else { + docsInOrder = false; + } + } + + void intersectLeaves(PointValues.PointTree pointTree) throws IOException { + PointValues.Relation r = + compare(pointTree.getMinPackedValue(), pointTree.getMaxPackedValue()); + switch (r) { + case CELL_INSIDE_QUERY, CELL_CROSSES_QUERY -> { + if (pointTree.moveToChild()) { + do { + intersectLeaves(pointTree); + } while (pointTree.moveToSibling()); + pointTree.moveToParent(); + } else { + if (r == PointValues.Relation.CELL_CROSSES_QUERY) { + pointTree.visitDocValues(this); + } else { + pointTree.visitDocIDs(this); + } + updateMinMax( + sortableBytesToLong(pointTree.getMinPackedValue()), + sortableBytesToLong(pointTree.getMaxPackedValue())); + } + } + case CELL_OUTSIDE_QUERY -> {} + default -> throw new IllegalStateException("unreachable code"); + } + } + + void updateMinMax(long leafMinValue, long leafMaxValue) throws IOException { + this.blockMinValue = Math.min(blockMinValue, leafMinValue); + this.blockMaxValue = Math.max(blockMaxValue, leafMaxValue); + if (index >= minBlockLength) { + update(); + this.blockMinValue = Long.MAX_VALUE; + this.blockMaxValue = Long.MIN_VALUE; + } + } + + void update() throws IOException { + if (blockMinValue > blockMaxValue) { + return; + } + long mostCompetitiveValue = + reverse == false + ? Math.max(blockMinValue, minValueAsLong) Review Comment: I'm not sure why we need to take the max with `minValueAsLong`? ########## lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java: ########## @@ -207,102 +208,91 @@ private void updateCompetitiveIterator() throws IOException { || hitsThresholdReached == false || (leafTopSet == false && queueFull == false)) return; // if some documents have missing points, check that missing values prohibits optimization - if ((pointValues.getDocCount() < maxDoc) && isMissingValueCompetitive()) { + boolean dense = pointValues.getDocCount() == maxDoc; + if (dense == false && isMissingValueCompetitive()) { return; // we can't filter out documents, as documents with missing values are competitive } - updateCounter++; - // Start sampling if we get called too much - if (updateCounter > 256 - && (updateCounter & (currentSkipInterval - 1)) != currentSkipInterval - 1) { + if (competitiveIterator instanceof CompetitiveIterator iter) { + if (queueFull) { + encodeBottom(); + } + // CompetitiveIterator already built, try to reduce clause. + tryReduceDisjunctionClause(iter); return; } - if (queueFull) { - encodeBottom(); + if (thresholdAsLong == -1) { + if (dense == false) { + competitiveIterator = getNumericDocValues(context, field); + leadCost = Math.min(leadCost, competitiveIterator.cost()); + } + long threshold = Math.min(leadCost >> 3, maxDoc >> 5); + thresholdAsLong = intersectThresholdValue(threshold); } - DocIdSetBuilder result = new DocIdSetBuilder(maxDoc); - PointValues.IntersectVisitor visitor = - new PointValues.IntersectVisitor() { - DocIdSetBuilder.BulkAdder adder; - - @Override - public void grow(int count) { - adder = result.grow(count); - } - - @Override - public void visit(int docID) { - if (docID <= maxDocVisited) { - return; // Already visited or skipped - } - adder.add(docID); - } - - @Override - public void visit(int docID, byte[] packedValue) { - if (docID <= maxDocVisited) { - return; // already visited or skipped - } - long l = sortableBytesToLong(packedValue); - if (l >= minValueAsLong && l <= maxValueAsLong) { - adder.add(docID); // doc is competitive - } - } + if ((reverse == false && bottomAsComparableLong() <= thresholdAsLong) + || (reverse && bottomAsComparableLong() >= thresholdAsLong)) { + if (queueFull) { + encodeBottom(); + } + DisjunctionBuildVisitor visitor = new DisjunctionBuildVisitor(); + competitiveIterator = visitor.generateCompetitiveIterator(); + } + } - @Override - public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { - long min = sortableBytesToLong(minPackedValue); - long max = sortableBytesToLong(maxPackedValue); - - if (min > maxValueAsLong || max < minValueAsLong) { - // 1. cmp ==0 and pruning==Pruning.GREATER_THAN_OR_EQUAL_TO : if the sort is - // ascending then maxValueAsLong is bottom's next less value, so it is competitive - // 2. cmp ==0 and pruning==Pruning.GREATER_THAN: maxValueAsLong equals to - // bottom, but there are multiple comparators, so it could be competitive - return PointValues.Relation.CELL_OUTSIDE_QUERY; - } + private void tryReduceDisjunctionClause(CompetitiveIterator iter) { + int originalSize = iter.disis.size(); + Predicate<DisiAndMostCompetitiveValue> isCompetitive = + d -> d.mostCompetitiveValue <= maxValueAsLong && d.mostCompetitiveValue >= minValueAsLong; Review Comment: it's a bit awkward to define this predicate which is then only used once, could you inline it in the below loop? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org