original-brownbear commented on code in PR #13860:
URL: https://github.com/apache/lucene/pull/13860#discussion_r1986157505
##########
lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java:
##########
@@ -360,102 +350,95 @@ public static LeafSlice[] slices(
boolean allowSegmentPartitions) {
// Make a copy so we can sort:
- List<LeafReaderContext> sortedLeaves = new ArrayList<>(leaves);
+ LeafReaderContext[] sortedLeaves = leaves.toArray(new
LeafReaderContext[0]);
// Sort by maxDoc, descending:
- sortedLeaves.sort(Collections.reverseOrder(Comparator.comparingInt(l ->
l.reader().maxDoc())));
+ Arrays.sort(
+ sortedLeaves, (l1, l2) -> Integer.compare(l2.reader().maxDoc(),
l1.reader().maxDoc()));
if (allowSegmentPartitions) {
- final List<List<LeafReaderContextPartition>> groupedLeafPartitions = new
ArrayList<>();
- int currentSliceNumDocs = 0;
- List<LeafReaderContextPartition> group = null;
- for (LeafReaderContext ctx : sortedLeaves) {
- if (ctx.reader().maxDoc() > maxDocsPerSlice) {
- assert group == null;
- // if the segment does not fit in a single slice, we split it into
maximum 5 partitions of
- // equal size
- int numSlices = Math.min(5, Math.ceilDiv(ctx.reader().maxDoc(),
maxDocsPerSlice));
- int numDocs = ctx.reader().maxDoc() / numSlices;
- int maxDocId = numDocs;
- int minDocId = 0;
- for (int i = 0; i < numSlices - 1; i++) {
- groupedLeafPartitions.add(
- Collections.singletonList(
- LeafReaderContextPartition.createFromAndTo(ctx, minDocId,
maxDocId)));
- minDocId = maxDocId;
- maxDocId += numDocs;
- }
- // the last slice gets all the remaining docs
- groupedLeafPartitions.add(
- Collections.singletonList(
- LeafReaderContextPartition.createFromAndTo(
- ctx, minDocId, ctx.reader().maxDoc())));
- } else {
- if (group == null) {
- group = new ArrayList<>();
- groupedLeafPartitions.add(group);
- }
- group.add(LeafReaderContextPartition.createForEntireSegment(ctx));
-
- currentSliceNumDocs += ctx.reader().maxDoc();
- // We only split a segment when it does not fit entirely in a slice.
We don't partition
- // the
- // segment that makes the current slice (which holds multiple
segments) go over
- // maxDocsPerSlice. This means that a slice either contains multiple
entire segments, or a
- // single partition of a segment.
- if (group.size() >= maxSegmentsPerSlice || currentSliceNumDocs >
maxDocsPerSlice) {
- group = null;
- currentSliceNumDocs = 0;
- }
- }
- }
+ return slicesWithPartitionedSegments(maxDocsPerSlice,
maxSegmentsPerSlice, sortedLeaves);
+ }
- LeafSlice[] slices = new LeafSlice[groupedLeafPartitions.size()];
- int upto = 0;
- for (List<LeafReaderContextPartition> currentGroup :
groupedLeafPartitions) {
- slices[upto] = new LeafSlice(currentGroup);
- ++upto;
+ final List<LeafSlice> groupedLeaves = new ArrayList<>();
+ long docSum = 0;
+ List<LeafReaderContextPartition> group = null;
+ for (LeafReaderContext ctx : sortedLeaves) {
+ final int maxDoc = ctx.reader().maxDoc();
+ var partition = LeafReaderContextPartition.createForEntireSegment(ctx);
+ if (maxDoc > maxDocsPerSlice) {
+ assert group == null;
+ groupedLeaves.add(new LeafSlice(Collections.singletonList(partition)));
+ } else {
+ if (group == null) {
+ group = new ArrayList<>();
+ }
+ group.add(partition);
+ docSum += maxDoc;
+ if (docSum > maxDocsPerSlice || group.size() >= maxSegmentsPerSlice) {
+ groupedLeaves.add(new LeafSlice(group));
+ group = null;
+ docSum = 0;
+ }
}
- return slices;
}
+ if (group != null) {
+ groupedLeaves.add(new LeafSlice(group));
+ }
+ return groupedLeaves.toArray(LeafSlice.EMPTY_ARRAY);
+ }
- final List<List<LeafReaderContext>> groupedLeaves = new ArrayList<>();
- long docSum = 0;
- List<LeafReaderContext> group = null;
+ private static LeafSlice[] slicesWithPartitionedSegments(
+ int maxDocsPerSlice, int maxSegmentsPerSlice, LeafReaderContext[]
sortedLeaves) {
+ final List<List<LeafReaderContextPartition>> groupedLeafPartitions = new
ArrayList<>();
+ int currentSliceNumDocs = 0;
+ List<LeafReaderContextPartition> group = null;
for (LeafReaderContext ctx : sortedLeaves) {
if (ctx.reader().maxDoc() > maxDocsPerSlice) {
assert group == null;
- groupedLeaves.add(Collections.singletonList(ctx));
+ // if the segment does not fit in a single slice, we split it into
maximum 5 partitions of
+ // equal size
+ int numSlices = Math.min(5, Math.ceilDiv(ctx.reader().maxDoc(),
maxDocsPerSlice));
+ int numDocs = ctx.reader().maxDoc() / numSlices;
+ int maxDocId = numDocs;
+ int minDocId = 0;
+ for (int i = 0; i < numSlices - 1; i++) {
+ groupedLeafPartitions.add(
+ Collections.singletonList(
+ LeafReaderContextPartition.createFromAndTo(ctx, minDocId,
maxDocId)));
+ minDocId = maxDocId;
+ maxDocId += numDocs;
+ }
+ // the last slice gets all the remaining docs
+ groupedLeafPartitions.add(
+ Collections.singletonList(
+ LeafReaderContextPartition.createFromAndTo(ctx, minDocId,
ctx.reader().maxDoc())));
} else {
if (group == null) {
group = new ArrayList<>();
- group.add(ctx);
-
- groupedLeaves.add(group);
- } else {
- group.add(ctx);
+ groupedLeafPartitions.add(group);
}
-
- docSum += ctx.reader().maxDoc();
- if (group.size() >= maxSegmentsPerSlice || docSum > maxDocsPerSlice) {
+ group.add(LeafReaderContextPartition.createForEntireSegment(ctx));
+
+ currentSliceNumDocs += ctx.reader().maxDoc();
+ // We only split a segment when it does not fit entirely in a slice.
We don't partition
+ // the
+ // segment that makes the current slice (which holds multiple
segments) go over
+ // maxDocsPerSlice. This means that a slice either contains multiple
entire segments, or a
+ // single partition of a segment.
+ if (group.size() >= maxSegmentsPerSlice || currentSliceNumDocs >
maxDocsPerSlice) {
group = null;
- docSum = 0;
+ currentSliceNumDocs = 0;
}
}
}
- LeafSlice[] slices = new LeafSlice[groupedLeaves.size()];
+ LeafSlice[] slices = new LeafSlice[groupedLeafPartitions.size()];
int upto = 0;
- for (List<LeafReaderContext> currentLeaf : groupedLeaves) {
- slices[upto] =
- new LeafSlice(
- new ArrayList<>(
- currentLeaf.stream()
- .map(LeafReaderContextPartition::createForEntireSegment)
- .toList()));
+ for (List<LeafReaderContextPartition> currentGroup :
groupedLeafPartitions) {
+ slices[upto] = new LeafSlice(currentGroup);
Review Comment:
Done in https://github.com/apache/lucene/pull/14336 sorry for delay on this
one :) But after benchmarking a little more as of late, this is more valuable
than ever for us. Especially for large segment count use-cases this logic tends
to make up a non-trivial part of the total execution time of the query phase
(as in up to O(10%)) without this fix (close to an order of magnitude cheaper
with all the fixes in here in ES use cases seemingly ... probably due to
instruction cache/code-size issues as it turned out now :)).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]