gsmiller commented on code in PR #15803:
URL: https://github.com/apache/lucene/pull/15803#discussion_r2954631774


##########
lucene/core/src/java/org/apache/lucene/index/ReaderUtil.java:
##########
@@ -89,4 +93,82 @@ public static int subIndex(int n, List<LeafReaderContext> 
leaves) {
     }
     return hi;
   }
+
+  /**
+   * Partitions global doc IDs from ScoreDoc array by leaf. Extracts doc IDs, 
sorts them, and
+   * partitions across leaves.
+   *
+   * @param hits the ScoreDoc array (typically from TopDocs.scoreDocs)
+   * @param leaves the index reader's leaves
+   * @return array indexed by leaf ord, containing global doc IDs for that 
leaf (empty if no hits)
+   */
+  public static int[][] partitionByLeaf(ScoreDoc[] hits, 
List<LeafReaderContext> leaves) {
+    int[] docIds = new int[hits.length];
+    for (int i = 0; i < hits.length; i++) {
+      docIds[i] = hits[i].doc;
+    }
+    Arrays.sort(docIds);
+    return partitionByLeaf(docIds, leaves);
+  }
+
+  /**
+   * Partitions sorted global doc IDs by leaf.
+   *
+   * @param sortedDocIds global doc IDs, must be sorted ascending
+   * @param leaves the index reader's leaves
+   * @return array indexed by leaf ord, containing global doc IDs for that 
leaf (empty if no hits)
+   */
+  public static int[][] partitionByLeaf(int[] sortedDocIds, 
List<LeafReaderContext> leaves) {
+    assert isSorted(sortedDocIds) : "docIds must be sorted";
+    int numLeaves = leaves.size();
+    int[][] result = new int[numLeaves][];
+    if (sortedDocIds.length == 0) {
+      for (int i = 0; i < numLeaves; i++) {
+        result[i] = EMPTY_INT_ARRAY;
+      }
+      return result;
+    }
+    int leafStart = 0;
+    int leafIdx = 0;
+    LeafReaderContext leaf = leaves.get(0);
+    int leafEnd = leaf.docBase + leaf.reader().maxDoc();
+    for (int i = 0; i < sortedDocIds.length; i++) {
+      int docId = sortedDocIds[i];
+      while (docId >= leafEnd) {
+        int count = i - leafStart;
+        if (count == 0) {
+          result[leafIdx] = EMPTY_INT_ARRAY;
+        } else {
+          result[leafIdx] = new int[count];
+          System.arraycopy(sortedDocIds, leafStart, result[leafIdx], 0, count);
+        }
+        leafStart = i;
+        leafIdx++;
+        leaf = leaves.get(leafIdx);
+        leafEnd = leaf.docBase + leaf.reader().maxDoc();
+      }
+    }
+    // Handle last leaf
+    int count = sortedDocIds.length - leafStart;
+    if (count == 0) {
+      result[leafIdx] = EMPTY_INT_ARRAY;
+    } else {
+      result[leafIdx] = new int[count];
+      System.arraycopy(sortedDocIds, leafStart, result[leafIdx], 0, count);
+    }
+    // Fill remaining empty leaves
+    for (int i = leafIdx + 1; i < numLeaves; i++) {
+      result[i] = EMPTY_INT_ARRAY;
+    }
+    return result;
+  }
+
+  private static boolean isSorted(int[] a) {
+    for (int i = 1; i < a.length; i++) {
+      if (a[i] < a[i - 1]) {
+        throw new AssertionError("docId " + a[i] + " appears after " + a[i - 
1]);

Review Comment:
   I don't think we should explicitly throw inside this helper method. Let's 
simply return `false` here and let the calling `assert` handle throwing the 
exception?



##########
lucene/core/src/java/org/apache/lucene/index/ReaderUtil.java:
##########
@@ -89,4 +93,82 @@ public static int subIndex(int n, List<LeafReaderContext> 
leaves) {
     }
     return hi;
   }
+
+  /**
+   * Partitions global doc IDs from ScoreDoc array by leaf. Extracts doc IDs, 
sorts them, and
+   * partitions across leaves.
+   *
+   * @param hits the ScoreDoc array (typically from TopDocs.scoreDocs)
+   * @param leaves the index reader's leaves
+   * @return array indexed by leaf ord, containing global doc IDs for that 
leaf (empty if no hits)
+   */
+  public static int[][] partitionByLeaf(ScoreDoc[] hits, 
List<LeafReaderContext> leaves) {
+    int[] docIds = new int[hits.length];
+    for (int i = 0; i < hits.length; i++) {
+      docIds[i] = hits[i].doc;
+    }
+    Arrays.sort(docIds);
+    return partitionByLeaf(docIds, leaves);
+  }
+
+  /**
+   * Partitions sorted global doc IDs by leaf.
+   *
+   * @param sortedDocIds global doc IDs, must be sorted ascending
+   * @param leaves the index reader's leaves
+   * @return array indexed by leaf ord, containing global doc IDs for that 
leaf (empty if no hits)
+   */
+  public static int[][] partitionByLeaf(int[] sortedDocIds, 
List<LeafReaderContext> leaves) {
+    assert isSorted(sortedDocIds) : "docIds must be sorted";
+    int numLeaves = leaves.size();
+    int[][] result = new int[numLeaves][];
+    if (sortedDocIds.length == 0) {
+      for (int i = 0; i < numLeaves; i++) {
+        result[i] = EMPTY_INT_ARRAY;
+      }
+      return result;
+    }
+    int leafStart = 0;
+    int leafIdx = 0;
+    LeafReaderContext leaf = leaves.get(0);

Review Comment:
   My IDE pointed out we can use `#getFirst` here instead. I like the 
suggestion; feels slightly more idiomatic. I don't care that much though :)



##########
lucene/core/src/java/org/apache/lucene/index/ReaderUtil.java:
##########
@@ -89,4 +93,82 @@ public static int subIndex(int n, List<LeafReaderContext> 
leaves) {
     }
     return hi;
   }
+
+  /**
+   * Partitions global doc IDs from ScoreDoc array by leaf. Extracts doc IDs, 
sorts them, and
+   * partitions across leaves.
+   *
+   * @param hits the ScoreDoc array (typically from TopDocs.scoreDocs)
+   * @param leaves the index reader's leaves
+   * @return array indexed by leaf ord, containing global doc IDs for that 
leaf (empty if no hits)
+   */
+  public static int[][] partitionByLeaf(ScoreDoc[] hits, 
List<LeafReaderContext> leaves) {
+    int[] docIds = new int[hits.length];
+    for (int i = 0; i < hits.length; i++) {
+      docIds[i] = hits[i].doc;
+    }
+    Arrays.sort(docIds);
+    return partitionByLeaf(docIds, leaves);
+  }
+
+  /**
+   * Partitions sorted global doc IDs by leaf.
+   *
+   * @param sortedDocIds global doc IDs, must be sorted ascending
+   * @param leaves the index reader's leaves
+   * @return array indexed by leaf ord, containing global doc IDs for that 
leaf (empty if no hits)
+   */
+  public static int[][] partitionByLeaf(int[] sortedDocIds, 
List<LeafReaderContext> leaves) {
+    assert isSorted(sortedDocIds) : "docIds must be sorted";
+    int numLeaves = leaves.size();
+    int[][] result = new int[numLeaves][];
+    if (sortedDocIds.length == 0) {
+      for (int i = 0; i < numLeaves; i++) {

Review Comment:
   Let's replace this loop with `Arrays.fill(result, EMPTY_INT_ARRAY);`?



##########
lucene/core/src/java/org/apache/lucene/index/ReaderUtil.java:
##########
@@ -89,4 +93,82 @@ public static int subIndex(int n, List<LeafReaderContext> 
leaves) {
     }
     return hi;
   }
+
+  /**
+   * Partitions global doc IDs from ScoreDoc array by leaf. Extracts doc IDs, 
sorts them, and
+   * partitions across leaves.
+   *
+   * @param hits the ScoreDoc array (typically from TopDocs.scoreDocs)
+   * @param leaves the index reader's leaves
+   * @return array indexed by leaf ord, containing global doc IDs for that 
leaf (empty if no hits)
+   */
+  public static int[][] partitionByLeaf(ScoreDoc[] hits, 
List<LeafReaderContext> leaves) {
+    int[] docIds = new int[hits.length];
+    for (int i = 0; i < hits.length; i++) {
+      docIds[i] = hits[i].doc;
+    }
+    Arrays.sort(docIds);
+    return partitionByLeaf(docIds, leaves);
+  }
+
+  /**
+   * Partitions sorted global doc IDs by leaf.
+   *
+   * @param sortedDocIds global doc IDs, must be sorted ascending
+   * @param leaves the index reader's leaves
+   * @return array indexed by leaf ord, containing global doc IDs for that 
leaf (empty if no hits)
+   */
+  public static int[][] partitionByLeaf(int[] sortedDocIds, 
List<LeafReaderContext> leaves) {
+    assert isSorted(sortedDocIds) : "docIds must be sorted";
+    int numLeaves = leaves.size();
+    int[][] result = new int[numLeaves][];
+    if (sortedDocIds.length == 0) {
+      for (int i = 0; i < numLeaves; i++) {
+        result[i] = EMPTY_INT_ARRAY;
+      }
+      return result;
+    }
+    int leafStart = 0;
+    int leafIdx = 0;
+    LeafReaderContext leaf = leaves.get(0);
+    int leafEnd = leaf.docBase + leaf.reader().maxDoc();
+    for (int i = 0; i < sortedDocIds.length; i++) {
+      int docId = sortedDocIds[i];
+      while (docId >= leafEnd) {
+        int count = i - leafStart;
+        if (count == 0) {
+          result[leafIdx] = EMPTY_INT_ARRAY;
+        } else {
+          result[leafIdx] = new int[count];
+          System.arraycopy(sortedDocIds, leafStart, result[leafIdx], 0, count);
+        }
+        leafStart = i;
+        leafIdx++;
+        leaf = leaves.get(leafIdx);
+        leafEnd = leaf.docBase + leaf.reader().maxDoc();
+      }
+    }
+    // Handle last leaf
+    int count = sortedDocIds.length - leafStart;
+    if (count == 0) {
+      result[leafIdx] = EMPTY_INT_ARRAY;
+    } else {
+      result[leafIdx] = new int[count];
+      System.arraycopy(sortedDocIds, leafStart, result[leafIdx], 0, count);
+    }
+    // Fill remaining empty leaves
+    for (int i = leafIdx + 1; i < numLeaves; i++) {

Review Comment:
   I think we can leverage `Arrays#fill` here too?



##########
lucene/core/src/java/org/apache/lucene/index/ReaderUtil.java:
##########
@@ -89,4 +93,82 @@ public static int subIndex(int n, List<LeafReaderContext> 
leaves) {
     }
     return hi;
   }
+
+  /**
+   * Partitions global doc IDs from ScoreDoc array by leaf. Extracts doc IDs, 
sorts them, and
+   * partitions across leaves.
+   *
+   * @param hits the ScoreDoc array (typically from TopDocs.scoreDocs)
+   * @param leaves the index reader's leaves
+   * @return array indexed by leaf ord, containing global doc IDs for that 
leaf (empty if no hits)
+   */
+  public static int[][] partitionByLeaf(ScoreDoc[] hits, 
List<LeafReaderContext> leaves) {
+    int[] docIds = new int[hits.length];
+    for (int i = 0; i < hits.length; i++) {
+      docIds[i] = hits[i].doc;
+    }
+    Arrays.sort(docIds);
+    return partitionByLeaf(docIds, leaves);
+  }
+
+  /**
+   * Partitions sorted global doc IDs by leaf.
+   *
+   * @param sortedDocIds global doc IDs, must be sorted ascending
+   * @param leaves the index reader's leaves
+   * @return array indexed by leaf ord, containing global doc IDs for that 
leaf (empty if no hits)
+   */
+  public static int[][] partitionByLeaf(int[] sortedDocIds, 
List<LeafReaderContext> leaves) {
+    assert isSorted(sortedDocIds) : "docIds must be sorted";
+    int numLeaves = leaves.size();
+    int[][] result = new int[numLeaves][];
+    if (sortedDocIds.length == 0) {
+      for (int i = 0; i < numLeaves; i++) {
+        result[i] = EMPTY_INT_ARRAY;
+      }
+      return result;
+    }
+    int leafStart = 0;
+    int leafIdx = 0;
+    LeafReaderContext leaf = leaves.get(0);
+    int leafEnd = leaf.docBase + leaf.reader().maxDoc();
+    for (int i = 0; i < sortedDocIds.length; i++) {
+      int docId = sortedDocIds[i];
+      while (docId >= leafEnd) {
+        int count = i - leafStart;
+        if (count == 0) {
+          result[leafIdx] = EMPTY_INT_ARRAY;
+        } else {
+          result[leafIdx] = new int[count];
+          System.arraycopy(sortedDocIds, leafStart, result[leafIdx], 0, count);
+        }
+        leafStart = i;
+        leafIdx++;
+        leaf = leaves.get(leafIdx);
+        leafEnd = leaf.docBase + leaf.reader().maxDoc();
+      }
+    }
+    // Handle last leaf
+    int count = sortedDocIds.length - leafStart;
+    if (count == 0) {

Review Comment:
   It's not possible for `count` to be zero at this point. Let's remove the 
empty condition branch here and put an `assert count > 0` in?



##########
lucene/core/src/java/org/apache/lucene/index/ReaderUtil.java:
##########
@@ -89,4 +93,82 @@ public static int subIndex(int n, List<LeafReaderContext> 
leaves) {
     }
     return hi;
   }
+
+  /**
+   * Partitions global doc IDs from ScoreDoc array by leaf. Extracts doc IDs, 
sorts them, and
+   * partitions across leaves.
+   *
+   * @param hits the ScoreDoc array (typically from TopDocs.scoreDocs)
+   * @param leaves the index reader's leaves
+   * @return array indexed by leaf ord, containing global doc IDs for that 
leaf (empty if no hits)
+   */
+  public static int[][] partitionByLeaf(ScoreDoc[] hits, 
List<LeafReaderContext> leaves) {
+    int[] docIds = new int[hits.length];
+    for (int i = 0; i < hits.length; i++) {
+      docIds[i] = hits[i].doc;
+    }
+    Arrays.sort(docIds);
+    return partitionByLeaf(docIds, leaves);
+  }
+
+  /**
+   * Partitions sorted global doc IDs by leaf.
+   *
+   * @param sortedDocIds global doc IDs, must be sorted ascending
+   * @param leaves the index reader's leaves
+   * @return array indexed by leaf ord, containing global doc IDs for that 
leaf (empty if no hits)
+   */
+  public static int[][] partitionByLeaf(int[] sortedDocIds, 
List<LeafReaderContext> leaves) {
+    assert isSorted(sortedDocIds) : "docIds must be sorted";
+    int numLeaves = leaves.size();
+    int[][] result = new int[numLeaves][];
+    if (sortedDocIds.length == 0) {
+      for (int i = 0; i < numLeaves; i++) {
+        result[i] = EMPTY_INT_ARRAY;
+      }
+      return result;
+    }
+    int leafStart = 0;
+    int leafIdx = 0;
+    LeafReaderContext leaf = leaves.get(0);
+    int leafEnd = leaf.docBase + leaf.reader().maxDoc();
+    for (int i = 0; i < sortedDocIds.length; i++) {
+      int docId = sortedDocIds[i];
+      while (docId >= leafEnd) {
+        int count = i - leafStart;
+        if (count == 0) {
+          result[leafIdx] = EMPTY_INT_ARRAY;
+        } else {
+          result[leafIdx] = new int[count];
+          System.arraycopy(sortedDocIds, leafStart, result[leafIdx], 0, count);
+        }
+        leafStart = i;
+        leafIdx++;
+        leaf = leaves.get(leafIdx);
+        leafEnd = leaf.docBase + leaf.reader().maxDoc();
+      }
+    }
+    // Handle last leaf

Review Comment:
   We're not exactly handling the last leaf here right? This could be a 
confusing comment. I think what we're really doing here is handling the 
remaining docIDs that are sort of "in-flight" right? It's possible there are 
still trailing empty leaves (which you handle below). I'd tweak this comment to 
something like "Handle remaining docIDs"



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to