Re: [PR] Speed up the sort when building forward index [lucene]

via GitHub Tue, 24 Oct 2023 06:50:58 -0700


jpountz commented on code in PR #12712:
URL: https://github.com/apache/lucene/pull/12712#discussion_r1370176150



##########
lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java:
##########
@@ -991,4 +939,233 @@ static int readMonotonicInts(DataInput in, int[] ints) 
throws IOException {
     }
     return len;
   }
+
+  static class ForwardIndexSorter {
+
+    private static final int HISTOGRAM_SIZE = 256;
+    private static final int BUFFER_SIZE = 8192;
+    private static final int BUFFER_BYTES = BUFFER_SIZE * Long.BYTES;
+    private final Directory directory;
+    private final Bucket[] buckets = new Bucket[HISTOGRAM_SIZE];
+    private final double ramBudgetMB;
+
+    /** Fork of {@link org.apache.lucene.util.LSBRadixSorter} to sort longs. */
+    private static class MemorySorter {
+      private final int[] histogram = new int[HISTOGRAM_SIZE];
+
+      private static void buildHistogram(long[] array, int len, int[] 
histogram, int shift) {
+        for (int i = 0; i < len; ++i) {
+          final int b = (int) ((array[i] >>> shift) & 0xFFL);
+          histogram[b] += 1;
+        }
+      }
+
+      private static void sumHistogram(int[] histogram) {
+        int accum = 0;
+        for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
+          final int count = histogram[i];
+          histogram[i] = accum;
+          accum += count;
+        }
+      }
+
+      private static void reorder(long[] array, int len, int[] histogram, int 
shift, long[] dest) {
+        for (int i = 0; i < len; ++i) {
+          final long v = array[i];
+          final int b = (int) ((v >>> shift) & 0xFF);
+          dest[histogram[b]++] = v;
+        }
+      }
+
+      private static boolean sort(long[] array, int len, int[] histogram, int 
shift, long[] dest) {
+        Arrays.fill(histogram, 0);
+        buildHistogram(array, len, histogram, shift);
+        if (histogram[0] == len) {
+          return false;
+        }
+        sumHistogram(histogram);
+        reorder(array, len, histogram, shift, dest);
+        return true;
+      }
+
+      public void sortAndConsume(int numBits, long[] array, int len, 
LongConsumer consumer)
+          throws IOException {
+        long[] buffer = new long[len];
+        for (int shift = 0; shift < numBits; shift += 8) {
+          if (sort(array, len, histogram, shift, buffer)) {
+            // swap arrays
+            long[] tmp = array;
+            array = buffer;
+            buffer = tmp;
+          }
+        }
+        for (int i = 0; i < len; i++) {
+          consumer.accept(array[i]);
+        }
+        consumer.onFinish();
+      }
+    }
+
+    private static class Bucket {
+      int bufferUsed;
+      int blockNum;
+      long lastFp;
+      final ByteBuffersDataOutput fps = new ByteBuffersDataOutput();
+      final long[] buffer = new long[BUFFER_SIZE];
+      int finalBlockSize;
+
+      void addEntry(long l, IndexOutput output) throws IOException {

Review Comment:
   Nit: It would read better to me if the `IndexOutput` was an argument of 
`reset` rather than of `addEntry`



##########
lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java:
##########
@@ -991,4 +939,233 @@ static int readMonotonicInts(DataInput in, int[] ints) 
throws IOException {
     }
     return len;
   }
+
+  static class ForwardIndexSorter {
+
+    private static final int HISTOGRAM_SIZE = 256;
+    private static final int BUFFER_SIZE = 8192;
+    private static final int BUFFER_BYTES = BUFFER_SIZE * Long.BYTES;
+    private final Directory directory;
+    private final Bucket[] buckets = new Bucket[HISTOGRAM_SIZE];
+    private final double ramBudgetMB;
+
+    /** Fork of {@link org.apache.lucene.util.LSBRadixSorter} to sort longs. */
+    private static class MemorySorter {
+      private final int[] histogram = new int[HISTOGRAM_SIZE];
+
+      private static void buildHistogram(long[] array, int len, int[] 
histogram, int shift) {
+        for (int i = 0; i < len; ++i) {
+          final int b = (int) ((array[i] >>> shift) & 0xFFL);
+          histogram[b] += 1;
+        }
+      }
+
+      private static void sumHistogram(int[] histogram) {
+        int accum = 0;
+        for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
+          final int count = histogram[i];
+          histogram[i] = accum;
+          accum += count;
+        }
+      }
+
+      private static void reorder(long[] array, int len, int[] histogram, int 
shift, long[] dest) {
+        for (int i = 0; i < len; ++i) {
+          final long v = array[i];
+          final int b = (int) ((v >>> shift) & 0xFF);
+          dest[histogram[b]++] = v;
+        }
+      }
+
+      private static boolean sort(long[] array, int len, int[] histogram, int 
shift, long[] dest) {
+        Arrays.fill(histogram, 0);
+        buildHistogram(array, len, histogram, shift);
+        if (histogram[0] == len) {
+          return false;
+        }
+        sumHistogram(histogram);
+        reorder(array, len, histogram, shift, dest);
+        return true;
+      }
+
+      public void sortAndConsume(int numBits, long[] array, int len, 
LongConsumer consumer)
+          throws IOException {
+        long[] buffer = new long[len];
+        for (int shift = 0; shift < numBits; shift += 8) {
+          if (sort(array, len, histogram, shift, buffer)) {
+            // swap arrays
+            long[] tmp = array;
+            array = buffer;
+            buffer = tmp;
+          }
+        }
+        for (int i = 0; i < len; i++) {
+          consumer.accept(array[i]);
+        }
+        consumer.onFinish();
+      }
+    }
+
+    private static class Bucket {
+      int bufferUsed;
+      int blockNum;
+      long lastFp;
+      final ByteBuffersDataOutput fps = new ByteBuffersDataOutput();
+      final long[] buffer = new long[BUFFER_SIZE];
+      int finalBlockSize;
+
+      void addEntry(long l, IndexOutput output) throws IOException {
+        buffer[bufferUsed++] = l;
+        if (bufferUsed == BUFFER_SIZE) {
+          flush(output, false);
+        }
+      }
+
+      void flush(IndexOutput output, boolean isFinal) throws IOException {
+        if (isFinal) {
+          finalBlockSize = bufferUsed;
+        }
+        long fp = output.getFilePointer();
+        fps.writeVLong(encode(fp - lastFp));
+        lastFp = fp;
+        for (int i = 0; i < bufferUsed; i++) {
+          output.writeLong(buffer[i]);
+        }
+        lastFp = fp;
+        blockNum++;
+        bufferUsed = 0;
+      }
+
+      void reset() {
+        finalBlockSize = 0;
+        bufferUsed = 0;
+        blockNum = 0;
+        lastFp = 0;
+        fps.reset();
+      }
+    }
+
+    private static long encode(long fpDelta) {
+      assert (fpDelta & 0x07) == 0 : "fpDelta should be multiple of 8";
+      if (fpDelta % BUFFER_BYTES == 0) {
+        return ((fpDelta / BUFFER_BYTES) << 1) | 1;
+      } else {
+        return fpDelta;
+      }
+    }
+
+    private static long decode(long fpDelta) {
+      if ((fpDelta & 1) == 1) {
+        return (fpDelta >>> 1) * BUFFER_BYTES;
+      } else {
+        return fpDelta;
+      }
+    }
+
+    ForwardIndexSorter(Directory directory, double ramBudgetMB) {
+      this.directory = directory;
+      this.ramBudgetMB = ramBudgetMB;
+    }
+
+    void consume(String fileName, LongConsumer consumer) throws IOException {
+      try (IndexInput in = directory.openInput(fileName, IOContext.READONCE)) {
+        final long end = in.length() - CodecUtil.footerLength();
+        while (in.getFilePointer() < end) {
+          consumer.accept(in.readLong());
+        }
+      }
+      consumer.onFinish();
+    }
+
+    void consume(String fileName, long indexFP, LongConsumer consumer) throws 
IOException {
+      try (IndexInput index = directory.openInput(fileName, 
IOContext.READONCE);
+          IndexInput value = directory.openInput(fileName, 
IOContext.READONCE)) {
+        index.seek(indexFP);
+        for (int i = 0; i < buckets.length; i++) {
+          int blockNum = index.readVInt();
+          int finalBlockSize = index.readVInt();
+          long fp = decode(index.readVLong());
+          for (int block = 0; block < blockNum - 1; block++) {
+            value.seek(fp);
+            for (int j = 0; j < BUFFER_SIZE; j++) {
+              consumer.accept(value.readLong());
+            }
+            fp += decode(index.readVLong());
+          }
+          value.seek(fp);
+          for (int j = 0; j < finalBlockSize; j++) {
+            consumer.accept(value.readLong());
+          }
+        }
+        consumer.onFinish();
+      }
+    }
+
+    LongConsumer consumer(int shift, IndexOutput output) {
+      return new LongConsumer() {
+        @Override
+        public void accept(long value) throws IOException {
+          int b = (int) ((value >>> shift) & 0xFF);
+          Bucket bucket = buckets[b];
+          bucket.addEntry(value, output);
+        }
+
+        @Override
+        public void onFinish() throws IOException {
+          for (Bucket bucket : buckets) {
+            bucket.flush(output, true);
+          }
+        }
+      };
+    }
+
+    void sortAndConsume(String fileName, int maxDoc, LongConsumer consumer) 
throws IOException {
+      int bitsRequired = PackedInts.bitsRequired(maxDoc);
+
+      long total = (directory.fileLength(fileName) - CodecUtil.footerLength());
+      // Use a memory sorter if ram budget is enough
+      if (total * 2 < ramBudgetMB * 1024 * 1024) {
+        assert total % Long.BYTES == 0;
+        long[] entries = new long[(int) (total / Long.BYTES)];
+        try (IndexInput in = directory.openInput(fileName, 
IOContext.READONCE)) {
+          in.readLongs(entries, 0, entries.length);
+          new MemorySorter().sortAndConsume(bitsRequired, entries, 
entries.length, consumer);
+        }
+        return;
+      }
+
+      // sort offline
+      for (int i = 0; i < HISTOGRAM_SIZE; i++) {
+        buckets[i] = new Bucket();
+      }
+      String sourceFileName = fileName;
+      long indexFP = -1;
+      for (int shift = 0; shift < bitsRequired; shift += 8) {

Review Comment:
   Add some comments that it's ok to only compare doc IDs since this is a LSD 
radix sort which is stable and term IDs are already sorted?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Speed up the sort when building forward index [lucene]

Reply via email to