Re: [PR] Encode dense blocks of postings as bit sets. [lucene]

via GitHub Tue, 14 Jan 2025 05:16:46 -0800


msokolov commented on code in PR #14133:
URL: https://github.com/apache/lucene/pull/14133#discussion_r1914773748



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java:
##########
@@ -572,7 +597,36 @@ public int freq() throws IOException {
     }
 
     private void refillFullBlock() throws IOException {
-      forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
+      int bitsPerValue = docIn.readByte();
+      if (bitsPerValue > 0) {
+        forDeltaUtil.decodeAndPrefixSum(bitsPerValue, docInUtil, prevDocID, 
docBuffer);
+        encoding = DeltaEncoding.PACKED;
+      } else if (bitsPerValue == 0) {
+        // dense block: 128 one bits
+        docBitSet.set(0, BLOCK_SIZE);
+        docBitSetBase = prevDocID + 1;
+        docCumulativeWordPopCounts[0] = Long.SIZE;
+        docCumulativeWordPopCounts[1] = 2 * Long.SIZE;
+        encoding = DeltaEncoding.UNARY;
+      } else {
+        assert level0LastDocID != NO_MORE_DOCS;
+        // block is encoded as a bit set
+        docBitSetBase = prevDocID + 1;
+        int numLongs = -bitsPerValue;
+        docIn.readLongs(docBitSet.getBits(), 0, numLongs);
+        // Note: this for loop auto-vectorizes
+        for (int i = 0; i < numLongs - 1; ++i) {
+          docCumulativeWordPopCounts[i] = 
Long.bitCount(docBitSet.getBits()[i]);
+        }
+        for (int i = 1; i < numLongs - 1; ++i) {

Review Comment:
   sneaky!



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java:
##########
@@ -572,7 +597,36 @@ public int freq() throws IOException {
     }
 
     private void refillFullBlock() throws IOException {
-      forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
+      int bitsPerValue = docIn.readByte();
+      if (bitsPerValue > 0) {
+        forDeltaUtil.decodeAndPrefixSum(bitsPerValue, docInUtil, prevDocID, 
docBuffer);
+        encoding = DeltaEncoding.PACKED;
+      } else if (bitsPerValue == 0) {
+        // dense block: 128 one bits
+        docBitSet.set(0, BLOCK_SIZE);
+        docBitSetBase = prevDocID + 1;
+        docCumulativeWordPopCounts[0] = Long.SIZE;
+        docCumulativeWordPopCounts[1] = 2 * Long.SIZE;
+        encoding = DeltaEncoding.UNARY;
+      } else {
+        assert level0LastDocID != NO_MORE_DOCS;
+        // block is encoded as a bit set
+        docBitSetBase = prevDocID + 1;
+        int numLongs = -bitsPerValue;
+        docIn.readLongs(docBitSet.getBits(), 0, numLongs);
+        // Note: this for loop auto-vectorizes
+        for (int i = 0; i < numLongs - 1; ++i) {
+          docCumulativeWordPopCounts[i] = 
Long.bitCount(docBitSet.getBits()[i]);
+        }
+        for (int i = 1; i < numLongs - 1; ++i) {
+          docCumulativeWordPopCounts[i] += docCumulativeWordPopCounts[i - 1];
+        }
+        docCumulativeWordPopCounts[numLongs - 1] = BLOCK_SIZE;
+        assert docCumulativeWordPopCounts[numLongs - 2]

Review Comment:
   what happens if we have fewer than `BLOCK_SIZE` postings to encode? Do these 
go in a different encoding?



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java:
##########
@@ -857,7 +913,19 @@ public int nextDoc() throws IOException {
         moveToNextLevel0Block();
       }
 
-      return this.doc = docBuffer[docBufferUpto++];
+      switch (encoding) {
+        case PACKED:
+          doc = docBuffer[docBufferUpto];
+          break;
+        case UNARY:
+          int next = docBitSet.nextSetBit(doc - docBitSetBase + 1);

Review Comment:
   I wonder if there would be any benefit in maintaining `next` as a member 
variable that would always be `doc - docBitSetBase`. I guess it would save a 
single addition here, so maybe not worth it



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java:
##########
@@ -405,7 +422,34 @@ private void flushDocBlock(boolean finishTerm) throws 
IOException {
         }
       }
       long numSkipBytes = level0Output.size();
-      forDeltaUtil.encodeDeltas(docDeltaBuffer, level0Output);
+      // Now we need to decide whether to encode block deltas as packed 
integers (FOR) or unary
+      // codes (bit set). FOR makes #nextDoc() a bit faster while the bit set 
approach makes
+      // #advance() sometimes faster and #intoBitSet() much faster. Since the 
trade-off is not
+      // obvious, we make the decision purely based on storage efficiency, 
using the approach that
+      // requires fewer bits to encode the block.
+      int bitsPerValue = forDeltaUtil.bitsRequired(docDeltaBuffer);
+      int sum = Math.toIntExact(Arrays.stream(docDeltaBuffer).sum());
+      int numBitSetLongs = FixedBitSet.bits2words(sum);
+      if (sum == BLOCK_SIZE) {
+        level0Output.writeByte((byte) 0);
+      } else if (version < VERSION_DENSE_BLOCKS_AS_BITSETS || bitsPerValue * 
BLOCK_SIZE < sum) {
+        level0Output.writeByte((byte) bitsPerValue);
+        forDeltaUtil.encodeDeltas(bitsPerValue, docDeltaBuffer, level0Output);
+      } else {
+        // Storing doc deltas is more efficient using unary coding (ie. 
storing doc IDs as a bit
+        // set)
+        spareBitSet.clear(0, numBitSetLongs << 6);
+        int s = -1;
+        for (int i : docDeltaBuffer) {
+          s += i;
+          spareBitSet.set(s);
+        }
+        level0Output.writeByte((byte) -numBitSetLongs);

Review Comment:
   I guess this is guaranteed to fit in a byte by limits on `BLOCK_SIZE`?



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java:
##########
@@ -572,7 +597,36 @@ public int freq() throws IOException {
     }
 
     private void refillFullBlock() throws IOException {
-      forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
+      int bitsPerValue = docIn.readByte();
+      if (bitsPerValue > 0) {
+        forDeltaUtil.decodeAndPrefixSum(bitsPerValue, docInUtil, prevDocID, 
docBuffer);
+        encoding = DeltaEncoding.PACKED;
+      } else if (bitsPerValue == 0) {
+        // dense block: 128 one bits

Review Comment:
   confusing -- since we set the bitset to all zeros?



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java:
##########
@@ -198,44 +181,32 @@ private static void innerPrefixSum16(int[] arr) {
 
   private final int[] tmp = new int[BLOCK_SIZE];
 
+  /** Return the number of bits per value required to store the given array. */
+  int bitsRequired(int[] ints) {
+    int or = 0;
+    for (int l : ints) {
+      or |= l;
+    }
+    assert or != 0;

Review Comment:
   Perhaps we could indicate this assumption in the javadoc -- it's not clear 
to me, at a glance, why this should be true. I guess this is only called with 
*all* the postings for a term or something?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Encode dense blocks of postings as bit sets. [lucene]

Reply via email to