jpountz commented on a change in pull request #1234: Add compression for Binary 
doc value fields
URL: https://github.com/apache/lucene-solr/pull/1234#discussion_r376529195
 
 

 ##########
 File path: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java
 ##########
 @@ -742,6 +755,131 @@ public BytesRef binaryValue() throws IOException {
         };
       }
     }
+  }  
+  
+  // Decompresses blocks of binary values to retrieve content
+  class BinaryDecoder {
+    
+    private final LongValues addresses;
+    private final IndexInput compressedData;
+    // Cache of last uncompressed block 
+    private long lastBlockId = -1;
+    private int []uncompressedDocEnds = new 
int[Lucene80DocValuesFormat.BINARY_DOCS_PER_COMPRESSED_BLOCK];
+    private int uncompressedBlockLength = 0;        
+    private int numDocsInBlock = 0;
+    private final byte[] uncompressedBlock;
+    private final BytesRef uncompressedBytesRef;
+    
+    public BinaryDecoder(LongValues addresses, IndexInput compressedData, int 
biggestUncompressedBlockSize) {
+      super();
+      this.addresses = addresses;
+      this.compressedData = compressedData;
+      // pre-allocate a byte array large enough for the biggest uncompressed 
block needed.
+      this.uncompressedBlock = new byte[biggestUncompressedBlockSize];
+      uncompressedBytesRef = new BytesRef(uncompressedBlock);
+      
+    }
+
+    BytesRef decode(int docNumber) throws IOException {
+      int blockId = docNumber >> Lucene80DocValuesFormat.BINARY_BLOCK_SHIFT; 
+      int docInBlockId = docNumber % 
Lucene80DocValuesFormat.BINARY_DOCS_PER_COMPRESSED_BLOCK;
+      assert docInBlockId < 
Lucene80DocValuesFormat.BINARY_DOCS_PER_COMPRESSED_BLOCK;
+      
+      
+      // already read and uncompressed?
+      if (blockId != lastBlockId) {
+        lastBlockId = blockId;
+        long blockStartOffset = addresses.get(blockId);
+        compressedData.seek(blockStartOffset);
+        
+        numDocsInBlock = compressedData.readVInt();
+        assert numDocsInBlock <= 
Lucene80DocValuesFormat.BINARY_DOCS_PER_COMPRESSED_BLOCK;
+        uncompressedDocEnds = new int[numDocsInBlock];
+        uncompressedBlockLength = 0;        
+
+        int onlyLength = -1;
+        for (int i = 0; i < numDocsInBlock; i++) {
+          if (i == 0) {
+            // The first length value is special. It is shifted and has a bit 
to denote if
+            // all other values are the same length
+            int lengthPlusSameInd = compressedData.readVInt();
+            int sameIndicator = lengthPlusSameInd & 1;
+            int firstValLength = lengthPlusSameInd >>1;
 
 Review comment:
   Since you are stealing a bit, we should do an unsigned shift (`>>>`) instead.
   
   This would never be a problem in practice, but imagine than the length was a 
31-bits integer. Shifting by one bit on the left at index time would make this 
number negative. So here we need an unsigned shift rather than a signed shift 
that preserves the sign.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Reply via email to