[GitHub] jpountz commented on a change in pull request #556: LUCENE-8673: Use radix partitioning when merging dimensional points

GitBox Wed, 06 Feb 2019 06:22:42 -0800

jpountz commented on a change in pull request #556: LUCENE-8673: Use radix 
partitioning when merging dimensional points
URL: https://github.com/apache/lucene-solr/pull/556#discussion_r254264333


 ##########
 File path: 
lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java
 ##########
 @@ -1239,90 +1017,68 @@ assert valuesInOrderAndBounds(count, sortedDim, 
minPackedValue, maxPackedValue,
 
   /** The array (sized numDims) of PathSlice describe the cell we have 
currently recursed to. */
   private void build(int nodeID, int leafNodeOffset,
-                     PathSlice[] slices,
-                     LongBitSet ordBitSet,
+                     PointWriter data,
                      IndexOutput out,
+                     BKDRadixSelector radixSelector,
                      byte[] minPackedValue, byte[] maxPackedValue,
                      byte[] splitPackedValues,
-                     long[] leafBlockFPs,
-                     List<Closeable> toCloseHeroically) throws IOException {
-
-    for(PathSlice slice : slices) {
-      assert slice.count == slices[0].count;
-    }
-
-    if (numDataDims == 1 && slices[0].writer instanceof OfflinePointWriter && 
slices[0].count <= maxPointsSortInHeap) {
-      // Special case for 1D, to cutover to heap once we recurse deeply enough:
-      slices[0] = switchToHeap(slices[0], toCloseHeroically);
-    }
+                     long[] leafBlockFPs) throws IOException {
 
     if (nodeID >= leafNodeOffset) {
 
       // Leaf node: write block
       // We can write the block in any order so by default we write it sorted 
by the dimension that has the
       // least number of unique bytes at commonPrefixLengths[dim], which makes 
compression more efficient
-      int sortedDim = 0;
-      int sortedDimCardinality = Integer.MAX_VALUE;
-
-      for (int dim=0;dim<numDataDims;dim++) {
-        if (slices[dim].writer instanceof HeapPointWriter == false) {
-          // Adversarial cases can cause this, e.g. very lopsided data, all 
equal points, such that we started
-          // offline, but then kept splitting only in one dimension, and so 
never had to rewrite into heap writer
-          slices[dim] = switchToHeap(slices[dim], toCloseHeroically);
-        }
 
-        PathSlice source = slices[dim];
+      if (data instanceof HeapPointWriter == false) {
+        // Adversarial cases can cause this, e.g. very lopsided data, all 
equal points, such that we started
+        // offline, but then kept splitting only in one dimension, and so 
never had to rewrite into heap writer
+        data = switchToHeap(data);
+      }
 
-        HeapPointWriter heapSource = (HeapPointWriter) source.writer;
+      // We ensured that maxPointsSortInHeap was >= maxPointsInLeafNode, so we 
better be in heap at this point:
+      HeapPointWriter heapSource = (HeapPointWriter) data;
 
-        // Find common prefix by comparing first and last values, already 
sorted in this dimension:
-        heapSource.readPackedValue(Math.toIntExact(source.start), scratch1);
-        heapSource.readPackedValue(Math.toIntExact(source.start + source.count 
- 1), scratch2);
+      //we store common prefix on scratch1
+      computeCommonPrefixLength(heapSource, scratch1);
 
-        int offset = dim * bytesPerDim;
-        commonPrefixLengths[dim] = bytesPerDim;
-        for(int j=0;j<bytesPerDim;j++) {
-          if (scratch1[offset+j] != scratch2[offset+j]) {
-            commonPrefixLengths[dim] = j;
-            break;
-          }
+      int sortedDim = 0;
+      int sortedDimCardinality = Integer.MAX_VALUE;
+      FixedBitSet[] usedBytes = new FixedBitSet[numDataDims];
+      for (int dim = 0; dim < numDataDims; ++dim) {
+        if (commonPrefixLengths[dim] < bytesPerDim) {
+          usedBytes[dim] = new FixedBitSet(256);
         }
-
+      }
+      //Find the dimension to compress
+      for (int dim = 0; dim < numDataDims; dim++) {
         int prefix = commonPrefixLengths[dim];
         if (prefix < bytesPerDim) {
-          int cardinality = 1;
-          byte previous = scratch1[offset + prefix];
-          for (long i = 1; i < source.count; ++i) {
-            heapSource.readPackedValue(Math.toIntExact(source.start + i), 
scratch2);
-            byte b = scratch2[offset + prefix];
-            assert Byte.toUnsignedInt(previous) <= Byte.toUnsignedInt(b);
-            if (b != previous) {
-              cardinality++;
-              previous = b;
-            }
+          int offset = dim * bytesPerDim;
+          for (int i = 0; i < heapSource.count(); ++i) {
+            heapSource.getPackedValueSlice(i, scratchBytesRef1);
+            int bucket = scratchBytesRef1.bytes[scratchBytesRef1.offset + 
offset + prefix] & 0xff;
+            usedBytes[dim].set(bucket);
           }
-          assert cardinality <= 256;
+          int cardinality =usedBytes[dim].cardinality();
           if (cardinality < sortedDimCardinality) {
             sortedDim = dim;
             sortedDimCardinality = cardinality;
           }
         }
       }
 
-      PathSlice source = slices[sortedDim];
-
-      // We ensured that maxPointsSortInHeap was >= maxPointsInLeafNode, so we 
better be in heap at this point:
-      HeapPointWriter heapSource = (HeapPointWriter) source.writer;
+      sortHeapPointWriter(heapSource, sortedDim);
 
       // Save the block file pointer:
       leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
       //System.out.println("  write leaf block @ fp=" + out.getFilePointer());
 
       // Write docIDs first, as their own chunk, so that at intersect time we 
can add all docIDs w/o
       // loading the values:
-      int count = Math.toIntExact(source.count);
+      int count = Math.toIntExact(heapSource.count());
       assert count > 0: "nodeID=" + nodeID + " leafNodeOffset=" + 
leafNodeOffset;
-      writeLeafBlockDocs(out, heapSource.docIDs, 
Math.toIntExact(source.start), count);
+      writeLeafBlockDocs(out, heapSource.docIDs, Math.toIntExact(0), count);
 
 Review comment:
   s/Math.toIntExact(0)/0/

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

[GitHub] jpountz commented on a change in pull request #556: LUCENE-8673: Use radix partitioning when merging dimensional points

Reply via email to