iverase commented on a change in pull request #556: LUCENE-8673: Use radix 
partitioning when merging dimensional points
URL: https://github.com/apache/lucene-solr/pull/556#discussion_r254294954
 
 

 ##########
 File path: 
lucene/core/src/java/org/apache/lucene/util/bkd/BKDRadixSelector.java
 ##########
 @@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.bkd;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FutureArrays;
+import org.apache.lucene.util.RadixSelector;
+
+
+/**
+ *
+ * Offline Radix selector for BKD tree.
+ *
+ *  @lucene.internal
+ * */
+public final class BKDRadixSelector {
+  //size of the histogram
+  private static final int HISTOGRAM_SIZE = 256;
+  //size of the online buffer: 8 KB
+  private static final int MAX_SIZE_OFFLINE_BUFFER = 1024 * 8;
+  // we store one histogram per recursion level
+  private final long[][] histogram;
+  //bytes per dimension
+  private final int bytesPerDim;
+  // number of bytes to be sorted: bytesPerDim + Integer.BYTES
+  private final int bytesSorted;
+  //data dimensions size
+  private final int packedBytesLength;
+  //flag to when we are moving to sort on heap
+  private final int maxPointsSortInHeap;
+  //reusable buffer
+  private final byte[] offlineBuffer;
+  //holder for partition points
+  private final int[] partitionBucket;
+  // scratch object to move bytes around
+  private final BytesRef bytesRef1 = new BytesRef();
+  // scratch object to move bytes around
+  private final BytesRef bytesRef2 = new BytesRef();
+  //Directory to create new Offline writer
+  private final Directory tempDir;
+  // prefix for temp files
+  private final String tempFileNamePrefix;
+
+  /**
+   * Sole constructor.
+   */
+  public BKDRadixSelector(int numDim, int bytesPerDim, int 
maxPointsSortInHeap, Directory tempDir, String tempFileNamePrefix) {
+    this.bytesPerDim = bytesPerDim;
+    this.packedBytesLength = numDim * bytesPerDim;
+    this.bytesSorted = bytesPerDim + Integer.BYTES;
+    this.maxPointsSortInHeap = 2 * maxPointsSortInHeap;
+    int numberOfPointsOffline  = MAX_SIZE_OFFLINE_BUFFER / (packedBytesLength 
+ Integer.BYTES);
+    this.offlineBuffer = new byte[numberOfPointsOffline * (packedBytesLength + 
Integer.BYTES)];
+    this.partitionBucket = new int[bytesSorted];
+    this.histogram = new long[bytesSorted][HISTOGRAM_SIZE];
+    this.bytesRef1.length = numDim * bytesPerDim;
+    this.tempDir = tempDir;
+    this.tempFileNamePrefix = tempFileNamePrefix;
+  }
+
+  /**
+   *
+   * Method to partition the input data. It returns the value of the dimension 
where
+   * the split happens. The method destroys the original writer.
+   *
+   */
+  public byte[] select(PointWriter points, PointWriter left, PointWriter 
right, long from, long to, long partitionPoint, int dim) throws IOException {
 
 Review comment:
   At the moment no, but one of the improvements it can be done is to re-use 
the idea of path slices on the previous versions and keep just the first 
`HeapPointWriter`. Then you do not need to copy bytes between heap objects and 
just adjust the start end end points for the different levels.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

Reply via email to