jpountz commented on a change in pull request #556: LUCENE-8673: Use radix partitioning when merging dimensional points URL: https://github.com/apache/lucene-solr/pull/556#discussion_r254264333
########## File path: lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java ########## @@ -1239,90 +1017,68 @@ assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, /** The array (sized numDims) of PathSlice describe the cell we have currently recursed to. */ private void build(int nodeID, int leafNodeOffset, - PathSlice[] slices, - LongBitSet ordBitSet, + PointWriter data, IndexOutput out, + BKDRadixSelector radixSelector, byte[] minPackedValue, byte[] maxPackedValue, byte[] splitPackedValues, - long[] leafBlockFPs, - List<Closeable> toCloseHeroically) throws IOException { - - for(PathSlice slice : slices) { - assert slice.count == slices[0].count; - } - - if (numDataDims == 1 && slices[0].writer instanceof OfflinePointWriter && slices[0].count <= maxPointsSortInHeap) { - // Special case for 1D, to cutover to heap once we recurse deeply enough: - slices[0] = switchToHeap(slices[0], toCloseHeroically); - } + long[] leafBlockFPs) throws IOException { if (nodeID >= leafNodeOffset) { // Leaf node: write block // We can write the block in any order so by default we write it sorted by the dimension that has the // least number of unique bytes at commonPrefixLengths[dim], which makes compression more efficient - int sortedDim = 0; - int sortedDimCardinality = Integer.MAX_VALUE; - - for (int dim=0;dim<numDataDims;dim++) { - if (slices[dim].writer instanceof HeapPointWriter == false) { - // Adversarial cases can cause this, e.g. very lopsided data, all equal points, such that we started - // offline, but then kept splitting only in one dimension, and so never had to rewrite into heap writer - slices[dim] = switchToHeap(slices[dim], toCloseHeroically); - } - PathSlice source = slices[dim]; + if (data instanceof HeapPointWriter == false) { + // Adversarial cases can cause this, e.g. very lopsided data, all equal points, such that we started + // offline, but then kept splitting only in one dimension, and so never had to rewrite into heap writer + data = switchToHeap(data); + } - HeapPointWriter heapSource = (HeapPointWriter) source.writer; + // We ensured that maxPointsSortInHeap was >= maxPointsInLeafNode, so we better be in heap at this point: + HeapPointWriter heapSource = (HeapPointWriter) data; - // Find common prefix by comparing first and last values, already sorted in this dimension: - heapSource.readPackedValue(Math.toIntExact(source.start), scratch1); - heapSource.readPackedValue(Math.toIntExact(source.start + source.count - 1), scratch2); + //we store common prefix on scratch1 + computeCommonPrefixLength(heapSource, scratch1); - int offset = dim * bytesPerDim; - commonPrefixLengths[dim] = bytesPerDim; - for(int j=0;j<bytesPerDim;j++) { - if (scratch1[offset+j] != scratch2[offset+j]) { - commonPrefixLengths[dim] = j; - break; - } + int sortedDim = 0; + int sortedDimCardinality = Integer.MAX_VALUE; + FixedBitSet[] usedBytes = new FixedBitSet[numDataDims]; + for (int dim = 0; dim < numDataDims; ++dim) { + if (commonPrefixLengths[dim] < bytesPerDim) { + usedBytes[dim] = new FixedBitSet(256); } - + } + //Find the dimension to compress + for (int dim = 0; dim < numDataDims; dim++) { int prefix = commonPrefixLengths[dim]; if (prefix < bytesPerDim) { - int cardinality = 1; - byte previous = scratch1[offset + prefix]; - for (long i = 1; i < source.count; ++i) { - heapSource.readPackedValue(Math.toIntExact(source.start + i), scratch2); - byte b = scratch2[offset + prefix]; - assert Byte.toUnsignedInt(previous) <= Byte.toUnsignedInt(b); - if (b != previous) { - cardinality++; - previous = b; - } + int offset = dim * bytesPerDim; + for (int i = 0; i < heapSource.count(); ++i) { + heapSource.getPackedValueSlice(i, scratchBytesRef1); + int bucket = scratchBytesRef1.bytes[scratchBytesRef1.offset + offset + prefix] & 0xff; + usedBytes[dim].set(bucket); } - assert cardinality <= 256; + int cardinality =usedBytes[dim].cardinality(); if (cardinality < sortedDimCardinality) { sortedDim = dim; sortedDimCardinality = cardinality; } } } - PathSlice source = slices[sortedDim]; - - // We ensured that maxPointsSortInHeap was >= maxPointsInLeafNode, so we better be in heap at this point: - HeapPointWriter heapSource = (HeapPointWriter) source.writer; + sortHeapPointWriter(heapSource, sortedDim); // Save the block file pointer: leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer(); //System.out.println(" write leaf block @ fp=" + out.getFilePointer()); // Write docIDs first, as their own chunk, so that at intersect time we can add all docIDs w/o // loading the values: - int count = Math.toIntExact(source.count); + int count = Math.toIntExact(heapSource.count()); assert count > 0: "nodeID=" + nodeID + " leafNodeOffset=" + leafNodeOffset; - writeLeafBlockDocs(out, heapSource.docIDs, Math.toIntExact(source.start), count); + writeLeafBlockDocs(out, heapSource.docIDs, Math.toIntExact(0), count); Review comment: s/Math.toIntExact(0)/0/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org