jpountz commented on a change in pull request #525: LUCENE-8585: Index-time jump-tables for DocValues URL: https://github.com/apache/lucene-solr/pull/525#discussion_r248773989
########## File path: lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java ########## @@ -0,0 +1,663 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + + +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.EmptyDocValuesProducer; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.SortedSetSelector; +import org.apache.lucene.store.GrowableByteArrayDataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RAMOutputStream; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.MathUtil; +import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.packed.DirectMonotonicWriter; +import org.apache.lucene.util.packed.DirectWriter; + +import static org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; +import static org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.NUMERIC_BLOCK_SHIFT; +import static org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE; + +/** writer for {@link Lucene80DocValuesFormat} */ +final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Closeable { + + IndexOutput data, meta; + final int maxDoc; + + /** expert: Creates a new writer */ + public Lucene80DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { + boolean success = false; + try { + String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); + data = state.directory.createOutput(dataName, state.context); + CodecUtil.writeIndexHeader(data, dataCodec, Lucene80DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); + meta = state.directory.createOutput(metaName, state.context); + CodecUtil.writeIndexHeader(meta, metaCodec, Lucene80DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + maxDoc = state.segmentInfo.maxDoc(); + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + @Override + public void close() throws IOException { + boolean success = false; + try { + if (meta != null) { + meta.writeInt(-1); // write EOF marker + CodecUtil.writeFooter(meta); // write checksum + } + if (data != null) { + CodecUtil.writeFooter(data); // write checksum + } + success = true; + } finally { + if (success) { + IOUtils.close(data, meta); + } else { + IOUtils.closeWhileHandlingException(data, meta); + } + meta = data = null; + } + } + + @Override + public void addNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { + meta.writeInt(field.number); + meta.writeByte(Lucene80DocValuesFormat.NUMERIC); + + writeValues(field, new EmptyDocValuesProducer() { + @Override + public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { + return DocValues.singleton(valuesProducer.getNumeric(field)); + } + }); + } + + private static class MinMaxTracker { + long min, max, numValues, spaceInBits; + + MinMaxTracker() { + reset(); + spaceInBits = 0; + } + + private void reset() { + min = Long.MAX_VALUE; + max = Long.MIN_VALUE; + numValues = 0; + } + + /** Accumulate a new value. */ + void update(long v) { + min = Math.min(min, v); + max = Math.max(max, v); + ++numValues; + } + + /** Update the required space. */ + void finish() { + if (max > min) { + spaceInBits += DirectWriter.unsignedBitsRequired(max - min) * numValues; + } + } + + /** Update space usage and get ready for accumulating values for the next block. */ + void nextBlock() { + finish(); + reset(); + } + } + + private long[] writeValues(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { + SortedNumericDocValues values = valuesProducer.getSortedNumeric(field); + int numDocsWithValue = 0; + MinMaxTracker minMax = new MinMaxTracker(); + MinMaxTracker blockMinMax = new MinMaxTracker(); + long gcd = 0; + Set<Long> uniqueValues = new HashSet<>(); + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + for (int i = 0, count = values.docValueCount(); i < count; ++i) { + long v = values.nextValue(); + + if (gcd != 1) { + if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) { + // in that case v - minValue might overflow and make the GCD computation return + // wrong results. Since these extreme values are unlikely, we just discard + // GCD computation for them + gcd = 1; + } else if (minMax.numValues != 0) { // minValue needs to be set first + gcd = MathUtil.gcd(gcd, v - minMax.min); + } + } + + minMax.update(v); + blockMinMax.update(v); + if (blockMinMax.numValues == NUMERIC_BLOCK_SIZE) { + blockMinMax.nextBlock(); + } + + if (uniqueValues != null + && uniqueValues.add(v) + && uniqueValues.size() > 256) { + uniqueValues = null; + } + } + + numDocsWithValue++; + } + + minMax.finish(); + blockMinMax.finish(); + + final long numValues = minMax.numValues; + long min = minMax.min; + final long max = minMax.max; + assert blockMinMax.spaceInBits <= minMax.spaceInBits; + + if (numDocsWithValue == 0) { // meta[-2, 0]: No documents with values + meta.writeLong(-2); // docsWithFieldOffset + meta.writeLong(0L); // docsWithFieldLength + meta.writeShort((short) -1); // jumpTableEntryCount + } else if (numDocsWithValue == maxDoc) { // meta[-1, 0]: All documents has values + meta.writeLong(-1); // docsWithFieldOffset + meta.writeLong(0L); // docsWithFieldLength + meta.writeShort((short) -1); // jumpTableEntryCount + } else { // meta[data.offset, data.length]: IndexedDISI structure for documents with values + long offset = data.getFilePointer(); + meta.writeLong(offset);// docsWithFieldOffset + values = valuesProducer.getSortedNumeric(field); + final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data); + meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength + meta.writeShort(jumpTableEntryCount); + } + + meta.writeLong(numValues); + final int numBitsPerValue; + boolean doBlocks = false; + Map<Long, Integer> encode = null; + if (min >= max) { // meta[-1]: All values are 0 + numBitsPerValue = 0; + meta.writeInt(-1); // tablesize + } else { + if (uniqueValues != null + && uniqueValues.size() > 1 + && DirectWriter.unsignedBitsRequired(uniqueValues.size() - 1) < DirectWriter.unsignedBitsRequired((max - min) / gcd)) { + numBitsPerValue = DirectWriter.unsignedBitsRequired(uniqueValues.size() - 1); + final Long[] sortedUniqueValues = uniqueValues.toArray(new Long[0]); + Arrays.sort(sortedUniqueValues); + meta.writeInt(sortedUniqueValues.length); // tablesize + for (Long v : sortedUniqueValues) { + meta.writeLong(v); // table[] entry + } + encode = new HashMap<>(); + for (int i = 0; i < sortedUniqueValues.length; ++i) { + encode.put(sortedUniqueValues[i], i); + } + min = 0; + gcd = 1; + } else { + uniqueValues = null; + // we do blocks if that appears to save 10+% storage + doBlocks = minMax.spaceInBits > 0 && (double) blockMinMax.spaceInBits / minMax.spaceInBits <= 0.9; + if (doBlocks) { + numBitsPerValue = 0xFF; + meta.writeInt(-2 - NUMERIC_BLOCK_SHIFT); // tablesize + } else { + numBitsPerValue = DirectWriter.unsignedBitsRequired((max - min) / gcd); + if (gcd == 1 && min > 0 + && DirectWriter.unsignedBitsRequired(max) == DirectWriter.unsignedBitsRequired(max - min)) { + min = 0; + } + meta.writeInt(-1); // tablesize + } + } + } + + meta.writeByte((byte) numBitsPerValue); + meta.writeLong(min); + meta.writeLong(gcd); + long startOffset = data.getFilePointer(); + meta.writeLong(startOffset); // valueOffset + long jumpTableOffset = -1; + if (doBlocks) { + jumpTableOffset = writeValuesMultipleBlocks(valuesProducer.getSortedNumeric(field), gcd); + } else if (numBitsPerValue != 0) { + writeValuesSingleBlock(valuesProducer.getSortedNumeric(field), numValues, numBitsPerValue, min, gcd, encode); + } + meta.writeLong(data.getFilePointer() - startOffset); // valuesLength + meta.writeLong(jumpTableOffset); + return new long[] {numDocsWithValue, numValues}; + } + + private void writeValuesSingleBlock(SortedNumericDocValues values, long numValues, int numBitsPerValue, + long min, long gcd, Map<Long, Integer> encode) throws IOException { + DirectWriter writer = DirectWriter.getInstance(data, numValues, numBitsPerValue); + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + for (int i = 0, count = values.docValueCount(); i < count; ++i) { + long v = values.nextValue(); + if (encode == null) { + writer.add((v - min) / gcd); + } else { + writer.add(encode.get(v)); + } + } + } + writer.finish(); + } + + // Returns the offset to the jump-table for vBPV + private long writeValuesMultipleBlocks(SortedNumericDocValues values, long gcd) throws IOException { + long[] offsets = new long[ArrayUtil.oversize(100, Long.BYTES)]; // 100 blocks = 1.6M values Review comment: can you reduce the initial size so that testing exercises this line of code? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
