[GitHub] jpountz commented on a change in pull request #525: LUCENE-8585: Index-time jump-tables for DocValues

GitBox Thu, 17 Jan 2019 09:59:31 -0800

jpountz commented on a change in pull request #525: LUCENE-8585: Index-time 
jump-tables for DocValues
URL: https://github.com/apache/lucene-solr/pull/525#discussion_r248773989


 ##########
 File path: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java
 ##########
 @@ -0,0 +1,663 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.EmptyDocValuesProducer;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.SortedSetSelector;
+import org.apache.lucene.store.GrowableByteArrayDataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.MathUtil;
+import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.packed.DirectMonotonicWriter;
+import org.apache.lucene.util.packed.DirectWriter;
+
+import static 
org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
+import static 
org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.NUMERIC_BLOCK_SHIFT;
+import static 
org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE;
+
+/** writer for {@link Lucene80DocValuesFormat} */
+final class Lucene80DocValuesConsumer extends DocValuesConsumer implements 
Closeable {
+
+  IndexOutput data, meta;
+  final int maxDoc;
+
+  /** expert: Creates a new writer */
+  public Lucene80DocValuesConsumer(SegmentWriteState state, String dataCodec, 
String dataExtension, String metaCodec, String metaExtension) throws 
IOException {
+    boolean success = false;
+    try {
+      String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, 
state.segmentSuffix, dataExtension);
+      data = state.directory.createOutput(dataName, state.context);
+      CodecUtil.writeIndexHeader(data, dataCodec, 
Lucene80DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), 
state.segmentSuffix);
+      String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, 
state.segmentSuffix, metaExtension);
+      meta = state.directory.createOutput(metaName, state.context);
+      CodecUtil.writeIndexHeader(meta, metaCodec, 
Lucene80DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), 
state.segmentSuffix);
+      maxDoc = state.segmentInfo.maxDoc();
+      success = true;
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(this);
+      }
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    boolean success = false;
+    try {
+      if (meta != null) {
+        meta.writeInt(-1); // write EOF marker
+        CodecUtil.writeFooter(meta); // write checksum
+      }
+      if (data != null) {
+        CodecUtil.writeFooter(data); // write checksum
+      }
+      success = true;
+    } finally {
+      if (success) {
+        IOUtils.close(data, meta);
+      } else {
+        IOUtils.closeWhileHandlingException(data, meta);
+      }
+      meta = data = null;
+    }
+  }
+
+  @Override
+  public void addNumericField(FieldInfo field, DocValuesProducer 
valuesProducer) throws IOException {
+    meta.writeInt(field.number);
+    meta.writeByte(Lucene80DocValuesFormat.NUMERIC);
+
+    writeValues(field, new EmptyDocValuesProducer() {
+      @Override
+      public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws 
IOException {
+        return DocValues.singleton(valuesProducer.getNumeric(field));
+      }
+    });
+  }
+
+  private static class MinMaxTracker {
+    long min, max, numValues, spaceInBits;
+
+    MinMaxTracker() {
+      reset();
+      spaceInBits = 0;
+    }
+
+    private void reset() {
+      min = Long.MAX_VALUE;
+      max = Long.MIN_VALUE;
+      numValues = 0;
+    }
+
+    /** Accumulate a new value. */
+    void update(long v) {
+      min = Math.min(min, v);
+      max = Math.max(max, v);
+      ++numValues;
+    }
+
+    /** Update the required space. */
+    void finish() {
+      if (max > min) {
+        spaceInBits += DirectWriter.unsignedBitsRequired(max - min) * 
numValues;
+      }
+    }
+
+    /** Update space usage and get ready for accumulating values for the next 
block. */
+    void nextBlock() {
+      finish();
+      reset();
+    }
+  }
+
+  private long[] writeValues(FieldInfo field, DocValuesProducer 
valuesProducer) throws IOException {
+    SortedNumericDocValues values = valuesProducer.getSortedNumeric(field);
+    int numDocsWithValue = 0;
+    MinMaxTracker minMax = new MinMaxTracker();
+    MinMaxTracker blockMinMax = new MinMaxTracker();
+    long gcd = 0;
+    Set<Long> uniqueValues = new HashSet<>();
+    for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc 
= values.nextDoc()) {
+      for (int i = 0, count = values.docValueCount(); i < count; ++i) {
+        long v = values.nextValue();
+
+        if (gcd != 1) {
+          if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) {
+            // in that case v - minValue might overflow and make the GCD 
computation return
+            // wrong results. Since these extreme values are unlikely, we just 
discard
+            // GCD computation for them
+            gcd = 1;
+          } else if (minMax.numValues != 0) { // minValue needs to be set first
+            gcd = MathUtil.gcd(gcd, v - minMax.min);
+          }
+        }
+
+        minMax.update(v);
+        blockMinMax.update(v);
+        if (blockMinMax.numValues == NUMERIC_BLOCK_SIZE) {
+          blockMinMax.nextBlock();
+        }
+
+        if (uniqueValues != null
+            && uniqueValues.add(v)
+            && uniqueValues.size() > 256) {
+          uniqueValues = null;
+        }
+      }
+
+      numDocsWithValue++;
+    }
+
+    minMax.finish();
+    blockMinMax.finish();
+
+    final long numValues = minMax.numValues;
+    long min = minMax.min;
+    final long max = minMax.max;
+    assert blockMinMax.spaceInBits <= minMax.spaceInBits;
+
+    if (numDocsWithValue == 0) {              // meta[-2, 0]: No documents 
with values
+      meta.writeLong(-2); // docsWithFieldOffset
+      meta.writeLong(0L); // docsWithFieldLength
+      meta.writeShort((short) -1); // jumpTableEntryCount
+    } else if (numDocsWithValue == maxDoc) {  // meta[-1, 0]: All documents 
has values
+      meta.writeLong(-1); // docsWithFieldOffset
+      meta.writeLong(0L); // docsWithFieldLength
+      meta.writeShort((short) -1); // jumpTableEntryCount
+    } else {                                  // meta[data.offset, 
data.length]: IndexedDISI structure for documents with values
+      long offset = data.getFilePointer();
+      meta.writeLong(offset);// docsWithFieldOffset
+      values = valuesProducer.getSortedNumeric(field);
+      final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data);
+      meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
+      meta.writeShort(jumpTableEntryCount);
+    }
+
+    meta.writeLong(numValues);
+    final int numBitsPerValue;
+    boolean doBlocks = false;
+    Map<Long, Integer> encode = null;
+    if (min >= max) {                         // meta[-1]: All values are 0
+      numBitsPerValue = 0;
+      meta.writeInt(-1); // tablesize
+    } else {
+      if (uniqueValues != null
+          && uniqueValues.size() > 1
+          && DirectWriter.unsignedBitsRequired(uniqueValues.size() - 1) < 
DirectWriter.unsignedBitsRequired((max - min) / gcd)) {
+        numBitsPerValue = 
DirectWriter.unsignedBitsRequired(uniqueValues.size() - 1);
+        final Long[] sortedUniqueValues = uniqueValues.toArray(new Long[0]);
+        Arrays.sort(sortedUniqueValues);
+        meta.writeInt(sortedUniqueValues.length); // tablesize
+        for (Long v : sortedUniqueValues) {
+          meta.writeLong(v); // table[] entry
+        }
+        encode = new HashMap<>();
+        for (int i = 0; i < sortedUniqueValues.length; ++i) {
+          encode.put(sortedUniqueValues[i], i);
+        }
+        min = 0;
+        gcd = 1;
+      } else {
+        uniqueValues = null;
+        // we do blocks if that appears to save 10+% storage
+        doBlocks = minMax.spaceInBits > 0 && (double) blockMinMax.spaceInBits 
/ minMax.spaceInBits <= 0.9;
+        if (doBlocks) {
+          numBitsPerValue = 0xFF;
+          meta.writeInt(-2 - NUMERIC_BLOCK_SHIFT); // tablesize
+        } else {
+          numBitsPerValue = DirectWriter.unsignedBitsRequired((max - min) / 
gcd);
+          if (gcd == 1 && min > 0
+              && DirectWriter.unsignedBitsRequired(max) == 
DirectWriter.unsignedBitsRequired(max - min)) {
+            min = 0;
+          }
+          meta.writeInt(-1); // tablesize
+        }
+      }
+    }
+
+    meta.writeByte((byte) numBitsPerValue);
+    meta.writeLong(min);
+    meta.writeLong(gcd);
+    long startOffset = data.getFilePointer();
+    meta.writeLong(startOffset); // valueOffset
+    long jumpTableOffset = -1;
+    if (doBlocks) {
+      jumpTableOffset = 
writeValuesMultipleBlocks(valuesProducer.getSortedNumeric(field), gcd);
+    } else if (numBitsPerValue != 0) {
+      writeValuesSingleBlock(valuesProducer.getSortedNumeric(field), 
numValues, numBitsPerValue, min, gcd, encode);
+    }
+    meta.writeLong(data.getFilePointer() - startOffset); // valuesLength
+    meta.writeLong(jumpTableOffset);
+    return new long[] {numDocsWithValue, numValues};
+  }
+
+  private void writeValuesSingleBlock(SortedNumericDocValues values, long 
numValues, int numBitsPerValue,
+      long min, long gcd, Map<Long, Integer> encode) throws IOException {
+    DirectWriter writer = DirectWriter.getInstance(data, numValues, 
numBitsPerValue);
+    for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc 
= values.nextDoc()) {
+      for (int i = 0, count = values.docValueCount(); i < count; ++i) {
+        long v = values.nextValue();
+        if (encode == null) {
+          writer.add((v - min) / gcd);
+        } else {
+          writer.add(encode.get(v));
+        }
+      }
+    }
+    writer.finish();
+  }
+
+  // Returns the offset to the jump-table for vBPV
+  private long writeValuesMultipleBlocks(SortedNumericDocValues values, long 
gcd) throws IOException {
+    long[] offsets = new long[ArrayUtil.oversize(100, Long.BYTES)]; // 100 
blocks = 1.6M values
 
 Review comment:
   can you reduce the initial size so that testing exercises this line of code?

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] jpountz commented on a change in pull request #525: LUCENE-8585: Index-time jump-tables for DocValues

Reply via email to