[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

jpountz Fri, 21 Dec 2018 09:50:19 -0800

Github user jpountz commented on a diff in the pull request:

    https://github.com/apache/lucene-solr/pull/525#discussion_r243642604
  
    --- Diff: 
lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java
 ---
    @@ -0,0 +1,663 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package org.apache.lucene.codecs.lucene80;
    +
    +
    +import java.io.Closeable;
    +import java.io.IOException;
    +import java.util.Arrays;
    +import java.util.HashMap;
    +import java.util.HashSet;
    +import java.util.Map;
    +import java.util.Set;
    +
    +import org.apache.lucene.codecs.CodecUtil;
    +import org.apache.lucene.codecs.DocValuesConsumer;
    +import org.apache.lucene.codecs.DocValuesProducer;
    +import org.apache.lucene.index.BinaryDocValues;
    +import org.apache.lucene.index.DocValues;
    +import org.apache.lucene.index.EmptyDocValuesProducer;
    +import org.apache.lucene.index.FieldInfo;
    +import org.apache.lucene.index.IndexFileNames;
    +import org.apache.lucene.index.SegmentWriteState;
    +import org.apache.lucene.index.SortedDocValues;
    +import org.apache.lucene.index.SortedNumericDocValues;
    +import org.apache.lucene.index.SortedSetDocValues;
    +import org.apache.lucene.index.TermsEnum;
    +import org.apache.lucene.search.DocIdSetIterator;
    +import org.apache.lucene.search.SortedSetSelector;
    +import org.apache.lucene.store.GrowableByteArrayDataOutput;
    +import org.apache.lucene.store.IndexOutput;
    +import org.apache.lucene.store.RAMOutputStream;
    +import org.apache.lucene.util.ArrayUtil;
    +import org.apache.lucene.util.BytesRef;
    +import org.apache.lucene.util.BytesRefBuilder;
    +import org.apache.lucene.util.IOUtils;
    +import org.apache.lucene.util.MathUtil;
    +import org.apache.lucene.util.StringHelper;
    +import org.apache.lucene.util.packed.DirectMonotonicWriter;
    +import org.apache.lucene.util.packed.DirectWriter;
    +
    +import static 
org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
    +import static 
org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.NUMERIC_BLOCK_SHIFT;
    +import static 
org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE;
    +
    +/** writer for {@link Lucene80DocValuesFormat} */
    +final class Lucene80DocValuesConsumer extends DocValuesConsumer implements 
Closeable {
    +
    +  IndexOutput data, meta;
    +  final int maxDoc;
    +
    +  /** expert: Creates a new writer */
    +  public Lucene80DocValuesConsumer(SegmentWriteState state, String 
dataCodec, String dataExtension, String metaCodec, String metaExtension) throws 
IOException {
    +    boolean success = false;
    +    try {
    +      String dataName = 
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, 
dataExtension);
    +      data = state.directory.createOutput(dataName, state.context);
    +      CodecUtil.writeIndexHeader(data, dataCodec, 
Lucene80DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), 
state.segmentSuffix);
    +      String metaName = 
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, 
metaExtension);
    +      meta = state.directory.createOutput(metaName, state.context);
    +      CodecUtil.writeIndexHeader(meta, metaCodec, 
Lucene80DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), 
state.segmentSuffix);
    +      maxDoc = state.segmentInfo.maxDoc();
    +      success = true;
    +    } finally {
    +      if (!success) {
    +        IOUtils.closeWhileHandlingException(this);
    +      }
    +    }
    +  }
    +
    +  @Override
    +  public void close() throws IOException {
    +    boolean success = false;
    +    try {
    +      if (meta != null) {
    +        meta.writeInt(-1); // write EOF marker
    +        CodecUtil.writeFooter(meta); // write checksum
    +      }
    +      if (data != null) {
    +        CodecUtil.writeFooter(data); // write checksum
    +      }
    +      success = true;
    +    } finally {
    +      if (success) {
    +        IOUtils.close(data, meta);
    +      } else {
    +        IOUtils.closeWhileHandlingException(data, meta);
    +      }
    +      meta = data = null;
    +    }
    +  }
    +
    +  @Override
    +  public void addNumericField(FieldInfo field, DocValuesProducer 
valuesProducer) throws IOException {
    +    meta.writeInt(field.number);
    +    meta.writeByte(Lucene80DocValuesFormat.NUMERIC);
    +
    +    writeValues(field, new EmptyDocValuesProducer() {
    +      @Override
    +      public SortedNumericDocValues getSortedNumeric(FieldInfo field) 
throws IOException {
    +        return DocValues.singleton(valuesProducer.getNumeric(field));
    +      }
    +    });
    +  }
    +
    +  private static class MinMaxTracker {
    +    long min, max, numValues, spaceInBits;
    +
    +    MinMaxTracker() {
    +      reset();
    +      spaceInBits = 0;
    +    }
    +
    +    private void reset() {
    +      min = Long.MAX_VALUE;
    +      max = Long.MIN_VALUE;
    +      numValues = 0;
    +    }
    +
    +    /** Accumulate a new value. */
    +    void update(long v) {
    +      min = Math.min(min, v);
    +      max = Math.max(max, v);
    +      ++numValues;
    +    }
    +
    +    /** Update the required space. */
    +    void finish() {
    +      if (max > min) {
    +        spaceInBits += DirectWriter.unsignedBitsRequired(max - min) * 
numValues;
    +      }
    +    }
    +
    +    /** Update space usage and get ready for accumulating values for the 
next block. */
    +    void nextBlock() {
    +      finish();
    +      reset();
    +    }
    +  }
    +
    +  private long[] writeValues(FieldInfo field, DocValuesProducer 
valuesProducer) throws IOException {
    +    SortedNumericDocValues values = valuesProducer.getSortedNumeric(field);
    +    int numDocsWithValue = 0;
    +    MinMaxTracker minMax = new MinMaxTracker();
    +    MinMaxTracker blockMinMax = new MinMaxTracker();
    +    long gcd = 0;
    +    Set<Long> uniqueValues = new HashSet<>();
    +    for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; 
doc = values.nextDoc()) {
    +      for (int i = 0, count = values.docValueCount(); i < count; ++i) {
    +        long v = values.nextValue();
    +
    +        if (gcd != 1) {
    +          if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) {
    +            // in that case v - minValue might overflow and make the GCD 
computation return
    +            // wrong results. Since these extreme values are unlikely, we 
just discard
    +            // GCD computation for them
    +            gcd = 1;
    +          } else if (minMax.numValues != 0) { // minValue needs to be set 
first
    +            gcd = MathUtil.gcd(gcd, v - minMax.min);
    +          }
    +        }
    +
    +        minMax.update(v);
    +        blockMinMax.update(v);
    +        if (blockMinMax.numValues == NUMERIC_BLOCK_SIZE) {
    +          blockMinMax.nextBlock();
    +        }
    +
    +        if (uniqueValues != null
    +            && uniqueValues.add(v)
    +            && uniqueValues.size() > 256) {
    +          uniqueValues = null;
    +        }
    +      }
    +
    +      numDocsWithValue++;
    +    }
    +
    +    minMax.finish();
    +    blockMinMax.finish();
    +
    +    final long numValues = minMax.numValues;
    +    long min = minMax.min;
    +    final long max = minMax.max;
    +    assert blockMinMax.spaceInBits <= minMax.spaceInBits;
    +
    +    if (numDocsWithValue == 0) {              // meta[-2, 0]: No documents 
with values
    +      meta.writeLong(-2); // docsWithFieldOffset
    +      meta.writeLong(0L); // docsWithFieldLength
    +      meta.writeShort((short) -1); // jumpTableEntryCount
    +    } else if (numDocsWithValue == maxDoc) {  // meta[-1, 0]: All 
documents has values
    +      meta.writeLong(-1); // docsWithFieldOffset
    +      meta.writeLong(0L); // docsWithFieldLength
    +      meta.writeShort((short) -1); // jumpTableEntryCount
    +    } else {                                  // meta[data.offset, 
data.length]: IndexedDISI structure for documents with values
    +      long offset = data.getFilePointer();
    +      meta.writeLong(offset);// docsWithFieldOffset
    +      values = valuesProducer.getSortedNumeric(field);
    +      final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, 
data);
    +      meta.writeLong(data.getFilePointer() - offset); // 
docsWithFieldLength
    +      meta.writeShort(jumpTableEntryCount);
    +    }
    +
    +    meta.writeLong(numValues);
    +    final int numBitsPerValue;
    +    boolean doBlocks = false;
    +    Map<Long, Integer> encode = null;
    +    if (min >= max) {                         // meta[-1]: All values are 0
    +      numBitsPerValue = 0;
    +      meta.writeInt(-1); // tablesize
    +    } else {
    +      if (uniqueValues != null
    +          && uniqueValues.size() > 1
    +          && DirectWriter.unsignedBitsRequired(uniqueValues.size() - 1) < 
DirectWriter.unsignedBitsRequired((max - min) / gcd)) {
    +        numBitsPerValue = 
DirectWriter.unsignedBitsRequired(uniqueValues.size() - 1);
    +        final Long[] sortedUniqueValues = uniqueValues.toArray(new 
Long[0]);
    +        Arrays.sort(sortedUniqueValues);
    +        meta.writeInt(sortedUniqueValues.length); // tablesize
    +        for (Long v : sortedUniqueValues) {
    +          meta.writeLong(v); // table[] entry
    +        }
    +        encode = new HashMap<>();
    +        for (int i = 0; i < sortedUniqueValues.length; ++i) {
    +          encode.put(sortedUniqueValues[i], i);
    +        }
    +        min = 0;
    +        gcd = 1;
    +      } else {
    +        uniqueValues = null;
    +        // we do blocks if that appears to save 10+% storage
    +        doBlocks = minMax.spaceInBits > 0 && (double) 
blockMinMax.spaceInBits / minMax.spaceInBits <= 0.9;
    +        if (doBlocks) {
    +          numBitsPerValue = 0xFF;
    +          meta.writeInt(-2 - NUMERIC_BLOCK_SHIFT); // tablesize
    +        } else {
    +          numBitsPerValue = DirectWriter.unsignedBitsRequired((max - min) 
/ gcd);
    +          if (gcd == 1 && min > 0
    +              && DirectWriter.unsignedBitsRequired(max) == 
DirectWriter.unsignedBitsRequired(max - min)) {
    +            min = 0;
    +          }
    +          meta.writeInt(-1); // tablesize
    +        }
    +      }
    +    }
    +
    +    meta.writeByte((byte) numBitsPerValue);
    +    meta.writeLong(min);
    +    meta.writeLong(gcd);
    +    long startOffset = data.getFilePointer();
    +    meta.writeLong(startOffset); // valueOffset
    +    long jumpTableOffset = -1;
    +    if (doBlocks) {
    +      jumpTableOffset = 
writeValuesMultipleBlocks(valuesProducer.getSortedNumeric(field), gcd);
    +    } else if (numBitsPerValue != 0) {
    +      writeValuesSingleBlock(valuesProducer.getSortedNumeric(field), 
numValues, numBitsPerValue, min, gcd, encode);
    +    }
    +    meta.writeLong(data.getFilePointer() - startOffset); // valuesLength
    +    meta.writeLong(jumpTableOffset);
    +    return new long[] {numDocsWithValue, numValues};
    +  }
    +
    +  private void writeValuesSingleBlock(SortedNumericDocValues values, long 
numValues, int numBitsPerValue,
    +      long min, long gcd, Map<Long, Integer> encode) throws IOException {
    +    DirectWriter writer = DirectWriter.getInstance(data, numValues, 
numBitsPerValue);
    +    for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; 
doc = values.nextDoc()) {
    +      for (int i = 0, count = values.docValueCount(); i < count; ++i) {
    +        long v = values.nextValue();
    +        if (encode == null) {
    +          writer.add((v - min) / gcd);
    +        } else {
    +          writer.add(encode.get(v));
    +        }
    +      }
    +    }
    +    writer.finish();
    +  }
    +
    +  // Returns the offset to the jump-table for vBPV
    +  private long writeValuesMultipleBlocks(SortedNumericDocValues values, 
long gcd) throws IOException {
    +    long[] offsets = new long[ArrayUtil.oversize(100, Long.BYTES)]; // 100 
blocks = 1.6M values
    --- End diff --
    
    should we start with ArrayUtil.oversize(1, Long.BYTES) to be more likely to 
exercise the growth logic in tests?



---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] lucene-solr pull request #525: LUCENE-8585: Index-time jump-tables for DocVa...

Reply via email to