Github user jpountz commented on a diff in the pull request:
https://github.com/apache/lucene-solr/pull/525#discussion_r243642604
--- Diff:
lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java
---
@@ -0,0 +1,663 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.EmptyDocValuesProducer;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.SortedSetSelector;
+import org.apache.lucene.store.GrowableByteArrayDataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.MathUtil;
+import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.packed.DirectMonotonicWriter;
+import org.apache.lucene.util.packed.DirectWriter;
+
+import static
org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
+import static
org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.NUMERIC_BLOCK_SHIFT;
+import static
org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE;
+
+/** writer for {@link Lucene80DocValuesFormat} */
+final class Lucene80DocValuesConsumer extends DocValuesConsumer implements
Closeable {
+
+ IndexOutput data, meta;
+ final int maxDoc;
+
+ /** expert: Creates a new writer */
+ public Lucene80DocValuesConsumer(SegmentWriteState state, String
dataCodec, String dataExtension, String metaCodec, String metaExtension) throws
IOException {
+ boolean success = false;
+ try {
+ String dataName =
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix,
dataExtension);
+ data = state.directory.createOutput(dataName, state.context);
+ CodecUtil.writeIndexHeader(data, dataCodec,
Lucene80DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(),
state.segmentSuffix);
+ String metaName =
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix,
metaExtension);
+ meta = state.directory.createOutput(metaName, state.context);
+ CodecUtil.writeIndexHeader(meta, metaCodec,
Lucene80DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(),
state.segmentSuffix);
+ maxDoc = state.segmentInfo.maxDoc();
+ success = true;
+ } finally {
+ if (!success) {
+ IOUtils.closeWhileHandlingException(this);
+ }
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ boolean success = false;
+ try {
+ if (meta != null) {
+ meta.writeInt(-1); // write EOF marker
+ CodecUtil.writeFooter(meta); // write checksum
+ }
+ if (data != null) {
+ CodecUtil.writeFooter(data); // write checksum
+ }
+ success = true;
+ } finally {
+ if (success) {
+ IOUtils.close(data, meta);
+ } else {
+ IOUtils.closeWhileHandlingException(data, meta);
+ }
+ meta = data = null;
+ }
+ }
+
+ @Override
+ public void addNumericField(FieldInfo field, DocValuesProducer
valuesProducer) throws IOException {
+ meta.writeInt(field.number);
+ meta.writeByte(Lucene80DocValuesFormat.NUMERIC);
+
+ writeValues(field, new EmptyDocValuesProducer() {
+ @Override
+ public SortedNumericDocValues getSortedNumeric(FieldInfo field)
throws IOException {
+ return DocValues.singleton(valuesProducer.getNumeric(field));
+ }
+ });
+ }
+
+ private static class MinMaxTracker {
+ long min, max, numValues, spaceInBits;
+
+ MinMaxTracker() {
+ reset();
+ spaceInBits = 0;
+ }
+
+ private void reset() {
+ min = Long.MAX_VALUE;
+ max = Long.MIN_VALUE;
+ numValues = 0;
+ }
+
+ /** Accumulate a new value. */
+ void update(long v) {
+ min = Math.min(min, v);
+ max = Math.max(max, v);
+ ++numValues;
+ }
+
+ /** Update the required space. */
+ void finish() {
+ if (max > min) {
+ spaceInBits += DirectWriter.unsignedBitsRequired(max - min) *
numValues;
+ }
+ }
+
+ /** Update space usage and get ready for accumulating values for the
next block. */
+ void nextBlock() {
+ finish();
+ reset();
+ }
+ }
+
+ private long[] writeValues(FieldInfo field, DocValuesProducer
valuesProducer) throws IOException {
+ SortedNumericDocValues values = valuesProducer.getSortedNumeric(field);
+ int numDocsWithValue = 0;
+ MinMaxTracker minMax = new MinMaxTracker();
+ MinMaxTracker blockMinMax = new MinMaxTracker();
+ long gcd = 0;
+ Set<Long> uniqueValues = new HashSet<>();
+ for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS;
doc = values.nextDoc()) {
+ for (int i = 0, count = values.docValueCount(); i < count; ++i) {
+ long v = values.nextValue();
+
+ if (gcd != 1) {
+ if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) {
+ // in that case v - minValue might overflow and make the GCD
computation return
+ // wrong results. Since these extreme values are unlikely, we
just discard
+ // GCD computation for them
+ gcd = 1;
+ } else if (minMax.numValues != 0) { // minValue needs to be set
first
+ gcd = MathUtil.gcd(gcd, v - minMax.min);
+ }
+ }
+
+ minMax.update(v);
+ blockMinMax.update(v);
+ if (blockMinMax.numValues == NUMERIC_BLOCK_SIZE) {
+ blockMinMax.nextBlock();
+ }
+
+ if (uniqueValues != null
+ && uniqueValues.add(v)
+ && uniqueValues.size() > 256) {
+ uniqueValues = null;
+ }
+ }
+
+ numDocsWithValue++;
+ }
+
+ minMax.finish();
+ blockMinMax.finish();
+
+ final long numValues = minMax.numValues;
+ long min = minMax.min;
+ final long max = minMax.max;
+ assert blockMinMax.spaceInBits <= minMax.spaceInBits;
+
+ if (numDocsWithValue == 0) { // meta[-2, 0]: No documents
with values
+ meta.writeLong(-2); // docsWithFieldOffset
+ meta.writeLong(0L); // docsWithFieldLength
+ meta.writeShort((short) -1); // jumpTableEntryCount
+ } else if (numDocsWithValue == maxDoc) { // meta[-1, 0]: All
documents has values
+ meta.writeLong(-1); // docsWithFieldOffset
+ meta.writeLong(0L); // docsWithFieldLength
+ meta.writeShort((short) -1); // jumpTableEntryCount
+ } else { // meta[data.offset,
data.length]: IndexedDISI structure for documents with values
+ long offset = data.getFilePointer();
+ meta.writeLong(offset);// docsWithFieldOffset
+ values = valuesProducer.getSortedNumeric(field);
+ final short jumpTableEntryCount = IndexedDISI.writeBitSet(values,
data);
+ meta.writeLong(data.getFilePointer() - offset); //
docsWithFieldLength
+ meta.writeShort(jumpTableEntryCount);
+ }
+
+ meta.writeLong(numValues);
+ final int numBitsPerValue;
+ boolean doBlocks = false;
+ Map<Long, Integer> encode = null;
+ if (min >= max) { // meta[-1]: All values are 0
+ numBitsPerValue = 0;
+ meta.writeInt(-1); // tablesize
+ } else {
+ if (uniqueValues != null
+ && uniqueValues.size() > 1
+ && DirectWriter.unsignedBitsRequired(uniqueValues.size() - 1) <
DirectWriter.unsignedBitsRequired((max - min) / gcd)) {
+ numBitsPerValue =
DirectWriter.unsignedBitsRequired(uniqueValues.size() - 1);
+ final Long[] sortedUniqueValues = uniqueValues.toArray(new
Long[0]);
+ Arrays.sort(sortedUniqueValues);
+ meta.writeInt(sortedUniqueValues.length); // tablesize
+ for (Long v : sortedUniqueValues) {
+ meta.writeLong(v); // table[] entry
+ }
+ encode = new HashMap<>();
+ for (int i = 0; i < sortedUniqueValues.length; ++i) {
+ encode.put(sortedUniqueValues[i], i);
+ }
+ min = 0;
+ gcd = 1;
+ } else {
+ uniqueValues = null;
+ // we do blocks if that appears to save 10+% storage
+ doBlocks = minMax.spaceInBits > 0 && (double)
blockMinMax.spaceInBits / minMax.spaceInBits <= 0.9;
+ if (doBlocks) {
+ numBitsPerValue = 0xFF;
+ meta.writeInt(-2 - NUMERIC_BLOCK_SHIFT); // tablesize
+ } else {
+ numBitsPerValue = DirectWriter.unsignedBitsRequired((max - min)
/ gcd);
+ if (gcd == 1 && min > 0
+ && DirectWriter.unsignedBitsRequired(max) ==
DirectWriter.unsignedBitsRequired(max - min)) {
+ min = 0;
+ }
+ meta.writeInt(-1); // tablesize
+ }
+ }
+ }
+
+ meta.writeByte((byte) numBitsPerValue);
+ meta.writeLong(min);
+ meta.writeLong(gcd);
+ long startOffset = data.getFilePointer();
+ meta.writeLong(startOffset); // valueOffset
+ long jumpTableOffset = -1;
+ if (doBlocks) {
+ jumpTableOffset =
writeValuesMultipleBlocks(valuesProducer.getSortedNumeric(field), gcd);
+ } else if (numBitsPerValue != 0) {
+ writeValuesSingleBlock(valuesProducer.getSortedNumeric(field),
numValues, numBitsPerValue, min, gcd, encode);
+ }
+ meta.writeLong(data.getFilePointer() - startOffset); // valuesLength
+ meta.writeLong(jumpTableOffset);
+ return new long[] {numDocsWithValue, numValues};
+ }
+
+ private void writeValuesSingleBlock(SortedNumericDocValues values, long
numValues, int numBitsPerValue,
+ long min, long gcd, Map<Long, Integer> encode) throws IOException {
+ DirectWriter writer = DirectWriter.getInstance(data, numValues,
numBitsPerValue);
+ for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS;
doc = values.nextDoc()) {
+ for (int i = 0, count = values.docValueCount(); i < count; ++i) {
+ long v = values.nextValue();
+ if (encode == null) {
+ writer.add((v - min) / gcd);
+ } else {
+ writer.add(encode.get(v));
+ }
+ }
+ }
+ writer.finish();
+ }
+
+ // Returns the offset to the jump-table for vBPV
+ private long writeValuesMultipleBlocks(SortedNumericDocValues values,
long gcd) throws IOException {
+ long[] offsets = new long[ArrayUtil.oversize(100, Long.BYTES)]; // 100
blocks = 1.6M values
--- End diff --
should we start with ArrayUtil.oversize(1, Long.BYTES) to be more likely to
exercise the growth logic in tests?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]