Github user shaie commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/513#discussion_r238938229 --- Diff: lucene/core/src/java/org/apache/lucene/index/FieldUpdatesBuffer.java --- @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.index; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefArray; +import org.apache.lucene.util.BytesRefIterator; +import org.apache.lucene.util.Counter; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * This class efficiently buffers numeric and binary field updates and stores + * terms, values and metadata in a memory efficient way without creating large amounts + * of objects. Update terms are stored without de-duplicating the update term. + * In general we try to optimize for several use-cases. For instance we try to use constant + * space for update terms field since the common case always updates on the same field. Also for docUpTo + * we try to optimize for the case when updates should be applied to all docs ie. docUpTo=Integer.MAX_VALUE. + * In other cases each update will likely have a different docUpTo. + * Along the same lines this impl optimizes the case when all updates have a value. Lastly, the soft_deletes case + * where all values for a specific field is shared this also stores numeric values only once if all updates share + * the same value. + */ +final class FieldUpdatesBuffer { + private final Counter bytesUsed; + private int numUpdates = 1; + // we use a very simple approach and store the update term values without de-duplication + // which is also not a common case to keep updating the same value more than once... + // we might pay a higher price in terms of memory in certain cases but will gain + // on CPU for those. We also save on not needing to sort in order to apply the terms in order + // since by definition we store them in order. + private final BytesRefArray termValues; + private final BytesRefArray byteValues; // this will be null if we are buffering numerics + private int[] docsUpTo = null; + private long[] numericValues; // this will be null if we are buffering binaries + private boolean[] hasValues; + private String[] fields; + private final String firstField; + private final boolean firstHasValue; + private long firstNumericValue; + private final int firstDocUpTo; + private final boolean isNumeric; + + private FieldUpdatesBuffer(Counter bytesUsed, DocValuesUpdate initialValue, int docUpTo, boolean isNumeric) { + this.bytesUsed = bytesUsed; + termValues = new BytesRefArray(bytesUsed); + termValues.append(initialValue.term.bytes); + firstField = initialValue.term.field; + firstDocUpTo = docUpTo; + firstHasValue = initialValue.hasValue; + if (firstHasValue == false) { + hasValues = new boolean[] {false}; + bytesUsed.addAndGet(1); + } + this.isNumeric = isNumeric; + byteValues = isNumeric ? null : new BytesRefArray(bytesUsed); + } + + FieldUpdatesBuffer(Counter bytesUsed, DocValuesUpdate.NumericDocValuesUpdate initialValue, int docUpTo) { + this(bytesUsed, initialValue, docUpTo, true); + if (initialValue.hasValue) { + firstNumericValue = initialValue.getValue(); + } + } + + FieldUpdatesBuffer(Counter bytesUsed, DocValuesUpdate.BinaryDocValuesUpdate initialValue, int docUpTo) { + this(bytesUsed, initialValue, docUpTo, false); + if (initialValue.hasValue()) { + byteValues.append(initialValue.getValue()); + } + } + + void add(String field, int docUpTo, int ord, boolean hasValue) { + if (this.firstField.equals(field) == false || fields != null) { + if (fields == null) { + int newSize = ArrayUtil.oversize(ord + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF); + fields = new String[newSize]; + bytesUsed.addAndGet(newSize * RamUsageEstimator.NUM_BYTES_OBJECT_REF); + Arrays.fill(fields, 0, ord, firstField); + } else if (fields.length <= ord) { + int newSize = ArrayUtil.oversize(ord + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF); + String[] array = new String[newSize]; + System.arraycopy(fields, 0, array, 0, fields.length); + bytesUsed.addAndGet((newSize - fields.length) * RamUsageEstimator.NUM_BYTES_OBJECT_REF); + fields = array; + } + fields[ord] = field; + } + + if (this.firstDocUpTo != docUpTo || docsUpTo != null) { + if (docsUpTo == null) { + int newSize = ArrayUtil.oversize(ord + 1, Integer.BYTES); + bytesUsed.addAndGet(newSize * Integer.BYTES); + docsUpTo = new int[newSize]; + Arrays.fill(docsUpTo, 0, ord, firstDocUpTo); + } else if (docsUpTo.length <= ord) { + int newSize = ArrayUtil.oversize(ord + 1, Integer.BYTES); + int[] array = new int[newSize]; + System.arraycopy(docsUpTo, 0, array, 0, docsUpTo.length); + bytesUsed.addAndGet((newSize-docsUpTo.length) * Integer.BYTES); + docsUpTo = array; + } + docsUpTo[ord] = docUpTo; + } + + if (hasValue == false || hasValues != null) { + if (hasValues == null) { + int newSize = ArrayUtil.oversize(ord + 1, 1); + bytesUsed.addAndGet(newSize); + hasValues = new boolean[newSize]; + Arrays.fill(hasValues, 0, ord, true); + } else if (hasValues.length <= ord) { + int newSize = ArrayUtil.oversize(ord + 1, 1); + boolean[] array = new boolean[newSize]; + System.arraycopy(hasValues, 0, array, 0, hasValues.length); + bytesUsed.addAndGet(newSize-hasValues.length); + hasValues = array; + } + hasValues[ord] = hasValue; + } + } + + void addUpdate(Term term, long value, int docUpTo) { + assert isNumeric; + termValues.append(term.bytes); + String field = term.field; + final int ord = numUpdates++; + + add(field, docUpTo, ord, true); + if (this.firstNumericValue != value || numericValues != null) { + if (numericValues == null) { + int newSize = ArrayUtil.oversize(ord + 1, Long.BYTES); + bytesUsed.addAndGet(newSize * Long.BYTES); + numericValues = new long[newSize]; + Arrays.fill(numericValues, 0, ord, firstNumericValue); + } else if (numericValues.length <= ord) { + int newSize = ArrayUtil.oversize(ord + 1, Long.BYTES); + long[] array = new long[newSize]; + System.arraycopy(numericValues, 0, array, 0, numericValues.length); + bytesUsed.addAndGet((newSize-numericValues.length) * Long.BYTES); + numericValues = array; + } + numericValues[ord] = value; + } + } + + void addNoValue(Term term, int docUpTo) { + termValues.append(term.bytes); + String field = term.field; --- End diff -- Maybe inline this, like you do in the other `addUpdate`?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org