Github user s1monw commented on a diff in the pull request: https://github.com/apache/lucene-solr/pull/513#discussion_r239136654 --- Diff: lucene/core/src/java/org/apache/lucene/index/FieldUpdatesBuffer.java --- @@ -0,0 +1,280 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.index; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefArray; +import org.apache.lucene.util.BytesRefIterator; +import org.apache.lucene.util.Counter; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * This class efficiently buffers numeric and binary field updates and stores + * terms, values and metadata in a memory efficient way without creating large amounts + * of objects. Update terms are stored without de-duplicating the update term. + * In general we try to optimize for several use-cases. For instance we try to use constant + * space for update terms field since the common case always updates on the same field. Also for docUpTo + * we try to optimize for the case when updates should be applied to all docs ie. docUpTo=Integer.MAX_VALUE. + * In other cases each update will likely have a different docUpTo. + * Along the same lines this impl optimizes the case when all updates have a value. Lastly, if all updates share the + * same value for a numeric field we only store the value once. + */ +final class FieldUpdatesBuffer { + private static final long SELF_SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(FieldUpdatesBuffer.class); + private static final long STRING_SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(String.class); + private final Counter bytesUsed; + private int numUpdates = 1; + // we use a very simple approach and store the update term values without de-duplication + // which is also not a common case to keep updating the same value more than once... + // we might pay a higher price in terms of memory in certain cases but will gain + // on CPU for those. We also save on not needing to sort in order to apply the terms in order + // since by definition we store them in order. + private final BytesRefArray termValues; + private final BytesRefArray byteValues; // this will be null if we are buffering numerics + private int[] docsUpTo; + private long[] numericValues; // this will be null if we are buffering binaries + private FixedBitSet hasValues; + private String[] fields; + private final boolean isNumeric; + + private FieldUpdatesBuffer(Counter bytesUsed, DocValuesUpdate initialValue, int docUpTo, boolean isNumeric) { + this.bytesUsed = bytesUsed; + this.bytesUsed.addAndGet(SELF_SHALLOW_SIZE); + termValues = new BytesRefArray(bytesUsed); + termValues.append(initialValue.term.bytes); + fields = new String[] {initialValue.term.field}; + bytesUsed.addAndGet(sizeOfString(initialValue.term.field)); + docsUpTo = new int[] {docUpTo}; + if (initialValue.hasValue == false) { + hasValues = new FixedBitSet(1); + bytesUsed.addAndGet(hasValues.ramBytesUsed()); + } + this.isNumeric = isNumeric; + byteValues = isNumeric ? null : new BytesRefArray(bytesUsed); + } + + private static long sizeOfString(String string) { + return STRING_SHALLOW_SIZE + (string.length() * Character.BYTES); + } + + FieldUpdatesBuffer(Counter bytesUsed, DocValuesUpdate.NumericDocValuesUpdate initialValue, int docUpTo) { + this(bytesUsed, initialValue, docUpTo, true); + if (initialValue.hasValue()) { + numericValues = new long[] {initialValue.getValue()}; + } else { + numericValues = new long[] {0}; + } + bytesUsed.addAndGet(Long.BYTES); + } + + FieldUpdatesBuffer(Counter bytesUsed, DocValuesUpdate.BinaryDocValuesUpdate initialValue, int docUpTo) { + this(bytesUsed, initialValue, docUpTo, false); + if (initialValue.hasValue()) { + byteValues.append(initialValue.getValue()); + } + } + + void add(String field, int docUpTo, int ord, boolean hasValue) { + if (fields[0].equals(field) == false || fields.length != 1 ) { + if (fields.length <= ord) { + String[] array = ArrayUtil.grow(fields, ord+1); + if (fields.length == 1) { + Arrays.fill(array, 1, ord, fields[0]); + } + bytesUsed.addAndGet((array.length - fields.length) * RamUsageEstimator.NUM_BYTES_OBJECT_REF); + fields = array; + } + if (field != fields[0]) { // that's an easy win of not accounting if there is an outlier + bytesUsed.addAndGet(sizeOfString(field)); + } + fields[ord] = field; + } + + if (docsUpTo[0] != docUpTo || docsUpTo.length != 1) { + if (docsUpTo.length <= ord) { + int[] array = ArrayUtil.grow(docsUpTo, ord+1); + if (docsUpTo.length == 1) { + Arrays.fill(array, 1, ord, docsUpTo[0]); + } + bytesUsed.addAndGet((array.length-docsUpTo.length) * Integer.BYTES); + docsUpTo = array; + } + docsUpTo[ord] = docUpTo; + } + + if (hasValue == false || hasValues != null) { + if (hasValues == null) { + hasValues = new FixedBitSet(ord+1); + hasValues.set(0, ord); + bytesUsed.addAndGet(hasValues.ramBytesUsed()); + } else { + FixedBitSet fixedBitSet = FixedBitSet.ensureCapacity(hasValues, ArrayUtil.oversize(ord + 1, 1)); --- End diff -- ++ good catch
--- --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org