This is an automated email from the ASF dual-hosted git repository. schor pushed a commit to branch robin-hood-hash in repository https://gitbox.apache.org/repos/asf/uima-uimaj.git
commit 9a426895dfaa761ff74dd1de3d43ed47c79a834d Author: Marshall Schor <m...@schor.com> AuthorDate: Wed Dec 18 09:04:09 2019 -0500 no jira, experiment, robin hood hashing. Initial results: worse performance for IntHashSet, might be OK for Obj HashSet --- .../uima/internal/util/Common_hash_support_rh.java | 561 +++++++++++++++ .../apache/uima/internal/util/IntHashSetRh.java | 794 +++++++++++++++++++++ .../apache/uima/internal/util/ObjHashSetRh.java | 520 ++++++++++++++ .../uima/internal/util/IntHashSetPerfTestRh.java | 219 ++++++ .../uima/internal/util/IntHashSetTestRh.java | 250 +++++++ .../uima/internal/util/ObjHashSetTestRh.java | 185 +++++ 6 files changed, 2529 insertions(+) diff --git a/uimaj-core/src/main/java/org/apache/uima/internal/util/Common_hash_support_rh.java b/uimaj-core/src/main/java/org/apache/uima/internal/util/Common_hash_support_rh.java new file mode 100644 index 0000000..01c0edd --- /dev/null +++ b/uimaj-core/src/main/java/org/apache/uima/internal/util/Common_hash_support_rh.java @@ -0,0 +1,561 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.uima.internal.util; + +import java.util.Arrays; +import java.util.function.IntPredicate; + +/** + * A common superclass for hash maps and hash sets + * Uses robin hood with backward shift for deletion + * http://codecapsule.com/2013/11/17/robin-hood-hashing-backward-shift-deletion/ + * + * uses linear probing (no delta expansion) + * + * extra table (1 byte per slot) holds lower 7 bits of hash + * and therefore also serves to indicate distance from initial probe = + * probe spot % 128 - lower 7 bits of hash (if negative, add 128, unless wrap around?) + * + * 8th bit , if 1 , is flag showing empty; no tombstones, because using backward-shift technique for removes + * + * find: stop early, if # probe-spot > distance from initial probe. (mod wrap-around) + * + */ +public abstract class Common_hash_support_rh { + + // set to true to collect statistics for tuning + // you have to also put a call to showHistogram() at the end of the run + protected static final boolean TUNE = false; + + protected static final int MIN_SIZE = 10; // 10 / .66 =15.15 + protected static final int MIN_CAPACITY = 16; + protected static final int MIN_CAPACITY_SHRINK = 64; // don't shrink below this - thrashing + + private static final byte LHB_EMPTY = (byte) 0x80; + private static final int LHB_HASH_MASK = 0x7f; + + protected final float loadFactor; + + protected final int initialCapacity; + + protected int histogram []; + protected int maxProbe = 0; + + protected int sizeWhichTriggersExpansion; + private int size = 0; // number of elements in the table + + protected boolean secondTimeShrinkable = false; + + protected abstract boolean is_valid_key(int pos); + protected abstract int keys_length(); + protected abstract void newKeysAndValues(int capacity); + protected abstract void clearKeysAndValues(); + + protected int bitMask; // = key size -1 + /** + * 7 bits of lower hash map, serves as distance from initial bucket probe + * initialize to "empty" = 0x80 + * If not "empty", the value is + * - often the same as the lower 7 bits of the index + * - if not that, it's because there's a collision, and some slot with a higher index will have the lower 7 bits of the index + * - so, the distance from the initial probe is the probe addr with the matching value - initial probe addr. + * - which, is the same as the + * ((probe addr with the matching value) & LBH_HASH_MASK) + * - matching value + */ + private byte[] lower_hash_bits; + protected int initial_probe; + protected byte initial_probe_lhb; + private int smaller_of_mask; + + private static Common_hash_support_rh tune_instance; + + protected abstract void shift(int prev, int pos); + protected abstract void setEmpty(int pos); + + protected abstract void expand_table(); // double table size + + /** + * @param initialSizeBeforeExpanding the number of elements the table should hold before expanding + */ + public Common_hash_support_rh(int initialSizeBeforeExpanding) { + this(initialSizeBeforeExpanding, 0.66F); + } + + public Common_hash_support_rh(int initialSizeBeforeExpanding, float factor) { + this.loadFactor = factor; + this.initialCapacity = tableSpace(initialSizeBeforeExpanding, factor); + lower_hash_bits = init_lhb(this.initialCapacity); + bitMask = this.initialCapacity - 1; + setSmallerOfMask(); + if (TUNE) { + tune_instance = this; + histogram = new int[200]; + Arrays.fill(histogram, 0); + maxProbe = 0; + } + } + + public Common_hash_support_rh(Common_hash_support_rh orig) { + this(orig.initialCapacity); + this.sizeWhichTriggersExpansion = orig.sizeWhichTriggersExpansion; + this.size = orig.size; + this.secondTimeShrinkable = orig.secondTimeShrinkable; + + // copy doesn't do tuning measurements + this.histogram = null; + } + + public void clear() { + // see if size is less than the 1/2 size that triggers expansion + if (size < (sizeWhichTriggersExpansion >>> 1)) { + // if 2nd time then shrink by 50% + // this is done to avoid thrashing around the threshold + if (secondTimeShrinkable) { + secondTimeShrinkable = false; + final int newCapacity = Math.max(initialCapacity, keys_length() >>> 1); + if (newCapacity < keys_length()) { + newTable(newCapacity); // shrink table by 50% + size = 0; + resetHistogram(); + if (PositiveIntSet.IS_TRACE_MODE_SWITCH) { + System.out.println("TRAcE_MODE Common_hash clear 2nd time shrinkable, newCapacity=" + newCapacity + ", keys_length: " + keys_length()); + } + return; + } else { // don't shrink below minimum + clearExisting(); + if (PositiveIntSet.IS_TRACE_MODE_SWITCH) { + System.out.println("TRAcE_MODE Common_hash clear 2nd time shrinkable but nothing done, below minimum: newCapacity=" + newCapacity + ", keys_length: " + keys_length()); + } + return; + } + } else { + if (PositiveIntSet.IS_TRACE_MODE_SWITCH) System.out.println("TRACE_MODE Common_hash clear setting 2nd time shrinkable"); + secondTimeShrinkable = true; + } + } else { + secondTimeShrinkable = false; // reset this to require 2 triggers in a row + } + clearExisting(); + } + + private void clearExisting() { + clearKeysAndValues(); + Arrays.fill(lower_hash_bits, LHB_EMPTY); + size = 0; + resetHistogram(); + } + + /** It gets a ref to the current value of table, and then searches that array. + * + * speedups: avoid looking in lower_hash_bits array for 90% of the time the hit is in the first 2 positions? + * + * @param hash the hash code of the key + * @param is_eq true if the key at the int position is == to the key, or is 0 + * @return the probeAddr in keys array. + * If not found, the probe value is to empty cell or to some other value + * Use isEmpty(probeAddr) to distinguish. + */ + protected int findPosition( + int hash, + IntPredicate is_eq_or_not_present + ) { + + int probeAddr = initial_probe = hash & bitMask; + final byte initialProbeLhb = initial_probe_lhb = (byte) (hash & LHB_HASH_MASK); + + if (is_eq_or_not_present.test(probeAddr) ) { + if (TUNE) { + updateHistogram(0); + } + return probeAddr; + } + int extraProbes = 0; + + for (;;) { + + byte plhb = lower_hash_bits[probeAddr]; + // not found if hit an empty slot + if (plhb == LHB_EMPTY || + // or the distance to original probe for this slot is < number of probes + // (because otherwise, the other item would have replaced if present) + extraProbesForEntry(plhb, probeAddr) < extraProbes + ) { + // not found + if (TUNE) { + updateHistogram(extraProbes); + } + return probeAddr; + } + + if (plhb == initialProbeLhb) { + if (is_eq_or_not_present.test(probeAddr)) { + // found + if (TUNE) { + updateHistogram(extraProbes); + } + return probeAddr; + } + // else was collision, not equal + } + + extraProbes++; + + probeAddr = incrPos(probeAddr); + } + + } + + /** + * Find the position for a new (guaranteed not in the table) item + * + * @param hash the hash code of the key + * @return the probeAddr in keys array. + * If not found, the probe value is to empty cell or to some other value + * Use isEmpty(probeAddr) to distinguish. + */ + protected int findPosition_new(int hash) { + + int extraProbes = 0; + int probeAddr = initial_probe = hash & bitMask; + initial_probe_lhb = (byte) (hash & LHB_HASH_MASK); + + + for (;;) { + + byte plhb = lower_hash_bits[probeAddr]; + // not found if hit an empty slot + if (plhb == LHB_EMPTY || + // or the distance to original probe for this slot is < number of probes + // (because otherwise, the other item would have replaced if present) + extraProbesForEntry(plhb, probeAddr) < extraProbes + ) { + // not found + if (TUNE) { + updateHistogram(extraProbes); + } + return probeAddr; + } + + extraProbes++; + + probeAddr = incrPos(probeAddr); + } + } + + protected boolean isEmpty(int probeAddr) { + return lower_hash_bits[probeAddr] == LHB_EMPTY; + } + + protected byte setLhb(int pos, byte lbh) { + byte r = lower_hash_bits[pos]; + assert lbh >= 0; + if (extraProbesForEntry(lbh, pos) >= 20) { + System.out.println("debug"); + } + assert extraProbesForEntry(lbh, pos) < 20; + lower_hash_bits[pos] = lbh; + return r; + } + + protected void remove_common(int position) { + int prev = position; + int i = incrPos(position); + size --; + for (;;) { + byte lhb = lower_hash_bits[i]; + if (lhb == LHB_EMPTY || extraProbesForEntry(lhb, i) == 0 ) { + lower_hash_bits[prev] = LHB_EMPTY; + setEmpty(prev); + return; + } + //debug + if (extraProbesForEntry(lhb, prev) > 20) { + System.out.println("debug"); + } + lower_hash_bits[prev] = lhb; + shift(prev, i); + + prev = i; + i = incrPos(i); + } + } + +// /** +// * This method calls the subclass's copy_to_new_table method, +// * passing an inner lambda containing common code for copying old to new. +// * +// * That inner lambda, when invoked by the copy_to_new_table method, +// * is passed another lambda of one argument (the old index) +// * which is called to copy each element. +// * @param new_capacity +// * @param old_capacity +// */ +// private void copyOld2New(int new_capacity, int old_capacity) { +// copy_to_new_table( +// new_capacity, +// old_capacity, +// +// // this code assigned to "commonCopy" arg +// (IntConsumer copyToNew, IntPredicate is_valid_old_key) -> { +// newTable(new_capacity); +// for (int i = 0; i < old_capacity; i++) { +// if (is_valid_old_key.test(i)) { +// copyToNew.accept(i); +// } +// } +// }); +// } + + /** + * advance pos until it points to a non 0 or is 1 past end + * If pos is negative, start at 0. + * Don't move if pos already has valid key + * @param pos - + * @return updated pos + */ + protected int moveToNextFilled(int pos) { + final int max = keys_length(); + if (pos < 0) { + pos = 0; + } + while (true) { + if (pos >= max) { + return pos; + } + if (is_valid_key(pos)) { + return pos; + } + pos ++; + } + } + + /** + * decrement pos until it points to a non 0 or is -1 + * If pos is beyond end start at end. + * Don't move if pos already has valid key + * @param pos - + * @return updated pos + */ + protected int moveToPreviousFilled(int pos) { + final int max = keys_length(); + if (pos > max) { + pos = max - 1; + } + + while (true) { + if (pos < 0) { + return pos; + } + if (is_valid_key(pos)) { + return pos; + } + pos --; + } + } + + // called by clear, increase table size, decrease table size + protected void newTable(int capacity) { + capacity = Math.max(MIN_SIZE, Misc.nextHigherPowerOf2(capacity)); + newKeysAndValues(capacity); + lower_hash_bits = init_lhb(capacity); + setSmallerOfMask(); + bitMask = capacity - 1; + sizeWhichTriggersExpansion = (int)(capacity * loadFactor); + } + + protected void incrementSize() { + size++; + if (size >= sizeWhichTriggersExpansion) { + maybeIncreaseTableCapacity(); + } + } + + private void maybeIncreaseTableCapacity() { + + if (TUNE) { + int old_capacity = keys_length(); + int new_capacity = 2 * old_capacity; + System.out.println("Capacity increasing from " + old_capacity + " to " + new_capacity + ", size=" + size); + } + expand_table(); + } + +// /** +// * only called if actually found and removed an entry +// */ +// protected void commonRemove() { +// size --; +//// debugValidate(); +// } + + public int size() { + return size; + } + + protected void resetHistogram() { +// if (TUNE) { +// histogram = new int[200]; +// Arrays.fill(histogram, 0); +// maxProbe = 0; +// } + } + + private void updateHistogram(int extraProbes) { + histogram[extraProbes] = 1 + histogram[extraProbes]; + if (maxProbe < extraProbes) { + maxProbe = extraProbes; + } + } + + public void showHistogram() { + if (TUNE) { + int sumI = 0; + for (int i : histogram) { + sumI += i; + } + + System.out.format( + "Histogram of number of probes, loadfactor = %.1f, maxProbe=%,d nbr of find operations at last table size=%,d%n", + loadFactor, maxProbe, sumI); + for (int i = 0; i <= maxProbe; i++) { + if (i == maxProbe && histogram[i] == 0) { + System.out.println("huh?"); + } + System.out.println(i + ": " + histogram[i]); + } + + System.out.println("bytes / entry = " + (float) (keys_length()) * 8 / size()); + System.out.format("size = %,d, prevExpansionTriggerSize = %,d, next = %,d%n", + size(), + (int) ((keys_length() >>> 1) * loadFactor), + (int) (keys_length() * loadFactor)); + } + } + + static { + if (TUNE) { + Runtime.getRuntime().addShutdownHook(new Thread(null, () -> { + tune_instance.showHistogram(); + })); + } + } + + // test case use + int getCapacity() { + return keys_length(); + } + + protected abstract class CommonKeyIterator implements IntListIterator { + + protected int curPosition; + + protected final int firstPosition; + + protected CommonKeyIterator() { + this.curPosition = moveToNextFilled(0); + firstPosition = curPosition; + } + + @Override + public boolean hasNext() { + return curPosition < keys_length() && curPosition >= 0; + } + + @Override + public boolean hasPrevious() { + if (curPosition > keys_length() || curPosition <= 0) { + return false; + } + + int test = moveToPreviousFilled(curPosition - 1); + return test >= 0; + } + + @Override + /** + * @see org.apache.uima.internal.util.IntListIterator#moveToStart() + */ + public void moveToStart() { + curPosition = moveToNextFilled(0); + } + + @Override + /** + * @see org.apache.uima.internal.util.IntListIterator#moveToEnd() + */ + public void moveToEnd() { + curPosition = moveToPreviousFilled(bitMask); + } + + } + + /** + * + * @param numberOfElements - + * @param factor - + * @return capacity of the main table (either 2 byte or 4 byte entries) + */ + public static int tableSpace(int numberOfElements, Float factor) { + if (numberOfElements < 0) { + throw new IllegalArgumentException("must be > 0"); + } + final int capacity = Math.round(numberOfElements / factor); + return Math.max(16, Misc.nextHigherPowerOf2(capacity)); + } + + protected void debugValidate() { + // count non-0, non-removed, compare to size + int sum = 0; + for (int i = 0; i < keys_length(); i ++) { + if (is_valid_key(i)) { + sum ++; + if (sum > size) { + System.out.println("debug"); + } + } + } + } + + protected int incrPos(int pos) { + return bitMask & (pos + 1); + } + + private byte[] init_lhb(int size) { + byte[] lhb = new byte[size]; + Arrays.fill(lhb, LHB_EMPTY); + return lhb; + } + + /** + * Compute extra probes for an entry. + * Handle cases where the table size is bigger or smaller than the + * size of the lower-hash-bits + * @param entry the lower-hash-bits, represents the original probe's last 7 bits + * @param pos the current position + * @return the extra distance of this entry from the original probe + */ + private int extraProbesForEntry(byte entry, int pos) { + return smaller_of_mask & (pos - entry); + } + + private void setSmallerOfMask () { + smaller_of_mask = Math.min(lower_hash_bits.length, 128) - 1; + } +} diff --git a/uimaj-core/src/main/java/org/apache/uima/internal/util/IntHashSetRh.java b/uimaj-core/src/main/java/org/apache/uima/internal/util/IntHashSetRh.java new file mode 100644 index 0000000..571be51 --- /dev/null +++ b/uimaj-core/src/main/java/org/apache/uima/internal/util/IntHashSetRh.java @@ -0,0 +1,794 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.uima.internal.util; + +import java.util.Arrays; +import java.util.NoSuchElementException; + +import org.apache.uima.util.impl.Constants; + +/** + * A set of non-zero ints. + * Can be negative. + * + * 0 reserved internally to indicate "not in the map"; + * you will get an exception if you try to store 0 as a value. + * + * 0 will be returned if the value is missing from the map. + * + * allowed range is Integer.MIN_VALUE to Integer.MAX_VALUE + * 0 is the value for an empty cell + * Integer.MIN_VALUE is the value for a deleted (removed) value + * + * based on Int2IntHashMap + * This impl is for use in a single thread case only + * + * Supports shrinking (reallocating the big table) + * + * Supports representing ints as "short" 2byte values if possible, + * together with an offset amount. + * Because of the offset, the adjusted key could be == to the offset, + * so we subtract 1 from it to preserve 0 value as being the null / empty. + * For short values, the range is: + * Short.MIN_VALUE+2 to Short.MAX_VALUE after Offset, + * with the "0" value moved down by 1 and + * the Short.MIN_VALUE used for deleted (removed) items + * + * Automatically switches to full int representation if needed + */ +public class IntHashSetRh extends Common_hash_support_rh implements PositiveIntSet { + + public static final int SIZE_NEEDING_4_BYTES = 256 * 256 - 2; + + // this load factor gives, for array doubling strategy: + // between 1.5 * 4 bytes (1 word for key) = 6 and + // 3 * 4 12 bytes per entry + // This compares with 160 bytes/entry for the IntArrayRBT impl + + // offset only used with keys2. values stored are key - offset + // intent is to have them fit into short data type. + // If the offset is 100,000, then the keys range from 100000 - 32766 to 100000 + 32767, includes "0" + // -32767 == Short.MIN_VALUE + 1, + // Numbers 0 and below are stored as n - 1, so "0" itself isn't actually stored + // because it's used to represent an empty slot + // - value after key adjustment of 0 reserved; if value is 0 - means no entry + // store values at or below the offset are stored as 1 less to avoid mapping to "0". + + private static final short EMPTY2 = 0; + private int offset; + + private int [] keys4; + private short[] keys2; + + private boolean isMake4; + + // these are true values (before any offset adjustment) + private int mostPositive = Integer.MIN_VALUE; + private int mostNegative = Integer.MAX_VALUE; + + public IntHashSetRh() { + this(10, 0); + } + + public IntHashSetRh(int initialCapacity) { + this(initialCapacity, 0); + } + + /** + * + * @param initialSizeBeforeExpanding - you can add this many before expansion + * @param offset - for values in the short range, the amount to subtract + * before storing. + * If == MIN_VALUE, then force 4 byte ints + */ + public IntHashSetRh(int initialSizeBeforeExpanding, int offset) { + super(initialSizeBeforeExpanding); + isMake4 = offset == Integer.MIN_VALUE; + this.offset = isMake4 ? 0 : offset; + newTable(this.initialCapacity); + resetTable(); + if (IS_TRACE_MODE_SWITCH) { + System.out.println("TRACE_MODE new IntHashSet, sizeBeforeExpanding = " + initialSizeBeforeExpanding + ", offset= " + offset); + } + } + +// /** +// * The number of 32 bit words that are reserved when +// * creating a table to hold the specified number of elements +// * +// * The number is a power of 2. +// * +// * The number is at least 16. +// * +// * The number is such that you could add this many elements without +// * triggering the capacity expansion. +// * +// * @param numberOfElements - +// * @return - +// */ +// public int tableSpace(int numberOfElements) { +// return tableSpace(numberOfElements, loadFactor); +// } +// + + public static int tableSpace(int numberOfElements) { + return Common_hash_support.tableSpace(numberOfElements, 0.66f); + } + + + /** + * Method called by handleHashSet in PositiveIntSet + * to indicate if adding this many items would cause an expansion + * @return true if would not expand + */ + public boolean wontExpand() { + return wontExpand(1); + } + + /** + * Method called by handleHashSet in PositiveIntSet + * to indicate if adding this many items would cause an expansion + * @param n the number of items added + * @return true if would not expand + */ + public boolean wontExpand(int n) { + return (size() + n) < sizeWhichTriggersExpansion; + } + + public int getSpaceUsedInWords() { + return (keys4 != null) ? keys4.length : (keys2.length >> 1); + } + + public static int getSpaceOverheadInWords() { + return 11; + } + + /** + * Only call this if using short values with offset + * @param adjKey + * @return raw key + */ + private int getRawFromAdjKey(int adjKey) { + assert (adjKey != Short.MIN_VALUE); + return adjKey + offset + ((adjKey < 0) ? 1 : 0); + } + + + private void resetTable() { + mostPositive = Integer.MIN_VALUE; + mostNegative = Integer.MAX_VALUE; +// resetHistogram(); +// size = 0; +// nbrRemoved = 0; + } + + @Override + public void clear() { + super.clear(); + resetTable(); + } + + + // only called when keys are shorts + private boolean isAdjKeyOutOfRange(int adjKey) { + return (adjKey > Short.MAX_VALUE || + // the minimum adjKey value stored in a short is + // Short.MIN_VALUE + 1 + adjKey <= Short.MIN_VALUE); + } + + @Override + public boolean contains(int rawKey) { + int pos = findPosition(rawKey); + if (isEmpty(pos)) { + return false; + } + return get(pos) == rawKey; + } + + /** + * This method is part of the PositiveSet API, and + * is defined to return an int that could be used with + * iterators to position them. + * + * For this case, it is not used, because + * the iterators don't support positioning this way + * because they are not sorted. + */ + @Override + public int find(int rawKey) { + throw new UnsupportedOperationException(); + } + + + /** + * @param rawKey the key value to find + * @return the position in the table if present, otherwise the position of the slot where the + * key value would be added, unless the new value is at a position which would require + * the key2 form to be switched to the key4 form, in which case, + * -1 is returned (means not found, and requires conversion to 4 byte keys) + */ + private int findPosition(int rawKey) { + + if (rawKey == 0) { + throw new IllegalArgumentException("0 is an invalid key"); + } + + if (keys4 == null) { + // special handling for 2 byte keys with offsets + // check for keys never stored in short table + // adjKey of Short.MIN_VALUE which is the removed flag + final int adjKey = getAdjKey(rawKey); + if (isAdjKeyOutOfRange(adjKey)) { + return -1; + } + + + return findPositionAdjKey(adjKey); + + } else { + + + return findPosition4(rawKey); + } + + } + + private int findPosition4(int rawKey) { + + return super.findPosition( + + // key hash + Misc.hashInt(rawKey), + + // is_eq || not present + i -> keys4[i] == rawKey || keys4[i] == 0 + + ); + } + + private int findPositionAdjKey(int adjKey) { + // special handling for 2 byte keys with offsets + // check for keys never stored in short table + // adjKey of Short.MIN_VALUE which is the removed flag + + return super.findPosition( + + // key hash + Misc.hashInt(adjKey), + + // is_eq + i -> keys2[i] == 0 || keys2[i] == adjKey + ); + } + + + /** + * return the adjusted key. + * never called for 4 byte form + * for 2 byte key mode, subtract the offset, and adjust by -1 if 0 or less + * Note: returned value can be less than Short.MIN_VALUE + * @param rawKey + * @return adjusted key, a range from negative to positive, but never 0 + */ + private int getAdjKey(int rawKey) { +// if (rawKey == 0 || (rawKey == Integer.MIN_VALUE)) { +// throw new ArrayIndexOutOfBoundsException(rawKey); +// } +// if (keys4 != null) { +// return rawKey; +// } + int adjKey = rawKey - offset; + return adjKey - ( (adjKey <= 0) ? 1 : 0); + } + + private void switchTo4byte(int capacity) { + // convert to 4 byte because values can't be offset and fit in a short + final short[] oldKeys = keys2; + isMake4 = true; + newTable(capacity); // make a 4 table. same size + for (short adjKey : oldKeys) { + if (adjKey != EMPTY2 ) { + find4AndAddIfMissing(getRawFromAdjKey(adjKey)); + } + } + } + + /** + * + * @param rawKey - + * @return true if this set did not already contain the specified element + */ + @Override + public boolean add(int rawKey) { + if (rawKey == 0) { + throw new IllegalArgumentException("argument must be non-zero"); + } + + if (size() == 0) { + mostPositive = mostNegative = rawKey; + } else { + if (rawKey > mostPositive) { + mostPositive = rawKey; + } + if (rawKey < mostNegative) { + mostNegative = rawKey; + } + } + + if (keys4 != null) { + return maybeAdd4(rawKey); + // short keys + } else { + int adjKey = getAdjKey(rawKey); + if (isAdjKeyOutOfRange(adjKey)) { + switchTo4byte(getCapacity()); + return maybeAdd4(rawKey); + + // key in range + } else { + return maybeAdd2(adjKey); + } + } + } + + private boolean maybeAdd2(int adjKey) { + boolean wasAdded = find2AndAddIfMissing(adjKey); + if (wasAdded) { + incrementSize(); + } + return wasAdded; + } + + private boolean maybeAdd4(int rawKey) { + boolean wasAdded = find4AndAddIfMissing(rawKey); + if (wasAdded) { + incrementSize(); + } + return wasAdded; + } + + private boolean find4AndAddIfMissing(int rawKey) { + + int i = findPosition4(rawKey); + if (keys4[i] == rawKey) { + return false; // already in table + } + + int c = keys4[i]; + int saved = c; + + byte prev_lhb = setLhb(i, initial_probe_lhb); + keys4[i] = rawKey; + + if (saved == 0) { + return true; + } + // robin hood - if stole slot, add that slot back + rh_add4(incrPos(i), saved, prev_lhb); + return true; + } + + private void rh_add4(int pos, int v, byte lhb) { + while (true) { + int c = keys4[pos]; + int saved = c; // if empty, is 0 + + byte prev_lhb = setLhb(pos, lhb); + keys4[pos] = v; + + if (saved == 0) { + break; + } + // robin hood - if stole slot, add that slot back + v = saved; + lhb = prev_lhb; + pos = incrPos(pos); + } + } + + private boolean find2AndAddIfMissing(int adjKey) { + int i = findPositionAdjKey(adjKey); + + short c = keys2[i]; + if (c == (short)adjKey) { + return false; // already in table + } + + short saved = c; // if empty, is 0 + + byte prev_lhb = setLhb(i, initial_probe_lhb); + keys2[i] = (short) adjKey; + + if (saved == 0) { + return true; + } + // robin hood - if stole slot, add that slot back + rh_add2(incrPos(i), saved, prev_lhb); + return true; + } + + private void rh_add2(int pos, short v, byte lhb) { + while (true) { + short c = keys2[pos]; + short saved = c; // if empty, is 0 + + byte prev_lhb = setLhb(pos, lhb); + keys2[pos] = v; + + if (saved == 0) { + break; + } + // robin hood - if stole slot, add that slot back + v = saved; + lhb = prev_lhb; + pos = incrPos(pos); + } + + } + + @Override + protected void shift(int prev, int next) { + if (keys4 == null) { + keys2[prev] = keys2[next]; + } else { + keys4[prev] = keys4[next]; + } + } + + @Override + protected void setEmpty(int pos) { + if (keys4 == null) { + keys2[pos] = 0; + } else { + keys4[pos] = 0; + } + + } + /** + * mostPositive and mostNegative are not updated + * for removes. So these values may be inaccurate, + * but mostPositive is always >= actual most positive, + * and mostNegative is always <= actual most negative. + * No conversion from int to short + * + * Does slot shifting on remove + * + * @param rawKey the value to remove + * @return true if the key was present + */ + @Override + public boolean remove(int rawKey) { +// debugValidate(); + final int pos = findPosition(rawKey); + if (isEmpty(pos)) { + return false; + } + + if (get(pos) != rawKey) { + return false; + } + remove_common(pos); + +// //debug +// if (size() <= 0) +// System.out.println("debug"); +// assert size() > 0; + + if (rawKey == mostPositive) { + mostPositive --; // a weak adjustment + } + if (rawKey == mostNegative) { + mostNegative ++; // a weak adjustment + } + + return true; + } + + /** + * + * @return a value that is >= the actual most positive value in the table. + * it will be == unless a remove operation has removed a most positive value + */ + public int getMostPositive() { + return mostPositive; + } + + /** + * + * @return a value that is <= the actual least positive value in the table. + * It will be == unless remove operations has removed a least positive value. + */ + public int getMostNegative() { + return mostNegative; + } + + public void showHistogram() { + if (TUNE) { + int sumI = 0; + for (int i : histogram) { + sumI += i; + } + + System.out.format( + "Histogram of number of probes, loadfactor = %.1f, maxProbe=%,d nbr of find operations at last table size=%,d%n", + loadFactor, maxProbe, sumI); + for (int i = 0; i <= maxProbe; i++) { + if (i == maxProbe && histogram[i] == 0) { + System.out.println("huh?"); + } + System.out.println(i + ": " + histogram[i]); + } + + if (keys4 == null) { + System.out.println("bytes / entry = " + (float) (keys2.length) * 2 / size()); + System.out.format("size = %,d, prevExpansionTriggerSize = %,d, next = %,d%n", + size(), + (int) ((keys2.length >>> 1) * loadFactor), + (int) (keys2.length * loadFactor)); + } else { + System.out.println("bytes / entry = " + (float) (keys4.length) * 4 / size()); + System.out.format("size = %,d, prevExpansionTriggerSize = %,d, next = %,d%n", + size(), + (int) ((keys4.length >>> 1) * loadFactor), + (int) (keys4.length * loadFactor)); + } + } + } + + /** + * For iterator use, position is a magic number returned by the internal find + * For short keys, the value stored for adjKey == 0 is -1, adjKey == -1 is -2, etc. + */ + @Override + public int get(int pos) { + final int adjKey; + if (keys4 == null) { + adjKey = keys2[pos]; + if (adjKey == 0 ) { + return 0; // null, not present + } + return getRawFromAdjKey(adjKey); + } else { + adjKey = keys4[pos]; + if (adjKey == 0 ) { + return 0; // null, not present + } + return adjKey; + } + } + + + private class IntHashSetIterator implements IntListIterator { + + private int curPosition; + + private IntHashSetIterator() { + this.curPosition = 0; + } + + public final boolean hasNext() { + curPosition = moveToNextFilled(curPosition); + return curPosition < getCapacity(); + } + + public final int nextNvc() { + curPosition = moveToNextFilled(curPosition); + int r = get(curPosition); + curPosition = moveToNextFilled(curPosition + 1); + return r; + } + + /** + * @see org.apache.uima.internal.util.IntListIterator#hasPrevious() + */ + public boolean hasPrevious() { + int prev = moveToPreviousFilled(curPosition - 1); + return (prev >= 0); + } + + @Override + public int previous() { + curPosition = moveToPreviousFilled(curPosition - 1); + if (curPosition < 0) { + throw new NoSuchElementException(); + } + return get(curPosition); + } + + @Override + public int previousNvc() { + curPosition = moveToPreviousFilled(curPosition - 1); + return get(curPosition); + } + + /** + * @see org.apache.uima.internal.util.IntListIterator#moveToEnd() + */ + public void moveToEnd() { + curPosition = getCapacity() - 1; + } + + /** + * @see org.apache.uima.internal.util.IntListIterator#moveToStart() + */ + public void moveToStart() { + curPosition = 0; + } + } + + + @Override + public IntListIterator iterator() { + return new IntHashSetIterator(); + } + + @Override + public int moveToFirst() { + return (size() == 0) ? -1 : moveToNextFilled(0); + } + + @Override + public int moveToLast() { + return (size() == 0) ? -1 : moveToPreviousFilled(getCapacity() -1); + } + + @Override + public int moveToNext(int position) { + if (position < 0) { + return position; + } + final int n = moveToNextFilled(position + 1); + return (n >= getCapacity()) ? -1 : n; + } + + @Override + public int moveToPrevious(int position) { + if (position >= getCapacity()) { + return -1; + } + return moveToPreviousFilled(position - 1); + } + + @Override + public boolean isValid(int position) { + return (position >= 0) && (position < getCapacity()); + } + + @Override + public void bulkAddTo(IntVector v) { + if (null == keys4) { + for (int k : keys2) { + if (k != 0) { + v.add(getRawFromAdjKey(k)); + } + } + } else { + for (int k : keys4) { + if (k != 0) { + v.add(k); + } + } + } + } + + @Override + public int[] toIntArray() { + final int s = size(); + if (s == 0) { + return Constants.EMPTY_INT_ARRAY; + } + final int[] r = new int[size()]; + int pos = moveToFirst(); + for (int i = 0; i < r.length; i ++) { + r[i] = get(pos); + pos = moveToNextFilled(pos + 1); + } + return r; + } + + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + return String + .format( + "IntHashSet [loadFactor=%s, initialCapacity=%s, sizeWhichTriggersExpansion=%s, size=%s, offset=%s%n keys4=%s%n keys2=%s%n secondTimeShrinkable=%s, mostPositive=%s, mostNegative=%s]", + loadFactor, initialCapacity, sizeWhichTriggersExpansion, size(), offset, + Arrays.toString(keys4), Arrays.toString(keys2), secondTimeShrinkable, mostPositive, + mostNegative); + } + + // for testing + boolean isShortHashSet() { + return keys2 != null; + } + + int getOffset() { + return offset; + } + + @Override + protected boolean is_valid_key(int pos) { + if (keys4 == null) { + return keys2[pos] != 0; + } + return keys4[pos] != 0; + } + + @Override + protected int keys_length() { + return (keys4 == null) ? keys2.length : keys4.length; + } + + @Override + protected void newKeysAndValues(int capacity) { + if (isMake4) { + keys4 = new int[capacity]; + keys2 = null; + } else { + keys2 = new short[capacity]; + keys4 = null; + } + } + + @Override + protected void clearKeysAndValues() { + if (keys4 == null) { + Arrays.fill(keys2, (short)0); + } else { + Arrays.fill(keys4, 0); + } + resetTable(); + } + + @Override + protected void expand_table() { + int old_capacity = keys_length(); + int new_capacity = old_capacity * 2; + + if (keys4 == null) { + if (new_capacity >= 256 * 256) { + // switch to 4 + if (TUNE) {System.out.println("Switching to 4 byte keys");} + switchTo4byte(new_capacity); + } else { + final short[] oldKeys = keys2; + newTable(new_capacity); + for (int v :oldKeys) { + if (v != 0) { + find2AndAddIfMissing(v); + } + } + } + } else { + // keys4 + final int[] oldKeys = keys4; + newTable(new_capacity); + for (int v :oldKeys) { + if (v != 0) { + find4AndAddIfMissing(v); + } + } + } + } + +} diff --git a/uimaj-core/src/main/java/org/apache/uima/internal/util/ObjHashSetRh.java b/uimaj-core/src/main/java/org/apache/uima/internal/util/ObjHashSetRh.java new file mode 100644 index 0000000..d70c7d3 --- /dev/null +++ b/uimaj-core/src/main/java/org/apache/uima/internal/util/ObjHashSetRh.java @@ -0,0 +1,520 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.uima.internal.util; + +import java.lang.reflect.Array; +import java.util.Arrays; +import java.util.Collection; +import java.util.Iterator; +import java.util.NoSuchElementException; +import java.util.Set; + +import org.apache.uima.cas.FeatureStructure; + +/** + * A set of Objects of type T + + * This impl is for use in a single thread case only, when table is being updated. + * Multiple reader threads are OK if there's no writing. + * + * Supports shrinking (reallocating the big table) + * + * Uses robin hood method, with + * incrementing collision resolution, and shift backwards on remove, and 1 byte lower-hash-bits aux table + * for locality of ref. + */ +public class ObjHashSetRh<T> extends Common_hash_support_rh implements Set<T> { + +// public static final float DEFAULT_LOAD_FACTOR = 0.66F; + + private final Class<T> clazz; // for reifying the T type, resizing table, toArray operations + +// private final float loadFactor = DEFAULT_LOAD_FACTOR; + +// private final int initialCapacity; +// +// private int histogram []; +// private int maxProbe = 0; +// +// private int sizeWhichTriggersExpansion; +// private int size; // number of elements in the table +// private int nbrRemoved; // number of removed elements (coded as removed) + + // the actual Object table, operated as a hashtable + private T [] keys; +// final private T [] emptyKeyArray; + +// private boolean secondTimeShrinkable = false; + +// private int modificationCount = 0; // not currently used + + public ObjHashSetRh(Class<T> clazz) { + this(12, clazz); // default initial size + } + + /** + * @param initialCapacity - you can add this many before expansion + * @param clazz - a superclass of the stored items + * @param removedMarker - a unique value never stored in the table, used to mark removed items + */ + public ObjHashSetRh(int initialCapacity, Class<T> clazz) { + super(initialCapacity); + this.clazz = clazz; + newTable(initialCapacity); + } + + /** + * Copy constructor + * @param ohs - + */ + public ObjHashSetRh(ObjHashSetRh<T> ohs) { + super(ohs); + this.clazz = ohs.clazz; + this.keys = Arrays.copyOf(ohs.keys, ohs.keys.length); + } + + + public ObjHashSetRh(ObjHashSetRh<T> ohs, boolean readOnly) { + this(ohs); + if (!readOnly) Misc.internalError(); + } + + + /** + * returns a position in the key/value table + * if the key is not found, then the position will be to the + * place where the key would be put upon adding (without reusing REMOVED, and the + * current internal value of keys[position] would be 0. + * + * if the key is found, then keys[position] == key + * @param obj the object to find + * @return the probeAddr in keys array - might reference a slot holding null, or the key value if found + */ + private int findPosition(final T key) { + if (key == null) { + throw new IllegalArgumentException("null is an invalid key"); + } + + return findPosition( + + // key hash + Misc.hashInt(key.hashCode()), + + // is_eq + i -> keys[i].equals(key)); + } + + + @Override + public boolean contains(Object obj) { // arg must be Object to fit Collection API + return (clazz.isAssignableFrom(obj.getClass())) ? (find((T) obj) != -1) : false; + } + + /** + * @param obj the object to find in the table (if it is there) + * @return the position of obj in the table, or -1 if not in the table + */ + public int find(T obj) { + if (obj == null || size() == 0) { + return -1; + } + + final int pos = findPosition(obj); + return obj.equals(keys[pos]) ? pos : -1; // keys[pos] can be null + } + + /** + * + * @param obj - the object to add + * @return true if this set did not already contain the specified element + */ + @Override + public boolean add(T obj) { + final int i = findPosition(obj); + T c = keys[i]; + if (obj.equals(c)) { // keys[i] may be null + return false; // false if already present + } + + // i is ref to empty or non-empty (but not equal) slot which should be stolen + add_new(obj, c, i); + incrementSize(); + return true; + } + + private void add_new(T obj, T saved, int i) { + keys[i] = obj; + byte prev_lhb = setLhb(i, initial_probe_lhb); + + if (saved != null) { + rh_add(incrPos(i), saved, prev_lhb); + } + } + + private void rh_add(int pos,T v, byte lhb) { + while (true) { + T c = keys[pos]; + T saved = c; // if empty, is 0 + + byte prev_lhb = setLhb(pos, lhb); + keys[pos] = v; + + if (saved == null) { + break; + } + // robin hood - if stole slot, add that slot back + v = saved; + lhb = prev_lhb; + pos = incrPos(pos); + } + } + + @Override + protected void shift(int prev, int next) { + keys[prev] = keys[next]; + } + + @Override + protected void setEmpty(int pos) { + keys[pos] = null; + } + + + /** + * Using robin hood shifting for removed item + * @param rawKey the value to remove + * @return true if the key was present + */ + @Override + public boolean remove(Object rawKey) { + if (rawKey == null) { + return false; + } + + final int pos = findPosition((T) rawKey); // null or equal or collision obj + T c = keys[pos]; + if (c == null || ! c.equals(rawKey)) { + return false; + } + + remove_common(pos); + return true; + } + + private void removeAtPosition(int pos) { + remove_common(pos); + } + + +// /** +// * @see Set#size() +// */ +// @Override +// public int size() { +// return size; +// } + +// public void showHistogram() { +// if (TUNE) { +// int sumI = 0; +// for (int i : histogram) { +// sumI += i; +// } +// +// System.out.format( +// "Histogram of number of probes, loadfactor = %.1f, maxProbe=%,d nbr of find operations at last table size=%,d%n", +// loadFactor, maxProbe, sumI); +// for (int i = 0; i <= maxProbe; i++) { +// if (i == maxProbe && histogram[i] == 0) { +// System.out.println("huh?"); +// } +// System.out.println(i + ": " + histogram[i]); +// } +// +// System.out.println("bytes / entry = " + (float) (keys.length) * 4 / size()); +// System.out.format("size = %,d, prevExpansionTriggerSize = %,d, next = %,d%n", +// size(), +// (int) ((keys.length >>> 1) * loadFactor), +// (int) (keys.length * loadFactor)); +// } +// } + + /** + */ + /** + * For iterator use + * @param index - a magic number returned by the internal find + * @return the T at that spot, or null if nothing there + */ + public T get(int index) { + T obj = keys[index]; + if (obj == null) { + return null; // null, not present + } + return obj; + } + +//// /** +//// * advance pos until it points to a non 0 or is 1 past end +//// * @param pos - +//// * @return updated pos +//// */ +//// public int moveToNextFilled(int pos) { +////// if (pos < 0) { +////// pos = 0; +////// } +//// +//// final int max = getCapacity(); +//// while (true) { +//// if (pos >= max) { +//// return pos; +//// } +//// T v = keys[pos]; +//// if (v != null && v != removedMarker) { +//// return pos; +//// } +//// pos ++; +//// } +//// } +// +// /** +// * decrement pos until it points to a non 0 or is -1 +// * @param pos - +// * @return updated pos +// */ +// public int moveToPreviousFilled(int pos) { +// final int max = getCapacity(); +// if (pos > max) { +// pos = max - 1; +// } +// +// while (true) { +// if (pos < 0) { +// return pos; +// } +// T v = keys[pos]; +// if (v != null && v != removedMarker) { +// return pos; +// } +// pos --; +// } +// } + + private class ObjHashSetIterator implements Iterator<T> { + + /** + * Keep this always pointing to a non-0 entry, or + * if not valid, outside the range + */ + protected int curPosition; + + private ObjHashSetIterator() { + this.curPosition = moveToNextFilled(0); + } + + @Override + public final boolean hasNext() { + return curPosition < getCapacity(); + } + + @Override + public final T next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + T r = get(curPosition); + curPosition = moveToNextFilled(curPosition + 1); + return r; + } + + @Override + public void remove() { + int pos = moveToPreviousFilled(curPosition - 1); + if (pos >= 0) { + removeAtPosition(pos); + } + } + } + + + @Override + public Iterator<T> iterator() { + return new ObjHashSetIterator(); + } + + private int moveToFirst() { + return (size() == 0) ? -1 : moveToNextFilled(0); + } + +// private int moveToLast() { +// return (size() == 0) ? -1 : moveToPreviousFilled(getCapacity() -1); +// } + +// private int moveToNext(int position) { +// if (position < 0) { +// return position; +// } +// final int n = moveToNextFilled(position + 1); +// return (n >= getCapacity()) ? -1 : n; +// } +// +// private int moveToPrevious(int position) { +// if (position >= getCapacity()) { +// return -1; +// } +// return moveToPreviousFilled(position - 1); +// } + +// public boolean isValid(int position) { +// return (position >= 0) && (position < getCapacity()); +// } + + /** + * if the fs is in the set, the iterator should return it. + * if not, return -1 (makes iterator invalid) + * @param fs position to this fs + * @return the index if present, otherwise -1; + */ + public int moveTo(FeatureStructure fs) { + if (clazz.isAssignableFrom(fs.getClass())) { + int pos = find((T)fs); + if (pos >= 0) { + return pos; + } + } + return -1; + } + + @Override + public <T2> T2[] toArray(T2[] a) { + final int s = size(); + if (s == 0) { + if (a.length >= 1) { + a[0] = null; // part of the contract of toArray, where the array a size is > + } + return a; + } + + final T2[] r = (a.length >= s)? a : (T2[]) Array.newInstance(a.getClass(), s); + int pos = moveToFirst(); + for (int i = 0; i < s; i ++) { + r[i] = (T2) get(pos); + pos = moveToNextFilled(pos + 1); + } + if (a.length > s) { + r[s] = null; // part of the contract of toArray, where the array a size is > + } + return r; + } + + @Override + public T[] toArray() { + return toArray((T[]) Array.newInstance(clazz, size())); + } + + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + return String + .format( + "%s [loadFactor=%s, initialCapacity=%s, sizeWhichTriggersExpansion=%s, size=%s, secondTimeShrinkable=%s%n keys=%s]", + this.getClass().getName(), loadFactor, initialCapacity, sizeWhichTriggersExpansion, size(), secondTimeShrinkable, + Arrays.toString(keys)); + } + + // Collection methods + @Override + public boolean isEmpty() { + return size() == 0; + } + + @Override + public boolean containsAll(Collection<?> c) { + c.stream().allMatch(item -> contains(item)); + return false; + } + + @Override + public boolean addAll(Collection<? extends T> c) { + boolean[] anyChanged = {false}; + c.stream().forEach(item -> anyChanged[0] |= add(item)); + return anyChanged[0]; + } + + @Override + public boolean removeAll(Collection<?> c) { + boolean[] anyChanged = {false}; + c.stream().forEach(item -> anyChanged[0] |= remove(item)); + return anyChanged[0]; + } + + @Override + public boolean retainAll(Collection<?> c) { + boolean anyChanged = false; + Iterator<T> it = iterator(); + while (it.hasNext()) { + T v = it.next(); + if (!c.contains(v)) { + anyChanged = true; + it.remove(); + } + } + return anyChanged; + } + + @Override + protected boolean is_valid_key(int pos) { + return keys[pos] != null; + } + + @Override + protected int keys_length() { + return keys.length; + } + + @Override + protected void newKeysAndValues(int capacity) { + keys = (T[]) Array.newInstance(clazz, capacity); + } + + @Override + protected void clearKeysAndValues() { + Arrays.fill(keys, null); + } + + @Override + protected void expand_table() { + int old_capacity = keys_length(); + int new_capacity = old_capacity * 2; + + final T[] oldKeys = keys; + newTable(new_capacity); + for (T v : oldKeys) { + if (v != null) { + int pos = findPosition_new( Misc.hashInt(v.hashCode())); + add_new(v, keys[pos], pos); + } + } + } + + +} diff --git a/uimaj-core/src/test/java/org/apache/uima/internal/util/IntHashSetPerfTestRh.java b/uimaj-core/src/test/java/org/apache/uima/internal/util/IntHashSetPerfTestRh.java new file mode 100644 index 0000000..5bb5286 --- /dev/null +++ b/uimaj-core/src/test/java/org/apache/uima/internal/util/IntHashSetPerfTestRh.java @@ -0,0 +1,219 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.uima.internal.util; + +import java.util.Random; + +import org.apache.uima.internal.util.rb_trees.IntArrayRBT; + +import junit.framework.TestCase; + +public class IntHashSetPerfTestRh extends TestCase { + /** + * Set to false to run the performance test + * + * Tests both IntHashSet and IntBitSet + */ + final boolean SKIP = false; + + static int cacheLoadSize; + + static long seed = 3737463135938899369L; +// new Random().nextLong(); + static { + System.out.println("Random seed for IntHashSetPerfTest: " + seed); + } + Random r = new Random(seed); + +// Set<Integer> keys = new HashSet<Integer>(1000); + + int dmv = 0; + + IntHashSetRh m2; + IntArrayRBT m1; + IntBitSet m3; + + final int[] keys10000 = new int[511111]; + int k10ki = 0; + + + public void testPerf() { + if (SKIP) return; + m1 = new IntArrayRBT(16); + m2 = new IntHashSetRh(16); + m3 = new IntBitSet(16); + + for (int i = 0; i < keys10000.length; i++) { + int k = r.nextInt(511110); + + keys10000[i] = k + 1; + } + + System.out.format("%n%n W A R M U P %n%n"); + cacheLoadSize = 0; + warmup(m1); + warmup(m2); + + System.out.format("%n%n Time 100 %n%n"); + timelp(100); + System.out.format("%n%n Time 1000 %n%n"); + timelp(1000); + System.out.format("%n%n Time 10000 %n%n"); + timelp(10000); + System.out.format("%n%n Time 100000 %n%n"); + timelp(100000); + cacheLoadSize = 0; // 1 * 256 * 1; + System.out.format("%n%n Time 100000 %n%n"); + timelp(100000); + + System.out.format("%n%n Time 500000 %n%n"); + timelp(500000); + + System.out.format("%n%n For Yourkit: Time 500000 %n%n"); + timelp(500000); + System.out.format("%n%n For Yourkit: Time 500000 %n%n"); + timelp(500000); + System.out.format("%n%n For Yourkit: Time 500000 %n%n"); + timelp(500000); + + + System.out.println(dmv); + } + private void time2(int n) { +// float f1 = time(m1, n); + float f2 = time(m2, n); + float f3 = time(m3, n); + System.out.format(" ratio " +// + "RBT/hash = %.3f RTB/bitset = %.3f " + + "hash/bitset = %.3f%n", +// f1/f2, f1/f3, + f2/f3); + } + + private void timelp(int n) { + time2(n); + time2(n); + time2(n); + } + + private void warmup(Object m) { + for (int i = 0; i < 500; i++) { + inner(m,true, 1000) ; // warm up + } + } + + private float time(Object m, int ss) { + long start = System.nanoTime(); + for (int i = 0; i < 500; i++) { + inner(m,false, ss); + } + float t = (System.nanoTime() - start) / 1000000.0F; + System.out.format("time for %,d: %s is %.3f milliseconds %n", ss, + m.getClass().getSimpleName(), + t); + return t; + } + + private int nextKey() { + int r = keys10000[k10ki++]; + if (k10ki >= keys10000.length) { + k10ki = 0; + } + return r; + } + + private void inner(Object m, boolean check, int ss) { + CS cs = new CS(m); + + for (int i = 0; i < ss; i++) { + + int k = keys10000[i]; +// System.out.print(" " + k); +// if (i %100 == 0) System.out.println(""); +// keys.add(k); + cs.add(k); + cacheLoad(i); + if (check) { + assertTrue(cs.contains(k)); + } + } + for (int i = 0; i < ss; i++) { + boolean v = cs.contains(keys10000[i]); + if (!v) { + throw new RuntimeException("never happen"); + } + dmv += 1; + cacheLoad(i); + } + cs.clear(); + + +// for (int k : keys) { +// assertEquals(10000 + k, (m instanceof IntHashSetRh) ? +// ((IntHashSetRh)m).get(k) : +// ((IntArrayRBT)m).getMostlyClose(k)); +// } + + } + + static class CS { + final Object set; + + CS(Object set) { + this.set = set; + } + + boolean contains(int i) { + return (set instanceof IntArrayRBT) ? ((IntArrayRBT)set).contains(i) : + (set instanceof IntHashSetRh) ? ((IntHashSetRh) set).contains(i) : + ((IntBitSet) set).contains(i); + } + + void add(int i) { + if (set instanceof IntArrayRBT) { + ((IntArrayRBT)set).add(i); + } else if (set instanceof IntHashSetRh) { + ((IntHashSetRh)set).add(i); + } else { + ((IntBitSet)set).add(i); + } + } + + void clear() { + if (set instanceof IntArrayRBT) { + ((IntArrayRBT)set).clear(); + } else if (set instanceof IntHashSetRh) { + ((IntHashSetRh)set).clear(); + } else { + ((IntBitSet)set).clear(); + } + } + + } + + void cacheLoad(int i) { + if (cacheLoadSize > 0) { + int[] cl = new int[cacheLoadSize]; + if (i != 100000) { + cl = null; + } + } + } +} diff --git a/uimaj-core/src/test/java/org/apache/uima/internal/util/IntHashSetTestRh.java b/uimaj-core/src/test/java/org/apache/uima/internal/util/IntHashSetTestRh.java new file mode 100644 index 0000000..09d3365 --- /dev/null +++ b/uimaj-core/src/test/java/org/apache/uima/internal/util/IntHashSetTestRh.java @@ -0,0 +1,250 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.uima.internal.util; + +import java.util.Arrays; +import java.util.Random; + +import junit.framework.TestCase; + + +public class IntHashSetTestRh extends TestCase { + + IntHashSetRh ihs; + + Random random; + + public void setUp() { + ihs = new IntHashSetRh(); + } + + public void testBasic() { + + ihs.add(15); + ihs.add(188); + int[] sv = getSortedValues(ihs); + assertEquals(2, sv.length); + assertEquals(15, sv[0]); + assertEquals(188, sv[1]); + assertEquals(15, ihs.getMostNegative()); + assertEquals(188, ihs.getMostPositive()); + + // test most positive / negative + ihs.clear(); + ihs.add(189); + assertEquals(189, ihs.getMostNegative()); + assertEquals(189, ihs.getMostPositive()); + ihs.add(1000); + ihs.add(-1000); + assertEquals(1000, ihs.getMostPositive()); + assertEquals(-1000, ihs.getMostNegative()); + ihs.add(500); + ihs.add(-500); + assertEquals(1000, ihs.getMostPositive()); + assertEquals(-1000, ihs.getMostNegative()); + ihs.remove(1000); + assertEquals(999, ihs.getMostPositive()); + assertEquals(-1000, ihs.getMostNegative()); + ihs.add(1001); + assertEquals(1001, ihs.getMostPositive()); + sv = getSortedValues(ihs); + assertTrue(Arrays.equals(sv, new int[]{-1000, -500, 189, 500, 1001})); + } + + public void testSwitching224() { + final int OS = 100000; + ihs = new IntHashSetRh(16, OS); + ihs.add(OS - 1); + ihs.add(OS); + ihs.add(OS + 1); + int[] sv = getSortedValues(ihs); + assertTrue(Arrays.equals(sv, new int[]{99999, 100000, 100001 })); + ihs.add(OS - 32767); + sv = getSortedValues(ihs); + assertTrue(Arrays.equals(sv, new int[]{OS - 32767, 99999, 100000, 100001})); + ihs.add(OS - 32768); + sv = getSortedValues(ihs); + assertTrue(Arrays.equals(sv, new int[]{OS - 32768, OS - 32767, 99999, 100000, 100001})); + + } + + private int[] getSortedValues(IntHashSetRh s) { + IntListIterator it = s.iterator(); + int[] r = new int[s.size()]; + int i = 0; + while (it.hasNext()) { + r[i++] = it.nextNvc(); + } + Arrays.sort(r); + return r; + } + + public void testContains() { + ihs.add(1188); + ihs.add(1040); + assertTrue(ihs.contains(1188)); + assertTrue(ihs.contains(1040)); + assertFalse(ihs.contains(1)); + assertFalse(ihs.contains(99)); + } + +// public void testTableSpace() { +// assertEquals(32, IntHashSetRh.tableSpace(19, 0.6f)); // 19 / .6 = 31.xxx, round to 32 +// assertEquals(64, IntHashSetRh.tableSpace(21, 0.6f)); +// assertEquals(32, ihs.tableSpace(21)); +// } + + public void testWontExpand() { + ihs = new IntHashSetRh(21); + assertEquals(16, ihs.getSpaceUsedInWords()); + assertTrue(ihs.wontExpand(20)); + assertFalse(ihs.wontExpand(21)); + } + + public void testExpandNpe() { + ihs.add(15); + ihs.add(150000); // makes 4 byte table entries + + for (int i = 1; i < 256; i++) { // 0 is invalid key + ihs.add(i); // causes resize, check no NPE etc thrown. + } + } + + public void testAddIntoRemovedSlot() { + long seed = // -5748775656590176364L; + new Random().nextLong(); + System.out.println("Random seed for testAddIntoRemovedSlot in " + this.getClass().getName() + ": " + seed); + random = new Random(seed); + + for (int i = 1; i < 100; i++) { + ihs.add(i); + assertEquals(i, ihs.size()); + } + + assertEquals(99, ihs.size()); + + /** Test with 2 byte numbers */ + checkRemovedReuse(true); + + ihs = new IntHashSetRh(); + for (int i = 1; i < 99; i++) { + ihs.add(i); + } + ihs.add(100000); // force 4 byte + checkRemovedReuse(false); + } + + private void checkRemovedReuse(boolean is2) { + assertTrue(ihs.getSpaceUsedInWords() == ((is2) ? 128 : 256)); + for (int i = 0; i < 100000; i++) { +// if (i % 25000 == 0) System.out.println(i); + int v = 1 + random.nextInt(100 + (i % 30000)); // random between 1 and 30,101 + int sz = ihs.size(); + boolean wasRemoved = ihs.remove(v); + assertEquals(sz - (wasRemoved ? 1 : 0), ihs.size()); + // debug + if (ihs.contains(v)) { + System.out.println("debug"); + } + assertTrue(!(ihs.contains(v))); + v = 1 + random.nextInt(100 + (i % 30000)); + sz = ihs.size(); + boolean wasAdded = ihs.add(v); + assertEquals(sz + (wasAdded ? 1 : 0), ihs.size()); + assertTrue(ihs.contains(v)); + } + assertTrue(ihs.getSpaceUsedInWords() == ((is2) ? 16384 : 32768) ); + + // 32,768, 16,384, 8,192, 4096, 2048, 1024, 512, 256 + // for 2 byte storage, is2 = true, and expected is: i / 2 + // for 4 byte storage, is2 = false, and expected is i + + ihs.clear(); // doesn't set 2nd time because size + removed > 1/2 the capacity + + for (int i = 32768; i > 128; i = i / 2) { + ihs.clear(); // sets 2nd time shrinkable + assertTrue(ihs.getSpaceUsedInWords() == (is2 ? i/2 : i)); + ihs.clear(); // shrinks + assertTrue(ihs.getSpaceUsedInWords() == (is2 ? i/4 : i/2)); + } +// ihs.clear(); +// + assertTrue(ihs.getSpaceUsedInWords() == (is2 ? 64: 128)); + + // table size should be 128, adding 100 items should cause expansion (84 == .66 * 128) + for (int i = 1; i < ((is2) ? 100 : 99); i++) { + ihs.add(i); + } + if (!is2) { + ihs.add(100000); + } + + assertTrue(ihs.getSpaceUsedInWords() == ((is2) ? 128 : 256)); + for (int i = 0; i < 1000; i++) { + int v = 1 + random.nextInt(100); + ihs.remove(v); + assertTrue(!(ihs.contains(v))); + ihs.add(v); + assertTrue(ihs.contains(v)); + } + + assertTrue(ihs.getSpaceUsedInWords() == ((is2) ? 128 : 256)); + + } + + public void testRandom() { + int countAdd = 0; + int dupsA = 0; + int notPres = 0; + int countRmv = 0; + + long seed = + new Random().nextLong(); + System.out.println("Random seed for testRandom in " + this.getClass().getName() + ": " + seed); + random = new Random(seed); + + for (int i = 1; i < 1024 * 1024; i++) { + int k = i & (1024 * 256) - 1; + if (k == 0) continue; + if (random.nextInt(3) > 0) { + int sz = ihs.size(); + if (ihs.add(k)) { + countAdd ++; + assertEquals(sz + 1, ihs.size()); + } else { + dupsA ++; + } + + } else { + int sz = ihs.size(); + if (ihs.remove(k)) { + countRmv ++; + assertEquals(sz - 1, ihs.size()); + } else { + notPres ++; + } + + } + } + + System.out.format("added: %,d dups: %,d rmvd: %,d notPres: %,d, size: %d%n", countAdd, dupsA, countRmv, notPres, ihs.size()); + assertEquals(countAdd - countRmv, ihs.size() ); + } +} diff --git a/uimaj-core/src/test/java/org/apache/uima/internal/util/ObjHashSetTestRh.java b/uimaj-core/src/test/java/org/apache/uima/internal/util/ObjHashSetTestRh.java new file mode 100644 index 0000000..e8aefc2 --- /dev/null +++ b/uimaj-core/src/test/java/org/apache/uima/internal/util/ObjHashSetTestRh.java @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.uima.internal.util; + +import java.util.Random; + +import junit.framework.TestCase; + + +public class ObjHashSetTestRh extends TestCase { + + ObjHashSetRh<Integer> ihs; + + Random random; + + public void setUp() { + ihs = new ObjHashSetRh<>(Integer.class); + } + + public void testBasic() { + + ihs.add(15); + ihs.add(188); + Integer[] sv = ihs.toArray(); + assertEquals(2, sv.length); + assertEquals(15 + 188, sv[0] + sv[1]); + + // test most positive / negative + ihs.clear(); + ihs.add(189); + ihs.add(1000); + ihs.add(-1000); + ihs.add(500); + ihs.add(-500); + ihs.remove(1000); + ihs.add(1001); + } + + public void testContains() { + ihs.add(1188); + ihs.add(1040); + assertTrue(ihs.contains(1188)); + assertTrue(ihs.contains(1040)); + assertFalse(ihs.contains(1)); + assertFalse(ihs.contains(99)); + } + +// public void testTableSpace() { +// assertEquals(32, ObjHashSetRh.tableSpace(19, 0.6f)); // 19 / .6 = 31.xxx, round to 32 +// assertEquals(64, ObjHashSetRh.tableSpace(21, 0.6f)); +// assertEquals(32, ihs.tableSpace(21)); +// } + + public void testExpandNpe() { + ihs.add(15); + ihs.add(150000); // makes 4 byte table entries + + for (int i = 1; i < 256; i++) { // 0 is invalid key + ihs.add(i); // causes resize, check no NPE etc thrown. + } + } + + public void testAddIntoRemovedSlot() { + long seed = // -6401324561844760524L; + new Random().nextLong(); + System.out.println("Random seed for testAddIntoRemovedSlot in " + this.getClass().getName() + ": " + seed); + random = new Random(seed); + + for (int i = 1; i < 100; i++) { + ihs.add(i); + assertEquals(i, ihs.size()); + } + + assertEquals(99, ihs.size()); + + /** Test with 2 byte numbers */ + checkRemovedReuse(true); + + ihs = new ObjHashSetRh<>(Integer.class); + for (int i = 1; i < 99; i++) { + ihs.add(i); + } + ihs.add(100000); // force 4 byte + checkRemovedReuse(false); + } + + private void checkRemovedReuse(boolean is2) { + for (int i = 0; i < 100000; i++) { + int v = 1 + random.nextInt(100 + (i % 30000)); // random between 1 and 30,101 + int sz = ihs.size(); + boolean wasRemoved = ihs.remove(v); + assertEquals(sz - (wasRemoved ? 1 : 0), ihs.size()); + assertTrue(!(ihs.contains(v))); + v = 1 + random.nextInt(100 + (i % 30000)); + sz = ihs.size(); + boolean wasAdded = ihs.add(v); + assertEquals(sz + (wasAdded ? 1 : 0), ihs.size()); + assertTrue(ihs.contains(v)); + } + + ihs.clear(); // doesn't set 2nd time because size + removed > 1/2 the capacity + + for (int i = 32768; i > 128; i = i / 2) { + ihs.clear(); // sets 2nd time shrinkable + assertEquals(i, ihs.getCapacity()); + ihs.clear(); // shrinks + assertEquals(i/2, ihs.getCapacity()); + } +// ihs.clear(); +// + + // table size should be 128, adding 100 items should cause expansion (84 == .66 * 128) + for (int i = 1; i < 100; i++) { + ihs.add(i); + } + + assertEquals(256, ihs.getCapacity()); + for (int i = 0; i < 1000; i++) { + int v = 1 + random.nextInt(100); + ihs.remove(v); + assertTrue(!(ihs.contains(v))); + ihs.add(v); + assertTrue(ihs.contains(v)); + } + + assertEquals(256, ihs.getCapacity()); + + } + + public void testRandom() { + int countAdd = 0; + int dupsA = 0; + int notPres = 0; + int countRmv = 0; + + long seed = + new Random().nextLong(); + System.out.println("Random seed for testRandom in " + this.getClass().getName() + ": " + seed); + random = new Random(seed); + + for (int i = 1; i < 1024 * 1024; i++) { + int k = i & (1024 * 256) - 1; + if (k == 0) continue; + if (random.nextInt(3) > 0) { + int sz = ihs.size(); + if (ihs.add(k)) { + countAdd ++; + assertEquals(sz + 1, ihs.size()); + } else { + dupsA ++; + } + + } else { + int sz = ihs.size(); + if (ihs.remove(k)) { + countRmv ++; + assertEquals(sz - 1, ihs.size()); + } else { + notPres ++; + } + + } + } + + System.out.format("added: %,d dups: %,d rmvd: %,d notPres: %,d, size: %d%n", countAdd, dupsA, countRmv, notPres, ihs.size()); + assertEquals(countAdd - countRmv, ihs.size() ); + } +}