[I] Compute gain with vector API in BPIndexReorderer [lucene]

via GitHub Wed, 25 Oct 2023 04:39:58 -0700


gf2121 opened a new issue, #12721:
URL: https://github.com/apache/lucene/issues/12721


   ### Description
   
   An immature idea ! :)
   
   I noticed that `BPIndexReorderer$ComputeGainsTask#computeGain()` took a lot 
in CPU profile:
   
   ```
   PERCENT       CPU SAMPLES   STACK
   4.75%         53042         
org.apache.lucene.misc.index.BPIndexReorderer$ComputeGainsTask#computeGain()
   4.47%         49905         
org.apache.lucene.analysis.standard.StandardTokenizerImpl#getNextToken()
   3.87%         43194         
org.apache.lucene.index.FreqProxTermsWriter$SortingPostingsEnum#reset()
   3.67%         40975         
org.apache.lucene.index.TermsHashPerField#writeByte()
   3.59%         40003         org.apache.lucene.util.BytesRefHash#equals()
   3.55%         39588         
org.apache.lucene.store.MemorySegmentIndexInput#readByte()
   2.80%         31197         
org.apache.lucene.util.ByteBlockPool#setBytesRef()
   2.52%         28154         java.io.BufferedOutputStream#write()
   2.33%         25978         org.apache.lucene.util.BytesRefHash#findHash()
   2.32%         25896         
org.apache.lucene.codecs.lucene90.Lucene90PostingsWriter#startDoc()
   2.24%         24966         org.apache.lucene.util.Sorter#binarySort()
   2.23%         24864         org.apache.lucene.store.DataInput#readVInt()
   2.10%         23484         
org.apache.lucene.misc.index.BPIndexReorderer$ForwardIndex#seek()
   2.04%         22808         
org.apache.lucene.codecs.PushPostingsWriterBase#writeTerm()
   ```
   
   I tried to implement this with vector api, the result looks good on AVX-512 
(Java 21):
   ```
   Benchmark              (maxTerm)  (termsNum)   Mode  Cnt  Score   Error   
Units
   GainBenchmark.gainNew       1024         128  thrpt    5  3.471 ± 0.142  
ops/us
   GainBenchmark.gainOld       1024         128  thrpt    5  2.031 ± 0.432  
ops/us
   ```
   
   
   <details><summary> Code </summary>
   
   ```
   package testing;
   
   import jdk.incubator.vector.FloatVector;
   import jdk.incubator.vector.IntVector;
   import jdk.incubator.vector.Vector;
   import jdk.incubator.vector.VectorMask;
   import jdk.incubator.vector.VectorOperators;
   import jdk.incubator.vector.VectorSpecies;
   import org.openjdk.jmh.annotations.*;
   
   import java.util.Arrays;
   import java.util.concurrent.ThreadLocalRandom;
   import java.util.concurrent.TimeUnit;
   
   @BenchmarkMode(Mode.Throughput)
   @OutputTimeUnit(TimeUnit.MICROSECONDS)
   @State(Scope.Benchmark)
   @Warmup(iterations = 3, time = 3)
   @Measurement(iterations = 5, time = 3)
   @Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
   public class GainBenchmark {
     private static final VectorSpecies<Integer> INT_SPECIES = 
IntVector.SPECIES_PREFERRED;
     private static final VectorSpecies<Float> FLOAT_SPECIES = 
FloatVector.SPECIES_PREFERRED;
   
     private int[] terms;
     private int[] fromFreqs;
     private int[] toFreqs;
     private final int[] indicesBuffer = new int[128];
     @Param({"128"})
     int termsNum;
     @Param({"1024"})
     int maxTerm;
   
     private static final float[] LOG2_TABLE = new float[256];
   
     static {
       LOG2_TABLE[0] = 1f;
       // float that has the biased exponent of 1f and zeros for sign and 
mantissa bits
       final int one = Float.floatToIntBits(1f);
       for (int i = 0; i < 256; ++i) {
         float f = Float.intBitsToFloat(one | (i << (23 - 8)));
         LOG2_TABLE[i] = (float) (Math.log(f) / Math.log(2));
       }
     }
   
     @Setup(Level.Trial)
     public void init() {
       terms = new int[termsNum];
       for (int i=0; i < termsNum; i++) {
         terms[i] = ThreadLocalRandom.current().nextInt(maxTerm);
       }
       fromFreqs = new int[maxTerm];
       toFreqs = new int[maxTerm];
       for (int i = 0; i < maxTerm; i++) {
         fromFreqs[i] = ThreadLocalRandom.current().nextInt(16);
         toFreqs[i] = ThreadLocalRandom.current().nextInt(16);
       }
       float o = gainOld();
       float n = gainNew();
       if (Math.abs(o - n) > 1E-4f) {
         throw new RuntimeException("New is wrong, old: " + o + ", new: " + n);
       }
     }
   
     @Benchmark
     public float gainOld() {
       float gain = 0;
       for (int i = 0; i < terms.length; ++i) {
         final int termID = terms[i];
         final int fromDocFreq = fromFreqs[termID];
         final int toDocFreq = toFreqs[termID];
         assert fromDocFreq >= 0;
         assert toDocFreq >= 0;
         gain +=
             (toDocFreq == 0 ? 0 : fastLog2(toDocFreq))
                 - (fromDocFreq == 0 ? 0 : fastLog2(fromDocFreq));
       }
       return gain;
     }
   
     @Benchmark
     public float gainNew() {
       final int upperBound = INT_SPECIES.loopBound(termsNum);
       int i = 0;
       FloatVector acc = FloatVector.zero(FLOAT_SPECIES);
       for (; i < upperBound; i += INT_SPECIES.length()) {
         IntVector fromVector = IntVector.fromArray(INT_SPECIES, fromFreqs, 0, 
terms, i);
         IntVector toVector = IntVector.fromArray(INT_SPECIES, toFreqs, 0, 
terms, i);
         Vector<Float> gainVector = 
fastLog2(toVector).sub(fastLog2(fromVector));
         acc = acc.add(gainVector);
   
       }
       float gain = acc.reduceLanes(VectorOperators.ADD);
   
       for ( ; i < terms.length; ++i) {
         final int termID = terms[i];
         final int fromDocFreq = fromFreqs[termID];
         final int toDocFreq = toFreqs[termID];
         assert fromDocFreq >= 0;
         assert toDocFreq >= 0;
         gain +=
             (toDocFreq == 0 ? 0 : fastLog2(toDocFreq))
                 - (fromDocFreq == 0 ? 0 : fastLog2(fromDocFreq));
       }
       return gain;
     }
   
     private static final IntVector BROAD_31 = IntVector.broadcast(INT_SPECIES, 
31);
     private static final IntVector BROAD_32 = IntVector.broadcast(INT_SPECIES, 
32);
     private static final IntVector BROAD_24 = IntVector.broadcast(INT_SPECIES, 
24);
     private static final FloatVector BROAD_0F = 
FloatVector.broadcast(FLOAT_SPECIES, 0);
   
     private Vector<Float> fastLog2(IntVector intVector) {
       VectorMask<Integer> mask = intVector.test(VectorOperators.IS_DEFAULT);
       IntVector floorLog2 = 
BROAD_31.sub(intVector.lanewise(VectorOperators.LEADING_ZEROS_COUNT));
       IntVector tableIndices = intVector
           .lanewise(VectorOperators.LSHL, BROAD_32.sub(floorLog2))
           .lanewise(VectorOperators.LSHR, BROAD_24);
       tableIndices.intoArray(indicesBuffer, 0);
       return floorLog2.convert(VectorOperators.I2F, 0)
           .add(FloatVector.fromArray(FLOAT_SPECIES, LOG2_TABLE, 0, 
indicesBuffer, 0))
           .blend(BROAD_0F, mask.cast(FLOAT_SPECIES));
     }
   
     static float fastLog2(int i) {
       assert i > 0 : "Cannot compute log of i=" + i;
       int floorLog2 = 31 - Integer.numberOfLeadingZeros(i);
       int tableIndex = i << (32 - floorLog2) >>> (32 - 8);
       return floorLog2 + LOG2_TABLE[tableIndex];
     }
   }
   ```
   
   </details>


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[I] Compute gain with vector API in BPIndexReorderer [lucene]

Reply via email to