[GitHub] [lucene] ChrisHegarty commented on pull request #12311: Integrate the Incubating Panama Vector API

via GitHub Sat, 20 May 2023 02:38:52 -0700


ChrisHegarty commented on PR #12311:
URL: https://github.com/apache/lucene/pull/12311#issuecomment-1555874325


   Just dumping an initial round of benchmark results, etc, based on what is 
currently in this PR. 
   
   <details>
   
   <summary>Benchmark source (derived from Robert's)</summary>
   
   ```
   davekim$ cat src/main/java/testing/DotProductBenchmark.java 
   package testing;
   
   import org.openjdk.jmh.annotations.*;
   
   import java.util.concurrent.atomic.AtomicInteger;
   import java.util.concurrent.ThreadLocalRandom;
   import java.util.concurrent.TimeUnit;
   import java.util.stream.IntStream;
   
   import jdk.incubator.vector.FloatVector;
   import jdk.incubator.vector.VectorOperators;
   import jdk.incubator.vector.VectorSpecies;
   
   @BenchmarkMode(Mode.Throughput)
   @OutputTimeUnit(TimeUnit.MICROSECONDS)
   @State(Scope.Benchmark)
   @Warmup(iterations = 3, time = 3)
   @Measurement(iterations = 5, time = 3)
   @Fork(value = 1, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
   public class DotProductBenchmark {
   
     private float[] a;
     private float[] b;
   
     @Param({"1", "4", "6", "8", "13", "16", "25", "32", "64", "100", "128", 
"207", "256", "300", "512", "702", "1024"})
     //@Param({"1", "4", "6", "8", "13", "16", "25", "32", "64", "100" })
     //@Param({"1024"})
     int size;
   
     @Setup(Level.Trial)
     public void init() {
       a = new float[size];
       b = new float[size];
       for (int i = 0; i < size; ++i) {
         a[i] = ThreadLocalRandom.current().nextFloat();
         b[i] = ThreadLocalRandom.current().nextFloat();
       }
     }
   
     static final VectorSpecies<Float> SPECIES = FloatVector.SPECIES_PREFERRED;
   
     @Benchmark
     public float dotProductNew() {
       if (a.length != b.length) {
         throw new IllegalArgumentException("vector dimensions differ: " + 
a.length + "!=" + b.length);
       }
       int i = 0;
       float res = 0;
       // if the array size is large (2x platform vector size), its worth the 
overhead to vectorize
       // vector loop is unrolled a single time (2 accumulators in parallel)
       if (a.length >= 2 * SPECIES.length()) {
         FloatVector acc1 = FloatVector.zero(SPECIES);
         FloatVector acc2 = FloatVector.zero(SPECIES);
         int upperBound = SPECIES.loopBound(a.length - SPECIES.length());
         for (; i < upperBound; i += 2 * SPECIES.length()) {
           FloatVector va = FloatVector.fromArray(SPECIES, a, i);
           FloatVector vb = FloatVector.fromArray(SPECIES, b, i);
           acc1 = acc1.add(va.mul(vb));
           FloatVector vc = FloatVector.fromArray(SPECIES, a, i + 
SPECIES.length());
           FloatVector vd = FloatVector.fromArray(SPECIES, b, i + 
SPECIES.length());
           acc2 = acc2.add(vc.mul(vd));
         }
         res += acc1.reduceLanes(VectorOperators.ADD) + 
acc2.reduceLanes(VectorOperators.ADD);
       }
       for (; i < a.length; i++) {
         res += b[i] * a[i];
       }
       return res;
     }
   
     @Benchmark
     public float dotProductOld() {
       if (a.length != b.length) {
         throw new IllegalArgumentException("vector dimensions differ: " + 
a.length + "!=" + b.length);
       }
       float res = 0f;
       /*
        * If length of vector is larger than 8, we use unrolled dot product to 
accelerate the
        * calculation.
        */
       int i;
       for (i = 0; i < a.length % 8; i++) {
         res += b[i] * a[i];
       }
       if (a.length < 8) {
         return res;
       }
       for (; i + 31 < a.length; i += 32) {
         res +=
             b[i + 0] * a[i + 0]
                 + b[i + 1] * a[i + 1]
                 + b[i + 2] * a[i + 2]
                 + b[i + 3] * a[i + 3]
                 + b[i + 4] * a[i + 4]
                 + b[i + 5] * a[i + 5]
                 + b[i + 6] * a[i + 6]
                 + b[i + 7] * a[i + 7];
         res +=
             b[i + 8] * a[i + 8]
                 + b[i + 9] * a[i + 9]
                 + b[i + 10] * a[i + 10]
                 + b[i + 11] * a[i + 11]
                 + b[i + 12] * a[i + 12]
                 + b[i + 13] * a[i + 13]
                 + b[i + 14] * a[i + 14]
                 + b[i + 15] * a[i + 15];
         res +=
             b[i + 16] * a[i + 16]
                 + b[i + 17] * a[i + 17]
                 + b[i + 18] * a[i + 18]
                 + b[i + 19] * a[i + 19]
                 + b[i + 20] * a[i + 20]
                 + b[i + 21] * a[i + 21]
                 + b[i + 22] * a[i + 22]
                 + b[i + 23] * a[i + 23];
         res +=
             b[i + 24] * a[i + 24]
                 + b[i + 25] * a[i + 25]
                 + b[i + 26] * a[i + 26]
                 + b[i + 27] * a[i + 27]
                 + b[i + 28] * a[i + 28]
                 + b[i + 29] * a[i + 29]
                 + b[i + 30] * a[i + 30]
                 + b[i + 31] * a[i + 31];
       }
       for (; i + 7 < a.length; i += 8) {
         res +=
             b[i + 0] * a[i + 0]
                 + b[i + 1] * a[i + 1]
                 + b[i + 2] * a[i + 2]
                 + b[i + 3] * a[i + 3]
                 + b[i + 4] * a[i + 4]
                 + b[i + 5] * a[i + 5]
                 + b[i + 6] * a[i + 6]
                 + b[i + 7] * a[i + 7];
       }
       return res;
     }
   }
   
   
   ```
   
   </details>
   
   <details>
   
   <summary>Benchmark results</summary>
   
   ```
   Benchmark                          (size)   Mode  Cnt    Score   Error   
Units
   DotProductBenchmark.dotProductNew       1  thrpt    5  486.822 ± 1.260  
ops/us
   DotProductBenchmark.dotProductOld       1  thrpt    5  547.520 ± 1.362  
ops/us
   
   DotProductBenchmark.dotProductNew       4  thrpt    5  276.907 ± 1.612  
ops/us
   DotProductBenchmark.dotProductOld       4  thrpt    5  398.279 ± 1.195  
ops/us
   
   DotProductBenchmark.dotProductNew       6  thrpt    5  273.141 ± 1.060  
ops/us
   DotProductBenchmark.dotProductOld       6  thrpt    5  364.975 ± 1.939  
ops/us
   
   DotProductBenchmark.dotProductNew       8  thrpt    5  219.088 ± 0.538  
ops/us
   DotProductBenchmark.dotProductOld       8  thrpt    5  273.919 ± 0.897  
ops/us
   
   DotProductBenchmark.dotProductNew      13  thrpt    5  186.654 ± 0.075  
ops/us
   DotProductBenchmark.dotProductOld      13  thrpt    5  199.216 ± 0.476  
ops/us
   
   DotProductBenchmark.dotProductNew      16  thrpt    5  160.680 ± 0.401  
ops/us
   DotProductBenchmark.dotProductOld      16  thrpt    5  155.481 ± 0.382  
ops/us
   
   DotProductBenchmark.dotProductNew      25  thrpt    5  103.595 ± 0.358  
ops/us
   DotProductBenchmark.dotProductOld      25  thrpt    5   99.612 ± 0.262  
ops/us
   
   DotProductBenchmark.dotProductNew      32  thrpt    5   84.886 ± 0.342  
ops/us
   DotProductBenchmark.dotProductOld      32  thrpt    5  103.425 ± 0.364  
ops/us
   
   DotProductBenchmark.dotProductNew      64  thrpt    5   78.525 ± 1.889  
ops/us
   DotProductBenchmark.dotProductOld      64  thrpt    5   53.223 ± 0.108  
ops/us
   
   DotProductBenchmark.dotProductNew     100  thrpt    5   60.173 ± 0.453  
ops/us
   DotProductBenchmark.dotProductOld     100  thrpt    5   32.104 ± 0.027  
ops/us
   
   DotProductBenchmark.dotProductNew     128  thrpt    5   64.356 ± 0.145  
ops/us
   DotProductBenchmark.dotProductOld     128  thrpt    5   27.143 ± 0.083  
ops/us
   
   DotProductBenchmark.dotProductNew     207  thrpt    5   35.962 ± 0.015  
ops/us
   DotProductBenchmark.dotProductOld     207  thrpt    5   16.279 ± 0.253  
ops/us
   
   DotProductBenchmark.dotProductNew     256  thrpt    5   49.528 ± 1.180  
ops/us
   DotProductBenchmark.dotProductOld     256  thrpt    5   13.683 ± 0.137  
ops/us
   
   DotProductBenchmark.dotProductNew     300  thrpt    5   36.517 ± 0.104  
ops/us
   DotProductBenchmark.dotProductOld     300  thrpt    5   11.232 ± 0.007  
ops/us
   
   DotProductBenchmark.dotProductNew     512  thrpt    5   36.253 ± 0.004  
ops/us
   DotProductBenchmark.dotProductOld     512  thrpt    5    6.866 ± 0.078  
ops/us
   
   DotProductBenchmark.dotProductNew     702  thrpt    5   17.555 ± 0.184  
ops/us
   DotProductBenchmark.dotProductOld     702  thrpt    5    4.855 ± 0.020  
ops/us
   
   DotProductBenchmark.dotProductNew    1024  thrpt    5   23.363 ± 0.037  
ops/us
   DotProductBenchmark.dotProductOld    1024  thrpt    5    3.439 ± 0.067  
ops/us
   
   ```
   
   </details>
   
   <details>
   
   <summary>Machine details (for reference)</summary>
   
   ```
   davekim$ /home/chegar/binaries/jdk-20.0.1/bin/jshell --add-modules 
jdk.incubator.vector
   |  Welcome to JShell -- Version 20.0.1
   |  For an introduction type: /help intro
   
   jshell> jdk.incubator.vector.FloatVector.SPECIES_PREFERRED
   $1 ==> Species[float, 16, S_512_BIT]
   
   davekim$ cat /etc/lsb-release 
   DISTRIB_ID=Ubuntu
   DISTRIB_RELEASE=23.04
   DISTRIB_CODENAME=lunar
   DISTRIB_DESCRIPTION="Ubuntu 23.04"
   
   davekim$ lscpu
   Architecture:            x86_64
     CPU op-mode(s):        32-bit, 64-bit
     Address sizes:         39 bits physical, 48 bits virtual
     Byte Order:            Little Endian
   CPU(s):                  12
     On-line CPU(s) list:   0-11
   Vendor ID:               GenuineIntel
     Model name:            11th Gen Intel(R) Core(TM) i5-11400 @ 2.60GHz
       CPU family:          6
       Model:               167
       Thread(s) per core:  2
       Core(s) per socket:  6
       Socket(s):           1
       Stepping:            1
       CPU(s) scaling MHz:  62%
       CPU max MHz:         4400.0000
       CPU min MHz:         800.0000
       BogoMIPS:            5184.00
       Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr 
pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall 
nx pdpe1gb rdtscp lm constant_tsc 
                            art arch_perfmon pebs bts rep_good nopl xtopology 
nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl 
vmx est tm2 ssse3 sdbg fma cx16 
                            xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt 
tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch 
cpuid_fault invpcid_single ssbd ibrs ibpb 
                            stibp ibrs_enhanced tpr_shadow vnmi flexpriority 
ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid mpx 
avx512f avx512dq rdseed adx smap avx512i
                            fma clflushopt intel_pt avx512cd sha_ni avx512bw 
avx512vl xsaveopt xsavec xgetbv1 xsaves dtherm ida arat pln pts hwp hwp_notify 
hwp_act_window hwp_epp hwp_pkg_req
                             avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes 
vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid fsrm md_clear 
flush_l1d arch_capabilities
   Virtualization features: 
     Virtualization:        VT-x
   Caches (sum of all):     
     L1d:                   288 KiB (6 instances)
     L1i:                   192 KiB (6 instances)
     L2:                    3 MiB (6 instances)
     L3:                    12 MiB (1 instance)
   NUMA:                    
     NUMA node(s):          1
     NUMA node0 CPU(s):     0-11
   Vulnerabilities:         
     Itlb multihit:         Not affected
     L1tf:                  Not affected
     Mds:                   Not affected
     Meltdown:              Not affected
     Mmio stale data:       Mitigation; Clear CPU buffers; SMT vulnerable
     Retbleed:              Mitigation; Enhanced IBRS
     Spec store bypass:     Mitigation; Speculative Store Bypass disabled via 
prctl
     Spectre v1:            Mitigation; usercopy/swapgs barriers and __user 
pointer sanitization
     Spectre v2:            Mitigation; Enhanced IBRS, IBPB conditional, RSB 
filling, PBRSB-eIBRS SW sequence
     Srbds:                 Not affected
     Tsx async abort:       Not affected
   
   ```
   
   </details>


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [lucene] ChrisHegarty commented on pull request #12311: Integrate the Incubating Panama Vector API

Reply via email to