(systemds) branch main updated: [SYSTEMDS-3920] Vector API in more codegen primitives

mboehm7 Sat, 28 Mar 2026 07:58:50 -0700

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new 422fba7050 [SYSTEMDS-3920] Vector API in more codegen primitives
422fba7050 is described below

commit 422fba7050e2326fc3aac2fb9dd42ca8e3ac21b5
Author: JulianJuelg <[email protected]>
AuthorDate: Sat Mar 28 15:57:07 2026 +0100

    [SYSTEMDS-3920] Vector API in more codegen primitives
    
    Closes #2428.
---
 .../sysds/runtime/codegen/LibSpoofPrimitives.java  | 733 ++++++++++++++++--
 .../sysds/runtime/matrix/data/LibMatrixMult.java   |  39 +
 .../primitives_vector_api/BenchCase.java           | 345 +++++++++
 .../primitives_vector_api/BenchUtil.java           |  84 ++
 .../performance/primitives_vector_api/Ctx.java     |  78 ++
 .../primitives_vector_api/PrimitivePerfSuite.java  |  63 ++
 .../backup_primitives_for_benchmark.java           | 862 +++++++++++++++++++++
 7 files changed, 2125 insertions(+), 79 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java 
b/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java
index ebb42676f0..86f2284c50 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java
@@ -28,10 +28,16 @@ import org.apache.sysds.runtime.functionobjects.BitwAnd;
 import org.apache.sysds.runtime.functionobjects.IntegerDivide;
 import org.apache.sysds.runtime.functionobjects.Modulus;
 import org.apache.sysds.runtime.matrix.data.LibMatrixDNN;
+import org.apache.sysds.runtime.matrix.data.LibMatrixDNN.PoolingType;
 import org.apache.sysds.runtime.matrix.data.LibMatrixDNNIm2Col;
 import org.apache.sysds.runtime.matrix.data.LibMatrixDNNPooling;
 import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
-import org.apache.sysds.runtime.matrix.data.LibMatrixDNN.PoolingType;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+import jdk.incubator.vector.VectorMask;
 
 /**
  * This library contains all vector primitives that are used in 
@@ -45,6 +51,13 @@ public class LibSpoofPrimitives
        private static IntegerDivide intDiv = IntegerDivide.getFnObject();
        private static Modulus mod = Modulus.getFnObject();
        private static BitwAnd bwAnd = BitwAnd.getBitwAndFnObject();
+
+       // Vector API initializations
+       private static final VectorSpecies<Double> SPECIES = 
DoubleVector.SPECIES_PREFERRED;
+       @SuppressWarnings("unused")
+       private static final VectorSpecies<Float> FSPECIES = 
FloatVector.SPECIES_PREFERRED;
+       private static final int vLen = SPECIES.length();
+
        
        //global pool of reusable vectors, individual operations set up their 
own thread-local
        //ring buffers of reusable vectors with specific number of vectors and 
vector sizes 
@@ -57,13 +70,32 @@ public class LibSpoofPrimitives
        };
 
        public static double rowMaxsVectMult(double[] a, double[] b, int ai, 
int bi, int len) {
-               double val = Double.NEGATIVE_INFINITY;
-               int j=0;
-               for( int i = ai; i < ai+len; i++ )
-                       val = Math.max(a[i]*b[j++], val);
-               return val;
+               double maxVal = Double.NEGATIVE_INFINITY;
+       
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+       
+               DoubleVector vmax = DoubleVector.broadcast(SPECIES, 
Double.NEGATIVE_INFINITY);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi 
+ i);
+                       DoubleVector prod = va.mul(vb);
+                       vmax = vmax.max(prod);
+               }
+       
+               maxVal = vmax.reduceLanes(VectorOperators.MAX);
+       
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       maxVal = Math.max(maxVal, a[ai + i] * b[bi + i]);
+               }
+       
+               return maxVal;
        }
 
+       // note: parameter bi unused
        public static double rowMaxsVectMult(double[] a, double[] b, int[] aix, 
int ai, int bi, int len) {
                double val = Double.NEGATIVE_INFINITY;
                for( int i = ai; i < ai+len; i++ )
@@ -71,6 +103,32 @@ public class LibSpoofPrimitives
                return val;
        }
 
+       // not in use: vector api implementation slower than scalar loop version
+       public static double rowMaxsVectMult_vector_api(double[] a, double[] b, 
int[] aix, int ai, int bi, int len) {
+               double scalarMax = Double.NEGATIVE_INFINITY;
+
+               int i = 0;
+               int upperBound = SPECIES.loopBound(len);
+               DoubleVector vmax = DoubleVector.broadcast(SPECIES, 
Double.NEGATIVE_INFINITY);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upperBound; i += SPECIES.length()) {
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       DoubleVector vb = DoubleVector.fromArray(SPECIES, b, 0, 
aix, ai + i);
+                       DoubleVector prod = va.mul(vb);
+                       vmax = vmax.max(prod);
+               }
+               scalarMax = Math.max(scalarMax, 
vmax.reduceLanes(VectorOperators.MAX));
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       double prod = a[ai + i] * b[aix[ai + i]];
+                       if (prod > scalarMax)
+                               scalarMax = prod;
+               }
+               return scalarMax;
+       }
+
        // forwarded calls to LibMatrixMult
        public static double dotProduct(double[] a, double[] b, int ai, int bi, 
int len) {
                if( a == null || b == null ) return 0;
@@ -295,6 +353,7 @@ public class LibSpoofPrimitives
         * @param len number of processed elements
         * @return sum value
         */
+
        public static double vectSum(double[] a, int ai, int len) { 
                double val = 0;
                final int bn = len%8;
@@ -313,6 +372,27 @@ public class LibSpoofPrimitives
                //scalar result
                return val; 
        } 
+       // not in use: vector api implementation slower than scalar loop version
+       public static double vectSum_vector_api(double[] a, int ai, int len) {
+        double sum = 0d;
+        int i = 0;
+
+        DoubleVector acc = DoubleVector.zero(SPECIES);
+        int upperBound = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+        for (; i < upperBound; i += SPECIES.length()) {
+            DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai + i);
+            acc = acc.add(v);
+        }
+        sum += acc.reduceLanes(VectorOperators.ADD);
+
+        //rest, not aligned to vLen-blocks
+        for (; i < len; i++) {
+            sum += a[ai + i];
+        }
+        return sum;
+    }
        
        public static double vectSum(double[] avals, int[] aix, int ai, int 
alen, int len) {
                //forward to dense as column indexes not required here
@@ -327,36 +407,82 @@ public class LibSpoofPrimitives
                return LibMatrixMult.dotProduct(avals, avals, ai, ai, alen);
        }
        
-       public static double vectMin(double[] a, int ai, int len) { 
+       public static double scalarvectMin(double[] a, int ai, int len) { 
                double val = Double.POSITIVE_INFINITY;
                for( int i = ai; i < ai+len; i++ )
                        val = Math.min(a[i], val);
                return val; 
        }
+
+       public static double vectMin(double[] a, int ai, int len) {     
+               int i = 0;
+               int upperBound = SPECIES.loopBound(len);
+               DoubleVector vmin = DoubleVector.broadcast(SPECIES, 
Double.POSITIVE_INFINITY);
+       
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upperBound; i += vLen) {
+                       DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       vmin = vmin.min(v);
+               }
+               double minVal = vmin.reduceLanes(VectorOperators.MIN);
+       
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       minVal = Math.min(minVal, a[ai + i]);
+               }
+               return minVal;
+       }
        
        public static double vectMin(double[] avals, int[] aix, int ai, int 
alen, int len) {
                double val = vectMin(avals, ai, alen);
                return (alen<len) ? Math.min(val, 0) : val;
        }
        
-       public static double vectMax(double[] a, int ai, int len) { 
-               double val = Double.NEGATIVE_INFINITY;
-               for( int i = ai; i < ai+len; i++ )
-                       val = Math.max(a[i], val);
-               return val; 
-       } 
+
+       public static double vectMax(double[] a, int ai, int len) {
+               int i = 0;
+               int upperBound = SPECIES.loopBound(len);
+               DoubleVector vmax = DoubleVector.broadcast(SPECIES, 
Double.NEGATIVE_INFINITY);
+       
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upperBound; i += vLen) {
+                       DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       vmax = vmax.max(v);
+               }
+               double maxVal = vmax.reduceLanes(VectorOperators.MAX);
+
+               //rest, not aligned to vLen-blocks      
+               for(;i<len;i++){
+                       maxVal = Math.max(a[ai + i],maxVal);
+               }
+               return maxVal;
+       }
        
        public static double vectMax(double[] avals, int[] aix, int ai, int 
alen, int len) {
                double val = vectMax(avals, ai, alen);
                return (alen<len) ? Math.max(val, 0) : val;
        }
        
-       public static double vectCountnnz(double[] a, int ai, int len) { 
+
+       public static double vectCountnnz(double[] a, int ai, int len) {        
                int count = 0;
-               for( int i = ai; i < ai+len; i++ )
+               int i = 0;
+               int upperBound = SPECIES.loopBound(len);
+               DoubleVector vzero = DoubleVector.zero(SPECIES);
+       
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upperBound; i += vLen) {
+                       DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       VectorMask<Double> nz = v.compare(VectorOperators.NE, 
vzero);
+                       count += nz.trueCount();
+               }
+       
+               //rest, not aligned to vLen-blocks      
+               for(;i<len;i++){
                        count += (a[i] != 0) ? 1 : 0;
+               }
                return count;
-       } 
+       }
        
        public static double vectCountnnz(double[] avals, int[] aix, int ai, 
int alen, int len) {
                //pure meta data operation
@@ -372,26 +498,106 @@ public class LibSpoofPrimitives
        }
        
        //custom vector div
-       
-       public static void vectDivAdd(double[] a, double bval, double[] c, int 
ai, int ci, int len) {
-               for( int j = ai; j < ai+len; j++, ci++)
-                       c[ci] +=  a[j] / bval;
+
+       public static void vectDivAdd(double[] a, double bval, double[] c, int 
ai, int ci, int len) { 
+               final double inv = 1.0 / bval; 
+               final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv); 
+               int i = 0; final int upperBound = SPECIES.loopBound(len); 
+
+               //unrolled vLen-block (for better instruction-level 
parallelism) 
+               for (; i < upperBound; i += vLen) { 
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i); 
+                       DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci 
+ i); 
+                       vc = vc.add(va.mul(vinv)); vc.intoArray(c, ci + i); 
+               } 
+               
+               //rest, not aligned to vLen-blocks 
+               for (; i < len; i++) { 
+                       c[ci + i] += a[ai + i] * inv;
+               } 
        }
        
+
        public static void vectDivAdd(double bval, double[] a, double[] c, int 
ai, int ci, int len) {
-               for( int j = ai; j < ai+len; j++, ci++)
-                       c[ci] +=  bval / a[j];
+               int i = 0;
+               int upperBound = SPECIES.loopBound(len);
+               DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upperBound; i += vLen) {
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci 
+ i);
+                       vc = vc.add(vb.div(va));
+                       vc.intoArray(c, ci + i);
+               }
+
+               //rest, not aligned to vLen-blocks      
+               for (;i<len;i++){
+                       c[ci+i] += bval/a[ai+i];
+               }
        }
 
+
        public static void vectDivAdd(double[] a, double bval, double[] c, 
int[] aix, int ai, int ci, int alen, int len) {
                for( int j = ai; j < ai+alen; j++ )
                        c[ci + aix[j]] += a[j] / bval;
        }
+
+       // not in use: vector api implementation slower than scalar loop version
+       public static void vectDivAdd_vector_api(double[] a, double bval, 
double[] c, int[] aix, int ai, int ci, int alen, int len) {
+
+               final double inv = 1.0 / bval;
+               int i = 0;
+               int upperBound = SPECIES.loopBound(alen);
+               DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upperBound; i += vLen) {
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       DoubleVector vcontrib = va.mul(vinv);
+
+                       // scatter-add lane-by-lane
+                       for (int lane = 0; lane < vLen; lane++) {
+                               int idx = ci + aix[ai + i + lane];
+                               c[idx] += vcontrib.lane(lane);
+                       }
+               }
+
+               //rest, not aligned to vLen-blocks
+               for(; i<alen; i++){
+                       c[ci + aix[ai + i]] += a[ai + i] * inv;
+               }
+       }
+
        
        public static void vectDivAdd(double bval, double[] a, double[] c, 
int[] aix, int ai, int ci, int alen, int len) {
                for( int j = ai; j < ai+alen; j++ )
                        c[ci + aix[j]] += bval / a[j];
        }
+
+       // not in use: vector api implementation slower than scalar loop version
+       public static void vectDivAdd_vector_api(double bval, double[] a, 
double[] c, int[] aix, int ai, int ci, int alen, int len) {
+               int i = 0;
+               int upperBound = SPECIES.loopBound(alen);
+               DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upperBound; i += vLen) {
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       DoubleVector vcontrib = vb.div(va);
+
+                       // scatter-add lane-by-lane
+                       for (int lane = 0; lane < vLen; lane++) {
+                               int idx = ci + aix[ai + i + lane];
+                               c[idx] += vcontrib.lane(lane);
+                       }       
+               }
+               //rest, not aligned to vLen-blocks
+               for (; i<alen; i++){
+                       c[ci + aix[ai + i]] += bval / a[ai +i];
+               }
+       }
+
        
        public static double[] vectDivWrite(double[] a, double bval, int ai, 
int len) {
                double[] c = allocVector(len, false);
@@ -399,6 +605,28 @@ public class LibSpoofPrimitives
                        c[j] = a[ai+j] / bval;
                return c;
        }
+
+       // not in use: vector api implementation slower than scalar loop version
+       public static double[] vectDivWrite_vector_api(double[] a, double bval, 
int ai, int len) {
+               double[] c = allocVector(len, false);
+               final double inv = 1.0 / bval;
+               final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv);
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       va.mul(vinv).intoArray(c, i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       c[i] = a[ai + i] * inv;
+               }
+               return c;
+       }
+
        
        public static double[] vectDivWrite(double bval, double[] a, int ai, 
int len) {
                double[] c = allocVector(len, false);
@@ -406,6 +634,26 @@ public class LibSpoofPrimitives
                        c[j] = bval / a[ai + j];
                return c;
        }
+
+       // not in use: vector api implementation slower than scalar loop version
+       public static double[] vectDivWrite_vector_api(double bval, double[] a, 
int ai, int len) {
+               double[] c = allocVector(len, false);
+               final DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       vb.div(va).intoArray(c, i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i<len; i++){
+                       c[i] = bval / a[ai + i];
+               }
+               return c;
+       }
        
        public static double[] vectDivWrite(double[] a, double[] b, int ai, int 
bi, int len) {
                double[] c = allocVector(len, false);
@@ -414,6 +662,26 @@ public class LibSpoofPrimitives
                return c;
        }
 
+       // not in use: vector api implementation slower than scalar loop version
+       public static double[] vectDivWrite_vector_api(double[] a, double[] b, 
int ai, int bi, int len) {
+               double[] c = allocVector(len, false);
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi 
+ i);
+                       va.div(vb).intoArray(c, i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for(; i <len; i++){
+                       c[i] = a[ai + i] / b[bi + i];
+               }
+               return c;
+       }
+
        public static double[] vectDivWrite(double[] a, double bval, int[] aix, 
int ai, int alen, int len) {
                double init = (bval != 0) ? 0 : Double.NaN;
                double[] c = allocVector(len, true, init);
@@ -1480,10 +1748,9 @@ public class LibSpoofPrimitives
        }
        
        //custom mult2
-       
+
        public static void vectMult2Add(double[] a, double[] c, int ai, int ci, 
int len) {
-               for( int j = ai; j < ai+len; j++, ci++)
-                       c[ci] +=  a[j] + a[j];
+               LibMatrixMult.vectMultiplyAdd(2.0,a,c,ai,ci,len);
        }
        
        public static void vectMult2Add(double[] a, double[] c, int[] aix, int 
ai, int ci, int alen, int len) {
@@ -1493,10 +1760,10 @@ public class LibSpoofPrimitives
        
        public static double[] vectMult2Write(double[] a, int ai, int len) {
                double[] c = allocVector(len, false);
-               for( int j = 0; j < len; j++, ai++)
-                       c[j] = a[ai] + a[ai];
+               LibMatrixMult.vectMultiplyWrite(2.0,a,c,ai,0,len);
                return c;
        }
+
        
        public static double[] vectMult2Write(double[] a, int[] aix, int ai, 
int alen, int len) {
                double[] c = allocVector(len, true);
@@ -1586,9 +1853,30 @@ public class LibSpoofPrimitives
        //custom vector equal
        
        public static void vectEqualAdd(double[] a, double bval, double[] c, 
int ai, int ci, int len) {
-               for( int j = ai; j < ai+len; j++, ci++)
-                       c[ci] += (a[j] == bval) ? 1 : 0;
-       }
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+               final DoubleVector bVec   = DoubleVector.broadcast(SPECIES, 
bval);
+               final DoubleVector ones   = DoubleVector.broadcast(SPECIES, 
1.0);
+               final DoubleVector zeros  = DoubleVector.zero(SPECIES);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, 
ai + i);
+                       DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, 
ci + i);
+
+                       VectorMask<Double> eq = 
aVec.compare(VectorOperators.EQ, bVec);
+
+                       DoubleVector inc = zeros.blend(ones, eq);
+
+                       cVec.add(inc).intoArray(c, ci + i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       c[ci + i] += (a[ai + i] == bval) ? 1.0 : 0.0;
+                       }
+               }
+       
        
        public static void vectEqualAdd(double bval, double[] a, double[] c, 
int ai, int ci, int len) {
                vectEqualAdd(a, bval, c, ai, ci, len);
@@ -1609,21 +1897,56 @@ public class LibSpoofPrimitives
        
        public static double[] vectEqualWrite(double[] a, double bval, int ai, 
int len) {
                double[] c = allocVector(len, false);
-               for( int j = 0; j < len; j++, ai++)
-                       c[j] = (a[ai] == bval) ? 1 : 0;
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+               DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+               DoubleVector zeros = DoubleVector.zero(SPECIES);
+               DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+       
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       var mask = va.compare(VectorOperators.EQ, vb);
+                       DoubleVector out = zeros.blend(ones, mask);
+                       out.intoArray(c, i);
+               }
+       
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       c[i] = (a[ai + i] == bval) ? 1 : 0;
+               }
                return c;
        }
        
+       
        public static double[] vectEqualWrite(double bval, double[] a, int ai, 
int len) {
                return vectEqualWrite(a, bval, ai, len);
        }
        
+
        public static double[] vectEqualWrite(double[] a, double[] b, int ai, 
int bi, int len) {
-               double[] c = allocVector(len, false);
-               for( int j = 0; j < len; j++, ai++, bi++)
-                       c[j] = (a[ai] == b[bi]) ? 1 : 0;
-               return c;
-       }
+        double[] c = allocVector(len, false);
+        final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+        final DoubleVector zeros = DoubleVector.zero(SPECIES);
+        int i = 0;
+        int upper = SPECIES.loopBound(len);
+
+        //unrolled vLen-block  (for better instruction-level parallelism)
+        for (; i < upper; i += vLen) {
+            DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+            DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i);
+            VectorMask<Double> eq = aVec.compare(VectorOperators.EQ, bVec);
+            DoubleVector out = zeros.blend(ones, eq);
+
+            out.intoArray(c, i);
+        }
+
+               //rest, not aligned to vLen-blocks
+        for (; i < len; i++) {
+            c[i] = (a[ai + i] == b[bi + i]) ? 1.0 : 0.0;
+        }
+        return c;
+    }
 
        public static double[] vectEqualWrite(double[] a, double bval, int[] 
aix, int ai, int alen, int len) {
                double init = (bval == 0) ? 1 : 0;
@@ -1655,8 +1978,27 @@ public class LibSpoofPrimitives
        //custom vector not equal
        
        public static void vectNotequalAdd(double[] a, double bval, double[] c, 
int ai, int ci, int len) {
-               for( int j = ai; j < ai+len; j++, ci++)
-                       c[ci] += (a[j] != bval) ? 1 : 0;
+               final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, 
bval);
+               final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+               final DoubleVector zeros = DoubleVector.zero(SPECIES);
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+               DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+               DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i);
+
+               VectorMask<Double> ne = aVec.compare(VectorOperators.NE, bVec);
+               DoubleVector inc = zeros.blend(ones, ne);
+
+               cVec.add(inc).intoArray(c, ci + i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       c[ci + i] += (a[ai + i] != bval) ? 1.0 : 0.0;
+               }
        }
        
        public static void vectNotequalAdd(double bval, double[] a, double[] c, 
int ai, int ci, int len) {
@@ -1675,13 +2017,31 @@ public class LibSpoofPrimitives
        public static void vectNotequalAdd(double bval, double[] a, double[] c, 
int[] aix, int ai, int ci, int alen, int len) {
                vectNotequalAdd(a, bval, c, aix, ai, ci, alen, len);
        }
-       
+
        public static double[] vectNotequalWrite(double[] a, double bval, int 
ai, int len) {
-               double[] c = allocVector(len, false);
-               for( int j = 0; j < len; j++, ai++)
-                       c[j] = (a[ai] != bval) ? 1 : 0;
-               return c;
-       }
+        double[] c = allocVector(len, false);
+        final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
+        final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+        final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+        int i = 0;
+        int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+        for (; i < upper; i += vLen) {
+            DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+            VectorMask<Double> ne = aVec.compare(VectorOperators.NE, bVec);
+            DoubleVector out = zeros.blend(ones, ne);
+
+            out.intoArray(c, i);
+        }
+
+               //rest, not aligned to vLen-blocks
+        for (; i < len; i++) {
+            c[i] = (a[ai + i] != bval) ? 1.0 : 0.0;
+        }
+        return c;
+    }
        
        public static double[] vectNotequalWrite(double bval, double[] a, int 
ai, int len) {
                return vectNotequalWrite(a, bval, ai, len);
@@ -1694,6 +2054,32 @@ public class LibSpoofPrimitives
                return c;
        }
 
+       // not in use: vector api implementation slower than scalar loop version
+       public static double[] vectNotequalWrite_vector_api(double[] a, 
double[] b, int ai, int bi, int len) {
+               double[] c = allocVector(len, false);
+               final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+               final DoubleVector zeros = DoubleVector.zero(SPECIES);
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+               
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, 
ai + i);
+                       DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, 
bi + i);
+
+                       VectorMask<Double> ne = 
aVec.compare(VectorOperators.NE, bVec);
+                       DoubleVector out = zeros.blend(ones, ne);
+
+                       out.intoArray(c, i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       c[i] = (a[ai + i] != b[bi + i]) ? 1.0 : 0.0;
+               }
+               return c;
+               }
+
        public static double[] vectNotequalWrite(double[] a, double bval, int[] 
aix, int ai, int alen, int len) {
                double init = (bval != 0) ? 1 : 0;
                double[] c = allocVector(len, true, init);
@@ -1723,9 +2109,29 @@ public class LibSpoofPrimitives
        //custom vector less
        
        public static void vectLessAdd(double[] a, double bval, double[] c, int 
ai, int ci, int len) {
-               for( int j = ai; j < ai+len; j++, ci++)
-                       c[ci] += (a[j] < bval) ? 1 : 0;
-       }
+               final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, 
bval);
+               final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+               final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, 
ai + i);
+                       DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, 
ci + i);
+
+                       VectorMask<Double> lt = 
aVec.compare(VectorOperators.LT, bVec);
+                       DoubleVector inc = zeros.blend(ones, lt);
+
+                       cVec.add(inc).intoArray(c, ci + i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       c[ci + i] += (a[ai + i] < bval) ? 1.0 : 0.0;
+                       }
+               }
        
        public static void vectLessAdd(double bval, double[] a, double[] c, int 
ai, int ci, int len) {
                vectGreaterequalAdd(a, bval, c, ai, ci, len);
@@ -1743,24 +2149,66 @@ public class LibSpoofPrimitives
        public static void vectLessAdd(double bval, double[] a, double[] c, 
int[] aix, int ai, int ci, int alen, int len) {
                vectGreaterequalAdd(a, bval, c, aix, ai, ci, alen, len);
        }
-       
+
        public static double[] vectLessWrite(double[] a, double bval, int ai, 
int len) {
-               double[] c = allocVector(len, false);
-               for( int j = 0; j < len; j++, ai++)
-                       c[j] = (a[ai] < bval) ? 1 : 0;
-               return c;
-       }
+        double[] c = allocVector(len, false);
+        final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
+        final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+        final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+        int i = 0;
+        int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+        for (; i < upper; i += vLen) {
+            DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+
+            VectorMask<Double> lt = aVec.compare(VectorOperators.LT, bVec);
+            DoubleVector out = zeros.blend(ones, lt);
+
+            out.intoArray(c, i);
+        }
+
+               //rest, not aligned to vLen-blocks
+        for (; i < len; i++) {
+            c[i] = (a[ai + i] < bval) ? 1.0 : 0.0;
+        }
+
+        return c;
+    }
+
        
        public static double[] vectLessWrite(double bval, double[] a, int ai, 
int len) {
                return vectGreaterequalWrite(a, bval, ai, len);
        }
-       
+
        public static double[] vectLessWrite(double[] a, double[] b, int ai, 
int bi, int len) {
                double[] c = allocVector(len, false);
-               for( int j = 0; j < len; j++, ai++, bi++)
-                       c[j] = (a[ai] < b[bi]) ? 1 : 0;
+
+               final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+               final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, 
ai + i);
+                       DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, 
bi + i);
+
+                       VectorMask<Double> lt = 
aVec.compare(VectorOperators.LT, bVec);
+                       DoubleVector out = zeros.blend(ones, lt);
+
+                       out.intoArray(c, i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+               c[i] = (a[ai + i] < b[bi + i]) ? 1.0 : 0.0;
+               }
+
                return c;
-       }
+               }
 
        public static double[] vectLessWrite(double[] a, double bval, int[] 
aix, int ai, int alen, int len) {
                double init = (bval > 0) ? 1 : 0;
@@ -1789,11 +2237,31 @@ public class LibSpoofPrimitives
        }
        
        //custom vector less equal
-       
+
        public static void vectLessequalAdd(double[] a, double bval, double[] 
c, int ai, int ci, int len) {
-               for( int j = ai; j < ai+len; j++, ci++)
-                       c[ci] += (a[j] <= bval) ? 1 : 0;
-       }
+               final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, 
bval);
+               final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+               final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, 
ai + i);
+                       DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, 
ci + i);
+
+                       VectorMask<Double> le = 
aVec.compare(VectorOperators.LE, bVec);
+                       DoubleVector inc = zeros.blend(ones, le);
+
+                       cVec.add(inc).intoArray(c, ci + i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       c[ci + i] += (a[ai + i] <= bval) ? 1.0 : 0.0;
+               }
+               }
        
        public static void vectLessequalAdd(double bval, double[] a, double[] 
c, int ai, int ci, int len) {
                vectGreaterAdd(a, bval, c, ai, ci, len);
@@ -1813,22 +2281,63 @@ public class LibSpoofPrimitives
        }
        
        public static double[] vectLessequalWrite(double[] a, double bval, int 
ai, int len) {
-               double[] c = allocVector(len, false);
-               for( int j = 0; j < len; j++, ai++)
-                       c[j] = (a[ai] <= bval) ? 1 : 0;
-               return c;
-       }
+        double[] c = allocVector(len, false);
+        final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
+        final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+        final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+        int i = 0;
+        int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+        for (; i < upper; i += vLen) {
+            DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+
+            VectorMask<Double> le = aVec.compare(VectorOperators.LE, bVec);
+            DoubleVector out = zeros.blend(ones, le);
+
+            out.intoArray(c, i);
+        }
+
+               //rest, not aligned to vLen-blocks
+        for (; i < len; i++) {
+            c[i] = (a[ai + i] <= bval) ? 1.0 : 0.0;
+        }
+
+        return c;
+    }
        
        public static double[] vectLessequalWrite(double bval, double[] a, int 
ai, int len) {
                return vectGreaterWrite(a, bval, ai, len);
        }
-       
+
        public static double[] vectLessequalWrite(double[] a, double[] b, int 
ai, int bi, int len) {
                double[] c = allocVector(len, false);
-               for( int j = 0; j < len; j++, ai++, bi++)
-                       c[j] = (a[ai] <= b[bi]) ? 1 : 0;
+
+               final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+               final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+               DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+               DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i);
+
+               VectorMask<Double> le = aVec.compare(VectorOperators.LE, bVec);
+               DoubleVector out = zeros.blend(ones, le);
+
+               out.intoArray(c, i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+               c[i] = (a[ai + i] <= b[bi + i]) ? 1.0 : 0.0;
+               }
+
                return c;
-       }
+               }
 
        public static double[] vectLessequalWrite(double[] a, double bval, 
int[] aix, int ai, int alen, int len) {
                double init = (bval >= 0) ? 1 : 0;
@@ -1859,9 +2368,29 @@ public class LibSpoofPrimitives
        //custom vector greater
        
        public static void vectGreaterAdd(double[] a, double bval, double[] c, 
int ai, int ci, int len) {
-               for( int j = ai; j < ai+len; j++, ci++)
-                       c[ci] += (a[j] > bval) ? 1 : 0;
-       }
+               final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, 
bval);
+               final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+               final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, 
ai + i);
+                       DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, 
ci + i);
+
+                       VectorMask<Double> gt = 
aVec.compare(VectorOperators.GT, bVec);
+                       DoubleVector inc = zeros.blend(ones, gt);
+
+                       cVec.add(inc).intoArray(c, ci + i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       c[ci + i] += (a[ai + i] > bval) ? 1.0 : 0.0;
+               }
+               }
        
        public static void vectGreaterAdd(double bval, double[] a, double[] c, 
int ai, int ci, int len) {
                vectLessequalAdd(a, bval, c, ai, ci, len);
@@ -1881,11 +2410,30 @@ public class LibSpoofPrimitives
        }
        
        public static double[] vectGreaterWrite(double[] a, double bval, int 
ai, int len) {
-               double[] c = allocVector(len, false);
-               for( int j = 0; j < len; j++, ai++)
-                       c[j] = (a[ai] > bval) ? 1 : 0;
-               return c;
-       }
+        double[] c = allocVector(len, false);
+        final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
+        final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+        final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+        int i = 0;
+        int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+        for (; i < upper; i += vLen) {
+            DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+
+            VectorMask<Double> gt = aVec.compare(VectorOperators.GT, bVec);
+            DoubleVector out = zeros.blend(ones, gt);
+
+            out.intoArray(c, i);
+        }
+
+               //rest, not aligned to vLen-blocks
+        for (; i < len; i++) {
+            c[i] = (a[ai + i] > bval) ? 1.0 : 0.0;
+               }
+        return c;
+    }
        
        public static double[] vectGreaterWrite(double bval, double[] a, int 
ai, int len) {
                return vectLessWrite(a, bval, ai, len);
@@ -1898,6 +2446,33 @@ public class LibSpoofPrimitives
                return c;
        }
 
+       // not in use: vector api implementation slower than scalar loop version
+       public static double[] vectGreaterWrite_vector_api(double[] a, double[] 
b, int ai, int bi, int len) {
+               double[] c = allocVector(len, false);
+               final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+               final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, 
ai + i);
+                       DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, 
bi + i);
+
+                       VectorMask<Double> gt = 
aVec.compare(VectorOperators.GT, bVec);
+                       DoubleVector out = zeros.blend(ones, gt);
+
+                       out.intoArray(c, i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       c[i] = (a[ai + i] > b[bi + i]) ? 1.0 : 0.0;
+               }
+               return c;
+               }
+
        public static double[] vectGreaterWrite(double[] a, double bval, int[] 
aix, int ai, int alen, int len) {
                double init = (bval < 0) ? 1 : 0;
                double[] c = allocVector(len, true, init);
diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index cfdf21255e..9417e5134e 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -4019,6 +4019,45 @@ public class LibMatrixMult
                        c[ ci+bix[j+7] ] = a[ ai+bix[j+7] ] * b[ j+7 ];
                }
        }
+       // test
+       public static double[] vectMult2Write(double[] a,double[] c, int ai, 
int len) {
+       
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+       
+               for (; i < upper; i += vLen) {
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       va.add(va).intoArray(c, i);
+               }
+       
+               for (; i < len; i++) {
+                       double x = a[ai + i];
+                       c[i] = x + x;
+               }
+       
+               return c;
+       }
+       public static double[] vectMult2Write_dedicated_2(double[] a, double[] 
c, int ai, int len) {
+               
+               final int bn = len % vLen;
+       
+               // scalar prefix so the vector loop is an exact multiple of vLen
+               for (int j = 0; j < bn; j++) {
+                       double x = a[ai + j];
+                       c[j] = x + x;
+               }
+       
+               // vector loop: j runs over multiples of vLen, no tail 
afterwards
+               for (int j = bn; j < len; j += vLen) {
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ j);
+                       va.add(va).intoArray(c, j);
+                       // or: va.mul(2.0) via broadcast if you prefer
+               }
+       
+               return c;
+       }
+       
+       
 
        public static void vectMultiply(double[] a, double[] c, int ai, int ci, 
final int len){
 
diff --git 
a/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java
 
b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java
new file mode 100644
index 0000000000..e8ac3f79f3
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java
@@ -0,0 +1,345 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.sysds.performance.primitives_vector_api;
+import org.apache.sysds.runtime.codegen.LibSpoofPrimitives;
+
+
+public enum BenchCase {
+
+       // Aggregations
+
+       VECT_SUM(
+         "vectSum dense",
+         OutKind.SCALAR_DOUBLE,
+         ctx -> ctx.initDenseA(),
+         ctx -> {ctx.scalarRes = 
backup_primitives_for_benchmark.scalarvectSum(ctx.a, 0, ctx.len);
+                         BenchUtil.blackhole = ctx.scalarRes;
+                        },
+         ctx -> {ctx.vectorRes = 
backup_primitives_for_benchmark.vectSum(ctx.a, 0, ctx.len);
+                         BenchUtil.blackhole = ctx.vectorRes;},
+         ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;}
+       ),
+
+
+       ROWS_MAXS_VECT_MULT(
+         "rowMaxsVectMult dense",
+         OutKind.SCALAR_DOUBLE,
+         ctx -> {ctx.initDenseA(); ctx.initDenseB();},
+         ctx -> ctx.scalarRes = 
backup_primitives_for_benchmark.scalarrowMaxsVectMult(ctx.a, ctx.b, 0, 0, 
ctx.len),
+         ctx -> ctx.vectorRes = 
backup_primitives_for_benchmark.rowMaxsVectMult(ctx.a, ctx.b, 0, 0, ctx.len),
+         ctx -> {
+               ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;
+         }
+       ),
+
+       ROWS_MAXS_VECT_MULT_AIX(
+         "rowMaxsVectMult_aix dense",
+         OutKind.SCALAR_DOUBLE,
+         ctx -> {ctx.initDenseA();ctx.initDenseB();ctx.initDenseAInt();},
+         ctx -> {ctx.scalarRes = 
backup_primitives_for_benchmark.scalarrowMaxsVectMult(ctx.a, ctx.b, 
ctx.a_int,0,0,ctx.len);
+               BenchUtil.blackhole = ctx.scalarRes;
+                       },
+         ctx -> {
+               ctx.vectorRes = 
backup_primitives_for_benchmark.rowMaxsVectMult(ctx.a, ctx.b, 
ctx.a_int,0,0,ctx.len);
+               BenchUtil.blackhole = ctx.vectorRes;
+                       },
+         ctx -> {
+               ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;
+         }
+       ),
+
+       VECT_MAX(
+         "vectMax dense",
+         OutKind.SCALAR_DOUBLE,
+         ctx -> ctx.initDenseA(),
+         ctx -> {ctx.scalarRes = 
backup_primitives_for_benchmark.scalarvectMax(ctx.a, 0, ctx.len);
+                         BenchUtil.blackhole = ctx.scalarRes;
+                        },
+         ctx -> {ctx.vectorRes = 
backup_primitives_for_benchmark.vectMax(ctx.a, 0, ctx.len);
+                         BenchUtil.blackhole = ctx.vectorRes;},
+         ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;}
+       ),
+       VECT_COUNTNNZ(
+         "vectCountnnz dense",
+         OutKind.SCALAR_DOUBLE,
+         ctx -> ctx.initDenseA(),
+         ctx -> {ctx.scalarRes = 
backup_primitives_for_benchmark.scalarvectCountnnz(ctx.a, 0, ctx.len);
+                         BenchUtil.blackhole = ctx.scalarRes;
+                        },
+         ctx -> {ctx.vectorRes = 
backup_primitives_for_benchmark.vectCountnnz(ctx.a, 0, ctx.len);
+                         BenchUtil.blackhole = ctx.vectorRes;},
+         ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;}
+       ),
+
+       // Divisions
+
+       VECT_DIV_ADD(
+         "vectDivAdd dense",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval(); 
ctx.initDenseADiv();},
+         ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.a, 
ctx.bval, ctx.cScalar, 0, 0, ctx.len),
+         ctx -> backup_primitives_for_benchmark.vectDivAdd(ctx.a, ctx.bval, 
ctx.cVector, 0, 0, ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       ),
+
+       VECT_DIV_ADD_2(
+         "vectDivAdd2 dense",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+         ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.bval, 
ctx.a, ctx.cScalar, 0, 0, ctx.len),
+         ctx -> LibSpoofPrimitives.vectDivAdd(ctx.bval, ctx.a, ctx.cVector, 0, 
0, ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       ),
+
+       VECT_DIV_ADD_SPARSE(
+         "vectDivAdd sparse",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseAandC_mutable(); ctx.initDenseAInt(); 
ctx.initbval();},
+         ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.a, 
ctx.bval, ctx.cScalar, ctx.a_int, 0, 0,ctx.len, ctx.len),
+         ctx -> LibSpoofPrimitives.vectDivAdd(ctx.a, ctx.bval, ctx.cVector, 
ctx.a_int, 0, 0,ctx.len, ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       ),
+
+
+       VECT_DIV_ADD_SPARSE2(
+         "vectDivAdd2 sparse",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseAandC_mutable(); ctx.initDenseAInt(); 
ctx.initbval();},
+         ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.bval, 
ctx.a, ctx.cScalar, ctx.a_int, 0, 0,ctx.len, ctx.len),
+         ctx -> LibSpoofPrimitives.vectDivAdd(ctx.bval, ctx.a, ctx.cVector, 
ctx.a_int, 0, 0,ctx.len, ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       ),
+
+       VECT_DIV_WRITE(
+         "vectDivWrite dense",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+         ctx -> ctx.cScalar = 
backup_primitives_for_benchmark.scalarvectDivWrite(ctx.a, ctx.bval, 0,ctx.len),
+         ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.a, ctx.bval, 
0,ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       ),
+       VECT_DIV_WRITE2(
+         "vectDivWrite2 dense",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+         ctx -> ctx.cScalar = 
backup_primitives_for_benchmark.scalarvectDivWrite(ctx.bval, ctx.a, 0,ctx.len),
+         ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.bval, ctx.a, 
0,ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       ), 
+       VECT_DIV_WRITE3(
+         "vectDivWrite3 dense",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval(); 
ctx.initDenseBDiv();},
+         ctx -> ctx.cScalar = 
backup_primitives_for_benchmark.scalarvectDivWrite(ctx.a, ctx.b, 0, 0,ctx.len),
+         ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.a, ctx.b, 0, 
0,ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       ),
+
+       // Comparisons
+
+       VECT_EQUAL_WRITE(
+         "vectEqualWrite dense",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+         ctx -> ctx.cScalar = 
backup_primitives_for_benchmark.scalarvectEqualWrite(ctx.a, ctx.bval, 
0,ctx.len),
+         ctx -> ctx.cVector = LibSpoofPrimitives.vectEqualWrite(ctx.a, 
ctx.bval, 0,ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       ),
+       VECT_EQUAL_ADD(
+         "vectEqualAdd dense",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+         ctx -> backup_primitives_for_benchmark.scalarvectEqualAdd(ctx.a, 
ctx.bval, ctx.cScalar,0, 0,ctx.len),
+         ctx -> LibSpoofPrimitives.vectEqualAdd(ctx.a, ctx.bval,ctx.cVector, 
0, 0,ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       ),
+       VECT_EQUAL_WRITE2(
+         "vectEqualWrite2 dense",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseA(); ctx.initbval();},
+         ctx -> ctx.cScalar = 
backup_primitives_for_benchmark.scalarvectEqualWrite(ctx.a, ctx.bval, 
0,ctx.len),
+         ctx -> ctx.cVector = LibSpoofPrimitives.vectEqualWrite(ctx.a, 
ctx.bval, 0,ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       ),
+       VECT_LESS_ADD(
+         "vectLessAdd dense",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+         ctx -> backup_primitives_for_benchmark.scalarvectLessAdd(ctx.a, 
ctx.bval, ctx.cScalar,0, 0,ctx.len),
+         ctx -> LibSpoofPrimitives.vectLessAdd(ctx.a, ctx.bval,ctx.cVector, 0, 
0,ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       ),
+       VECT_LESS_WRITE(
+         "vectLessWrite dense",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseA();  ctx.initbval();},
+         ctx -> ctx.cScalar = 
backup_primitives_for_benchmark.scalarvectLessWrite(ctx.a, ctx.bval, 0 
,ctx.len),
+         ctx -> ctx.cVector = LibSpoofPrimitives.vectLessWrite(ctx.a, 
ctx.bval, 0, ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       ),
+       VECT_LESS_WRITE2(
+         "vectLessWrite2 dense",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseA(); ctx.initDenseB(); ctx.initbval();},
+         ctx -> ctx.cScalar = 
backup_primitives_for_benchmark.scalarvectLessWrite(ctx.a, ctx.b, 0, 0 
,ctx.len),
+         ctx -> ctx.cVector = LibSpoofPrimitives.vectLessWrite(ctx.a, ctx.b, 
0, 0, ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       ),
+       VECT_LESSEQUAL_ADD(
+         "vectLessequalAdd dense",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+         ctx -> backup_primitives_for_benchmark.scalarvectLessequalAdd(ctx.a, 
ctx.bval, ctx.cScalar,0, 0,ctx.len),
+         ctx -> LibSpoofPrimitives.vectLessequalAdd(ctx.a, 
ctx.bval,ctx.cVector, 0, 0,ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       ),
+       VECT_LESSEQUAL_WRITE(
+         "vectLessequalWrite dense",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseA();  ctx.initbval();},
+         ctx -> ctx.cScalar = 
backup_primitives_for_benchmark.scalarvectLessequalWrite(ctx.a, ctx.bval, 0 
,ctx.len),
+         ctx -> ctx.cVector = LibSpoofPrimitives.vectLessequalWrite(ctx.a, 
ctx.bval, 0, ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       ),
+       VECT_LESSEQUAL_WRITE2(
+         "vectLessequalWrite2 dense",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseA(); ctx.initDenseB();},
+         ctx -> ctx.cScalar = 
backup_primitives_for_benchmark.scalarvectLessequalWrite(ctx.a, ctx.b, 0, 0 
,ctx.len),
+         ctx -> ctx.cVector = LibSpoofPrimitives.vectLessequalWrite(ctx.a, 
ctx.b, 0, 0, ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       ),
+
+       VECT_GREATER_ADD(
+         "vectGreaterAdd dense",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+         ctx -> backup_primitives_for_benchmark.scalarvectGreaterAdd(ctx.a, 
ctx.bval, ctx.cScalar,0, 0,ctx.len),
+         ctx -> LibSpoofPrimitives.vectGreaterAdd(ctx.a, ctx.bval,ctx.cVector, 
0, 0,ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       ),
+       VECT_GREATER_WRITE(
+         "vectGreaterWrite dense",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseA();  ctx.initbval();},
+         ctx -> ctx.cScalar = 
backup_primitives_for_benchmark.scalarvectGreaterWrite(ctx.a, ctx.bval, 0 
,ctx.len),
+         ctx -> ctx.cVector = LibSpoofPrimitives.vectGreaterWrite(ctx.a, 
ctx.bval, 0, ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       ),
+       VECT_GREATER_WRITE2(
+         "vectGreaterWrite2 dense",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseA(); ctx.initDenseB();},
+         ctx -> ctx.cScalar = 
backup_primitives_for_benchmark.scalarvectGreaterWrite(ctx.a, ctx.b, 0, 0 
,ctx.len),
+         ctx -> ctx.cVector = LibSpoofPrimitives.vectGreaterWrite(ctx.a, 
ctx.b, 0, 0, ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       ),
+
+       // vectMult2
+
+       VECT_Mult2_ADD(
+         "vectMult2Add dense",
+         OutKind.ARRAY_DOUBLE,
+         ctx -> {ctx.initDenseAandC_mutable(); },
+         ctx -> backup_primitives_for_benchmark.scalarvectMult2Add(ctx.a, 
ctx.cScalar,0, 0,ctx.len),
+         ctx -> LibSpoofPrimitives.vectMult2Add(ctx.a, ctx.cVector, 0, 
0,ctx.len),
+         ctx -> {
+               ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+               ctx.ok = ctx.maxDiff <= 1e-9;
+         }
+       );
+
+       public enum OutKind { SCALAR_DOUBLE, ARRAY_DOUBLE }
+       public final String name;
+       public final java.util.function.Consumer<Ctx> setup;
+       public final java.util.function.Consumer<Ctx> scalar;
+       public final java.util.function.Consumer<Ctx> vector;
+       public final java.util.function.Consumer<Ctx> verify;
+       public final OutKind outKind;
+
+  
+       BenchCase(String name,
+                         OutKind outKind,
+                         java.util.function.Consumer<Ctx> setup,
+                         java.util.function.Consumer<Ctx> scalar,
+                         java.util.function.Consumer<Ctx> vector,
+                         java.util.function.Consumer<Ctx> verify) {
+         this.name = name; this.outKind = outKind; this.setup = setup; 
this.scalar = scalar; this.vector = vector; this.verify = verify;
+       }
+}
+
diff --git 
a/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchUtil.java
 
b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchUtil.java
new file mode 100644
index 0000000000..bbf0f5031f
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchUtil.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.sysds.performance.primitives_vector_api;
+
+
+public class BenchUtil {
+       public static volatile double blackhole;
+
+       public static void warmup(Runnable r,int iters ) {
+               for (int i = 0; i < iters; i++) r.run();
+       }
+
+       public static double measure(Runnable r,int iters) {
+               System.gc();
+               long t0 = System.nanoTime();
+               for (int i = 0; i < iters; i++) 
+                       r.run();
+               long t1 = System.nanoTime();
+               return (t1 - t0) / (double) iters;
+       }
+
+       // ---- args helpers ----
+       public static int argInt(String[] args, String key, int def) {
+               for (int i = 0; i < args.length - 1; i++)
+               if (args[i].equals(key))
+                       return Integer.parseInt(args[i + 1]);
+               return def;
+       }
+
+       public static String argStr(String[] args, String key, String def) {
+               for (int i = 0; i < args.length - 1; i++)
+               if (args[i].equals(key))
+                       return args[i + 1];
+               return def;
+       }
+  
+       public static double maxAbsDiff(double[] a, double[] b) {
+               double m = 0;
+               for (int i = 0; i < a.length; i++)
+                       m = Math.max(m, Math.abs(a[i] - b[i]));
+               return m;
+       }
+       
+       public static void printScalarDouble(String name,
+               double nsScalar, double nsVector,
+               double scalarRes, double vectorRes,
+               boolean ok) {
+  
+               double speedup = nsScalar / nsVector;
+               System.out.printf("%s | scalar %.1f ns | vector %.1f ns | 
speedup %.3fx | " +
+                                               "s=%.6g v=%.6g | %s%n",
+               name, nsScalar, nsVector, speedup, scalarRes, vectorRes, ok ? 
"OK" : "FAIL");
+       }
+
+       public static void printArrayDiff(String name,
+               double nsScalar, double nsVector,
+               double maxDiff,
+               boolean ok) {
+
+               double speedup = nsScalar / nsVector;
+               System.out.printf("%s | scalar %.1f ns | vector %.1f ns | 
speedup %.3fx | " +
+                                               "maxDiff=%.6g | %s%n",
+               name, nsScalar, nsVector, speedup, maxDiff, ok ? "OK" : "FAIL");
+       }
+  }
+  
\ No newline at end of file
diff --git 
a/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java 
b/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java
new file mode 100644
index 0000000000..4fc1a15de0
--- /dev/null
+++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.sysds.performance.primitives_vector_api;
+
+public class Ctx {
+       public int len;
+       public double[] a, cInit,b,c, cScalar, cVector;
+       public double bval;
+
+       public double scalarRes, vectorRes;
+       public double maxDiff;
+       public boolean ok;
+       public int[] a_int;
+
+       void initDenseA() {
+               a = new double[len];
+               for (int i = 0; i < len; i++) a[i] = (i % 10) - 5;
+       }
+       void initDenseB() {
+               b = new double[len];
+               for (int i = 0; i < len; i++) b[i] = (i % 10) - 5;
+       }
+       void initDenseC() {
+               c = new double[len];
+               for (int i = 0; i < len; i++) c[i] = (i % 10) - 5;
+       }
+       void initDenseAInt() {
+               a_int = new int[len];
+               for (int i = 0; i < len; i++) a_int[i] = i;;
+       }
+       
+       void initbval(){
+               bval = 1.234567;
+       }
+       void initDenseADiv() {
+               a = new double[len];
+               for (int i = 0; i < len; i++) {
+                       a[i] = ((i % 10) + 1);  // Range: 1 to 10 (no zeros)
+               }
+       }
+       void initDenseBDiv() {
+               b = new double[len];
+               for (int i = 0; i < len; i++) b[i] = ((i % 10) + 1);
+       }
+
+       void initDenseAandC_mutable() {
+               initDenseADiv();
+               cInit = new double[len];
+               for (int i = 0; i < len; i++) cInit[i] = (i % 10) - 5;
+               cScalar = java.util.Arrays.copyOf(cInit, len);
+               cVector = java.util.Arrays.copyOf(cInit, len);
+       }
+
+       void resetC() {
+               if (cInit != null) {
+                       System.arraycopy(cInit, 0, cScalar, 0, len);
+                       System.arraycopy(cInit, 0, cVector, 0, len);
+               }
+       }
+}
diff --git 
a/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java
 
b/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java
new file mode 100644
index 0000000000..340b50ba8f
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.sysds.performance.primitives_vector_api;
+
+
+public class PrimitivePerfSuite {
+       public static void main(String[] args) {
+               //int len = BenchUtil.argInt(args, "--len", 262_144);
+               int len = BenchUtil.argInt(args, "--len", 1_000_000);
+               int warmup = BenchUtil.argInt(args, "--warmup", 10_000);
+               int iters = BenchUtil.argInt(args, "--iters", 100);
+               String filter = BenchUtil.argStr(args, "--filter", "");
+       
+               for (BenchCase bc : BenchCase.values()) {
+                       if (!filter.isEmpty() && !bc.name.contains(filter)) 
continue;
+       
+                       Ctx ctx = new Ctx();
+                       ctx.len = len;
+                       bc.setup.accept(ctx);
+               
+                       // warm scalar
+                       ctx.resetC(); 
+                       BenchUtil.warmup(() -> {bc.scalar.accept(ctx); 
},warmup);
+                       ctx.resetC();
+                       double nsScalar = BenchUtil.measure(() -> { 
bc.scalar.accept(ctx); }, iters);
+               
+                       // warm vector
+                       ctx.resetC(); 
+                       BenchUtil.warmup(() -> {bc.vector.accept(ctx); }, 
warmup);
+                       ctx.resetC();
+                       double nsVector = BenchUtil.measure(() -> 
{bc.vector.accept(ctx); }, iters);
+               
+                       // verify once
+                       ctx.resetC(); bc.scalar.accept(ctx);
+                       bc.vector.accept(ctx);
+                       bc.verify.accept(ctx);
+               
+                       if (bc.outKind == BenchCase.OutKind.SCALAR_DOUBLE) {
+                               BenchUtil.printScalarDouble(bc.name, nsScalar, 
nsVector, ctx.scalarRes, ctx.vectorRes, ctx.ok);
+                       } else {
+                               BenchUtil.printArrayDiff(bc.name, nsScalar, 
nsVector, ctx.maxDiff, ctx.ok);
+                       }
+               }
+       }
+}
diff --git 
a/src/test/java/org/apache/sysds/performance/primitives_vector_api/backup_primitives_for_benchmark.java
 
b/src/test/java/org/apache/sysds/performance/primitives_vector_api/backup_primitives_for_benchmark.java
new file mode 100644
index 0000000000..17cb093f97
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/performance/primitives_vector_api/backup_primitives_for_benchmark.java
@@ -0,0 +1,862 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.sysds.performance.primitives_vector_api;
+
+import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
+
+
+
+import java.util.Arrays;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+import jdk.incubator.vector.VectorMask;
+
+
+public class backup_primitives_for_benchmark {
+
+       // Vector API initializations
+       private static final VectorSpecies<Double> SPECIES = 
DoubleVector.SPECIES_PREFERRED;
+       private static final int vLen = SPECIES.length();
+
+       public static double[] allocVector(int len, boolean reset) {
+               return allocVector(len, reset, 0);
+       }
+       
+       protected static double[] allocVector(int len, boolean reset, double 
resetVal) {
+               VectorBuffer buff = memPool.get();
+               
+               //find next matching vector in ring buffer or
+               //allocate new vector if required
+               double[] vect = buff.next(len);
+               if( vect == null )
+                       vect = new double[len];
+               
+               //reset vector if required
+               if( reset )
+                       Arrays.fill(vect, resetVal);
+               return vect;
+       }
+               private static class VectorBuffer {
+               private static final int MAX_SIZE = 512*1024; //4MB
+               private final double[][] _data;
+               private int _pos;
+               private int _len1;
+               private int _len2;
+               
+               public VectorBuffer(int num, int len1, int len2) {
+                       //best effort size restriction since large intermediates
+                       //not necessarily used (num refers to the total number)
+                       len1 = Math.min(len1, MAX_SIZE);
+                       len2 = Math.min(len2, MAX_SIZE);
+                       //pre-allocate ring buffer
+                       int lnum = (len2>0 && len1!=len2) ? 2*num : num;
+                       _data = new double[lnum][];
+                       for( int i=0; i<num; i++ ) {
+                               if( lnum > num ) {
+                                       _data[2*i] = new double[len1];
+                                       _data[2*i+1] = new double[len2];
+                               }
+                               else {
+                                       _data[i] = new double[len1];
+                               }
+                       }
+                       _pos = -1;
+                       _len1 = len1;
+                       _len2 = len2;
+               }
+               public double[] next(int len) {
+                       if( _len1!=len && _len2!=len )
+                               return null;
+                       do {
+                               _pos = (_pos+1>=_data.length) ? 0 : _pos+1;
+                       } while( _data[_pos].length!=len );
+                       return _data[_pos];
+               }
+               @SuppressWarnings("unused")
+               public boolean isReusable(int num, int len1, int len2) {
+                       int lnum = (len2>0 && len1!=len2) ? 2*num : num;
+                       return (_len1 == len1 && _len2 == len2
+                               && _data.length == lnum);
+               }
+       }
+       private static ThreadLocal<VectorBuffer> memPool = new ThreadLocal<>() {
+               @Override protected VectorBuffer initialValue() { return new 
VectorBuffer(0,0,0); }
+       };
+
+       public static void scalarvectDivAdd(double[] a, double bval, double[] 
c, int ai, int ci, int len) {
+               for( int j = ai; j < ai+len; j++, ci++)
+                       c[ci] +=  a[j] / bval;
+       }
+
+       public static void vectDivAdd(double[] a, double bval, double[] c, int 
ai, int ci, int len) { 
+               final double inv = 1.0 / bval; 
+               final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv); 
+               int i = 0; final int upperBound = SPECIES.loopBound(len); 
+
+               //unrolled vLen-block (for better instruction-level 
parallelism) 
+               for (; i < upperBound; i += vLen) { 
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i); 
+                       DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci 
+ i); 
+                       vc = vc.add(va.mul(vinv)); vc.intoArray(c, ci + i); 
+               } 
+               
+               //rest, not aligned to vLen-blocks 
+               for (; i < len; i++) { 
+                       c[ci + i] += a[ai + i] * inv;
+               } 
+       }
+
+               public static void scalarvectDivAdd(double bval, double[] a, 
double[] c, int ai, int ci, int len) {
+               for( int j = ai; j < ai+len; j++, ci++)
+                       c[ci] +=  bval / a[j];
+       }
+
+       public static void vectDivAdd(double bval, double[] a, double[] c, int 
ai, int ci, int len) {
+               int i = 0;
+               int upperBound = SPECIES.loopBound(len);
+               DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upperBound; i += vLen) {
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci 
+ i);
+                       vc = vc.add(vb.div(va));
+                       vc.intoArray(c, ci + i);
+               }
+
+               //rest, not aligned to vLen-blocks      
+               for (;i<len;i++){
+                       c[ci+i] += bval/a[ai+i];
+               }
+       }
+
+       public static void scalarvectDivAdd(double[] a, double bval, double[] 
c, int[] aix, int ai, int ci, int alen, int len) {
+               for( int j = ai; j < ai+alen; j++ )
+                       c[ci + aix[j]] += a[j] / bval;
+       }
+
+       // not in use: vector api implementation slower than scalar loop version
+       public static void vectDivAdd(double[] a, double bval, double[] c, 
int[] aix, int ai, int ci, int alen, int len) {
+
+               final double inv = 1.0 / bval;
+               int i = 0;
+               int upperBound = SPECIES.loopBound(alen);
+               DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upperBound; i += vLen) {
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       DoubleVector vcontrib = va.mul(vinv);
+
+                       // scatter-add lane-by-lane
+                       for (int lane = 0; lane < vLen; lane++) {
+                               int idx = ci + aix[ai + i + lane];
+                               c[idx] += vcontrib.lane(lane);
+                       }
+               }
+
+               //rest, not aligned to vLen-blocks
+               for(; i<alen; i++){
+                       c[ci + aix[ai + i]] += a[ai + i] * inv;
+               }
+       }
+
+       public static void scalarvectDivAdd(double bval, double[] a, double[] 
c, int[] aix, int ai, int ci, int alen, int len) {
+               for( int j = ai; j < ai+alen; j++ )
+                       c[ci + aix[j]] += bval / a[j];
+       }
+
+       // not in use: vector api implementation slower than scalar loop version
+       public static void vectDivAdd(double bval, double[] a, double[] c, 
int[] aix, int ai, int ci, int alen, int len) {
+               int i = 0;
+               int upperBound = SPECIES.loopBound(alen);
+               DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upperBound; i += vLen) {
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       DoubleVector vcontrib = vb.div(va);
+
+                       // scatter-add lane-by-lane
+                       for (int lane = 0; lane < vLen; lane++) {
+                               int idx = ci + aix[ai + i + lane];
+                               c[idx] += vcontrib.lane(lane);
+                       }       
+               }
+               //rest, not aligned to vLen-blocks
+               for (; i<alen; i++){
+                       c[ci + aix[ai + i]] += bval / a[ai +i];
+               }
+       }
+       public static double[] scalarvectDivWrite(double[] a, double bval, int 
ai, int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++)
+                       c[j] = a[ai+j] / bval;
+               return c;
+       }
+
+       // not in use: vector api implementation slower than scalar loop version
+       public static double[] vectDivWrite(double[] a, double bval, int ai, 
int len) {
+               double[] c = allocVector(len, false);
+               final double inv = 1.0 / bval;
+               final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv);
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       va.mul(vinv).intoArray(c, i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       c[i] = a[ai + i] * inv;
+               }
+               return c;
+       }
+       public static double[] scalarvectDivWrite(double bval, double[] a, int 
ai, int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++)
+                       c[j] = bval / a[ai + j];
+               return c;
+       }
+
+       // not in use: vector api implementation slower than scalar loop version
+       public static double[] vectDivWrite(double bval, double[] a, int ai, 
int len) {
+               double[] c = allocVector(len, false);
+               final DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       vb.div(va).intoArray(c, i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i<len; i++){
+                       c[i] = bval / a[ai + i];
+               }
+               return c;
+       }
+       public static double[] scalarvectDivWrite(double[] a, double[] b, int 
ai, int bi, int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++)
+                       c[j] = a[ai + j] / b[bi + j];
+               return c;
+       }
+
+       // not in use: vector api implementation slower than scalar loop version
+       public static double[] vectDivWrite(double[] a, double[] b, int ai, int 
bi, int len) {
+               double[] c = allocVector(len, false);
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi 
+ i);
+                       va.div(vb).intoArray(c, i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for(; i <len; i++){
+                       c[i] = a[ai + i] / b[bi + i];
+               }
+               return c;
+       }
+       public static double scalarrowMaxsVectMult(double[] a, double[] b, int 
ai, int bi, int len) {
+               double val = Double.NEGATIVE_INFINITY;
+               int j=0;
+               for( int i = ai; i < ai+len; i++ )
+                       val = Math.max(a[i]*b[j++], val);
+               return val;
+       }
+
+       public static double rowMaxsVectMult(double[] a, double[] b, int ai, 
int bi, int len) {
+               double maxVal = Double.NEGATIVE_INFINITY;
+       
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+       
+               DoubleVector vmax = DoubleVector.broadcast(SPECIES, 
Double.NEGATIVE_INFINITY);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi 
+ i);
+                       DoubleVector prod = va.mul(vb);
+                       vmax = vmax.max(prod);
+               }
+       
+               maxVal = vmax.reduceLanes(VectorOperators.MAX);
+       
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       maxVal = Math.max(maxVal, a[ai + i] * b[bi + i]);
+               }
+       
+               return maxVal;
+       }
+       // note: parameter bi unused
+       public static double scalarrowMaxsVectMult(double[] a, double[] b, 
int[] aix, int ai, int bi, int len) {
+               double val = Double.NEGATIVE_INFINITY;
+               for( int i = ai; i < ai+len; i++ )
+                       val = Math.max(a[i]*b[aix[i]], val);
+               return val;
+       }
+
+       // not in use: vector api implementation slower than scalar loop version
+       public static double rowMaxsVectMult(double[] a, double[] b, int[] aix, 
int ai, int bi, int len) {
+               double scalarMax = Double.NEGATIVE_INFINITY;
+
+               int i = 0;
+               int upperBound = SPECIES.loopBound(len);
+               DoubleVector vmax = DoubleVector.broadcast(SPECIES, 
Double.NEGATIVE_INFINITY);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upperBound; i += SPECIES.length()) {
+                       DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       DoubleVector vb = DoubleVector.fromArray(SPECIES, b, 0, 
aix, ai + i);
+                       DoubleVector prod = va.mul(vb);
+                       vmax = vmax.max(prod);
+               }
+               scalarMax = Math.max(scalarMax, 
vmax.reduceLanes(VectorOperators.MAX));
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       double prod = a[ai + i] * b[aix[ai + i]];
+                       if (prod > scalarMax)
+                               scalarMax = prod;
+               }
+               return scalarMax;
+               }
+       
+
+       public static double scalarvectSum(double[] a, int ai, int len) { 
+               double val = 0;
+               final int bn = len%8;
+               
+               //compute rest
+               for( int i = ai; i < ai+bn; i++ )
+                       val += a[ i ];
+               
+               //unrolled 8-block (for better instruction-level parallelism)
+               for( int i = ai+bn; i < ai+len; i+=8 ) {
+                       //read 64B cacheline of a, compute cval' = sum(a) + cval
+                       val += a[ i+0 ] + a[ i+1 ] + a[ i+2 ] + a[ i+3 ]
+                                        + a[ i+4 ] + a[ i+5 ] + a[ i+6 ] + a[ 
i+7 ];
+               }
+               
+               //scalar result
+               return val; 
+       } 
+       
+       public static double vectSum(double[] a, int ai, int len) {
+               double sum = 0d;
+               int i = 0;
+
+               DoubleVector acc = DoubleVector.zero(SPECIES);
+               int upperBound = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upperBound; i += SPECIES.length()) {
+                       DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       acc = acc.add(v);
+               }
+               sum += acc.reduceLanes(VectorOperators.ADD);
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       sum += a[ai + i];
+               }
+               return sum;
+       }
+       public static double scalarvectMax(double[] a, int ai, int len) { 
+               double val = Double.NEGATIVE_INFINITY;
+               for( int i = ai; i < ai+len; i++ )
+                       val = Math.max(a[i], val);
+               return val; 
+       } 
+
+       public static double vectMax(double[] a, int ai, int len) {
+               int i = 0;
+               int upperBound = SPECIES.loopBound(len);
+               DoubleVector vmax = DoubleVector.broadcast(SPECIES, 
Double.NEGATIVE_INFINITY);
+       
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upperBound; i += vLen) {
+                       DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       vmax = vmax.max(v);
+               }
+               double maxVal = vmax.reduceLanes(VectorOperators.MAX);
+
+               //rest, not aligned to vLen-blocks      
+               for(;i<len;i++){
+                       maxVal = Math.max(a[ai + i],maxVal);
+               }
+               return maxVal;
+       }
+               public static double scalarvectCountnnz(double[] a, int ai, int 
len) { 
+               int count = 0;
+               for( int i = ai; i < ai+len; i++ )
+                       count += (a[i] != 0) ? 1 : 0;
+               return count;
+       } 
+       public static double vectCountnnz(double[] a, int ai, int len) {        
+               int count = 0;
+               int i = 0;
+               int upperBound = SPECIES.loopBound(len);
+               DoubleVector vzero = DoubleVector.zero(SPECIES);
+       
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upperBound; i += vLen) {
+                       DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai 
+ i);
+                       VectorMask<Double> nz = v.compare(VectorOperators.NE, 
vzero);
+                       count += nz.trueCount();
+               }
+       
+               //rest, not aligned to vLen-blocks      
+               for(;i<len;i++){
+                       count += (a[i] != 0) ? 1 : 0;
+               }
+               return count;
+       }
+               public static void scalarvectEqualAdd(double[] a, double bval, 
double[] c, int ai, int ci, int len) {
+               for( int j = ai; j < ai+len; j++, ci++)
+                       c[ci] += (a[j] == bval) ? 1 : 0;
+       }
+       public static void vectEqualAdd(double[] a, double bval, double[] c, 
int ai, int ci, int len) {
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+               final DoubleVector bVec   = DoubleVector.broadcast(SPECIES, 
bval);
+               final DoubleVector ones   = DoubleVector.broadcast(SPECIES, 
1.0);
+               final DoubleVector zeros  = DoubleVector.zero(SPECIES);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, 
ai + i);
+                       DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, 
ci + i);
+
+                       VectorMask<Double> eq = 
aVec.compare(VectorOperators.EQ, bVec);
+
+                       DoubleVector inc = zeros.blend(ones, eq);
+
+                       cVec.add(inc).intoArray(c, ci + i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       c[ci + i] += (a[ai + i] == bval) ? 1.0 : 0.0;
+               }
+       }
+       public static double[] scalarvectEqualWrite(double[] a, double bval, 
int ai, int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++, ai++)
+                       c[j] = (a[ai] == bval) ? 1 : 0;
+               return c;
+       }
+       public static double[] vectEqualWrite(double[] a, double bval, int ai, 
int len) {
+               double[] c = allocVector(len, false);
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+               DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+               DoubleVector zeros = DoubleVector.zero(SPECIES);
+               DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                               DoubleVector va = 
DoubleVector.fromArray(SPECIES, a, ai + i);
+                               var mask = va.compare(VectorOperators.EQ, vb);
+                               DoubleVector out = zeros.blend(ones, mask);
+                               out.intoArray(c, i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                               c[i] = (a[ai + i] == bval) ? 1 : 0;
+               }
+               return c;
+       }
+       public static double[] scalarvectEqualWrite(double[] a, double[] b, int 
ai, int bi, int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++, ai++, bi++)
+                       c[j] = (a[ai] == b[bi]) ? 1 : 0;
+               return c;
+       }
+                               
+       public static double[] vectEqualWrite(double[] a, double[] b, int ai, 
int bi, int len) {
+               double[] c = allocVector(len, false);
+               final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+               final DoubleVector zeros = DoubleVector.zero(SPECIES);
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                               DoubleVector aVec = 
DoubleVector.fromArray(SPECIES, a, ai + i);
+                               DoubleVector bVec = 
DoubleVector.fromArray(SPECIES, b, bi + i);
+                               VectorMask<Double> eq = 
aVec.compare(VectorOperators.EQ, bVec);
+                               DoubleVector out = zeros.blend(ones, eq);
+
+                               out.intoArray(c, i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       c[i] = (a[ai + i] == b[bi + i]) ? 1.0 : 0.0;
+               }
+               return c;
+       }
+       public static double[] vectNotequalWrite(double[] a, double[] b, int 
ai, int bi, int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++, ai++, bi++)
+                               c[j] = (a[ai] != b[bi]) ? 1 : 0;
+               return c;
+       }
+       
+       // not in use: vector api implementation slower than scalar loop version
+       public static double[] vectNotequalWrite_vector_api(double[] a, 
double[] b, int ai, int bi, int len) {
+               double[] c = allocVector(len, false);
+               final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+               final DoubleVector zeros = DoubleVector.zero(SPECIES);
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+               
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                               DoubleVector aVec = 
DoubleVector.fromArray(SPECIES, a, ai + i);
+                               DoubleVector bVec = 
DoubleVector.fromArray(SPECIES, b, bi + i);
+       
+                               VectorMask<Double> ne = 
aVec.compare(VectorOperators.NE, bVec);
+                               DoubleVector out = zeros.blend(ones, ne);
+       
+                               out.intoArray(c, i);
+               }
+       
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                               c[i] = (a[ai + i] != b[bi + i]) ? 1.0 : 0.0;
+               }
+               return c;
+       }
+
+
+       public static void scalarvectLessAdd(double[] a, double bval, double[] 
c, int ai, int ci, int len) {
+               for( int j = ai; j < ai+len; j++, ci++)
+                               c[ci] += (a[j] < bval) ? 1 : 0;
+       }
+       public static void vectLessAdd(double[] a, double bval, double[] c, int 
ai, int ci, int len) {
+               final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, 
bval);
+               final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+               final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                               DoubleVector aVec = 
DoubleVector.fromArray(SPECIES, a, ai + i);
+                               DoubleVector cVec = 
DoubleVector.fromArray(SPECIES, c, ci + i);
+
+                               VectorMask<Double> lt = 
aVec.compare(VectorOperators.LT, bVec);
+                               DoubleVector inc = zeros.blend(ones, lt);
+
+                               cVec.add(inc).intoArray(c, ci + i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                               c[ci + i] += (a[ai + i] < bval) ? 1.0 : 0.0;
+               }
+       }
+
+
+       public static double[] scalarvectLessWrite(double[] a, double bval, int 
ai, int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++, ai++)
+                               c[j] = (a[ai] < bval) ? 1 : 0;
+               return c;
+       }
+       
+       public static double[] vectLessWrite(double[] a, double bval, int ai, 
int len) {
+               double[] c = allocVector(len, false);
+               final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, 
bval);
+               final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+               final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, 
ai + i);
+
+                       VectorMask<Double> lt = 
aVec.compare(VectorOperators.LT, bVec);
+                       DoubleVector out = zeros.blend(ones, lt);
+
+                       out.intoArray(c, i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                               c[i] = (a[ai + i] < bval) ? 1.0 : 0.0;
+               }
+
+               return c;
+       }
+
+       public static double[] scalarvectLessWrite(double[] a, double[] b, int 
ai, int bi, int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++, ai++, bi++)
+                               c[j] = (a[ai] < b[bi]) ? 1 : 0;
+               return c;
+       }
+
+       public static double[] vectLessWrite(double[] a, double[] b, int ai, 
int bi, int len) {
+               double[] c = allocVector(len, false);
+               
+               final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+               final DoubleVector zeros = DoubleVector.zero(SPECIES);
+               
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+               
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, 
ai + i);
+                       DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, 
bi + i);
+       
+                       VectorMask<Double> lt = 
aVec.compare(VectorOperators.LT, bVec);
+                       DoubleVector out = zeros.blend(ones, lt);
+       
+                       out.intoArray(c, i);
+               }
+               
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       c[i] = (a[ai + i] < b[bi + i]) ? 1.0 : 0.0;
+               }
+               
+               return c;
+       }
+       public static void scalarvectLessequalAdd(double[] a, double bval, 
double[] c, int ai, int ci, int len) {
+               for( int j = ai; j < ai+len; j++, ci++)
+                               c[ci] += (a[j] <= bval) ? 1 : 0;
+       }
+
+       public static void vectLessequalAdd(double[] a, double bval, double[] 
c, int ai, int ci, int len) {
+               final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, 
bval);
+               final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+               final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, 
ai + i);
+                       DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, 
ci + i);
+       
+                       VectorMask<Double> le = 
aVec.compare(VectorOperators.LE, bVec);
+                       DoubleVector inc = zeros.blend(ones, le);
+       
+                       cVec.add(inc).intoArray(c, ci + i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                               c[ci + i] += (a[ai + i] <= bval) ? 1.0 : 0.0;
+               }
+       }
+       public static double[] scalarvectLessequalWrite(double[] a, double 
bval, int ai, int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++, ai++)
+                               c[j] = (a[ai] <= bval) ? 1 : 0;
+               return c;
+       }
+       public static double[] vectLessequalWrite(double[] a, double bval, int 
ai, int len) {
+               double[] c = allocVector(len, false);
+               final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, 
bval);
+               final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+               final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, 
ai + i);
+
+                       VectorMask<Double> le = 
aVec.compare(VectorOperators.LE, bVec);
+                       DoubleVector out = zeros.blend(ones, le);
+
+                       out.intoArray(c, i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                               c[i] = (a[ai + i] <= bval) ? 1.0 : 0.0;
+               }
+
+               return c;
+       }
+       public static double[] scalarvectLessequalWrite(double[] a, double[] b, 
int ai, int bi, int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++, ai++, bi++)
+                               c[j] = (a[ai] <= b[bi]) ? 1 : 0;
+               return c;
+       }
+
+       public static double[] vectLessequalWrite(double[] a, double[] b, int 
ai, int bi, int len) {
+               double[] c = allocVector(len, false);
+
+               final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+               final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                       DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, 
ai + i);
+                       DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, 
bi + i);
+               
+                       VectorMask<Double> le = 
aVec.compare(VectorOperators.LE, bVec);
+                       DoubleVector out = zeros.blend(ones, le);
+               
+                       out.intoArray(c, i);
+               }
+       
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                       c[i] = (a[ai + i] <= b[bi + i]) ? 1.0 : 0.0;
+               }
+       
+               return c;
+       }
+       public static void scalarvectGreaterAdd(double[] a, double bval, 
double[] c, int ai, int ci, int len) {
+               for( int j = ai; j < ai+len; j++, ci++)
+                       c[ci] += (a[j] > bval) ? 1 : 0;
+       }
+
+       public static void vectGreaterAdd(double[] a, double bval, double[] c, 
int ai, int ci, int len) {
+               final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, 
bval);
+               final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+               final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                               DoubleVector aVec = 
DoubleVector.fromArray(SPECIES, a, ai + i);
+                               DoubleVector cVec = 
DoubleVector.fromArray(SPECIES, c, ci + i);
+
+                               VectorMask<Double> gt = 
aVec.compare(VectorOperators.GT, bVec);
+                               DoubleVector inc = zeros.blend(ones, gt);
+
+                               cVec.add(inc).intoArray(c, ci + i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                               c[ci + i] += (a[ai + i] > bval) ? 1.0 : 0.0;
+               }
+               }
+               public static double[] scalarvectGreaterWrite(double[] a, 
double bval, int ai, int len) {
+                               double[] c = allocVector(len, false);
+                               for( int j = 0; j < len; j++, ai++)
+                                               c[j] = (a[ai] > bval) ? 1 : 0;
+                               return c;
+               }
+               public static double[] vectGreaterWrite(double[] a, double 
bval, int ai, int len) {
+               double[] c = allocVector(len, false);
+               final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, 
bval);
+               final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+               final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                               DoubleVector aVec = 
DoubleVector.fromArray(SPECIES, a, ai + i);
+
+                               VectorMask<Double> gt = 
aVec.compare(VectorOperators.GT, bVec);
+                               DoubleVector out = zeros.blend(ones, gt);
+
+                               out.intoArray(c, i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                               c[i] = (a[ai + i] > bval) ? 1.0 : 0.0;
+               }
+               return c;
+       }
+       public static void scalarvectMult2Add(double[] a, double[] c, int ai, 
int ci, int len) {
+               for( int j = ai; j < ai+len; j++, ci++)
+                               c[ci] +=  a[j] + a[j];
+       }
+
+       public static void vectMult2Add(double[] a, double[] c, int ai, int ci, 
int len) {
+               LibMatrixMult.vectMultiplyAdd(2.0,a,c,ai,ci,len);
+       }
+
+       public static double[] scalarvectGreaterWrite(double[] a, double[] b, 
int ai, int bi, int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++, ai++, bi++)
+                               c[j] = (a[ai] > b[bi]) ? 1 : 0;
+               return c;
+       }
+
+       // not in use: vector api implementation slower than scalar loop version
+       public static double[] vectGreaterWrite(double[] a, double[] b, int ai, 
int bi, int len) {
+               double[] c = allocVector(len, false);
+               final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+               final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+               int i = 0;
+               int upper = SPECIES.loopBound(len);
+
+               //unrolled vLen-block  (for better instruction-level 
parallelism)
+               for (; i < upper; i += vLen) {
+                               DoubleVector aVec = 
DoubleVector.fromArray(SPECIES, a, ai + i);
+                               DoubleVector bVec = 
DoubleVector.fromArray(SPECIES, b, bi + i);
+
+                               VectorMask<Double> gt = 
aVec.compare(VectorOperators.GT, bVec);
+                               DoubleVector out = zeros.blend(ones, gt);
+
+                               out.intoArray(c, i);
+               }
+
+               //rest, not aligned to vLen-blocks
+               for (; i < len; i++) {
+                               c[i] = (a[ai + i] > b[bi + i]) ? 1.0 : 0.0;
+               }
+               return c;
+       }
+}

(systemds) branch main updated: [SYSTEMDS-3920] Vector API in more codegen primitives

Reply via email to