This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 422fba7050 [SYSTEMDS-3920] Vector API in more codegen primitives
422fba7050 is described below
commit 422fba7050e2326fc3aac2fb9dd42ca8e3ac21b5
Author: JulianJuelg <[email protected]>
AuthorDate: Sat Mar 28 15:57:07 2026 +0100
[SYSTEMDS-3920] Vector API in more codegen primitives
Closes #2428.
---
.../sysds/runtime/codegen/LibSpoofPrimitives.java | 733 ++++++++++++++++--
.../sysds/runtime/matrix/data/LibMatrixMult.java | 39 +
.../primitives_vector_api/BenchCase.java | 345 +++++++++
.../primitives_vector_api/BenchUtil.java | 84 ++
.../performance/primitives_vector_api/Ctx.java | 78 ++
.../primitives_vector_api/PrimitivePerfSuite.java | 63 ++
.../backup_primitives_for_benchmark.java | 862 +++++++++++++++++++++
7 files changed, 2125 insertions(+), 79 deletions(-)
diff --git
a/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java
b/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java
index ebb42676f0..86f2284c50 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java
@@ -28,10 +28,16 @@ import org.apache.sysds.runtime.functionobjects.BitwAnd;
import org.apache.sysds.runtime.functionobjects.IntegerDivide;
import org.apache.sysds.runtime.functionobjects.Modulus;
import org.apache.sysds.runtime.matrix.data.LibMatrixDNN;
+import org.apache.sysds.runtime.matrix.data.LibMatrixDNN.PoolingType;
import org.apache.sysds.runtime.matrix.data.LibMatrixDNNIm2Col;
import org.apache.sysds.runtime.matrix.data.LibMatrixDNNPooling;
import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
-import org.apache.sysds.runtime.matrix.data.LibMatrixDNN.PoolingType;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+import jdk.incubator.vector.VectorMask;
/**
* This library contains all vector primitives that are used in
@@ -45,6 +51,13 @@ public class LibSpoofPrimitives
private static IntegerDivide intDiv = IntegerDivide.getFnObject();
private static Modulus mod = Modulus.getFnObject();
private static BitwAnd bwAnd = BitwAnd.getBitwAndFnObject();
+
+ // Vector API initializations
+ private static final VectorSpecies<Double> SPECIES =
DoubleVector.SPECIES_PREFERRED;
+ @SuppressWarnings("unused")
+ private static final VectorSpecies<Float> FSPECIES =
FloatVector.SPECIES_PREFERRED;
+ private static final int vLen = SPECIES.length();
+
//global pool of reusable vectors, individual operations set up their
own thread-local
//ring buffers of reusable vectors with specific number of vectors and
vector sizes
@@ -57,13 +70,32 @@ public class LibSpoofPrimitives
};
public static double rowMaxsVectMult(double[] a, double[] b, int ai,
int bi, int len) {
- double val = Double.NEGATIVE_INFINITY;
- int j=0;
- for( int i = ai; i < ai+len; i++ )
- val = Math.max(a[i]*b[j++], val);
- return val;
+ double maxVal = Double.NEGATIVE_INFINITY;
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ DoubleVector vmax = DoubleVector.broadcast(SPECIES,
Double.NEGATIVE_INFINITY);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi
+ i);
+ DoubleVector prod = va.mul(vb);
+ vmax = vmax.max(prod);
+ }
+
+ maxVal = vmax.reduceLanes(VectorOperators.MAX);
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ maxVal = Math.max(maxVal, a[ai + i] * b[bi + i]);
+ }
+
+ return maxVal;
}
+ // note: parameter bi unused
public static double rowMaxsVectMult(double[] a, double[] b, int[] aix,
int ai, int bi, int len) {
double val = Double.NEGATIVE_INFINITY;
for( int i = ai; i < ai+len; i++ )
@@ -71,6 +103,32 @@ public class LibSpoofPrimitives
return val;
}
+ // not in use: vector api implementation slower than scalar loop version
+ public static double rowMaxsVectMult_vector_api(double[] a, double[] b,
int[] aix, int ai, int bi, int len) {
+ double scalarMax = Double.NEGATIVE_INFINITY;
+
+ int i = 0;
+ int upperBound = SPECIES.loopBound(len);
+ DoubleVector vmax = DoubleVector.broadcast(SPECIES,
Double.NEGATIVE_INFINITY);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upperBound; i += SPECIES.length()) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ DoubleVector vb = DoubleVector.fromArray(SPECIES, b, 0,
aix, ai + i);
+ DoubleVector prod = va.mul(vb);
+ vmax = vmax.max(prod);
+ }
+ scalarMax = Math.max(scalarMax,
vmax.reduceLanes(VectorOperators.MAX));
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ double prod = a[ai + i] * b[aix[ai + i]];
+ if (prod > scalarMax)
+ scalarMax = prod;
+ }
+ return scalarMax;
+ }
+
// forwarded calls to LibMatrixMult
public static double dotProduct(double[] a, double[] b, int ai, int bi,
int len) {
if( a == null || b == null ) return 0;
@@ -295,6 +353,7 @@ public class LibSpoofPrimitives
* @param len number of processed elements
* @return sum value
*/
+
public static double vectSum(double[] a, int ai, int len) {
double val = 0;
final int bn = len%8;
@@ -313,6 +372,27 @@ public class LibSpoofPrimitives
//scalar result
return val;
}
+ // not in use: vector api implementation slower than scalar loop version
+ public static double vectSum_vector_api(double[] a, int ai, int len) {
+ double sum = 0d;
+ int i = 0;
+
+ DoubleVector acc = DoubleVector.zero(SPECIES);
+ int upperBound = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upperBound; i += SPECIES.length()) {
+ DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai + i);
+ acc = acc.add(v);
+ }
+ sum += acc.reduceLanes(VectorOperators.ADD);
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ sum += a[ai + i];
+ }
+ return sum;
+ }
public static double vectSum(double[] avals, int[] aix, int ai, int
alen, int len) {
//forward to dense as column indexes not required here
@@ -327,36 +407,82 @@ public class LibSpoofPrimitives
return LibMatrixMult.dotProduct(avals, avals, ai, ai, alen);
}
- public static double vectMin(double[] a, int ai, int len) {
+ public static double scalarvectMin(double[] a, int ai, int len) {
double val = Double.POSITIVE_INFINITY;
for( int i = ai; i < ai+len; i++ )
val = Math.min(a[i], val);
return val;
}
+
+ public static double vectMin(double[] a, int ai, int len) {
+ int i = 0;
+ int upperBound = SPECIES.loopBound(len);
+ DoubleVector vmin = DoubleVector.broadcast(SPECIES,
Double.POSITIVE_INFINITY);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upperBound; i += vLen) {
+ DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ vmin = vmin.min(v);
+ }
+ double minVal = vmin.reduceLanes(VectorOperators.MIN);
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ minVal = Math.min(minVal, a[ai + i]);
+ }
+ return minVal;
+ }
public static double vectMin(double[] avals, int[] aix, int ai, int
alen, int len) {
double val = vectMin(avals, ai, alen);
return (alen<len) ? Math.min(val, 0) : val;
}
- public static double vectMax(double[] a, int ai, int len) {
- double val = Double.NEGATIVE_INFINITY;
- for( int i = ai; i < ai+len; i++ )
- val = Math.max(a[i], val);
- return val;
- }
+
+ public static double vectMax(double[] a, int ai, int len) {
+ int i = 0;
+ int upperBound = SPECIES.loopBound(len);
+ DoubleVector vmax = DoubleVector.broadcast(SPECIES,
Double.NEGATIVE_INFINITY);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upperBound; i += vLen) {
+ DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ vmax = vmax.max(v);
+ }
+ double maxVal = vmax.reduceLanes(VectorOperators.MAX);
+
+ //rest, not aligned to vLen-blocks
+ for(;i<len;i++){
+ maxVal = Math.max(a[ai + i],maxVal);
+ }
+ return maxVal;
+ }
public static double vectMax(double[] avals, int[] aix, int ai, int
alen, int len) {
double val = vectMax(avals, ai, alen);
return (alen<len) ? Math.max(val, 0) : val;
}
- public static double vectCountnnz(double[] a, int ai, int len) {
+
+ public static double vectCountnnz(double[] a, int ai, int len) {
int count = 0;
- for( int i = ai; i < ai+len; i++ )
+ int i = 0;
+ int upperBound = SPECIES.loopBound(len);
+ DoubleVector vzero = DoubleVector.zero(SPECIES);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upperBound; i += vLen) {
+ DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ VectorMask<Double> nz = v.compare(VectorOperators.NE,
vzero);
+ count += nz.trueCount();
+ }
+
+ //rest, not aligned to vLen-blocks
+ for(;i<len;i++){
count += (a[i] != 0) ? 1 : 0;
+ }
return count;
- }
+ }
public static double vectCountnnz(double[] avals, int[] aix, int ai,
int alen, int len) {
//pure meta data operation
@@ -372,26 +498,106 @@ public class LibSpoofPrimitives
}
//custom vector div
-
- public static void vectDivAdd(double[] a, double bval, double[] c, int
ai, int ci, int len) {
- for( int j = ai; j < ai+len; j++, ci++)
- c[ci] += a[j] / bval;
+
+ public static void vectDivAdd(double[] a, double bval, double[] c, int
ai, int ci, int len) {
+ final double inv = 1.0 / bval;
+ final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv);
+ int i = 0; final int upperBound = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upperBound; i += vLen) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci
+ i);
+ vc = vc.add(va.mul(vinv)); vc.intoArray(c, ci + i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[ci + i] += a[ai + i] * inv;
+ }
}
+
public static void vectDivAdd(double bval, double[] a, double[] c, int
ai, int ci, int len) {
- for( int j = ai; j < ai+len; j++, ci++)
- c[ci] += bval / a[j];
+ int i = 0;
+ int upperBound = SPECIES.loopBound(len);
+ DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upperBound; i += vLen) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci
+ i);
+ vc = vc.add(vb.div(va));
+ vc.intoArray(c, ci + i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (;i<len;i++){
+ c[ci+i] += bval/a[ai+i];
+ }
}
+
public static void vectDivAdd(double[] a, double bval, double[] c,
int[] aix, int ai, int ci, int alen, int len) {
for( int j = ai; j < ai+alen; j++ )
c[ci + aix[j]] += a[j] / bval;
}
+
+ // not in use: vector api implementation slower than scalar loop version
+ public static void vectDivAdd_vector_api(double[] a, double bval,
double[] c, int[] aix, int ai, int ci, int alen, int len) {
+
+ final double inv = 1.0 / bval;
+ int i = 0;
+ int upperBound = SPECIES.loopBound(alen);
+ DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upperBound; i += vLen) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ DoubleVector vcontrib = va.mul(vinv);
+
+ // scatter-add lane-by-lane
+ for (int lane = 0; lane < vLen; lane++) {
+ int idx = ci + aix[ai + i + lane];
+ c[idx] += vcontrib.lane(lane);
+ }
+ }
+
+ //rest, not aligned to vLen-blocks
+ for(; i<alen; i++){
+ c[ci + aix[ai + i]] += a[ai + i] * inv;
+ }
+ }
+
public static void vectDivAdd(double bval, double[] a, double[] c,
int[] aix, int ai, int ci, int alen, int len) {
for( int j = ai; j < ai+alen; j++ )
c[ci + aix[j]] += bval / a[j];
}
+
+ // not in use: vector api implementation slower than scalar loop version
+ public static void vectDivAdd_vector_api(double bval, double[] a,
double[] c, int[] aix, int ai, int ci, int alen, int len) {
+ int i = 0;
+ int upperBound = SPECIES.loopBound(alen);
+ DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upperBound; i += vLen) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ DoubleVector vcontrib = vb.div(va);
+
+ // scatter-add lane-by-lane
+ for (int lane = 0; lane < vLen; lane++) {
+ int idx = ci + aix[ai + i + lane];
+ c[idx] += vcontrib.lane(lane);
+ }
+ }
+ //rest, not aligned to vLen-blocks
+ for (; i<alen; i++){
+ c[ci + aix[ai + i]] += bval / a[ai +i];
+ }
+ }
+
public static double[] vectDivWrite(double[] a, double bval, int ai,
int len) {
double[] c = allocVector(len, false);
@@ -399,6 +605,28 @@ public class LibSpoofPrimitives
c[j] = a[ai+j] / bval;
return c;
}
+
+ // not in use: vector api implementation slower than scalar loop version
+ public static double[] vectDivWrite_vector_api(double[] a, double bval,
int ai, int len) {
+ double[] c = allocVector(len, false);
+ final double inv = 1.0 / bval;
+ final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv);
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ va.mul(vinv).intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = a[ai + i] * inv;
+ }
+ return c;
+ }
+
public static double[] vectDivWrite(double bval, double[] a, int ai,
int len) {
double[] c = allocVector(len, false);
@@ -406,6 +634,26 @@ public class LibSpoofPrimitives
c[j] = bval / a[ai + j];
return c;
}
+
+ // not in use: vector api implementation slower than scalar loop version
+ public static double[] vectDivWrite_vector_api(double bval, double[] a,
int ai, int len) {
+ double[] c = allocVector(len, false);
+ final DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ vb.div(va).intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i<len; i++){
+ c[i] = bval / a[ai + i];
+ }
+ return c;
+ }
public static double[] vectDivWrite(double[] a, double[] b, int ai, int
bi, int len) {
double[] c = allocVector(len, false);
@@ -414,6 +662,26 @@ public class LibSpoofPrimitives
return c;
}
+ // not in use: vector api implementation slower than scalar loop version
+ public static double[] vectDivWrite_vector_api(double[] a, double[] b,
int ai, int bi, int len) {
+ double[] c = allocVector(len, false);
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi
+ i);
+ va.div(vb).intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for(; i <len; i++){
+ c[i] = a[ai + i] / b[bi + i];
+ }
+ return c;
+ }
+
public static double[] vectDivWrite(double[] a, double bval, int[] aix,
int ai, int alen, int len) {
double init = (bval != 0) ? 0 : Double.NaN;
double[] c = allocVector(len, true, init);
@@ -1480,10 +1748,9 @@ public class LibSpoofPrimitives
}
//custom mult2
-
+
public static void vectMult2Add(double[] a, double[] c, int ai, int ci,
int len) {
- for( int j = ai; j < ai+len; j++, ci++)
- c[ci] += a[j] + a[j];
+ LibMatrixMult.vectMultiplyAdd(2.0,a,c,ai,ci,len);
}
public static void vectMult2Add(double[] a, double[] c, int[] aix, int
ai, int ci, int alen, int len) {
@@ -1493,10 +1760,10 @@ public class LibSpoofPrimitives
public static double[] vectMult2Write(double[] a, int ai, int len) {
double[] c = allocVector(len, false);
- for( int j = 0; j < len; j++, ai++)
- c[j] = a[ai] + a[ai];
+ LibMatrixMult.vectMultiplyWrite(2.0,a,c,ai,0,len);
return c;
}
+
public static double[] vectMult2Write(double[] a, int[] aix, int ai,
int alen, int len) {
double[] c = allocVector(len, true);
@@ -1586,9 +1853,30 @@ public class LibSpoofPrimitives
//custom vector equal
public static void vectEqualAdd(double[] a, double bval, double[] c,
int ai, int ci, int len) {
- for( int j = ai; j < ai+len; j++, ci++)
- c[ci] += (a[j] == bval) ? 1 : 0;
- }
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+ final DoubleVector bVec = DoubleVector.broadcast(SPECIES,
bval);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES,
1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a,
ai + i);
+ DoubleVector cVec = DoubleVector.fromArray(SPECIES, c,
ci + i);
+
+ VectorMask<Double> eq =
aVec.compare(VectorOperators.EQ, bVec);
+
+ DoubleVector inc = zeros.blend(ones, eq);
+
+ cVec.add(inc).intoArray(c, ci + i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[ci + i] += (a[ai + i] == bval) ? 1.0 : 0.0;
+ }
+ }
+
public static void vectEqualAdd(double bval, double[] a, double[] c,
int ai, int ci, int len) {
vectEqualAdd(a, bval, c, ai, ci, len);
@@ -1609,21 +1897,56 @@ public class LibSpoofPrimitives
public static double[] vectEqualWrite(double[] a, double bval, int ai,
int len) {
double[] c = allocVector(len, false);
- for( int j = 0; j < len; j++, ai++)
- c[j] = (a[ai] == bval) ? 1 : 0;
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+ DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+ DoubleVector zeros = DoubleVector.zero(SPECIES);
+ DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ var mask = va.compare(VectorOperators.EQ, vb);
+ DoubleVector out = zeros.blend(ones, mask);
+ out.intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = (a[ai + i] == bval) ? 1 : 0;
+ }
return c;
}
+
public static double[] vectEqualWrite(double bval, double[] a, int ai,
int len) {
return vectEqualWrite(a, bval, ai, len);
}
+
public static double[] vectEqualWrite(double[] a, double[] b, int ai,
int bi, int len) {
- double[] c = allocVector(len, false);
- for( int j = 0; j < len; j++, ai++, bi++)
- c[j] = (a[ai] == b[bi]) ? 1 : 0;
- return c;
- }
+ double[] c = allocVector(len, false);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+ DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i);
+ VectorMask<Double> eq = aVec.compare(VectorOperators.EQ, bVec);
+ DoubleVector out = zeros.blend(ones, eq);
+
+ out.intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = (a[ai + i] == b[bi + i]) ? 1.0 : 0.0;
+ }
+ return c;
+ }
public static double[] vectEqualWrite(double[] a, double bval, int[]
aix, int ai, int alen, int len) {
double init = (bval == 0) ? 1 : 0;
@@ -1655,8 +1978,27 @@ public class LibSpoofPrimitives
//custom vector not equal
public static void vectNotequalAdd(double[] a, double bval, double[] c,
int ai, int ci, int len) {
- for( int j = ai; j < ai+len; j++, ci++)
- c[ci] += (a[j] != bval) ? 1 : 0;
+ final DoubleVector bVec = DoubleVector.broadcast(SPECIES,
bval);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+ DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i);
+
+ VectorMask<Double> ne = aVec.compare(VectorOperators.NE, bVec);
+ DoubleVector inc = zeros.blend(ones, ne);
+
+ cVec.add(inc).intoArray(c, ci + i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[ci + i] += (a[ai + i] != bval) ? 1.0 : 0.0;
+ }
}
public static void vectNotequalAdd(double bval, double[] a, double[] c,
int ai, int ci, int len) {
@@ -1675,13 +2017,31 @@ public class LibSpoofPrimitives
public static void vectNotequalAdd(double bval, double[] a, double[] c,
int[] aix, int ai, int ci, int alen, int len) {
vectNotequalAdd(a, bval, c, aix, ai, ci, alen, len);
}
-
+
public static double[] vectNotequalWrite(double[] a, double bval, int
ai, int len) {
- double[] c = allocVector(len, false);
- for( int j = 0; j < len; j++, ai++)
- c[j] = (a[ai] != bval) ? 1 : 0;
- return c;
- }
+ double[] c = allocVector(len, false);
+ final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+ VectorMask<Double> ne = aVec.compare(VectorOperators.NE, bVec);
+ DoubleVector out = zeros.blend(ones, ne);
+
+ out.intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = (a[ai + i] != bval) ? 1.0 : 0.0;
+ }
+ return c;
+ }
public static double[] vectNotequalWrite(double bval, double[] a, int
ai, int len) {
return vectNotequalWrite(a, bval, ai, len);
@@ -1694,6 +2054,32 @@ public class LibSpoofPrimitives
return c;
}
+ // not in use: vector api implementation slower than scalar loop version
+ public static double[] vectNotequalWrite_vector_api(double[] a,
double[] b, int ai, int bi, int len) {
+ double[] c = allocVector(len, false);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a,
ai + i);
+ DoubleVector bVec = DoubleVector.fromArray(SPECIES, b,
bi + i);
+
+ VectorMask<Double> ne =
aVec.compare(VectorOperators.NE, bVec);
+ DoubleVector out = zeros.blend(ones, ne);
+
+ out.intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = (a[ai + i] != b[bi + i]) ? 1.0 : 0.0;
+ }
+ return c;
+ }
+
public static double[] vectNotequalWrite(double[] a, double bval, int[]
aix, int ai, int alen, int len) {
double init = (bval != 0) ? 1 : 0;
double[] c = allocVector(len, true, init);
@@ -1723,9 +2109,29 @@ public class LibSpoofPrimitives
//custom vector less
public static void vectLessAdd(double[] a, double bval, double[] c, int
ai, int ci, int len) {
- for( int j = ai; j < ai+len; j++, ci++)
- c[ci] += (a[j] < bval) ? 1 : 0;
- }
+ final DoubleVector bVec = DoubleVector.broadcast(SPECIES,
bval);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a,
ai + i);
+ DoubleVector cVec = DoubleVector.fromArray(SPECIES, c,
ci + i);
+
+ VectorMask<Double> lt =
aVec.compare(VectorOperators.LT, bVec);
+ DoubleVector inc = zeros.blend(ones, lt);
+
+ cVec.add(inc).intoArray(c, ci + i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[ci + i] += (a[ai + i] < bval) ? 1.0 : 0.0;
+ }
+ }
public static void vectLessAdd(double bval, double[] a, double[] c, int
ai, int ci, int len) {
vectGreaterequalAdd(a, bval, c, ai, ci, len);
@@ -1743,24 +2149,66 @@ public class LibSpoofPrimitives
public static void vectLessAdd(double bval, double[] a, double[] c,
int[] aix, int ai, int ci, int alen, int len) {
vectGreaterequalAdd(a, bval, c, aix, ai, ci, alen, len);
}
-
+
public static double[] vectLessWrite(double[] a, double bval, int ai,
int len) {
- double[] c = allocVector(len, false);
- for( int j = 0; j < len; j++, ai++)
- c[j] = (a[ai] < bval) ? 1 : 0;
- return c;
- }
+ double[] c = allocVector(len, false);
+ final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+
+ VectorMask<Double> lt = aVec.compare(VectorOperators.LT, bVec);
+ DoubleVector out = zeros.blend(ones, lt);
+
+ out.intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = (a[ai + i] < bval) ? 1.0 : 0.0;
+ }
+
+ return c;
+ }
+
public static double[] vectLessWrite(double bval, double[] a, int ai,
int len) {
return vectGreaterequalWrite(a, bval, ai, len);
}
-
+
public static double[] vectLessWrite(double[] a, double[] b, int ai,
int bi, int len) {
double[] c = allocVector(len, false);
- for( int j = 0; j < len; j++, ai++, bi++)
- c[j] = (a[ai] < b[bi]) ? 1 : 0;
+
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a,
ai + i);
+ DoubleVector bVec = DoubleVector.fromArray(SPECIES, b,
bi + i);
+
+ VectorMask<Double> lt =
aVec.compare(VectorOperators.LT, bVec);
+ DoubleVector out = zeros.blend(ones, lt);
+
+ out.intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = (a[ai + i] < b[bi + i]) ? 1.0 : 0.0;
+ }
+
return c;
- }
+ }
public static double[] vectLessWrite(double[] a, double bval, int[]
aix, int ai, int alen, int len) {
double init = (bval > 0) ? 1 : 0;
@@ -1789,11 +2237,31 @@ public class LibSpoofPrimitives
}
//custom vector less equal
-
+
public static void vectLessequalAdd(double[] a, double bval, double[]
c, int ai, int ci, int len) {
- for( int j = ai; j < ai+len; j++, ci++)
- c[ci] += (a[j] <= bval) ? 1 : 0;
- }
+ final DoubleVector bVec = DoubleVector.broadcast(SPECIES,
bval);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a,
ai + i);
+ DoubleVector cVec = DoubleVector.fromArray(SPECIES, c,
ci + i);
+
+ VectorMask<Double> le =
aVec.compare(VectorOperators.LE, bVec);
+ DoubleVector inc = zeros.blend(ones, le);
+
+ cVec.add(inc).intoArray(c, ci + i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[ci + i] += (a[ai + i] <= bval) ? 1.0 : 0.0;
+ }
+ }
public static void vectLessequalAdd(double bval, double[] a, double[]
c, int ai, int ci, int len) {
vectGreaterAdd(a, bval, c, ai, ci, len);
@@ -1813,22 +2281,63 @@ public class LibSpoofPrimitives
}
public static double[] vectLessequalWrite(double[] a, double bval, int
ai, int len) {
- double[] c = allocVector(len, false);
- for( int j = 0; j < len; j++, ai++)
- c[j] = (a[ai] <= bval) ? 1 : 0;
- return c;
- }
+ double[] c = allocVector(len, false);
+ final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+
+ VectorMask<Double> le = aVec.compare(VectorOperators.LE, bVec);
+ DoubleVector out = zeros.blend(ones, le);
+
+ out.intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = (a[ai + i] <= bval) ? 1.0 : 0.0;
+ }
+
+ return c;
+ }
public static double[] vectLessequalWrite(double bval, double[] a, int
ai, int len) {
return vectGreaterWrite(a, bval, ai, len);
}
-
+
public static double[] vectLessequalWrite(double[] a, double[] b, int
ai, int bi, int len) {
double[] c = allocVector(len, false);
- for( int j = 0; j < len; j++, ai++, bi++)
- c[j] = (a[ai] <= b[bi]) ? 1 : 0;
+
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+ DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i);
+
+ VectorMask<Double> le = aVec.compare(VectorOperators.LE, bVec);
+ DoubleVector out = zeros.blend(ones, le);
+
+ out.intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = (a[ai + i] <= b[bi + i]) ? 1.0 : 0.0;
+ }
+
return c;
- }
+ }
public static double[] vectLessequalWrite(double[] a, double bval,
int[] aix, int ai, int alen, int len) {
double init = (bval >= 0) ? 1 : 0;
@@ -1859,9 +2368,29 @@ public class LibSpoofPrimitives
//custom vector greater
public static void vectGreaterAdd(double[] a, double bval, double[] c,
int ai, int ci, int len) {
- for( int j = ai; j < ai+len; j++, ci++)
- c[ci] += (a[j] > bval) ? 1 : 0;
- }
+ final DoubleVector bVec = DoubleVector.broadcast(SPECIES,
bval);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a,
ai + i);
+ DoubleVector cVec = DoubleVector.fromArray(SPECIES, c,
ci + i);
+
+ VectorMask<Double> gt =
aVec.compare(VectorOperators.GT, bVec);
+ DoubleVector inc = zeros.blend(ones, gt);
+
+ cVec.add(inc).intoArray(c, ci + i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[ci + i] += (a[ai + i] > bval) ? 1.0 : 0.0;
+ }
+ }
public static void vectGreaterAdd(double bval, double[] a, double[] c,
int ai, int ci, int len) {
vectLessequalAdd(a, bval, c, ai, ci, len);
@@ -1881,11 +2410,30 @@ public class LibSpoofPrimitives
}
public static double[] vectGreaterWrite(double[] a, double bval, int
ai, int len) {
- double[] c = allocVector(len, false);
- for( int j = 0; j < len; j++, ai++)
- c[j] = (a[ai] > bval) ? 1 : 0;
- return c;
- }
+ double[] c = allocVector(len, false);
+ final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+
+ VectorMask<Double> gt = aVec.compare(VectorOperators.GT, bVec);
+ DoubleVector out = zeros.blend(ones, gt);
+
+ out.intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = (a[ai + i] > bval) ? 1.0 : 0.0;
+ }
+ return c;
+ }
public static double[] vectGreaterWrite(double bval, double[] a, int
ai, int len) {
return vectLessWrite(a, bval, ai, len);
@@ -1898,6 +2446,33 @@ public class LibSpoofPrimitives
return c;
}
+ // not in use: vector api implementation slower than scalar loop version
+ public static double[] vectGreaterWrite_vector_api(double[] a, double[]
b, int ai, int bi, int len) {
+ double[] c = allocVector(len, false);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a,
ai + i);
+ DoubleVector bVec = DoubleVector.fromArray(SPECIES, b,
bi + i);
+
+ VectorMask<Double> gt =
aVec.compare(VectorOperators.GT, bVec);
+ DoubleVector out = zeros.blend(ones, gt);
+
+ out.intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = (a[ai + i] > b[bi + i]) ? 1.0 : 0.0;
+ }
+ return c;
+ }
+
public static double[] vectGreaterWrite(double[] a, double bval, int[]
aix, int ai, int alen, int len) {
double init = (bval < 0) ? 1 : 0;
double[] c = allocVector(len, true, init);
diff --git
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index cfdf21255e..9417e5134e 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -4019,6 +4019,45 @@ public class LibMatrixMult
c[ ci+bix[j+7] ] = a[ ai+bix[j+7] ] * b[ j+7 ];
}
}
+ // test
+ public static double[] vectMult2Write(double[] a,double[] c, int ai,
int len) {
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ for (; i < upper; i += vLen) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ va.add(va).intoArray(c, i);
+ }
+
+ for (; i < len; i++) {
+ double x = a[ai + i];
+ c[i] = x + x;
+ }
+
+ return c;
+ }
+ public static double[] vectMult2Write_dedicated_2(double[] a, double[]
c, int ai, int len) {
+
+ final int bn = len % vLen;
+
+ // scalar prefix so the vector loop is an exact multiple of vLen
+ for (int j = 0; j < bn; j++) {
+ double x = a[ai + j];
+ c[j] = x + x;
+ }
+
+ // vector loop: j runs over multiples of vLen, no tail
afterwards
+ for (int j = bn; j < len; j += vLen) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ j);
+ va.add(va).intoArray(c, j);
+ // or: va.mul(2.0) via broadcast if you prefer
+ }
+
+ return c;
+ }
+
+
public static void vectMultiply(double[] a, double[] c, int ai, int ci,
final int len){
diff --git
a/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java
b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java
new file mode 100644
index 0000000000..e8ac3f79f3
--- /dev/null
+++
b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java
@@ -0,0 +1,345 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.sysds.performance.primitives_vector_api;
+import org.apache.sysds.runtime.codegen.LibSpoofPrimitives;
+
+
+public enum BenchCase {
+
+ // Aggregations
+
+ VECT_SUM(
+ "vectSum dense",
+ OutKind.SCALAR_DOUBLE,
+ ctx -> ctx.initDenseA(),
+ ctx -> {ctx.scalarRes =
backup_primitives_for_benchmark.scalarvectSum(ctx.a, 0, ctx.len);
+ BenchUtil.blackhole = ctx.scalarRes;
+ },
+ ctx -> {ctx.vectorRes =
backup_primitives_for_benchmark.vectSum(ctx.a, 0, ctx.len);
+ BenchUtil.blackhole = ctx.vectorRes;},
+ ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;}
+ ),
+
+
+ ROWS_MAXS_VECT_MULT(
+ "rowMaxsVectMult dense",
+ OutKind.SCALAR_DOUBLE,
+ ctx -> {ctx.initDenseA(); ctx.initDenseB();},
+ ctx -> ctx.scalarRes =
backup_primitives_for_benchmark.scalarrowMaxsVectMult(ctx.a, ctx.b, 0, 0,
ctx.len),
+ ctx -> ctx.vectorRes =
backup_primitives_for_benchmark.rowMaxsVectMult(ctx.a, ctx.b, 0, 0, ctx.len),
+ ctx -> {
+ ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;
+ }
+ ),
+
+ ROWS_MAXS_VECT_MULT_AIX(
+ "rowMaxsVectMult_aix dense",
+ OutKind.SCALAR_DOUBLE,
+ ctx -> {ctx.initDenseA();ctx.initDenseB();ctx.initDenseAInt();},
+ ctx -> {ctx.scalarRes =
backup_primitives_for_benchmark.scalarrowMaxsVectMult(ctx.a, ctx.b,
ctx.a_int,0,0,ctx.len);
+ BenchUtil.blackhole = ctx.scalarRes;
+ },
+ ctx -> {
+ ctx.vectorRes =
backup_primitives_for_benchmark.rowMaxsVectMult(ctx.a, ctx.b,
ctx.a_int,0,0,ctx.len);
+ BenchUtil.blackhole = ctx.vectorRes;
+ },
+ ctx -> {
+ ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;
+ }
+ ),
+
+ VECT_MAX(
+ "vectMax dense",
+ OutKind.SCALAR_DOUBLE,
+ ctx -> ctx.initDenseA(),
+ ctx -> {ctx.scalarRes =
backup_primitives_for_benchmark.scalarvectMax(ctx.a, 0, ctx.len);
+ BenchUtil.blackhole = ctx.scalarRes;
+ },
+ ctx -> {ctx.vectorRes =
backup_primitives_for_benchmark.vectMax(ctx.a, 0, ctx.len);
+ BenchUtil.blackhole = ctx.vectorRes;},
+ ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;}
+ ),
+ VECT_COUNTNNZ(
+ "vectCountnnz dense",
+ OutKind.SCALAR_DOUBLE,
+ ctx -> ctx.initDenseA(),
+ ctx -> {ctx.scalarRes =
backup_primitives_for_benchmark.scalarvectCountnnz(ctx.a, 0, ctx.len);
+ BenchUtil.blackhole = ctx.scalarRes;
+ },
+ ctx -> {ctx.vectorRes =
backup_primitives_for_benchmark.vectCountnnz(ctx.a, 0, ctx.len);
+ BenchUtil.blackhole = ctx.vectorRes;},
+ ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;}
+ ),
+
+ // Divisions
+
+ VECT_DIV_ADD(
+ "vectDivAdd dense",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();
ctx.initDenseADiv();},
+ ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.a,
ctx.bval, ctx.cScalar, 0, 0, ctx.len),
+ ctx -> backup_primitives_for_benchmark.vectDivAdd(ctx.a, ctx.bval,
ctx.cVector, 0, 0, ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ ),
+
+ VECT_DIV_ADD_2(
+ "vectDivAdd2 dense",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+ ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.bval,
ctx.a, ctx.cScalar, 0, 0, ctx.len),
+ ctx -> LibSpoofPrimitives.vectDivAdd(ctx.bval, ctx.a, ctx.cVector, 0,
0, ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ ),
+
+ VECT_DIV_ADD_SPARSE(
+ "vectDivAdd sparse",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseAandC_mutable(); ctx.initDenseAInt();
ctx.initbval();},
+ ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.a,
ctx.bval, ctx.cScalar, ctx.a_int, 0, 0,ctx.len, ctx.len),
+ ctx -> LibSpoofPrimitives.vectDivAdd(ctx.a, ctx.bval, ctx.cVector,
ctx.a_int, 0, 0,ctx.len, ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ ),
+
+
+ VECT_DIV_ADD_SPARSE2(
+ "vectDivAdd2 sparse",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseAandC_mutable(); ctx.initDenseAInt();
ctx.initbval();},
+ ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.bval,
ctx.a, ctx.cScalar, ctx.a_int, 0, 0,ctx.len, ctx.len),
+ ctx -> LibSpoofPrimitives.vectDivAdd(ctx.bval, ctx.a, ctx.cVector,
ctx.a_int, 0, 0,ctx.len, ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ ),
+
+ VECT_DIV_WRITE(
+ "vectDivWrite dense",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+ ctx -> ctx.cScalar =
backup_primitives_for_benchmark.scalarvectDivWrite(ctx.a, ctx.bval, 0,ctx.len),
+ ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.a, ctx.bval,
0,ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ ),
+ VECT_DIV_WRITE2(
+ "vectDivWrite2 dense",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+ ctx -> ctx.cScalar =
backup_primitives_for_benchmark.scalarvectDivWrite(ctx.bval, ctx.a, 0,ctx.len),
+ ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.bval, ctx.a,
0,ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ ),
+ VECT_DIV_WRITE3(
+ "vectDivWrite3 dense",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();
ctx.initDenseBDiv();},
+ ctx -> ctx.cScalar =
backup_primitives_for_benchmark.scalarvectDivWrite(ctx.a, ctx.b, 0, 0,ctx.len),
+ ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.a, ctx.b, 0,
0,ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ ),
+
+ // Comparisons
+
+ VECT_EQUAL_WRITE(
+ "vectEqualWrite dense",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+ ctx -> ctx.cScalar =
backup_primitives_for_benchmark.scalarvectEqualWrite(ctx.a, ctx.bval,
0,ctx.len),
+ ctx -> ctx.cVector = LibSpoofPrimitives.vectEqualWrite(ctx.a,
ctx.bval, 0,ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ ),
+ VECT_EQUAL_ADD(
+ "vectEqualAdd dense",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+ ctx -> backup_primitives_for_benchmark.scalarvectEqualAdd(ctx.a,
ctx.bval, ctx.cScalar,0, 0,ctx.len),
+ ctx -> LibSpoofPrimitives.vectEqualAdd(ctx.a, ctx.bval,ctx.cVector,
0, 0,ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ ),
+ VECT_EQUAL_WRITE2(
+ "vectEqualWrite2 dense",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseA(); ctx.initbval();},
+ ctx -> ctx.cScalar =
backup_primitives_for_benchmark.scalarvectEqualWrite(ctx.a, ctx.bval,
0,ctx.len),
+ ctx -> ctx.cVector = LibSpoofPrimitives.vectEqualWrite(ctx.a,
ctx.bval, 0,ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ ),
+ VECT_LESS_ADD(
+ "vectLessAdd dense",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+ ctx -> backup_primitives_for_benchmark.scalarvectLessAdd(ctx.a,
ctx.bval, ctx.cScalar,0, 0,ctx.len),
+ ctx -> LibSpoofPrimitives.vectLessAdd(ctx.a, ctx.bval,ctx.cVector, 0,
0,ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ ),
+ VECT_LESS_WRITE(
+ "vectLessWrite dense",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseA(); ctx.initbval();},
+ ctx -> ctx.cScalar =
backup_primitives_for_benchmark.scalarvectLessWrite(ctx.a, ctx.bval, 0
,ctx.len),
+ ctx -> ctx.cVector = LibSpoofPrimitives.vectLessWrite(ctx.a,
ctx.bval, 0, ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ ),
+ VECT_LESS_WRITE2(
+ "vectLessWrite2 dense",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseA(); ctx.initDenseB(); ctx.initbval();},
+ ctx -> ctx.cScalar =
backup_primitives_for_benchmark.scalarvectLessWrite(ctx.a, ctx.b, 0, 0
,ctx.len),
+ ctx -> ctx.cVector = LibSpoofPrimitives.vectLessWrite(ctx.a, ctx.b,
0, 0, ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ ),
+ VECT_LESSEQUAL_ADD(
+ "vectLessequalAdd dense",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+ ctx -> backup_primitives_for_benchmark.scalarvectLessequalAdd(ctx.a,
ctx.bval, ctx.cScalar,0, 0,ctx.len),
+ ctx -> LibSpoofPrimitives.vectLessequalAdd(ctx.a,
ctx.bval,ctx.cVector, 0, 0,ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ ),
+ VECT_LESSEQUAL_WRITE(
+ "vectLessequalWrite dense",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseA(); ctx.initbval();},
+ ctx -> ctx.cScalar =
backup_primitives_for_benchmark.scalarvectLessequalWrite(ctx.a, ctx.bval, 0
,ctx.len),
+ ctx -> ctx.cVector = LibSpoofPrimitives.vectLessequalWrite(ctx.a,
ctx.bval, 0, ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ ),
+ VECT_LESSEQUAL_WRITE2(
+ "vectLessequalWrite2 dense",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseA(); ctx.initDenseB();},
+ ctx -> ctx.cScalar =
backup_primitives_for_benchmark.scalarvectLessequalWrite(ctx.a, ctx.b, 0, 0
,ctx.len),
+ ctx -> ctx.cVector = LibSpoofPrimitives.vectLessequalWrite(ctx.a,
ctx.b, 0, 0, ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ ),
+
+ VECT_GREATER_ADD(
+ "vectGreaterAdd dense",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+ ctx -> backup_primitives_for_benchmark.scalarvectGreaterAdd(ctx.a,
ctx.bval, ctx.cScalar,0, 0,ctx.len),
+ ctx -> LibSpoofPrimitives.vectGreaterAdd(ctx.a, ctx.bval,ctx.cVector,
0, 0,ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ ),
+ VECT_GREATER_WRITE(
+ "vectGreaterWrite dense",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseA(); ctx.initbval();},
+ ctx -> ctx.cScalar =
backup_primitives_for_benchmark.scalarvectGreaterWrite(ctx.a, ctx.bval, 0
,ctx.len),
+ ctx -> ctx.cVector = LibSpoofPrimitives.vectGreaterWrite(ctx.a,
ctx.bval, 0, ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ ),
+ VECT_GREATER_WRITE2(
+ "vectGreaterWrite2 dense",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseA(); ctx.initDenseB();},
+ ctx -> ctx.cScalar =
backup_primitives_for_benchmark.scalarvectGreaterWrite(ctx.a, ctx.b, 0, 0
,ctx.len),
+ ctx -> ctx.cVector = LibSpoofPrimitives.vectGreaterWrite(ctx.a,
ctx.b, 0, 0, ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ ),
+
+ // vectMult2
+
+ VECT_Mult2_ADD(
+ "vectMult2Add dense",
+ OutKind.ARRAY_DOUBLE,
+ ctx -> {ctx.initDenseAandC_mutable(); },
+ ctx -> backup_primitives_for_benchmark.scalarvectMult2Add(ctx.a,
ctx.cScalar,0, 0,ctx.len),
+ ctx -> LibSpoofPrimitives.vectMult2Add(ctx.a, ctx.cVector, 0,
0,ctx.len),
+ ctx -> {
+ ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+ ctx.ok = ctx.maxDiff <= 1e-9;
+ }
+ );
+
+ public enum OutKind { SCALAR_DOUBLE, ARRAY_DOUBLE }
+ public final String name;
+ public final java.util.function.Consumer<Ctx> setup;
+ public final java.util.function.Consumer<Ctx> scalar;
+ public final java.util.function.Consumer<Ctx> vector;
+ public final java.util.function.Consumer<Ctx> verify;
+ public final OutKind outKind;
+
+
+ BenchCase(String name,
+ OutKind outKind,
+ java.util.function.Consumer<Ctx> setup,
+ java.util.function.Consumer<Ctx> scalar,
+ java.util.function.Consumer<Ctx> vector,
+ java.util.function.Consumer<Ctx> verify) {
+ this.name = name; this.outKind = outKind; this.setup = setup;
this.scalar = scalar; this.vector = vector; this.verify = verify;
+ }
+}
+
diff --git
a/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchUtil.java
b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchUtil.java
new file mode 100644
index 0000000000..bbf0f5031f
--- /dev/null
+++
b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchUtil.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.sysds.performance.primitives_vector_api;
+
+
+public class BenchUtil {
+ public static volatile double blackhole;
+
+ public static void warmup(Runnable r,int iters ) {
+ for (int i = 0; i < iters; i++) r.run();
+ }
+
+ public static double measure(Runnable r,int iters) {
+ System.gc();
+ long t0 = System.nanoTime();
+ for (int i = 0; i < iters; i++)
+ r.run();
+ long t1 = System.nanoTime();
+ return (t1 - t0) / (double) iters;
+ }
+
+ // ---- args helpers ----
+ public static int argInt(String[] args, String key, int def) {
+ for (int i = 0; i < args.length - 1; i++)
+ if (args[i].equals(key))
+ return Integer.parseInt(args[i + 1]);
+ return def;
+ }
+
+ public static String argStr(String[] args, String key, String def) {
+ for (int i = 0; i < args.length - 1; i++)
+ if (args[i].equals(key))
+ return args[i + 1];
+ return def;
+ }
+
+ public static double maxAbsDiff(double[] a, double[] b) {
+ double m = 0;
+ for (int i = 0; i < a.length; i++)
+ m = Math.max(m, Math.abs(a[i] - b[i]));
+ return m;
+ }
+
+ public static void printScalarDouble(String name,
+ double nsScalar, double nsVector,
+ double scalarRes, double vectorRes,
+ boolean ok) {
+
+ double speedup = nsScalar / nsVector;
+ System.out.printf("%s | scalar %.1f ns | vector %.1f ns |
speedup %.3fx | " +
+ "s=%.6g v=%.6g | %s%n",
+ name, nsScalar, nsVector, speedup, scalarRes, vectorRes, ok ?
"OK" : "FAIL");
+ }
+
+ public static void printArrayDiff(String name,
+ double nsScalar, double nsVector,
+ double maxDiff,
+ boolean ok) {
+
+ double speedup = nsScalar / nsVector;
+ System.out.printf("%s | scalar %.1f ns | vector %.1f ns |
speedup %.3fx | " +
+ "maxDiff=%.6g | %s%n",
+ name, nsScalar, nsVector, speedup, maxDiff, ok ? "OK" : "FAIL");
+ }
+ }
+
\ No newline at end of file
diff --git
a/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java
b/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java
new file mode 100644
index 0000000000..4fc1a15de0
--- /dev/null
+++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.sysds.performance.primitives_vector_api;
+
+public class Ctx {
+ public int len;
+ public double[] a, cInit,b,c, cScalar, cVector;
+ public double bval;
+
+ public double scalarRes, vectorRes;
+ public double maxDiff;
+ public boolean ok;
+ public int[] a_int;
+
+ void initDenseA() {
+ a = new double[len];
+ for (int i = 0; i < len; i++) a[i] = (i % 10) - 5;
+ }
+ void initDenseB() {
+ b = new double[len];
+ for (int i = 0; i < len; i++) b[i] = (i % 10) - 5;
+ }
+ void initDenseC() {
+ c = new double[len];
+ for (int i = 0; i < len; i++) c[i] = (i % 10) - 5;
+ }
+ void initDenseAInt() {
+ a_int = new int[len];
+ for (int i = 0; i < len; i++) a_int[i] = i;;
+ }
+
+ void initbval(){
+ bval = 1.234567;
+ }
+ void initDenseADiv() {
+ a = new double[len];
+ for (int i = 0; i < len; i++) {
+ a[i] = ((i % 10) + 1); // Range: 1 to 10 (no zeros)
+ }
+ }
+ void initDenseBDiv() {
+ b = new double[len];
+ for (int i = 0; i < len; i++) b[i] = ((i % 10) + 1);
+ }
+
+ void initDenseAandC_mutable() {
+ initDenseADiv();
+ cInit = new double[len];
+ for (int i = 0; i < len; i++) cInit[i] = (i % 10) - 5;
+ cScalar = java.util.Arrays.copyOf(cInit, len);
+ cVector = java.util.Arrays.copyOf(cInit, len);
+ }
+
+ void resetC() {
+ if (cInit != null) {
+ System.arraycopy(cInit, 0, cScalar, 0, len);
+ System.arraycopy(cInit, 0, cVector, 0, len);
+ }
+ }
+}
diff --git
a/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java
b/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java
new file mode 100644
index 0000000000..340b50ba8f
--- /dev/null
+++
b/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.sysds.performance.primitives_vector_api;
+
+
+public class PrimitivePerfSuite {
+ public static void main(String[] args) {
+ //int len = BenchUtil.argInt(args, "--len", 262_144);
+ int len = BenchUtil.argInt(args, "--len", 1_000_000);
+ int warmup = BenchUtil.argInt(args, "--warmup", 10_000);
+ int iters = BenchUtil.argInt(args, "--iters", 100);
+ String filter = BenchUtil.argStr(args, "--filter", "");
+
+ for (BenchCase bc : BenchCase.values()) {
+ if (!filter.isEmpty() && !bc.name.contains(filter))
continue;
+
+ Ctx ctx = new Ctx();
+ ctx.len = len;
+ bc.setup.accept(ctx);
+
+ // warm scalar
+ ctx.resetC();
+ BenchUtil.warmup(() -> {bc.scalar.accept(ctx);
},warmup);
+ ctx.resetC();
+ double nsScalar = BenchUtil.measure(() -> {
bc.scalar.accept(ctx); }, iters);
+
+ // warm vector
+ ctx.resetC();
+ BenchUtil.warmup(() -> {bc.vector.accept(ctx); },
warmup);
+ ctx.resetC();
+ double nsVector = BenchUtil.measure(() ->
{bc.vector.accept(ctx); }, iters);
+
+ // verify once
+ ctx.resetC(); bc.scalar.accept(ctx);
+ bc.vector.accept(ctx);
+ bc.verify.accept(ctx);
+
+ if (bc.outKind == BenchCase.OutKind.SCALAR_DOUBLE) {
+ BenchUtil.printScalarDouble(bc.name, nsScalar,
nsVector, ctx.scalarRes, ctx.vectorRes, ctx.ok);
+ } else {
+ BenchUtil.printArrayDiff(bc.name, nsScalar,
nsVector, ctx.maxDiff, ctx.ok);
+ }
+ }
+ }
+}
diff --git
a/src/test/java/org/apache/sysds/performance/primitives_vector_api/backup_primitives_for_benchmark.java
b/src/test/java/org/apache/sysds/performance/primitives_vector_api/backup_primitives_for_benchmark.java
new file mode 100644
index 0000000000..17cb093f97
--- /dev/null
+++
b/src/test/java/org/apache/sysds/performance/primitives_vector_api/backup_primitives_for_benchmark.java
@@ -0,0 +1,862 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.sysds.performance.primitives_vector_api;
+
+import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
+
+
+
+import java.util.Arrays;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+import jdk.incubator.vector.VectorMask;
+
+
+public class backup_primitives_for_benchmark {
+
+ // Vector API initializations
+ private static final VectorSpecies<Double> SPECIES =
DoubleVector.SPECIES_PREFERRED;
+ private static final int vLen = SPECIES.length();
+
+ public static double[] allocVector(int len, boolean reset) {
+ return allocVector(len, reset, 0);
+ }
+
+ protected static double[] allocVector(int len, boolean reset, double
resetVal) {
+ VectorBuffer buff = memPool.get();
+
+ //find next matching vector in ring buffer or
+ //allocate new vector if required
+ double[] vect = buff.next(len);
+ if( vect == null )
+ vect = new double[len];
+
+ //reset vector if required
+ if( reset )
+ Arrays.fill(vect, resetVal);
+ return vect;
+ }
+ private static class VectorBuffer {
+ private static final int MAX_SIZE = 512*1024; //4MB
+ private final double[][] _data;
+ private int _pos;
+ private int _len1;
+ private int _len2;
+
+ public VectorBuffer(int num, int len1, int len2) {
+ //best effort size restriction since large intermediates
+ //not necessarily used (num refers to the total number)
+ len1 = Math.min(len1, MAX_SIZE);
+ len2 = Math.min(len2, MAX_SIZE);
+ //pre-allocate ring buffer
+ int lnum = (len2>0 && len1!=len2) ? 2*num : num;
+ _data = new double[lnum][];
+ for( int i=0; i<num; i++ ) {
+ if( lnum > num ) {
+ _data[2*i] = new double[len1];
+ _data[2*i+1] = new double[len2];
+ }
+ else {
+ _data[i] = new double[len1];
+ }
+ }
+ _pos = -1;
+ _len1 = len1;
+ _len2 = len2;
+ }
+ public double[] next(int len) {
+ if( _len1!=len && _len2!=len )
+ return null;
+ do {
+ _pos = (_pos+1>=_data.length) ? 0 : _pos+1;
+ } while( _data[_pos].length!=len );
+ return _data[_pos];
+ }
+ @SuppressWarnings("unused")
+ public boolean isReusable(int num, int len1, int len2) {
+ int lnum = (len2>0 && len1!=len2) ? 2*num : num;
+ return (_len1 == len1 && _len2 == len2
+ && _data.length == lnum);
+ }
+ }
+ private static ThreadLocal<VectorBuffer> memPool = new ThreadLocal<>() {
+ @Override protected VectorBuffer initialValue() { return new
VectorBuffer(0,0,0); }
+ };
+
+ public static void scalarvectDivAdd(double[] a, double bval, double[]
c, int ai, int ci, int len) {
+ for( int j = ai; j < ai+len; j++, ci++)
+ c[ci] += a[j] / bval;
+ }
+
+ public static void vectDivAdd(double[] a, double bval, double[] c, int
ai, int ci, int len) {
+ final double inv = 1.0 / bval;
+ final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv);
+ int i = 0; final int upperBound = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upperBound; i += vLen) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci
+ i);
+ vc = vc.add(va.mul(vinv)); vc.intoArray(c, ci + i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[ci + i] += a[ai + i] * inv;
+ }
+ }
+
+ public static void scalarvectDivAdd(double bval, double[] a,
double[] c, int ai, int ci, int len) {
+ for( int j = ai; j < ai+len; j++, ci++)
+ c[ci] += bval / a[j];
+ }
+
+ public static void vectDivAdd(double bval, double[] a, double[] c, int
ai, int ci, int len) {
+ int i = 0;
+ int upperBound = SPECIES.loopBound(len);
+ DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upperBound; i += vLen) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci
+ i);
+ vc = vc.add(vb.div(va));
+ vc.intoArray(c, ci + i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (;i<len;i++){
+ c[ci+i] += bval/a[ai+i];
+ }
+ }
+
+ public static void scalarvectDivAdd(double[] a, double bval, double[]
c, int[] aix, int ai, int ci, int alen, int len) {
+ for( int j = ai; j < ai+alen; j++ )
+ c[ci + aix[j]] += a[j] / bval;
+ }
+
+ // not in use: vector api implementation slower than scalar loop version
+ public static void vectDivAdd(double[] a, double bval, double[] c,
int[] aix, int ai, int ci, int alen, int len) {
+
+ final double inv = 1.0 / bval;
+ int i = 0;
+ int upperBound = SPECIES.loopBound(alen);
+ DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upperBound; i += vLen) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ DoubleVector vcontrib = va.mul(vinv);
+
+ // scatter-add lane-by-lane
+ for (int lane = 0; lane < vLen; lane++) {
+ int idx = ci + aix[ai + i + lane];
+ c[idx] += vcontrib.lane(lane);
+ }
+ }
+
+ //rest, not aligned to vLen-blocks
+ for(; i<alen; i++){
+ c[ci + aix[ai + i]] += a[ai + i] * inv;
+ }
+ }
+
+ public static void scalarvectDivAdd(double bval, double[] a, double[]
c, int[] aix, int ai, int ci, int alen, int len) {
+ for( int j = ai; j < ai+alen; j++ )
+ c[ci + aix[j]] += bval / a[j];
+ }
+
+ // not in use: vector api implementation slower than scalar loop version
+ public static void vectDivAdd(double bval, double[] a, double[] c,
int[] aix, int ai, int ci, int alen, int len) {
+ int i = 0;
+ int upperBound = SPECIES.loopBound(alen);
+ DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upperBound; i += vLen) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ DoubleVector vcontrib = vb.div(va);
+
+ // scatter-add lane-by-lane
+ for (int lane = 0; lane < vLen; lane++) {
+ int idx = ci + aix[ai + i + lane];
+ c[idx] += vcontrib.lane(lane);
+ }
+ }
+ //rest, not aligned to vLen-blocks
+ for (; i<alen; i++){
+ c[ci + aix[ai + i]] += bval / a[ai +i];
+ }
+ }
+ public static double[] scalarvectDivWrite(double[] a, double bval, int
ai, int len) {
+ double[] c = allocVector(len, false);
+ for( int j = 0; j < len; j++)
+ c[j] = a[ai+j] / bval;
+ return c;
+ }
+
+ // not in use: vector api implementation slower than scalar loop version
+ public static double[] vectDivWrite(double[] a, double bval, int ai,
int len) {
+ double[] c = allocVector(len, false);
+ final double inv = 1.0 / bval;
+ final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv);
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ va.mul(vinv).intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = a[ai + i] * inv;
+ }
+ return c;
+ }
+ public static double[] scalarvectDivWrite(double bval, double[] a, int
ai, int len) {
+ double[] c = allocVector(len, false);
+ for( int j = 0; j < len; j++)
+ c[j] = bval / a[ai + j];
+ return c;
+ }
+
+ // not in use: vector api implementation slower than scalar loop version
+ public static double[] vectDivWrite(double bval, double[] a, int ai,
int len) {
+ double[] c = allocVector(len, false);
+ final DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ vb.div(va).intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i<len; i++){
+ c[i] = bval / a[ai + i];
+ }
+ return c;
+ }
+ public static double[] scalarvectDivWrite(double[] a, double[] b, int
ai, int bi, int len) {
+ double[] c = allocVector(len, false);
+ for( int j = 0; j < len; j++)
+ c[j] = a[ai + j] / b[bi + j];
+ return c;
+ }
+
+ // not in use: vector api implementation slower than scalar loop version
+ public static double[] vectDivWrite(double[] a, double[] b, int ai, int
bi, int len) {
+ double[] c = allocVector(len, false);
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi
+ i);
+ va.div(vb).intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for(; i <len; i++){
+ c[i] = a[ai + i] / b[bi + i];
+ }
+ return c;
+ }
+ public static double scalarrowMaxsVectMult(double[] a, double[] b, int
ai, int bi, int len) {
+ double val = Double.NEGATIVE_INFINITY;
+ int j=0;
+ for( int i = ai; i < ai+len; i++ )
+ val = Math.max(a[i]*b[j++], val);
+ return val;
+ }
+
+ public static double rowMaxsVectMult(double[] a, double[] b, int ai,
int bi, int len) {
+ double maxVal = Double.NEGATIVE_INFINITY;
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ DoubleVector vmax = DoubleVector.broadcast(SPECIES,
Double.NEGATIVE_INFINITY);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi
+ i);
+ DoubleVector prod = va.mul(vb);
+ vmax = vmax.max(prod);
+ }
+
+ maxVal = vmax.reduceLanes(VectorOperators.MAX);
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ maxVal = Math.max(maxVal, a[ai + i] * b[bi + i]);
+ }
+
+ return maxVal;
+ }
+ // note: parameter bi unused
+ public static double scalarrowMaxsVectMult(double[] a, double[] b,
int[] aix, int ai, int bi, int len) {
+ double val = Double.NEGATIVE_INFINITY;
+ for( int i = ai; i < ai+len; i++ )
+ val = Math.max(a[i]*b[aix[i]], val);
+ return val;
+ }
+
+ // not in use: vector api implementation slower than scalar loop version
+ public static double rowMaxsVectMult(double[] a, double[] b, int[] aix,
int ai, int bi, int len) {
+ double scalarMax = Double.NEGATIVE_INFINITY;
+
+ int i = 0;
+ int upperBound = SPECIES.loopBound(len);
+ DoubleVector vmax = DoubleVector.broadcast(SPECIES,
Double.NEGATIVE_INFINITY);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upperBound; i += SPECIES.length()) {
+ DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ DoubleVector vb = DoubleVector.fromArray(SPECIES, b, 0,
aix, ai + i);
+ DoubleVector prod = va.mul(vb);
+ vmax = vmax.max(prod);
+ }
+ scalarMax = Math.max(scalarMax,
vmax.reduceLanes(VectorOperators.MAX));
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ double prod = a[ai + i] * b[aix[ai + i]];
+ if (prod > scalarMax)
+ scalarMax = prod;
+ }
+ return scalarMax;
+ }
+
+
+ public static double scalarvectSum(double[] a, int ai, int len) {
+ double val = 0;
+ final int bn = len%8;
+
+ //compute rest
+ for( int i = ai; i < ai+bn; i++ )
+ val += a[ i ];
+
+ //unrolled 8-block (for better instruction-level parallelism)
+ for( int i = ai+bn; i < ai+len; i+=8 ) {
+ //read 64B cacheline of a, compute cval' = sum(a) + cval
+ val += a[ i+0 ] + a[ i+1 ] + a[ i+2 ] + a[ i+3 ]
+ + a[ i+4 ] + a[ i+5 ] + a[ i+6 ] + a[
i+7 ];
+ }
+
+ //scalar result
+ return val;
+ }
+
+ public static double vectSum(double[] a, int ai, int len) {
+ double sum = 0d;
+ int i = 0;
+
+ DoubleVector acc = DoubleVector.zero(SPECIES);
+ int upperBound = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upperBound; i += SPECIES.length()) {
+ DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ acc = acc.add(v);
+ }
+ sum += acc.reduceLanes(VectorOperators.ADD);
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ sum += a[ai + i];
+ }
+ return sum;
+ }
+ public static double scalarvectMax(double[] a, int ai, int len) {
+ double val = Double.NEGATIVE_INFINITY;
+ for( int i = ai; i < ai+len; i++ )
+ val = Math.max(a[i], val);
+ return val;
+ }
+
+ public static double vectMax(double[] a, int ai, int len) {
+ int i = 0;
+ int upperBound = SPECIES.loopBound(len);
+ DoubleVector vmax = DoubleVector.broadcast(SPECIES,
Double.NEGATIVE_INFINITY);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upperBound; i += vLen) {
+ DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ vmax = vmax.max(v);
+ }
+ double maxVal = vmax.reduceLanes(VectorOperators.MAX);
+
+ //rest, not aligned to vLen-blocks
+ for(;i<len;i++){
+ maxVal = Math.max(a[ai + i],maxVal);
+ }
+ return maxVal;
+ }
+ public static double scalarvectCountnnz(double[] a, int ai, int
len) {
+ int count = 0;
+ for( int i = ai; i < ai+len; i++ )
+ count += (a[i] != 0) ? 1 : 0;
+ return count;
+ }
+ public static double vectCountnnz(double[] a, int ai, int len) {
+ int count = 0;
+ int i = 0;
+ int upperBound = SPECIES.loopBound(len);
+ DoubleVector vzero = DoubleVector.zero(SPECIES);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upperBound; i += vLen) {
+ DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai
+ i);
+ VectorMask<Double> nz = v.compare(VectorOperators.NE,
vzero);
+ count += nz.trueCount();
+ }
+
+ //rest, not aligned to vLen-blocks
+ for(;i<len;i++){
+ count += (a[i] != 0) ? 1 : 0;
+ }
+ return count;
+ }
+ public static void scalarvectEqualAdd(double[] a, double bval,
double[] c, int ai, int ci, int len) {
+ for( int j = ai; j < ai+len; j++, ci++)
+ c[ci] += (a[j] == bval) ? 1 : 0;
+ }
+ public static void vectEqualAdd(double[] a, double bval, double[] c,
int ai, int ci, int len) {
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+ final DoubleVector bVec = DoubleVector.broadcast(SPECIES,
bval);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES,
1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a,
ai + i);
+ DoubleVector cVec = DoubleVector.fromArray(SPECIES, c,
ci + i);
+
+ VectorMask<Double> eq =
aVec.compare(VectorOperators.EQ, bVec);
+
+ DoubleVector inc = zeros.blend(ones, eq);
+
+ cVec.add(inc).intoArray(c, ci + i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[ci + i] += (a[ai + i] == bval) ? 1.0 : 0.0;
+ }
+ }
+ public static double[] scalarvectEqualWrite(double[] a, double bval,
int ai, int len) {
+ double[] c = allocVector(len, false);
+ for( int j = 0; j < len; j++, ai++)
+ c[j] = (a[ai] == bval) ? 1 : 0;
+ return c;
+ }
+ public static double[] vectEqualWrite(double[] a, double bval, int ai,
int len) {
+ double[] c = allocVector(len, false);
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+ DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+ DoubleVector zeros = DoubleVector.zero(SPECIES);
+ DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector va =
DoubleVector.fromArray(SPECIES, a, ai + i);
+ var mask = va.compare(VectorOperators.EQ, vb);
+ DoubleVector out = zeros.blend(ones, mask);
+ out.intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = (a[ai + i] == bval) ? 1 : 0;
+ }
+ return c;
+ }
+ public static double[] scalarvectEqualWrite(double[] a, double[] b, int
ai, int bi, int len) {
+ double[] c = allocVector(len, false);
+ for( int j = 0; j < len; j++, ai++, bi++)
+ c[j] = (a[ai] == b[bi]) ? 1 : 0;
+ return c;
+ }
+
+ public static double[] vectEqualWrite(double[] a, double[] b, int ai,
int bi, int len) {
+ double[] c = allocVector(len, false);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec =
DoubleVector.fromArray(SPECIES, a, ai + i);
+ DoubleVector bVec =
DoubleVector.fromArray(SPECIES, b, bi + i);
+ VectorMask<Double> eq =
aVec.compare(VectorOperators.EQ, bVec);
+ DoubleVector out = zeros.blend(ones, eq);
+
+ out.intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = (a[ai + i] == b[bi + i]) ? 1.0 : 0.0;
+ }
+ return c;
+ }
+ public static double[] vectNotequalWrite(double[] a, double[] b, int
ai, int bi, int len) {
+ double[] c = allocVector(len, false);
+ for( int j = 0; j < len; j++, ai++, bi++)
+ c[j] = (a[ai] != b[bi]) ? 1 : 0;
+ return c;
+ }
+
+ // not in use: vector api implementation slower than scalar loop version
+ public static double[] vectNotequalWrite_vector_api(double[] a,
double[] b, int ai, int bi, int len) {
+ double[] c = allocVector(len, false);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec =
DoubleVector.fromArray(SPECIES, a, ai + i);
+ DoubleVector bVec =
DoubleVector.fromArray(SPECIES, b, bi + i);
+
+ VectorMask<Double> ne =
aVec.compare(VectorOperators.NE, bVec);
+ DoubleVector out = zeros.blend(ones, ne);
+
+ out.intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = (a[ai + i] != b[bi + i]) ? 1.0 : 0.0;
+ }
+ return c;
+ }
+
+
+ public static void scalarvectLessAdd(double[] a, double bval, double[]
c, int ai, int ci, int len) {
+ for( int j = ai; j < ai+len; j++, ci++)
+ c[ci] += (a[j] < bval) ? 1 : 0;
+ }
+ public static void vectLessAdd(double[] a, double bval, double[] c, int
ai, int ci, int len) {
+ final DoubleVector bVec = DoubleVector.broadcast(SPECIES,
bval);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec =
DoubleVector.fromArray(SPECIES, a, ai + i);
+ DoubleVector cVec =
DoubleVector.fromArray(SPECIES, c, ci + i);
+
+ VectorMask<Double> lt =
aVec.compare(VectorOperators.LT, bVec);
+ DoubleVector inc = zeros.blend(ones, lt);
+
+ cVec.add(inc).intoArray(c, ci + i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[ci + i] += (a[ai + i] < bval) ? 1.0 : 0.0;
+ }
+ }
+
+
+ public static double[] scalarvectLessWrite(double[] a, double bval, int
ai, int len) {
+ double[] c = allocVector(len, false);
+ for( int j = 0; j < len; j++, ai++)
+ c[j] = (a[ai] < bval) ? 1 : 0;
+ return c;
+ }
+
+ public static double[] vectLessWrite(double[] a, double bval, int ai,
int len) {
+ double[] c = allocVector(len, false);
+ final DoubleVector bVec = DoubleVector.broadcast(SPECIES,
bval);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a,
ai + i);
+
+ VectorMask<Double> lt =
aVec.compare(VectorOperators.LT, bVec);
+ DoubleVector out = zeros.blend(ones, lt);
+
+ out.intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = (a[ai + i] < bval) ? 1.0 : 0.0;
+ }
+
+ return c;
+ }
+
+ public static double[] scalarvectLessWrite(double[] a, double[] b, int
ai, int bi, int len) {
+ double[] c = allocVector(len, false);
+ for( int j = 0; j < len; j++, ai++, bi++)
+ c[j] = (a[ai] < b[bi]) ? 1 : 0;
+ return c;
+ }
+
+ public static double[] vectLessWrite(double[] a, double[] b, int ai,
int bi, int len) {
+ double[] c = allocVector(len, false);
+
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a,
ai + i);
+ DoubleVector bVec = DoubleVector.fromArray(SPECIES, b,
bi + i);
+
+ VectorMask<Double> lt =
aVec.compare(VectorOperators.LT, bVec);
+ DoubleVector out = zeros.blend(ones, lt);
+
+ out.intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = (a[ai + i] < b[bi + i]) ? 1.0 : 0.0;
+ }
+
+ return c;
+ }
+ public static void scalarvectLessequalAdd(double[] a, double bval,
double[] c, int ai, int ci, int len) {
+ for( int j = ai; j < ai+len; j++, ci++)
+ c[ci] += (a[j] <= bval) ? 1 : 0;
+ }
+
+ public static void vectLessequalAdd(double[] a, double bval, double[]
c, int ai, int ci, int len) {
+ final DoubleVector bVec = DoubleVector.broadcast(SPECIES,
bval);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a,
ai + i);
+ DoubleVector cVec = DoubleVector.fromArray(SPECIES, c,
ci + i);
+
+ VectorMask<Double> le =
aVec.compare(VectorOperators.LE, bVec);
+ DoubleVector inc = zeros.blend(ones, le);
+
+ cVec.add(inc).intoArray(c, ci + i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[ci + i] += (a[ai + i] <= bval) ? 1.0 : 0.0;
+ }
+ }
+ public static double[] scalarvectLessequalWrite(double[] a, double
bval, int ai, int len) {
+ double[] c = allocVector(len, false);
+ for( int j = 0; j < len; j++, ai++)
+ c[j] = (a[ai] <= bval) ? 1 : 0;
+ return c;
+ }
+ public static double[] vectLessequalWrite(double[] a, double bval, int
ai, int len) {
+ double[] c = allocVector(len, false);
+ final DoubleVector bVec = DoubleVector.broadcast(SPECIES,
bval);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a,
ai + i);
+
+ VectorMask<Double> le =
aVec.compare(VectorOperators.LE, bVec);
+ DoubleVector out = zeros.blend(ones, le);
+
+ out.intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = (a[ai + i] <= bval) ? 1.0 : 0.0;
+ }
+
+ return c;
+ }
+ public static double[] scalarvectLessequalWrite(double[] a, double[] b,
int ai, int bi, int len) {
+ double[] c = allocVector(len, false);
+ for( int j = 0; j < len; j++, ai++, bi++)
+ c[j] = (a[ai] <= b[bi]) ? 1 : 0;
+ return c;
+ }
+
+ public static double[] vectLessequalWrite(double[] a, double[] b, int
ai, int bi, int len) {
+ double[] c = allocVector(len, false);
+
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec = DoubleVector.fromArray(SPECIES, a,
ai + i);
+ DoubleVector bVec = DoubleVector.fromArray(SPECIES, b,
bi + i);
+
+ VectorMask<Double> le =
aVec.compare(VectorOperators.LE, bVec);
+ DoubleVector out = zeros.blend(ones, le);
+
+ out.intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = (a[ai + i] <= b[bi + i]) ? 1.0 : 0.0;
+ }
+
+ return c;
+ }
+ public static void scalarvectGreaterAdd(double[] a, double bval,
double[] c, int ai, int ci, int len) {
+ for( int j = ai; j < ai+len; j++, ci++)
+ c[ci] += (a[j] > bval) ? 1 : 0;
+ }
+
+ public static void vectGreaterAdd(double[] a, double bval, double[] c,
int ai, int ci, int len) {
+ final DoubleVector bVec = DoubleVector.broadcast(SPECIES,
bval);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec =
DoubleVector.fromArray(SPECIES, a, ai + i);
+ DoubleVector cVec =
DoubleVector.fromArray(SPECIES, c, ci + i);
+
+ VectorMask<Double> gt =
aVec.compare(VectorOperators.GT, bVec);
+ DoubleVector inc = zeros.blend(ones, gt);
+
+ cVec.add(inc).intoArray(c, ci + i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[ci + i] += (a[ai + i] > bval) ? 1.0 : 0.0;
+ }
+ }
+ public static double[] scalarvectGreaterWrite(double[] a,
double bval, int ai, int len) {
+ double[] c = allocVector(len, false);
+ for( int j = 0; j < len; j++, ai++)
+ c[j] = (a[ai] > bval) ? 1 : 0;
+ return c;
+ }
+ public static double[] vectGreaterWrite(double[] a, double
bval, int ai, int len) {
+ double[] c = allocVector(len, false);
+ final DoubleVector bVec = DoubleVector.broadcast(SPECIES,
bval);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec =
DoubleVector.fromArray(SPECIES, a, ai + i);
+
+ VectorMask<Double> gt =
aVec.compare(VectorOperators.GT, bVec);
+ DoubleVector out = zeros.blend(ones, gt);
+
+ out.intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = (a[ai + i] > bval) ? 1.0 : 0.0;
+ }
+ return c;
+ }
+ public static void scalarvectMult2Add(double[] a, double[] c, int ai,
int ci, int len) {
+ for( int j = ai; j < ai+len; j++, ci++)
+ c[ci] += a[j] + a[j];
+ }
+
+ public static void vectMult2Add(double[] a, double[] c, int ai, int ci,
int len) {
+ LibMatrixMult.vectMultiplyAdd(2.0,a,c,ai,ci,len);
+ }
+
+ public static double[] scalarvectGreaterWrite(double[] a, double[] b,
int ai, int bi, int len) {
+ double[] c = allocVector(len, false);
+ for( int j = 0; j < len; j++, ai++, bi++)
+ c[j] = (a[ai] > b[bi]) ? 1 : 0;
+ return c;
+ }
+
+ // not in use: vector api implementation slower than scalar loop version
+ public static double[] vectGreaterWrite(double[] a, double[] b, int ai,
int bi, int len) {
+ double[] c = allocVector(len, false);
+ final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+ final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+ int i = 0;
+ int upper = SPECIES.loopBound(len);
+
+ //unrolled vLen-block (for better instruction-level
parallelism)
+ for (; i < upper; i += vLen) {
+ DoubleVector aVec =
DoubleVector.fromArray(SPECIES, a, ai + i);
+ DoubleVector bVec =
DoubleVector.fromArray(SPECIES, b, bi + i);
+
+ VectorMask<Double> gt =
aVec.compare(VectorOperators.GT, bVec);
+ DoubleVector out = zeros.blend(ones, gt);
+
+ out.intoArray(c, i);
+ }
+
+ //rest, not aligned to vLen-blocks
+ for (; i < len; i++) {
+ c[i] = (a[ai + i] > b[bi + i]) ? 1.0 : 0.0;
+ }
+ return c;
+ }
+}