This is an automated email from the ASF dual-hosted git repository.
baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 1c26e2d299 [SYSTEMDS-3636] Improved ultra-sparse TSMM left w/ sparse
output
1c26e2d299 is described below
commit 1c26e2d299ace9f0b3b4974c9d8bac665fd9692e
Author: Christina Dionysio <[email protected]>
AuthorDate: Thu Dec 7 09:29:06 2023 +0100
[SYSTEMDS-3636] Improved ultra-sparse TSMM left w/ sparse output
This patch provides the support for left transposed ultra-sparse tsmm.
Similar to the the implementation of the right transpose ultra-sparse tsmm,
binary search is used to populate the upper triangular part of a sparse
output matrix.
Operation:
t(X) %*% X
tests show an improvement of 17 to 30x, and support some new cases that
were not able to run before.
Closes #1955
---
.../sysds/runtime/matrix/data/LibMatrixMult.java | 117 +++++++++++++--------
.../FullMatrixMultiplicationTransposeSelfTest.java | 27 ++++-
2 files changed, 94 insertions(+), 50 deletions(-)
diff --git
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index 0f96a30dad..80d0230da9 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -442,9 +442,9 @@ public class LibMatrixMult
//Timing time = new Timing(true);
//pre-processing
- ret.sparse = isSparseOutputTSMM(m1, leftTranspose);
+ ret.sparse = isSparseOutputTSMM(m1);
ret.allocateBlock();
- MatrixBlock m1t = isSparseOutputTSMM(m1, leftTranspose, true) ?
+ MatrixBlock m1t = isSparseOutputTSMM(m1, true) ?
LibMatrixReorg.transpose(m1) : null;
//core tsmm operation
@@ -484,9 +484,9 @@ public class LibMatrixMult
//Timing time = new Timing(true);
//pre-processing (no need to check isThreadSafe)
- ret.sparse = isSparseOutputTSMM(m1, leftTranspose);
+ ret.sparse = isSparseOutputTSMM(m1);
ret.allocateBlock();
- MatrixBlock m1t = isSparseOutputTSMM(m1, leftTranspose, true) ?
+ MatrixBlock m1t = isSparseOutputTSMM(m1, true) ?
LibMatrixReorg.transpose(m1, k) : null;
//core multi-threaded matrix mult computation
@@ -2506,39 +2506,60 @@ public class LibMatrixMult
}
private static void matrixMultTransposeSelfUltraSparse( MatrixBlock m1,
MatrixBlock ret, boolean leftTranspose, int rl, int ru ) {
- if( leftTranspose )
- throw new DMLRuntimeException("Left tsmm with sparse
output not supported");
-
- // Operation X%*%t(X), sparse input and output
- SparseBlock a = m1.sparseBlock;
- SparseBlock c = ret.sparseBlock;
+ SparseBlock a = m1.sparseBlock;
+ SparseBlock c = ret.sparseBlock;
int m = m1.rlen;
-
- final int blocksize = 256;
- for(int bi=rl; bi<ru; bi+=blocksize) { //blocking rows in X
- int bimin = Math.min(bi+blocksize, ru);
- for(int i=bi; i<bimin; i++) //preallocation
- if( !a.isEmpty(i) )
- c.allocate(i,
8*SparseRowVector.initialCapacity); //heuristic
- for(int bj=bi; bj<m; bj+=blocksize ) { //blocking cols
in t(X)
- int bjmin = Math.min(bj+blocksize, m);
- for(int i=bi; i<bimin; i++) { //rows in X
- if( a.isEmpty(i) ) continue;
- int apos = a.pos(i);
- int alen = a.size(i);
- int[] aix = a.indexes(i);
- double[] avals = a.values(i);
- for(int j=Math.max(bj,i); j<bjmin; j++)
{ //cols in t(X)
- if( a.isEmpty(j) ) continue;
- int bpos = a.pos(j);
- int blen = a.size(j);
- int[] bix = a.indexes(j);
- double[] bvals = a.values(j);
-
- //compute sparse dot product
and append
- double v = dotProduct(avals,
aix, apos, alen, bvals, bix, bpos, blen);
- if( v != 0 )
- c.append(i, j, v);
+
+ if(leftTranspose) {
+ // Operation t(X)%*%X, sparse input and output
+ for(int i=0; i<m; i++)
+ c.allocate(i,
8*SparseRowVector.initialCapacity);
+ SparseRow[] sr = ((SparseBlockMCSR) c).getRows();
+ for( int r=0; r<a.numRows(); r++ ) {
+ if( a.isEmpty(r) ) continue;
+ final int alen = a.size(r);
+ final double[] avals = a.values(r);
+ final int apos = a.pos(r);
+ int[] aix = a.indexes(r);
+ int rlix = (rl==0) ? 0 : a.posFIndexGTE(r, rl);
+ if(rlix>=0) {
+ int len = apos + alen;
+ for(int i = rlix; i < len && aix[i] <
ru; i++) {
+ for (int k = a.posFIndexGTE(r,
aix[i]); k < len; k++) {
+ sr[aix[i]].add(c.pos(k)
+ aix[k], avals[i] * avals[k]);
+ }
+ }
+ }
+ }
+ }
+ else {
+ // Operation X%*%t(X), sparse input and output
+ final int blocksize = 256;
+ for(int bi=rl; bi<ru; bi+=blocksize) { //blocking rows
in X
+ int bimin = Math.min(bi+blocksize, ru);
+ for(int i=bi; i<bimin; i++) //preallocation
+ if( !a.isEmpty(i) )
+ c.allocate(i,
8*SparseRowVector.initialCapacity); //heuristic
+ for(int bj=bi; bj<m; bj+=blocksize ) {
//blocking cols in t(X)
+ int bjmin = Math.min(bj+blocksize, m);
+ for(int i=bi; i<bimin; i++) { //rows in
X
+ if( a.isEmpty(i) ) continue;
+ int apos = a.pos(i);
+ int alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i);
+ for(int j=Math.max(bj,i);
j<bjmin; j++) { //cols in t(X)
+ if( a.isEmpty(j) )
continue;
+ int bpos = a.pos(j);
+ int blen = a.size(j);
+ int[] bix =
a.indexes(j);
+ double[] bvals =
a.values(j);
+
+ //compute sparse dot
product and append
+ double v =
dotProduct(avals, aix, apos, alen, bvals, bix, bpos, blen);
+ if( v != 0 )
+ c.append(i, j,
v);
+ }
}
}
}
@@ -2547,12 +2568,18 @@ public class LibMatrixMult
//alternative matrixMultTransposeSelfUltraSparse2 w/ IKJ iteration
order and sparse updates
private static void matrixMultTransposeSelfUltraSparse2( MatrixBlock
m1, MatrixBlock m1t, MatrixBlock ret, boolean leftTranspose, int rl, int ru ) {
- if( leftTranspose )
- throw new DMLRuntimeException("Left tsmm with sparse
output not supported");
+ SparseBlock a;
+ SparseBlock b;
+ if( leftTranspose ) {
+ a = m1t.sparseBlock;
+ b = m1.sparseBlock;
+ }
+ else {
+ a = m1.sparseBlock;
+ b = m1t.sparseBlock;
+ }
// Operation X%*%t(X), sparse input and output
- SparseBlock a = m1.sparseBlock;
- SparseBlock b = m1t.sparseBlock;
SparseBlock c = ret.sparseBlock;
for(int i=rl; i<ru; i++) { //rows in X
if( a.isEmpty(i) ) continue;
@@ -4370,7 +4397,7 @@ public class LibMatrixMult
MatrixBlock ret = m1;
final int rlen = m1.rlen;
final int clen = m1.clen;
- boolean retSparse = isSparseOutputTSMM(m1, leftTranspose);
+ boolean retSparse = isSparseOutputTSMM(m1);
if( !leftTranspose && !retSparse && m1.sparse && rlen > 1) {
//X%*%t(X) SPARSE MATRIX
//directly via LibMatrixReorg in order to prevent
sparsity change
@@ -4489,16 +4516,16 @@ public class LibMatrixMult
return m2.clen < 4*1024 && sparseOut;
}
- public static boolean isSparseOutputTSMM(MatrixBlock m1, boolean
leftTranspose) {
- return isSparseOutputTSMM(m1, leftTranspose, false);
+ public static boolean isSparseOutputTSMM(MatrixBlock m1) {
+ return isSparseOutputTSMM(m1, false);
}
- public static boolean isSparseOutputTSMM(MatrixBlock m1, boolean
leftTranspose, boolean ultraSparse) {
+ public static boolean isSparseOutputTSMM(MatrixBlock m1, boolean
ultraSparse) {
double sp = m1.getSparsity();
double osp = OptimizerUtils.getMatMultSparsity(sp, sp, m1.rlen,
m1.clen, m1.rlen, false);
double sp_threshold = ultraSparse ?
MatrixBlock.ULTRA_SPARSITY_TURN_POINT :
MatrixBlock.ULTRA_SPARSITY_TURN_POINT2;
- return !leftTranspose && m1.sparse && osp < sp_threshold;
+ return m1.sparse && osp < sp_threshold;
}
public static boolean isOuterProductTSMM(int rlen, int clen, boolean
left) {
diff --git
a/src/test/java/org/apache/sysds/test/functions/binary/matrix_full_other/FullMatrixMultiplicationTransposeSelfTest.java
b/src/test/java/org/apache/sysds/test/functions/binary/matrix_full_other/FullMatrixMultiplicationTransposeSelfTest.java
index 3d1b4c239f..3261f10385 100644
---
a/src/test/java/org/apache/sysds/test/functions/binary/matrix_full_other/FullMatrixMultiplicationTransposeSelfTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/binary/matrix_full_other/FullMatrixMultiplicationTransposeSelfTest.java
@@ -123,7 +123,12 @@ public class FullMatrixMultiplicationTransposeSelfTest
extends AutomatedTestBase
public void testVVRightSparseCP() {
runTransposeSelfVectorMultiplicationTest(MMTSJType.RIGHT,
ExecType.CP, true);
}
-
+
+ @Test
+ public void testLeftUltraSparseCP() {
+ runTransposeSelfUltraSparseTest(MMTSJType.LEFT);
+ }
+
@Test
public void testRightUltraSparseCP() {
runTransposeSelfUltraSparseTest(MMTSJType.RIGHT);
@@ -282,15 +287,27 @@ public class FullMatrixMultiplicationTransposeSelfTest
extends AutomatedTestBase
TestUtils.compareMatrices(G, Gtt, 1e-16);
//single-threaded core operations
- MatrixBlock R11 = G.transposeSelfMatrixMultOperations(new
MatrixBlock(), MMTSJType.RIGHT);
- MatrixBlock R12 = LibMatrixMult.matrixMult(G, Gt);
+ MatrixBlock R11 = G.transposeSelfMatrixMultOperations(new
MatrixBlock(), type);
+ MatrixBlock R12;
+ if (type.isLeft()) {
+ R12 = LibMatrixMult.matrixMult(Gt, G);
+ }
+ else {
+ R12 = LibMatrixMult.matrixMult(G, Gt);
+ }
Assert.assertEquals(R11.getNonZeros(), R12.getNonZeros());
TestUtils.compareMatrices(R11, R12, 1e-8);
//multi-threaded core operations
int k = InfrastructureAnalyzer.getLocalParallelism();
- MatrixBlock R21 = G.transposeSelfMatrixMultOperations(new
MatrixBlock(), MMTSJType.RIGHT, k);
- MatrixBlock R22 = LibMatrixMult.matrixMult(G, Gt, k);
+ MatrixBlock R21 = G.transposeSelfMatrixMultOperations(new
MatrixBlock(), type, k);
+ MatrixBlock R22;
+ if (type.isLeft()) {
+ R22 = LibMatrixMult.matrixMult(Gt, G, k);
+ }
+ else {
+ R22 = LibMatrixMult.matrixMult(G, Gt, k);
+ }
Assert.assertEquals(R21.getNonZeros(), R22.getNonZeros());
TestUtils.compareMatrices(R21, R22, 1e-8);
}