This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push: new c940502658 [SYSTEMDS-3773] Fix two matmult kernels w/ parallelization over rhs c940502658 is described below commit c94050265827deab65200c1b2ba4bad60f6d6f96 Author: Matthias Boehm <mboe...@gmail.com> AuthorDate: Sun Sep 22 15:50:25 2024 +0200 [SYSTEMDS-3773] Fix two matmult kernels w/ parallelization over rhs The matmult kernel library parallelizes by default over rows in the left-hand-side matrix, but for specific size regimes, switches to a parallelization over the rows or columns of the right-hand-side. The recently added full-coverage tests found two bug, which this patch fixes a) dense-dense matrix-vector multiplication w/ large vectors --> extended implementation to support the other parallelization b) sparse-dense vector-vector dot product --> disable parallelization for this specific case to use the existing kernel without binary searches --- .../sysds/runtime/matrix/data/LibMatrixMult.java | 21 +++++++++++++-------- .../component/matrix/MatrixMultiplyKernelTest.java | 21 +++++++++++++-------- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java index 328b43d6ca..c4eddd90fa 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java @@ -279,7 +279,7 @@ public class LibMatrixMult boolean pm2r = !ultraSparse && !sparse && checkParMatrixMultRightInputRows(m1, m2, k); boolean pm2c = !ultraSparse && checkParMatrixMultRightInputCols(m1, m2, k, pm2r); int num = pm2r ? m2.rlen : pm2c ? m2.clen : m1.rlen; - + // core multi-threaded matrix mult computation // (currently: always parallelization over number of rows) final ExecutorService pool = CommonThreadPool.get(k); @@ -1052,7 +1052,7 @@ public class LibMatrixMult matrixMultDenseDenseMVShortRHS(a, b, c, cd, rl, ru); } else if( n==1 ) { //MATRIX-VECTOR (tall rhs) - matrixMultDenseDenseMVTallRHS(a, b, c, cd, rl, ru); + matrixMultDenseDenseMVTallRHS(a, b, c, pm2, cd, rl, ru); } else if( pm2 && m==1 ) { //VECTOR-MATRIX matrixMultDenseDenseVM(a, b, c, n, cd, rl, ru); @@ -1075,15 +1075,20 @@ public class LibMatrixMult cvals[i] = dotProduct(a.values(i), bvals, a.pos(i), 0, cd); } - private static void matrixMultDenseDenseMVTallRHS(DenseBlock a, DenseBlock b, DenseBlock c, int cd, int rl, int ru) { + private static void matrixMultDenseDenseMVTallRHS(DenseBlock a, DenseBlock b, DenseBlock c, boolean pm2, int cd, int rl, int ru) { final int blocksizeI = 32; final int blocksizeK = 2*1024; //16KB vector blocks (L1) double[] bvals = b.valuesAt(0); double[] cvals = c.valuesAt(0); - for( int bi=rl; bi<ru; bi+=blocksizeI ) { - int bimin = Math.min(bi+blocksizeI, ru); - for( int bk=0; bk<cd; bk+=blocksizeK ) { - int bkmin = Math.min(bk+blocksizeK, cd); + // setup bounds according to parallelization strategy + // (default: rows in lhs, pm2: rows in rhs) + int cl = pm2 ? rl : 0, cu = pm2 ? ru : cd; + int rl2 = pm2 ? 0 : rl, ru2 = pm2 ? a.numRows() : ru; + // matrix-vector multication with cache blocking of vector + for( int bi=rl2; bi<ru2; bi+=blocksizeI ) { + int bimin = Math.min(bi+blocksizeI, ru2); + for( int bk=cl; bk<cu; bk+=blocksizeK ) { + int bkmin = Math.min(bk+blocksizeK, cu); for( int i=bi; i<bimin; i++) cvals[i] += dotProduct(a.values(i), bvals, a.pos(i,bk), bk, bkmin-bk); } @@ -4349,7 +4354,7 @@ public class LibMatrixMult private static boolean checkParMatrixMultRightInputRows( MatrixBlock m1, MatrixBlock m2, int k ) { //parallelize over rows in rhs matrix if number of rows in lhs/output is very small double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory(); - return (m1.rlen==1 && !(m1.isUltraSparse()||m2.isUltraSparse())) + return (m1.rlen==1 && !(m1.sparse && m2.clen==1) && !(m1.isUltraSparse()||m2.isUltraSparse())) || (m1.rlen<=16 && m2.rlen > m1.rlen && (!m1.sparse | m2.clen > 1) && ( !m1.isUltraSparse() && !(m1.sparse & m2.sparse) ) //dense-dense / sparse-dense / dense-sparse && (long)k * 8 * m1.rlen * m2.clen < Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem) ); diff --git a/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyKernelTest.java b/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyKernelTest.java index 9975213762..1d51e6fd8e 100644 --- a/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyKernelTest.java +++ b/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyKernelTest.java @@ -51,10 +51,15 @@ public class MatrixMultiplyKernelTest { testMatrixMultiply(MIN_PAR, 16, 1, 1, 1); } -// @Test //FIXME -// public void testDenseDenseMatrixLargeVector() { -// testMatrixMultiply(16, MIN_PAR, 1, 1, 1); -// } + @Test //parallelization over rows in lhs + public void testDenseDenseMatrixLargeVector() { + testMatrixMultiply(4000, 3000, 1, 1, 1); + } + + @Test //parallelization over rows in rhs + public void testDenseDenseMatrixLargeVectorPm2() { + testMatrixMultiply(16, MIN_PAR, 1, 1, 1); + } @Test public void testDenseDenseVectorMatrix() { @@ -90,10 +95,10 @@ public class MatrixMultiplyKernelTest { // sparse-dense kernels -// @Test FIXME -// public void testSparseDenseDotProduct() { -// testMatrixMultiply(1, MIN_PAR, 1, 0.1, 1); -// } + @Test + public void testSparseDenseDotProduct() { + testMatrixMultiply(1, MIN_PAR, 1, 0.1, 1); + } @Test public void testSparseDenseMatrixSmallVector() {