This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/systemds.git
commit 045f9e0f62f5f9df0d997d8bbd8496eea5d141f4 Author: Matthias Boehm <[email protected]> AuthorDate: Wed Apr 3 19:50:35 2024 +0200 [SYSTEMDS-3691] Multi-threaded dot-product matrix multiplication So far, dense dot-product where always executed in a single-threaded manner despite going through the multi-threaded code path because only a single task was created (single row in lhs matrix). We now use the existing logic for parallelizing over the common dimension instead. For pageRank on the europe_osm road network graph, the involved dot product (1x50912018 mmult 50912018x1) improved from ~50ms to 7ms on a machine with 24 pcores / 48 vcores. --- .../java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java index 780afdad67..d71b6d479f 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java @@ -1019,7 +1019,10 @@ public class LibMatrixMult if( m==1 && n==1 ) { //DOT PRODUCT double[] avals = a.valuesAt(0); double[] bvals = b.valuesAt(0); - c.set(0, 0, dotProduct(avals, bvals, cd)); + if( ru > m ) //pm2r - parallelize over common dim + c.set(0, 0, dotProduct(avals, bvals, rl, rl, ru-rl)); + else + c.set(0, 0, dotProduct(avals, bvals, cd)); } else if( n>1 && cd == 1 ) { //OUTER PRODUCT double[] avals = a.valuesAt(0); @@ -4460,8 +4463,8 @@ public class LibMatrixMult private static boolean checkParMatrixMultRightInputRows( MatrixBlock m1, MatrixBlock m2, int k ) { //parallelize over rows in rhs matrix if number of rows in lhs/output is very small double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory(); - return (m1.rlen==1 && LOW_LEVEL_OPTIMIZATION && m2.clen>1 && !(m1.isUltraSparse()||m2.isUltraSparse())) - || (m1.rlen<=16 && LOW_LEVEL_OPTIMIZATION && m2.clen>1 && m2.rlen > m1.rlen + return (m1.rlen==1 && LOW_LEVEL_OPTIMIZATION && !(m1.isUltraSparse()||m2.isUltraSparse())) + || (m1.rlen<=16 && LOW_LEVEL_OPTIMIZATION && m2.rlen > m1.rlen && ( !m1.isUltraSparse() && !(m1.sparse & m2.sparse) ) //dense-dense / sparse-dense / dense-sparse && (long)k * 8 * m1.rlen * m2.clen < Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem) ); }
