[SYSTEMML-641] Performance core block matrix mult (cache/task config) This patch makes two configuration changes to our core block matrix multiplications (mostly applicable for dense): (1) Cache-conscious selection of case dense-dense "skinny rhs matrix" (for common L2 cache size) (2) Balanced task size configurations (better cache-blocking, number of tasks as multiples of degree of parallelism k, balanced distribution of iterations over tasks)
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/bc223198 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/bc223198 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/bc223198 Branch: refs/heads/master Commit: bc22319820f8e8f00770bad054475cc797781bed Parents: 72e2166 Author: Matthias Boehm <mbo...@us.ibm.com> Authored: Thu Apr 21 00:37:42 2016 -0700 Committer: Matthias Boehm <mbo...@us.ibm.com> Committed: Thu Apr 21 10:34:05 2016 -0700 ---------------------------------------------------------------------- .../runtime/matrix/data/LibMatrixMult.java | 29 ++++++++++++++++---- .../sysml/runtime/util/UtilFunctions.java | 6 ++++ 2 files changed, 30 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bc223198/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java index 488e95a..3142b2c 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java @@ -184,10 +184,10 @@ public class LibMatrixMult try { ExecutorService pool = Executors.newFixedThreadPool( k ); ArrayList<MatrixMultTask> tasks = new ArrayList<MatrixMultTask>(); - int nk = pm2 ? k : Math.max(Math.min(8*k,ru/8), k); - int blklen = (int)(Math.ceil((double)ru/nk)); - for( int i=0; i<nk & i*blklen<ru; i++ ) - tasks.add(new MatrixMultTask(m1, m2, ret, tm2, pm2, i*blklen, Math.min((i+1)*blklen, ru))); + int nk = pm2 ? k : UtilFunctions.roundToNext(Math.min(8*k,ru/32), k); + ArrayList<Integer> blklens = getBalancedBlockSizes(ru, nk); + for( int i=0, rl=0; i<blklens.size(); rl+=blklens.get(i), i++ ) + tasks.add(new MatrixMultTask(m1, m2, ret, tm2, pm2, rl, rl+blklens.get(i))); //execute tasks List<Future<Object>> taskret = pool.invokeAll(tasks); pool.shutdown(); @@ -3945,7 +3945,8 @@ public class LibMatrixMult { //transpose if dense-dense, skinny rhs matrix (not vector), and memory guarded by output return (LOW_LEVEL_OPTIMIZATION && !m1.sparse && !m2.sparse - && m1.rlen>m2.clen && m2.rlen > 64 && m2.clen > 1 && m2.clen < 64); + && m1.rlen > m2.clen && m2.rlen > 64 && m2.clen > 1 && m2.clen < 64 + && 8*m2.rlen*m2.clen < 256*1024 ); //rhs fits in L2 cache } /** @@ -4079,6 +4080,24 @@ public class LibMatrixMult } + /** + * + * @param len + * @param k + * @return + */ + private static ArrayList<Integer> getBalancedBlockSizes(int len, int k) { + ArrayList<Integer> ret = new ArrayList<Integer>(); + int base = len / k; + int rest = len % k; + for( int i=0; i<k; i++ ) { + int val = base + (i<rest?1:0); + if( val > 0 ) + ret.add(val); + } + return ret; + } + ///////////////////////////////////////////////////////// // Task Implementations for Multi-Threaded Operations // ///////////////////////////////////////////////////////// http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bc223198/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java index 21078c4..241b178 100644 --- a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java +++ b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java @@ -254,6 +254,12 @@ public class UtilFunctions else return ((Integer)obj).intValue(); } + + public static int roundToNext(int val, int factor) { + //round up to next non-zero multiple of factor + int pval = Math.max(val, factor); + return ((pval + factor-1) / factor) * factor; + } /** *