[SYSTEMML-641] Performance core block matrix mult (cache/task config)

This patch makes two configuration changes to our core block matrix
multiplications (mostly applicable for dense):
(1) Cache-conscious selection of case dense-dense "skinny rhs matrix"
(for common L2 cache size)
(2) Balanced task size configurations (better cache-blocking, number of
tasks as multiples of degree of parallelism k, balanced distribution of
iterations over tasks) 

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/bc223198
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/bc223198
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/bc223198

Branch: refs/heads/master
Commit: bc22319820f8e8f00770bad054475cc797781bed
Parents: 72e2166
Author: Matthias Boehm <mbo...@us.ibm.com>
Authored: Thu Apr 21 00:37:42 2016 -0700
Committer: Matthias Boehm <mbo...@us.ibm.com>
Committed: Thu Apr 21 10:34:05 2016 -0700

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixMult.java      | 29 ++++++++++++++++----
 .../sysml/runtime/util/UtilFunctions.java       |  6 ++++
 2 files changed, 30 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bc223198/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 488e95a..3142b2c 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -184,10 +184,10 @@ public class LibMatrixMult
                try {
                        ExecutorService pool = Executors.newFixedThreadPool( k 
);
                        ArrayList<MatrixMultTask> tasks = new 
ArrayList<MatrixMultTask>();
-                       int nk = pm2 ? k : Math.max(Math.min(8*k,ru/8), k);
-                       int blklen = (int)(Math.ceil((double)ru/nk));
-                       for( int i=0; i<nk & i*blklen<ru; i++ )
-                               tasks.add(new MatrixMultTask(m1, m2, ret, tm2, 
pm2, i*blklen, Math.min((i+1)*blklen, ru)));
+                       int nk = pm2 ? k : 
UtilFunctions.roundToNext(Math.min(8*k,ru/32), k);
+                       ArrayList<Integer> blklens = getBalancedBlockSizes(ru, 
nk);
+                       for( int i=0, rl=0; i<blklens.size(); 
rl+=blklens.get(i), i++ )
+                               tasks.add(new MatrixMultTask(m1, m2, ret, tm2, 
pm2, rl, rl+blklens.get(i)));
                        //execute tasks
                        List<Future<Object>> taskret = pool.invokeAll(tasks);   
                        pool.shutdown();
@@ -3945,7 +3945,8 @@ public class LibMatrixMult
        {
                //transpose if dense-dense, skinny rhs matrix (not vector), and 
memory guarded by output 
                return (LOW_LEVEL_OPTIMIZATION && !m1.sparse && !m2.sparse 
-                               && m1.rlen>m2.clen && m2.rlen > 64 && m2.clen > 
1 && m2.clen < 64);
+                               && m1.rlen > m2.clen && m2.rlen > 64 && m2.clen 
> 1 && m2.clen < 64
+                               && 8*m2.rlen*m2.clen < 256*1024 ); //rhs fits 
in L2 cache
        }
        
        /**
@@ -4079,6 +4080,24 @@ public class LibMatrixMult
                
        }
        
+       /**
+        * 
+        * @param len
+        * @param k
+        * @return
+        */
+       private static ArrayList<Integer> getBalancedBlockSizes(int len, int k) 
{
+               ArrayList<Integer> ret = new ArrayList<Integer>();
+               int base = len / k;
+               int rest = len % k;
+               for( int i=0; i<k; i++ ) {
+                       int val = base + (i<rest?1:0);
+                       if( val > 0 )
+                               ret.add(val);
+               }       
+               return ret; 
+       }
+       
        /////////////////////////////////////////////////////////
        // Task Implementations for Multi-Threaded Operations  //
        /////////////////////////////////////////////////////////

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bc223198/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java 
b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
index 21078c4..241b178 100644
--- a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
+++ b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
@@ -254,6 +254,12 @@ public class UtilFunctions
                else
                        return ((Integer)obj).intValue();
        }
+       
+       public static int roundToNext(int val, int factor) {
+               //round up to next non-zero multiple of factor
+               int pval = Math.max(val, factor);
+               return ((pval + factor-1) / factor) * factor;
+       }
 
        /**
         * 

Reply via email to