[1/4] systemml git commit: [SYSTEMML-1752] Cache-conscious mmchain block matrix multiplication

mboehm7 Sat, 08 Jul 2017 22:34:03 -0700

Repository: systemml
Updated Branches:
  refs/heads/master f485ab292 -> b84a4933c



[SYSTEMML-1752] Cache-conscious mmchain block matrix multiplication

The fused mmchain matrix multiply for patterns such as t(X) %% (w * (X
%% v)) uses row-wise dotProduct and vectMultAdd operations, which works
very well for the common case of tall&skinny matrices where individual
rows fit into L1 cache. However, for graph and text scenarios with wide
matrices this leads to cache thrashing on the input and output vectors.

This patch introduces cache-conscious dense mmchain operations by
working over fragments of input and output vectors. On a scenario with
dense matrices of 2M features, this improved the multi-threaded
performance (in a parfor context) by almost 2x due to less cache
thrashing. At the same time, this patch does not negatively affect the
performance of the common case with tall and skinny matrices.

Furthermore, for sparse, we no longer use row blocks but single rows
(which reduces cache memory requirements) because these sparse row
blocks can anyway not be used for vectMultAdd4 operations. However, a
full support for cache-conscious sparse mmchain operations is still
pending but initial experiments with sparsity-dependent block sizes were
non-conclusive from a performance perspective and showed issues of
numerical stability.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/8b83ab5f
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/8b83ab5f
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/8b83ab5f

Branch: refs/heads/master
Commit: 8b83ab5f4611c27b605af45ee40e6e0b5cd801f9
Parents: 4a6165b
Author: Matthias Boehm <mboe...@gmail.com>
Authored: Sat Jul 8 02:23:03 2017 -0700
Committer: Matthias Boehm <mboe...@gmail.com>
Committed: Sat Jul 8 22:32:03 2017 -0700

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixMult.java      | 86 +++++++++-----------
 1 file changed, 39 insertions(+), 47 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/8b83ab5f/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index b5429c3..5996a51 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -1565,26 +1565,35 @@ public class LibMatrixMult
                
                //temporary array for cache blocking
                //(blocksize chosen to fit b+v in L2 (256KB) for default 1k 
blocks)
-               final int blocksize = 24; // constraint: factor of 4
-               double[] tmp = new double[blocksize];
-                       
+               final int blocksizeI = 24; // constraint: factor of 4
+               final int blocksizeJ = 1024;
+               double[] tmp = new double[blocksizeI];
+               
                //blockwise mmchain computation
-               final int bn = ru - ru % blocksize; //rl blocksize aligned
-               for( int bi=rl; bi < bn; bi+=blocksize ) 
+               final int bn = ru - ru % blocksizeI; //rl blocksize aligned
+               for( int bi=rl; bi < bn; bi+=blocksizeI ) 
                {
                        //compute 1st matrix-vector for row block
-                       for( int j=0, aix=bi*cd; j < blocksize; j++, aix+=cd)
-                               tmp[j] = dotProduct(a, b, aix, 0, cd);
+                       Arrays.fill(tmp, 0);
+                       for( int bj=0; bj<cd; bj+=blocksizeJ ) {
+                               int bjmin = Math.min(cd-bj, blocksizeJ);
+                               for( int i=0, aix=bi*cd+bj; i < blocksizeI; 
i++, aix+=cd)
+                                       tmp[i] += dotProduct(a, b, aix, bj, 
bjmin);
+                       }
                        
                        //multiply/subtract weights (in-place), if required
                        if( weights ) 
-                               vectMultiply(w, tmp, bi, 0, blocksize); 
+                               vectMultiply(w, tmp, bi, 0, blocksizeI);        
                        else if( weights2 )
-                               vectSubtract(w, tmp, bi, 0, blocksize);
+                               vectSubtract(w, tmp, bi, 0, blocksizeI);
                                
                        //compute 2nd matrix vector for row block and aggregate
-                       for (int j=0, aix=bi*cd; j < blocksize; j+=4, aix+=4*cd)
-                               vectMultiplyAdd4(tmp[j], tmp[j+1], tmp[j+2], 
tmp[j+3], a, c, aix, aix+cd, aix+2*cd, aix+3*cd, 0, cd);
+                       for( int bj = 0; bj<cd; bj+=blocksizeJ ) {
+                               int bjmin = Math.min(cd-bj, blocksizeJ);
+                               for (int i=0, aix=bi*cd+bj; i<blocksizeI; i+=4, 
aix+=4*cd)
+                                       vectMultiplyAdd4(tmp[i], tmp[i+1], 
tmp[i+2], tmp[i+3],
+                                               a, c, aix, aix+cd, aix+2*cd, 
aix+3*cd, bj, bjmin);
+                       }
                }
                
                //compute rest (not aligned to blocksize)
@@ -1592,7 +1601,7 @@ public class LibMatrixMult
                        double val = dotProduct(a, b, aix, 0, cd);
                        val *= (weights) ? w[i] : 1;
                        val -= (weights2) ? w[i] : 0;
-                       vectMultiplyAdd(val, a, c, aix, 0, cd);                 
        
+                       vectMultiplyAdd(val, a, c, aix, 0, cd);
                }
        }
 
@@ -1605,42 +1614,25 @@ public class LibMatrixMult
                boolean weights = (ct == ChainType.XtwXv);
                boolean weights2 = (ct == ChainType.XtXvy);
                
-               //temporary array for cache blocking
-               //(blocksize chosen to fit b+v in L2 (256KB) for default 1k 
blocks)
-               final int blocksize = 24;
-               double[] tmp = new double[blocksize];
-               
-               //blockwise mmchain computation
-               for( int bi=rl; bi < ru; bi+=blocksize ) 
-               {
-                       //reset row block intermediate
-                       int tmplen = Math.min(blocksize, ru-bi);
-
-                       //compute 1st matrix-vector for row block
-                       for( int j=0; j < tmplen; j++) {
-                               if( !a.isEmpty(bi+j) ) {
-                                       int apos = a.pos(bi+j);
-                                       int alen = a.size(bi+j);                
                
-                                       tmp[j] = dotProduct(a.values(bi+j), b, 
a.indexes(bi+j), apos, 0, alen);                                                
 
-                               }
-                       }
+               //row-wise mmchain computation
+               for( int i=rl; i < ru; i++ ) {
+                       if( a.isEmpty(i) || (weights && w[i]==0) )
+                               continue;
+                       int apos = a.pos(i);
+                       int alen = a.size(i);
+                       int[] aix = a.indexes(i);
+                       double[] avals = a.values(i);
                        
-                       //multiply weights (in-place), if required
-                       if( weights ) 
-                               vectMultiply(w, tmp, bi, 0, tmplen);    
-                       else if( weights2 )
-                               vectSubtract(w, tmp, bi, 0, tmplen);
-               
-                       //compute 2nd matrix vector for row block and aggregate
-                       for( int j=0; j < tmplen; j++) {
-                               if( !a.isEmpty(bi+j) && tmp[j] != 0 ) {
-                                       int apos = a.pos(bi+j);
-                                       int alen = a.size(bi+j);
-                                       int[] aix = a.indexes(bi+j);
-                                       double[] avals = a.values(bi+j);        
        
-                                       vectMultiplyAdd(tmp[j], avals, c, aix, 
apos, 0, alen);                                                  
-                               }
-                       }
+                       //compute 1st matrix-vector dot product
+                       double val = dotProduct(avals, b, aix, apos, 0, alen);
+                       
+                       //multiply/subtract weights, if required
+                       val *= (weights) ? w[i] : 1;
+                       val -= (weights2) ? w[i] : 0;
+                       
+                       //compute 2nd matrix vector and aggregate
+                       if( val != 0 )
+                               vectMultiplyAdd(val, avals, c, aix, apos, 0, 
alen);
                }
        }

[1/4] systemml git commit: [SYSTEMML-1752] Cache-conscious mmchain block matrix multiplication

Reply via email to