Repository: systemml
Updated Branches:
  refs/heads/master 8df0697e0 -> 65e2a46d2


[SYSTEMML-1857] Performance codegen outer operators (degree of par)

Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/06fa73ac
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/06fa73ac
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/06fa73ac

Branch: refs/heads/master
Commit: 06fa73acc4639dd446c2f36c62a956803c247753
Parents: 8df0697
Author: Matthias Boehm <mboe...@gmail.com>
Authored: Tue Aug 22 19:26:00 2017 -0700
Committer: Matthias Boehm <mboe...@gmail.com>
Committed: Wed Aug 23 12:41:45 2017 -0700

----------------------------------------------------------------------
 .../sysml/runtime/codegen/SpoofCellwise.java    |  1 -
 .../runtime/codegen/SpoofMultiAggregate.java    |  1 -
 .../sysml/runtime/codegen/SpoofOperator.java    |  4 ++
 .../runtime/codegen/SpoofOuterProduct.java      | 47 ++++++++++++++------
 .../sysml/runtime/codegen/SpoofRowwise.java     |  1 -
 5 files changed, 38 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/06fa73ac/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java 
b/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
index 575043b..5e22406 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
@@ -51,7 +51,6 @@ import org.apache.sysml.runtime.util.UtilFunctions;
 public abstract class SpoofCellwise extends SpoofOperator implements 
Serializable
 {
        private static final long serialVersionUID = 3442528770573293590L;
-       private static final long PAR_NUMCELL_THRESHOLD = 1024*1024;   //Min 1M 
elements
        
        public enum CellType {
                NO_AGG,

http://git-wip-us.apache.org/repos/asf/systemml/blob/06fa73ac/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java 
b/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
index ae3c353..4aa91bb 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
@@ -48,7 +48,6 @@ import org.apache.sysml.runtime.util.UtilFunctions;
 public abstract class SpoofMultiAggregate extends SpoofOperator implements 
Serializable
 {
        private static final long serialVersionUID = -6164871955591089349L;
-       private static final long PAR_NUMCELL_THRESHOLD = 1024*1024;   //Min 1M 
elements
        
        private final AggOp[] _aggOps;
        private final boolean _sparseSafe;

http://git-wip-us.apache.org/repos/asf/systemml/blob/06fa73ac/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java 
b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
index 3ea9246..fea51e3 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
@@ -37,6 +37,10 @@ public abstract class SpoofOperator implements Serializable
        private static final long serialVersionUID = 3834006998853573319L;
        private static final Log LOG = 
LogFactory.getLog(SpoofOperator.class.getName());
        
+       protected static final long PAR_NUMCELL_THRESHOLD = 1024*1024;   //Min 
1M elements
+       protected static final long PAR_MINFLOP_THRESHOLD = 2L*1024*1024; //MIN 
2 MFLOP
+       
+       
        public abstract MatrixBlock execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalars, MatrixBlock out)
                throws DMLRuntimeException;
        

http://git-wip-us.apache.org/repos/asf/systemml/blob/06fa73ac/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java 
b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
index bc99859..1ec873f 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
@@ -112,6 +112,9 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                if( inputs.get(0).isEmptyBlock(false) )
                        return new DoubleObject(0);
                
+               if( 2*inputs.get(0).getNonZeros()*inputs.get(1).getNumColumns() 
< PAR_MINFLOP_THRESHOLD )
+                       return execute(inputs, scalarObjects); //sequential
+               
                //input preparation
                double[][] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 
2, true, false));
                double[][] b = getDenseMatrices(prepInputMatrices(inputs, 3, 
true));
@@ -121,15 +124,14 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                final int m = inputs.get(0).getNumRows();
                final int n = inputs.get(0).getNumColumns();
                final int k = inputs.get(1).getNumColumns(); // rank
+               final long nnz = inputs.get(0).getNonZeros();
                double sum = 0;
                
                try 
                {
                        ExecutorService pool = Executors.newFixedThreadPool(k);
                        ArrayList<ParOuterProdAggTask> tasks = new 
ArrayList<ParOuterProdAggTask>();
-                       //create tasks (for wdivmm-left, parallelization over 
columns;
-                       //for wdivmm-right, parallelization over rows; both 
ensure disjoint results)
-                       int numThreads2 = 
UtilFunctions.roundToNext(Math.min(8*k,m/32), k);
+                       int numThreads2 = getPreferredNumberOfTasks(m, n, nnz, 
k, numThreads);
                        int blklen = (int)(Math.ceil((double)m/numThreads2));
                        for( int i=0; i<numThreads2 & i*blklen<m; i++ )
                                tasks.add(new 
ParOuterProdAggTask(inputs.get(0), ab[0], ab[1], b, scalars, 
@@ -259,6 +261,9 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                        out.allocateDenseBlock();
                }       
                
+               if( 2*inputs.get(0).getNonZeros()*inputs.get(1).getNumColumns() 
< PAR_MINFLOP_THRESHOLD )
+                       return execute(inputs, scalarObjects, out); //sequential
+               
                //input preparation
                double[][] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 
2, true, false));
                double[][] b = getDenseMatrices(prepInputMatrices(inputs, 3, 
true));
@@ -268,6 +273,7 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                final int m = inputs.get(0).getNumRows();
                final int n = inputs.get(0).getNumColumns();
                final int k = inputs.get(1).getNumColumns(); // rank
+               final long nnz = inputs.get(0).getNonZeros();
                
                MatrixBlock a = inputs.get(0);
                
@@ -284,21 +290,24 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                                        int numCG = 
((CompressedMatrixBlock)a).getNumColGroups();
                                        int blklen = 
(int)(Math.ceil((double)numCG/numThreads));
                                        for( int j=0; j<numThreads & 
j*blklen<numCG; j++ )
-                                               tasks.add(new ParExecTask(a, 
ab[0], ab[1], b, scalars, out, m, n, k, _outerProductType,  0, m, j*blklen, 
Math.min((j+1)*blklen, numCG)));
+                                               tasks.add(new ParExecTask(a, 
ab[0], ab[1], b, scalars, out, m, n, k,
+                                                       _outerProductType,  0, 
m, j*blklen, Math.min((j+1)*blklen, numCG)));
                                }
                                else {
                                        //parallelize over column partitions
                                        int blklen = 
(int)(Math.ceil((double)n/numThreads));
                                        for( int j=0; j<numThreads & 
j*blklen<n; j++ )
-                                               tasks.add(new ParExecTask(a, 
ab[0], ab[1], b, scalars, out, m, n, k, _outerProductType,  0, m, j*blklen, 
Math.min((j+1)*blklen, n)));
+                                               tasks.add(new ParExecTask(a, 
ab[0], ab[1], b, scalars, out, m, n, k,
+                                                       _outerProductType,  0, 
m, j*blklen, Math.min((j+1)*blklen, n)));
                                }
                        }
                        else { //right or cell-wise
                                //parallelize over row partitions
-                               int numThreads2 = 
UtilFunctions.roundToNext(Math.min(8*k,m/32), k);
+                               int numThreads2 = getPreferredNumberOfTasks(m, 
n, nnz, k, numThreads);
                                int blklen = 
(int)(Math.ceil((double)m/numThreads2));
                                for( int i=0; i<numThreads2 & i*blklen<m; i++ )
-                                       tasks.add(new ParExecTask(a, ab[0], 
ab[1], b, scalars, out, m, n, k, _outerProductType, i*blklen, 
Math.min((i+1)*blklen,m), 0, n));
+                                       tasks.add(new ParExecTask(a, ab[0], 
ab[1], b, scalars, out, m, n, k,
+                                               _outerProductType, i*blklen, 
Math.min((i+1)*blklen,m), 0, n));
                        }
                        List<Future<Long>> taskret = pool.invokeAll(tasks);
                        pool.shutdown();
@@ -320,6 +329,13 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                return out;
        }
        
+       private static int getPreferredNumberOfTasks(int m, int n, long nnz, 
int rank, int k) {
+               //compute number of tasks nk in range [k, 8k]
+               int base = (int) Math.min(Math.min(8*k, m/32),
+                       Math.ceil((double)2*nnz*rank/PAR_MINFLOP_THRESHOLD));
+               return UtilFunctions.roundToNext(base, k);
+       }
+       
        private void executeDense(double[] a, double[] u, double[] v, 
double[][] b, double[] scalars,
                double[] c, int m, int n, int k, OutProdType type, int rl, int 
ru, int cl, int cu )
        {
@@ -427,6 +443,7 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                if( !out.isInSparseFormat() ) //DENSE
                {
                        double[] c = out.getDenseBlock();
+                       double tmp = 0;
                        for( int bi=rl; bi<ru; bi+=blocksizeIJ ) {
                                int bimin = Math.min(ru, bi+blocksizeIJ);
                                //prepare starting indexes for block row
@@ -441,16 +458,20 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                                                int[] wix = sblock.indexes(i);
                                                double[] wval = 
sblock.values(i);
                                                int index = wpos + curk[i-bi];
-                                               for( ; index<wpos+wlen && 
wix[index]<bjmin; index++ ) {
-                                                       if(type == 
OutProdType.CELLWISE_OUTER_PRODUCT)
-                                                               c[wix[index]] = 
genexecCellwise( wval[index], u, uix, v, wix[index]*k, b, scalars, m, n, k, i, 
wix[index] );
-                                                       else
-                                                               c[0] += 
genexecCellwise( wval[index], u, uix, v, wix[index]*k, b, scalars, m, n, k, i, 
wix[index]);
-                                               }
+                                               if( type == 
OutProdType.CELLWISE_OUTER_PRODUCT )
+                                                       for( ; index<wpos+wlen 
&& wix[index]<bjmin; index++ )
+                                                               c[wix[index]] = 
genexecCellwise( wval[index], 
+                                                                       u, uix, 
v, wix[index]*k, b, scalars, m, n, k, i, wix[index] );
+                                               else
+                                                       for( ; index<wpos+wlen 
&& wix[index]<bjmin; index++ )
+                                                               tmp += 
genexecCellwise( wval[index], 
+                                                                       u, uix, 
v, wix[index]*k, b, scalars, m, n, k, i, wix[index]);
                                                curk[i-bi] = index - wpos;
                                        }
                                }
                        }
+                       if( type != OutProdType.CELLWISE_OUTER_PRODUCT )
+                               c[0] = tmp;
                }
                else //SPARSE
                {

http://git-wip-us.apache.org/repos/asf/systemml/blob/06fa73ac/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java 
b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
index e2d9f41..f1cef34 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
@@ -44,7 +44,6 @@ import org.apache.sysml.runtime.util.UtilFunctions;
 public abstract class SpoofRowwise extends SpoofOperator
 {
        private static final long serialVersionUID = 6242910797139642998L;
-       private static final long PAR_NUMCELL_THRESHOLD = 1024*1024;   //Min 1M 
elements
        
        public enum RowType {
                NO_AGG,    //no aggregation

Reply via email to