Repository: systemml
Updated Branches:
  refs/heads/master 129710a01 -> 0325da7de


[SYSTEMML-1857] Performance codegen operator skeletons (all templates)

This patch makes the following performance improvements to the codegen
operator skeletons (cell, row, outer, magg):

(1) Improved decision on multi-threading (all templates): Instead of
considering the number of cells in the main input matrix, we now
consider the total number of non-zeros of all inputs. On small data
scenarios, this achieved performance improvements of >2x.

(2) Row outer products (row): For special row outer products as used in
Kmeans and multi-class MLogreg, we now flip the outer product
computation from a-b to b-a if a is substantially larger to exploit
vectorized primitives. We compensate this flipped representation via
single transpose after aggregation. This improved the performance of
t(X)%*%X%*%v over X:=1Mx1K, v:=1Kx2 from 470ms to 340ms. 

(3) Sparse driver selection (magg): For multi-aggregates, the sparse
driver selection is especially important, because a wrong ordering can
render the entire operator as sparse-unsafe. For patterns such as
sum(X*Y)+sum(X*Z), we now systematically select the shared sparse input
X as the sparse driver. This led to performance improvements of >6x.

Finally, this patch also fixes special cases of row aggregate
compilation (e.g., equal number of rows and columns), where we generated
invalid code so far.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/0325da7d
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/0325da7d
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/0325da7d

Branch: refs/heads/master
Commit: 0325da7de21bbe8ff40e4258b8d8a50497902174
Parents: 129710a
Author: Matthias Boehm <mboe...@gmail.com>
Authored: Mon Aug 21 22:27:07 2017 -0700
Committer: Matthias Boehm <mboe...@gmail.com>
Committed: Mon Aug 21 22:27:07 2017 -0700

----------------------------------------------------------------------
 .../hops/codegen/template/TemplateCell.java     | 14 ++++-
 .../hops/codegen/template/TemplateMultiAgg.java | 40 +++++++++++++-
 .../hops/codegen/template/TemplateUtils.java    |  5 +-
 .../runtime/codegen/LibSpoofPrimitives.java     | 57 +++++++++++++-------
 .../sysml/runtime/codegen/SpoofCellwise.java    | 11 ++--
 .../runtime/codegen/SpoofMultiAggregate.java    | 11 ++--
 .../sysml/runtime/codegen/SpoofOperator.java    | 16 +++---
 .../runtime/codegen/SpoofOuterProduct.java      | 12 +++--
 .../sysml/runtime/codegen/SpoofRowwise.java     | 48 ++++++++++++-----
 .../instructions/cp/SpoofCPInstruction.java     |  3 +-
 .../instructions/spark/SpoofSPInstruction.java  |  8 +--
 11 files changed, 163 insertions(+), 62 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java 
b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
index ad4589b..b7e2a2d 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
@@ -339,15 +339,25 @@ public class TemplateCell extends TemplateBase
         */
        public static class HopInputComparator implements Comparator<Hop> 
        {
+               private final Hop _driver;
+               
+               public HopInputComparator() {
+                       this(null);
+               }
+               
+               public HopInputComparator(Hop driver) {
+                       _driver = driver;
+               }
+               
                @Override
                public int compare(Hop h1, Hop h2) {
                        long ncells1 = h1.getDataType()==DataType.SCALAR ? 
Long.MIN_VALUE : 
                                h1.dimsKnown() ? h1.getDim1()*h1.getDim2() : 
Long.MAX_VALUE;
                        long ncells2 = h2.getDataType()==DataType.SCALAR ? 
Long.MIN_VALUE :
                                h2.dimsKnown() ? h2.getDim1()*h2.getDim2() : 
Long.MAX_VALUE;
-                       if( ncells1 > ncells2 ) 
+                       if( ncells1 > ncells2 || h1 == _driver )
                                return -1;
-                       else if( ncells1 < ncells2) 
+                       else if( ncells1 < ncells2 || h2 == _driver)
                                return 1;
                        return Long.compare(
                                h1.dimsKnown(true) ? h1.getNnz() : ncells1, 

http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/hops/codegen/template/TemplateMultiAgg.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateMultiAgg.java 
b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateMultiAgg.java
index 0c2886e..e9f4cd2 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateMultiAgg.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateMultiAgg.java
@@ -22,16 +22,21 @@ package org.apache.sysml.hops.codegen.template;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Set;
+import java.util.stream.Collectors;
 
 import org.apache.sysml.hops.Hop;
 import org.apache.sysml.hops.Hop.AggOp;
+import org.apache.sysml.hops.Hop.OpOp2;
 import org.apache.sysml.hops.codegen.cplan.CNode;
 import org.apache.sysml.hops.codegen.cplan.CNodeData;
 import org.apache.sysml.hops.codegen.template.CPlanMemoTable.MemoTableEntry;
+import org.apache.sysml.hops.rewrite.HopRewriteUtils;
 import org.apache.sysml.hops.codegen.cplan.CNodeMultiAgg;
 import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
 import org.apache.sysml.hops.codegen.cplan.CNodeUnary;
 import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.Pair;
 
 public class TemplateMultiAgg extends TemplateCell 
@@ -86,9 +91,10 @@ public class TemplateMultiAgg extends TemplateCell
                //reorder inputs (ensure matrices/vectors come first) and prune 
literals
                //note: we order by number of cells and subsequently sparsity 
to ensure
                //that sparse inputs are used as the main input w/o unnecessary 
conversion
+               Hop shared = getSparseSafeSharedInput(roots, inHops);
                Hop[] sinHops = inHops.stream()
                        .filter(h -> !(h.getDataType().isScalar() && 
tmp.get(h.getHopID()).isLiteral()))
-                       .sorted(new HopInputComparator()).toArray(Hop[]::new);
+                       .sorted(new 
HopInputComparator(shared)).toArray(Hop[]::new);
                
                //construct template node
                ArrayList<CNode> inputs = new ArrayList<CNode>();
@@ -115,4 +121,36 @@ public class TemplateMultiAgg extends TemplateCell
                // return cplan instance
                return new Pair<Hop[],CNodeTpl>(sinHops, tpl);
        }
+       
+       private Hop getSparseSafeSharedInput(ArrayList<Hop> roots, HashSet<Hop> 
inHops) {
+               Set<Hop> tmp = inHops.stream()
+                       .filter(h -> h.getDataType().isMatrix())
+                       .collect(Collectors.toSet());
+               for( Hop root : roots ) {
+                       root.resetVisitStatus();
+                       HashSet<Hop> inputs = new HashSet<>();
+                       rCollectSparseSafeInputs(root, inHops, inputs);
+                       tmp.removeIf(h -> !inputs.contains(h));
+               }
+               Hop.resetVisitStatus(roots);
+               return tmp.isEmpty() ? null : 
+                       tmp.toArray(new Hop[0])[0];
+       }
+       
+       private void rCollectSparseSafeInputs(Hop current, HashSet<Hop> inHops, 
HashSet<Hop> sparseInputs) {
+               if( current.isVisited() || !(HopRewriteUtils.isBinary(current, 
OpOp2.MULT)
+                       || HopRewriteUtils.isAggUnaryOp(current, AggOp.SUM, 
AggOp.SUM_SQ))) {
+                       return;
+               }
+               
+               for( Hop c : current.getInput() ) {
+                       if( !inHops.contains(c) )
+                               rCollectSparseSafeInputs(c, inHops, 
sparseInputs);
+                       else if( c.dimsKnown(true) && 
MatrixBlock.evalSparseFormatInMemory(
+                               c.getDim1(), c.getDim2(), c.getNnz()) )
+                               sparseInputs.add(c);
+               }
+               
+               current.setVisited();
+       }
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java 
b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
index 55f6fee..6c07e6e 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
@@ -185,11 +185,12 @@ public class TemplateUtils
                Hop B1 = (inputs.length>1) ? inputs[1] : null;
                if( (X!=null && HopRewriteUtils.isEqualSize(output, X)) || 
X==null )
                        return RowType.NO_AGG;
-               else if( (B1!=null && output.getDim1()==X.getDim1() && 
output.getDim2()==B1.getDim2())
+               else if( ((B1!=null && output.getDim1()==X.getDim1() && 
output.getDim2()==B1.getDim2())
                        || (output instanceof IndexingOp && 
HopRewriteUtils.isColumnRangeIndexing((IndexingOp)output)))
+                       && !(output instanceof AggBinaryOp && 
HopRewriteUtils.isTransposeOfItself(output.getInput().get(0),X)) )
                        return RowType.NO_AGG_B1;
                else if( output.getDim1()==X.getDim1() && (output.getDim2()==1 
-                               || HopRewriteUtils.isBinary(output, 
OpOp2.CBIND)) 
+                               || HopRewriteUtils.isBinary(output, 
OpOp2.CBIND))
                        && !(output instanceof AggBinaryOp && HopRewriteUtils
                                
.isTransposeOfItself(output.getInput().get(0),X)))
                        return RowType.ROW_AGG;

http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java 
b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
index 80586b1..eed8cb3 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
@@ -75,29 +75,47 @@ public class LibSpoofPrimitives
        }
        
        public static void vectOuterMultAdd(double[] a, double[] b, double[] c, 
int ai, int bi, int ci, int len1, int len2) {
-               //rest, not aligned to 4-blocks
-               final int bn = len1%4;
-               for( int i=0, cix=ci; i < bn; i++, cix+=len2 )
-                       if( a[ai+i] != 0 )
-                               LibMatrixMult.vectMultiplyAdd(a[ai+i], b, c, 
bi, cix, len2);
-               
-               //unrolled 4-block (for fewer L1-dcache loads)
-               for( int i=bn, cix=ci+bn*len2; i < len1; i+=4, cix+=4*len2 ) {
-                       final int cix1=cix, cix2=cix+len2, cix3=cix+2*len2, 
cix4=cix+3*len2;
-                       final double aval1=a[ai+i], aval2=a[ai+i+1], 
aval3=a[ai+i+2], aval4=a[ai+i+3];
-                       for( int j=0; j<len2; j++ ) {
-                               final double bval = b[bi+j];
-                               c[cix1 + j] += aval1 * bval;
-                               c[cix2 + j] += aval2 * bval;
-                               c[cix3 + j] += aval3 * bval;
-                               c[cix4 + j] += aval4 * bval;
+               if( isFlipOuter(len1, len2) ) {
+                       for( int i=0, cix=ci; i < len2; i++, cix+=len1 ) {
+                               final double val = b[bi+i];
+                               if( val != 0 )
+                                       LibMatrixMult.vectMultiplyAdd(val, a, 
c, ai, cix, len1);
                        }
+               }
+               else {
+                       //rest, not aligned to 4-blocks
+                       final int bn = len1%4;
+                       for( int i=0, cix=ci; i < bn; i++, cix+=len2 )
+                               if( a[ai+i] != 0 )
+                                       LibMatrixMult.vectMultiplyAdd(a[ai+i], 
b, c, bi, cix, len2);
+                       
+                       //unrolled 4-block (for fewer L1-dcache loads)
+                       for( int i=bn, cix=ci+bn*len2; i < len1; i+=4, 
cix+=4*len2 ) {
+                               final int cix1=cix, cix2=cix+len2, 
cix3=cix+2*len2, cix4=cix+3*len2;
+                               final double aval1=a[ai+i], aval2=a[ai+i+1], 
aval3=a[ai+i+2], aval4=a[ai+i+3];
+                               for( int j=0; j<len2; j++ ) {
+                                       final double bval = b[bi+j];
+                                       c[cix1 + j] += aval1 * bval;
+                                       c[cix2 + j] += aval2 * bval;
+                                       c[cix3 + j] += aval3 * bval;
+                                       c[cix4 + j] += aval4 * bval;
+                               }
+                       }       
                }       
        }
        
        public static void vectOuterMultAdd(double[] a, double[] b, double[] c, 
int[] aix, int ai, int bi, int ci, int alen, int len1, int len2) {
-               for( int i=0; i < alen; i++ )
-                       LibMatrixMult.vectMultiplyAdd(a[ai+i], b, c, bi, 
ci+aix[ai+i]*len2, len2);
+               if( isFlipOuter(len1, len2) ) {
+                       for( int i=0, cix=ci; i < len2; i++, cix+=len1 ) {
+                               final double val = b[bi+i];
+                               if( val != 0 )
+                                       LibMatrixMult.vectMultiplyAdd(val, a, 
c, aix, ai, cix, alen);
+                       }
+               }
+               else {
+                       for( int i=0; i < alen; i++ )
+                               LibMatrixMult.vectMultiplyAdd(a[ai+i], b, c, 
bi, ci+aix[ai+i]*len2, len2);
+               }
        }
        
        public static void vectMultAdd(double[] a, double bval, double[] c, int 
bi, int ci, int len) {
@@ -1434,6 +1452,9 @@ public class LibSpoofPrimitives
                return mod.execute(in1, in2);
        }
        
+       public static boolean isFlipOuter(int len1, int len2) {
+               return (len1 > 64 * len2);
+       }
        
        //dynamic memory management
        

http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java 
b/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
index c35695f..575043b 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
@@ -115,7 +115,7 @@ public abstract class SpoofCellwise extends SpoofOperator 
implements Serializabl
                if( inputs==null || inputs.size() < 1  )
                        throw new RuntimeException("Invalid input arguments.");
                
-               if( 
inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD 
) {
+               if( getTotalInputNnz(inputs) < PAR_NUMCELL_THRESHOLD ) {
                        k = 1; //serial execution
                }
                
@@ -180,21 +180,21 @@ public abstract class SpoofCellwise extends SpoofOperator 
implements Serializabl
        }
 
        @Override
-       public void execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out)
+       public MatrixBlock execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out)
                throws DMLRuntimeException
        {
-               execute(inputs, scalarObjects, out, 1);
+               return execute(inputs, scalarObjects, out, 1);
        }
        
        @Override
-       public void execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int k)
+       public MatrixBlock execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int k)
                throws DMLRuntimeException
        {
                //sanity check
                if( inputs==null || inputs.size() < 1 || out==null )
                        throw new RuntimeException("Invalid input arguments.");
                
-               if( 
inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD 
) {
+               if( getTotalInputNnz(inputs) < PAR_NUMCELL_THRESHOLD ) {
                        k = 1; //serial execution
                }
                
@@ -276,6 +276,7 @@ public abstract class SpoofCellwise extends SpoofOperator 
implements Serializabl
                //post-processing
                out.setNonZeros(lnnz);
                out.examSparsity();
+               return out;
        }
        
        /////////

http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java 
b/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
index 43811f2..ae3c353 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
@@ -72,21 +72,21 @@ public abstract class SpoofMultiAggregate extends 
SpoofOperator implements Seria
        }
        
        @Override
-       public void execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out) 
+       public MatrixBlock execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out) 
                throws DMLRuntimeException
        {
-               execute(inputs, scalarObjects, out, 1);
+               return execute(inputs, scalarObjects, out, 1);
        }
        
        @Override
-       public void execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int k)       
+       public MatrixBlock execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int k)        
                throws DMLRuntimeException
        {
                //sanity check
                if( inputs==null || inputs.size() < 1  )
                        throw new RuntimeException("Invalid input arguments.");
                
-               if( 
inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD 
) {
+               if( getTotalInputNnz(inputs) < PAR_NUMCELL_THRESHOLD ) {
                        k = 1; //serial execution
                }
        
@@ -139,7 +139,8 @@ public abstract class SpoofMultiAggregate extends 
SpoofOperator implements Seria
        
                //post-processing
                out.recomputeNonZeros();
-               out.examSparsity();     
+               out.examSparsity();
+               return out;
        }
        
        private void executeDense(double[] a, SideInput[] b, double[] scalars, 
double[] c, int m, int n, int rl, int ru) throws DMLRuntimeException 

http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java 
b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
index fe32839..3ea9246 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
@@ -37,14 +37,14 @@ public abstract class SpoofOperator implements Serializable
        private static final long serialVersionUID = 3834006998853573319L;
        private static final Log LOG = 
LogFactory.getLog(SpoofOperator.class.getName());
        
-       public abstract void execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalars, MatrixBlock out)
+       public abstract MatrixBlock execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalars, MatrixBlock out)
                throws DMLRuntimeException;
        
-       public void execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalars, MatrixBlock out, int k)
+       public MatrixBlock execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalars, MatrixBlock out, int k)
                throws DMLRuntimeException 
        {
                //default implementation serial execution
-               execute(inputs, scalars, out);
+               return execute(inputs, scalars, out);
        }
        
        public abstract String getSpoofType();
@@ -113,7 +113,7 @@ public abstract class SpoofOperator implements Serializable
                return b;
        }
        
-       protected SideInput[] createSparseSideInputs(SideInput[] input) {
+       protected static SideInput[] createSparseSideInputs(SideInput[] input) {
                //determine if there are sparse side inputs
                boolean containsSparse = false;
                for( int i=0; i<input.length; i++ ) {
@@ -133,20 +133,24 @@ public abstract class SpoofOperator implements 
Serializable
                return ret;
        }
        
-       public double[][] getDenseMatrices(SideInput[] inputs) {
+       public static double[][] getDenseMatrices(SideInput[] inputs) {
                double[][] ret = new double[inputs.length][];
                for( int i=0; i<inputs.length; i++ )
                        ret[i] = inputs[i].ddat;
                return ret;
        }
        
-       protected double[] prepInputScalars(ArrayList<ScalarObject> 
scalarObjects) {
+       protected static double[] prepInputScalars(ArrayList<ScalarObject> 
scalarObjects) {
                double[] scalars = new double[scalarObjects.size()];
                for(int i=0; i < scalarObjects.size(); i++)
                        scalars[i] = scalarObjects.get(i).getDoubleValue();
                return scalars;
        }
        
+       public static long getTotalInputNnz(ArrayList<MatrixBlock> inputs) {
+               return inputs.stream().mapToLong(in -> in.getNonZeros()).sum();
+       }
+       
        //abstraction for safely accessing sideways matrices without the need 
        //to allocate empty matrices as dense, see prepInputMatrices
        

http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java 
b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
index 442755d..bc99859 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
@@ -147,7 +147,7 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                return new DoubleObject(sum);
        }
        
-       public void execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out)
+       public MatrixBlock execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out)
                throws DMLRuntimeException
        {
                //sanity check
@@ -159,7 +159,7 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                        || (_outerProductType == 
OutProdType.RIGHT_OUTER_PRODUCT &&  inputs.get(2).isEmptyBlock(false)) //V is 
empty
                        || inputs.get(0).isEmptyBlock(false) ) {  //X is empty
                        out.examSparsity(); //turn empty dense into sparse
-                       return; 
+                       return out;
                }
                
                //input preparation and result allocation (Allocate the output 
that is set by Sigma2CPInstruction) 
@@ -177,7 +177,7 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                
                //check for empty inputs; otherwise allocate result
                if( inputs.get(0).isEmptyBlock(false) )
-                       return;
+                       return out;
                out.allocateDenseOrSparseBlock();
                
                //input preparation
@@ -223,10 +223,11 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                        out.sortSparseRows();
                out.recomputeNonZeros();
                out.examSparsity();
+               return out;
        }
        
        @Override
-       public void execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int numThreads)      
+       public MatrixBlock execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int numThreads)       
                throws DMLRuntimeException
        {
                //sanity check
@@ -238,7 +239,7 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                        || (_outerProductType == 
OutProdType.RIGHT_OUTER_PRODUCT && inputs.get(2).isEmptyBlock(false)) //V is 
empty
                        || inputs.get(0).isEmptyBlock(false) ) {  //X is empty
                        out.examSparsity(); //turn empty dense into sparse
-                       return; 
+                       return out; 
                }
                
                //input preparation and result allocation (Allocate the output 
that is set by Sigma2CPInstruction) 
@@ -316,6 +317,7 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                                out.recomputeNonZeros();
                }
                out.examSparsity();
+               return out;
        }
        
        private void executeDense(double[] a, double[] u, double[] v, 
double[][] b, double[] scalars,

http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java 
b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
index 9f47733..e2d9f41 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
@@ -33,6 +33,7 @@ import 
org.apache.sysml.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysml.runtime.instructions.cp.DoubleObject;
 import org.apache.sysml.runtime.instructions.cp.ScalarObject;
 import org.apache.sysml.runtime.matrix.data.LibMatrixMult;
+import org.apache.sysml.runtime.matrix.data.LibMatrixReorg;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.SparseBlock;
 import org.apache.sysml.runtime.matrix.data.SparseRow;
@@ -61,6 +62,9 @@ public abstract class SpoofRowwise extends SpoofOperator
                }
                public boolean isRowTypeB1() {
                        return (this == NO_AGG_B1) || (this == COL_AGG_B1) || 
(this == COL_AGG_B1_T);
+               }
+               public boolean isRowTypeB1ColumnAgg() {
+                       return (this == COL_AGG_B1) || (this == COL_AGG_B1_T);
                } 
        }
        
@@ -97,22 +101,20 @@ public abstract class SpoofRowwise extends SpoofOperator
        public ScalarObject execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, int k) 
                throws DMLRuntimeException 
        {
-               MatrixBlock out = new MatrixBlock(1, 1, false);
-               if( k > 1 )
-                       execute(inputs, scalarObjects, out, k);
-               else
-                       execute(inputs, scalarObjects, out);
+               MatrixBlock out = ( k > 1 ) ?
+                       execute(inputs, scalarObjects, new 
MatrixBlock(1,1,false), k) :
+                       execute(inputs, scalarObjects, new 
MatrixBlock(1,1,false));
                return new DoubleObject(out.quickGetValue(0, 0));
        }
        
        @Override
-       public void execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out)      
+       public MatrixBlock execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out)       
                throws DMLRuntimeException 
        {
-               execute(inputs, scalarObjects, out, true, false);
+               return execute(inputs, scalarObjects, out, true, false);
        }
        
-       public void execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out, boolean allocTmp, 
boolean aggIncr) 
+       public MatrixBlock execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out, boolean allocTmp, 
boolean aggIncr) 
                throws DMLRuntimeException      
        {
                //sanity check
@@ -127,6 +129,8 @@ public abstract class SpoofRowwise extends SpoofOperator
                if( !aggIncr || !out.isAllocated() )
                        allocateOutputMatrix(m, n, n2, out);
                double[] c = out.getDenseBlock();
+               final boolean flipOut = _type.isRowTypeB1ColumnAgg()
+                       && LibSpoofPrimitives.isFlipOuter(out.getNumRows(), 
out.getNumColumns());
                
                //input preparation
                SideInput[] b = prepInputMatrices(inputs, 1, inputs.size()-1, 
true, _tB1);
@@ -149,18 +153,23 @@ public abstract class SpoofRowwise extends SpoofOperator
                if( allocTmp &&_reqVectMem > 0 )
                        LibSpoofPrimitives.cleanupThreadLocalMemory();
                out.recomputeNonZeros();
+               if( flipOut ) {
+                       fixTransposeDimensions(out);
+                       out = LibMatrixReorg.transpose(out, new MatrixBlock(
+                               out.getNumColumns(), out.getNumRows(), false));
+               }
                out.examSparsity();
+               return out;
        }
        
        @Override
-       public void execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int k)       
+       public MatrixBlock execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int k)        
                throws DMLRuntimeException
        {
                //redirect to serial execution
                if( k <= 1 || (_type.isColumnAgg() && 
!LibMatrixMult.checkParColumnAgg(inputs.get(0), k, false))
-                       || 
(long)inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD
 ) {
-                       execute(inputs, scalarObjects, out);
-                       return;
+                       || getTotalInputNnz(inputs) < PAR_NUMCELL_THRESHOLD ) {
+                       return execute(inputs, scalarObjects, out);
                }
                
                //sanity check
@@ -173,6 +182,8 @@ public abstract class SpoofRowwise extends SpoofOperator
                final int n2 = _type.isRowTypeB1() || 
hasMatrixSideInput(inputs) ?
                        getMinColsMatrixSideInputs(inputs) : -1;
                allocateOutputMatrix(m, n, n2, out);
+               final boolean flipOut = _type.isRowTypeB1ColumnAgg()
+                       && LibSpoofPrimitives.isFlipOuter(out.getNumRows(), 
out.getNumColumns());
                
                //input preparation
                SideInput[] b = prepInputMatrices(inputs, 1, inputs.size()-1, 
true, _tB1);
@@ -210,11 +221,18 @@ public abstract class SpoofRowwise extends SpoofOperator
                        }
                        
                        pool.shutdown();
+                       if( flipOut ) {
+                               fixTransposeDimensions(out);
+                               out = LibMatrixReorg.transpose(out, new 
MatrixBlock(
+                                       out.getNumColumns(), out.getNumRows(), 
false));
+                       }
                        out.examSparsity();
                }
                catch(Exception ex) {
                        throw new DMLRuntimeException(ex);
                }
+               
+               return out;
        }
        
        public static boolean hasMatrixSideInput(ArrayList<MatrixBlock> inputs) 
{
@@ -246,6 +264,12 @@ public abstract class SpoofRowwise extends SpoofOperator
                out.allocateDenseBlock();
        }
        
+       private void fixTransposeDimensions(MatrixBlock out) {
+               int rlen = out.getNumRows();
+               out.setNumRows(out.getNumColumns());
+               out.setNumColumns(rlen);
+       }
+       
        private void executeDense(double[] a, SideInput[] b, double[] scalars, 
double[] c, int n, int rl, int ru) 
        {
                if( a == null )

http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/runtime/instructions/cp/SpoofCPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/SpoofCPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/SpoofCPInstruction.java
index 7c16242..0e8489b 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/SpoofCPInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/SpoofCPInstruction.java
@@ -84,8 +84,7 @@ public class SpoofCPInstruction extends 
ComputationCPInstruction
                
                // set the output dimensions to the hop node matrix dimensions
                if( output.getDataType() == DataType.MATRIX) {
-                       MatrixBlock out = new MatrixBlock();
-                       _op.execute(inputs, scalars, out, _numThreads);
+                       MatrixBlock out = _op.execute(inputs, scalars, new 
MatrixBlock(), _numThreads);
                        ec.setMatrixOutput(output.getName(), out, 
getExtendedOpcode());
                }
                else if (output.getDataType() == DataType.SCALAR) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
index 0af46df..34041cf 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
@@ -471,7 +471,7 @@ public class SpoofSPInstruction extends SPInstruction
                                //prepare output and execute single-threaded 
operator
                                ArrayList<MatrixBlock> inputs = 
getAllMatrixInputs(ixIn, blkIn);
                                blkOut = aggIncr ? blkOut : new MatrixBlock();
-                               _op.execute(inputs, _scalars, blkOut, false, 
aggIncr);
+                               blkOut = _op.execute(inputs, _scalars, blkOut, 
false, aggIncr);
                                if( !aggIncr ) {
                                        MatrixIndexes ixOut = new 
MatrixIndexes(ixIn.getRowIndex(),
                                                
_op.getRowType()!=RowType.NO_AGG ? 1 : ixIn.getColumnIndex());
@@ -532,7 +532,7 @@ public class SpoofSPInstruction extends SPInstruction
                                                ixOut = new 
MatrixIndexes(ixOut.getRowIndex(), 1);
                                        else 
if(((SpoofCellwise)_op).getCellType()==CellType.COL_AGG)
                                                ixOut = new MatrixIndexes(1, 
ixOut.getColumnIndex());
-                                       _op.execute(inputs, _scalars, blkOut);
+                                       blkOut = _op.execute(inputs, _scalars, 
blkOut);
                                }
                                ret.add(new 
Tuple2<MatrixIndexes,MatrixBlock>(ixOut, blkOut));
                        }
@@ -566,7 +566,7 @@ public class SpoofSPInstruction extends SPInstruction
                        //execute core operation
                        ArrayList<MatrixBlock> inputs = 
getAllMatrixInputs(arg._1(), arg._2());
                        MatrixBlock blkOut = new MatrixBlock();
-                       _op.execute(inputs, _scalars, blkOut);
+                       blkOut = _op.execute(inputs, _scalars, blkOut);
                        
                        return new Tuple2<MatrixIndexes,MatrixBlock>(arg._1(), 
blkOut);
                }
@@ -641,7 +641,7 @@ public class SpoofSPInstruction extends SPInstruction
                                        blkOut.quickSetValue(0, 0, 
obj.getDoubleValue());
                                }
                                else {
-                                       _op.execute(inputs, _scalars, blkOut);
+                                       blkOut = _op.execute(inputs, _scalars, 
blkOut);
                                }
                                
                                ret.add(new 
Tuple2<MatrixIndexes,MatrixBlock>(createOutputIndexes(ixIn,_op), blkOut));      
                    

Reply via email to