Repository: systemml Updated Branches: refs/heads/master 129710a01 -> 0325da7de
[SYSTEMML-1857] Performance codegen operator skeletons (all templates) This patch makes the following performance improvements to the codegen operator skeletons (cell, row, outer, magg): (1) Improved decision on multi-threading (all templates): Instead of considering the number of cells in the main input matrix, we now consider the total number of non-zeros of all inputs. On small data scenarios, this achieved performance improvements of >2x. (2) Row outer products (row): For special row outer products as used in Kmeans and multi-class MLogreg, we now flip the outer product computation from a-b to b-a if a is substantially larger to exploit vectorized primitives. We compensate this flipped representation via single transpose after aggregation. This improved the performance of t(X)%*%X%*%v over X:=1Mx1K, v:=1Kx2 from 470ms to 340ms. (3) Sparse driver selection (magg): For multi-aggregates, the sparse driver selection is especially important, because a wrong ordering can render the entire operator as sparse-unsafe. For patterns such as sum(X*Y)+sum(X*Z), we now systematically select the shared sparse input X as the sparse driver. This led to performance improvements of >6x. Finally, this patch also fixes special cases of row aggregate compilation (e.g., equal number of rows and columns), where we generated invalid code so far. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/0325da7d Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/0325da7d Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/0325da7d Branch: refs/heads/master Commit: 0325da7de21bbe8ff40e4258b8d8a50497902174 Parents: 129710a Author: Matthias Boehm <mboe...@gmail.com> Authored: Mon Aug 21 22:27:07 2017 -0700 Committer: Matthias Boehm <mboe...@gmail.com> Committed: Mon Aug 21 22:27:07 2017 -0700 ---------------------------------------------------------------------- .../hops/codegen/template/TemplateCell.java | 14 ++++- .../hops/codegen/template/TemplateMultiAgg.java | 40 +++++++++++++- .../hops/codegen/template/TemplateUtils.java | 5 +- .../runtime/codegen/LibSpoofPrimitives.java | 57 +++++++++++++------- .../sysml/runtime/codegen/SpoofCellwise.java | 11 ++-- .../runtime/codegen/SpoofMultiAggregate.java | 11 ++-- .../sysml/runtime/codegen/SpoofOperator.java | 16 +++--- .../runtime/codegen/SpoofOuterProduct.java | 12 +++-- .../sysml/runtime/codegen/SpoofRowwise.java | 48 ++++++++++++----- .../instructions/cp/SpoofCPInstruction.java | 3 +- .../instructions/spark/SpoofSPInstruction.java | 8 +-- 11 files changed, 163 insertions(+), 62 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java index ad4589b..b7e2a2d 100644 --- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java +++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java @@ -339,15 +339,25 @@ public class TemplateCell extends TemplateBase */ public static class HopInputComparator implements Comparator<Hop> { + private final Hop _driver; + + public HopInputComparator() { + this(null); + } + + public HopInputComparator(Hop driver) { + _driver = driver; + } + @Override public int compare(Hop h1, Hop h2) { long ncells1 = h1.getDataType()==DataType.SCALAR ? Long.MIN_VALUE : h1.dimsKnown() ? h1.getDim1()*h1.getDim2() : Long.MAX_VALUE; long ncells2 = h2.getDataType()==DataType.SCALAR ? Long.MIN_VALUE : h2.dimsKnown() ? h2.getDim1()*h2.getDim2() : Long.MAX_VALUE; - if( ncells1 > ncells2 ) + if( ncells1 > ncells2 || h1 == _driver ) return -1; - else if( ncells1 < ncells2) + else if( ncells1 < ncells2 || h2 == _driver) return 1; return Long.compare( h1.dimsKnown(true) ? h1.getNnz() : ncells1, http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/hops/codegen/template/TemplateMultiAgg.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateMultiAgg.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateMultiAgg.java index 0c2886e..e9f4cd2 100644 --- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateMultiAgg.java +++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateMultiAgg.java @@ -22,16 +22,21 @@ package org.apache.sysml.hops.codegen.template; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; import org.apache.sysml.hops.Hop; import org.apache.sysml.hops.Hop.AggOp; +import org.apache.sysml.hops.Hop.OpOp2; import org.apache.sysml.hops.codegen.cplan.CNode; import org.apache.sysml.hops.codegen.cplan.CNodeData; import org.apache.sysml.hops.codegen.template.CPlanMemoTable.MemoTableEntry; +import org.apache.sysml.hops.rewrite.HopRewriteUtils; import org.apache.sysml.hops.codegen.cplan.CNodeMultiAgg; import org.apache.sysml.hops.codegen.cplan.CNodeTpl; import org.apache.sysml.hops.codegen.cplan.CNodeUnary; import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType; +import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.Pair; public class TemplateMultiAgg extends TemplateCell @@ -86,9 +91,10 @@ public class TemplateMultiAgg extends TemplateCell //reorder inputs (ensure matrices/vectors come first) and prune literals //note: we order by number of cells and subsequently sparsity to ensure //that sparse inputs are used as the main input w/o unnecessary conversion + Hop shared = getSparseSafeSharedInput(roots, inHops); Hop[] sinHops = inHops.stream() .filter(h -> !(h.getDataType().isScalar() && tmp.get(h.getHopID()).isLiteral())) - .sorted(new HopInputComparator()).toArray(Hop[]::new); + .sorted(new HopInputComparator(shared)).toArray(Hop[]::new); //construct template node ArrayList<CNode> inputs = new ArrayList<CNode>(); @@ -115,4 +121,36 @@ public class TemplateMultiAgg extends TemplateCell // return cplan instance return new Pair<Hop[],CNodeTpl>(sinHops, tpl); } + + private Hop getSparseSafeSharedInput(ArrayList<Hop> roots, HashSet<Hop> inHops) { + Set<Hop> tmp = inHops.stream() + .filter(h -> h.getDataType().isMatrix()) + .collect(Collectors.toSet()); + for( Hop root : roots ) { + root.resetVisitStatus(); + HashSet<Hop> inputs = new HashSet<>(); + rCollectSparseSafeInputs(root, inHops, inputs); + tmp.removeIf(h -> !inputs.contains(h)); + } + Hop.resetVisitStatus(roots); + return tmp.isEmpty() ? null : + tmp.toArray(new Hop[0])[0]; + } + + private void rCollectSparseSafeInputs(Hop current, HashSet<Hop> inHops, HashSet<Hop> sparseInputs) { + if( current.isVisited() || !(HopRewriteUtils.isBinary(current, OpOp2.MULT) + || HopRewriteUtils.isAggUnaryOp(current, AggOp.SUM, AggOp.SUM_SQ))) { + return; + } + + for( Hop c : current.getInput() ) { + if( !inHops.contains(c) ) + rCollectSparseSafeInputs(c, inHops, sparseInputs); + else if( c.dimsKnown(true) && MatrixBlock.evalSparseFormatInMemory( + c.getDim1(), c.getDim2(), c.getNnz()) ) + sparseInputs.add(c); + } + + current.setVisited(); + } } http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java index 55f6fee..6c07e6e 100644 --- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java +++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java @@ -185,11 +185,12 @@ public class TemplateUtils Hop B1 = (inputs.length>1) ? inputs[1] : null; if( (X!=null && HopRewriteUtils.isEqualSize(output, X)) || X==null ) return RowType.NO_AGG; - else if( (B1!=null && output.getDim1()==X.getDim1() && output.getDim2()==B1.getDim2()) + else if( ((B1!=null && output.getDim1()==X.getDim1() && output.getDim2()==B1.getDim2()) || (output instanceof IndexingOp && HopRewriteUtils.isColumnRangeIndexing((IndexingOp)output))) + && !(output instanceof AggBinaryOp && HopRewriteUtils.isTransposeOfItself(output.getInput().get(0),X)) ) return RowType.NO_AGG_B1; else if( output.getDim1()==X.getDim1() && (output.getDim2()==1 - || HopRewriteUtils.isBinary(output, OpOp2.CBIND)) + || HopRewriteUtils.isBinary(output, OpOp2.CBIND)) && !(output instanceof AggBinaryOp && HopRewriteUtils .isTransposeOfItself(output.getInput().get(0),X))) return RowType.ROW_AGG; http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java index 80586b1..eed8cb3 100644 --- a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java +++ b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java @@ -75,29 +75,47 @@ public class LibSpoofPrimitives } public static void vectOuterMultAdd(double[] a, double[] b, double[] c, int ai, int bi, int ci, int len1, int len2) { - //rest, not aligned to 4-blocks - final int bn = len1%4; - for( int i=0, cix=ci; i < bn; i++, cix+=len2 ) - if( a[ai+i] != 0 ) - LibMatrixMult.vectMultiplyAdd(a[ai+i], b, c, bi, cix, len2); - - //unrolled 4-block (for fewer L1-dcache loads) - for( int i=bn, cix=ci+bn*len2; i < len1; i+=4, cix+=4*len2 ) { - final int cix1=cix, cix2=cix+len2, cix3=cix+2*len2, cix4=cix+3*len2; - final double aval1=a[ai+i], aval2=a[ai+i+1], aval3=a[ai+i+2], aval4=a[ai+i+3]; - for( int j=0; j<len2; j++ ) { - final double bval = b[bi+j]; - c[cix1 + j] += aval1 * bval; - c[cix2 + j] += aval2 * bval; - c[cix3 + j] += aval3 * bval; - c[cix4 + j] += aval4 * bval; + if( isFlipOuter(len1, len2) ) { + for( int i=0, cix=ci; i < len2; i++, cix+=len1 ) { + final double val = b[bi+i]; + if( val != 0 ) + LibMatrixMult.vectMultiplyAdd(val, a, c, ai, cix, len1); } + } + else { + //rest, not aligned to 4-blocks + final int bn = len1%4; + for( int i=0, cix=ci; i < bn; i++, cix+=len2 ) + if( a[ai+i] != 0 ) + LibMatrixMult.vectMultiplyAdd(a[ai+i], b, c, bi, cix, len2); + + //unrolled 4-block (for fewer L1-dcache loads) + for( int i=bn, cix=ci+bn*len2; i < len1; i+=4, cix+=4*len2 ) { + final int cix1=cix, cix2=cix+len2, cix3=cix+2*len2, cix4=cix+3*len2; + final double aval1=a[ai+i], aval2=a[ai+i+1], aval3=a[ai+i+2], aval4=a[ai+i+3]; + for( int j=0; j<len2; j++ ) { + final double bval = b[bi+j]; + c[cix1 + j] += aval1 * bval; + c[cix2 + j] += aval2 * bval; + c[cix3 + j] += aval3 * bval; + c[cix4 + j] += aval4 * bval; + } + } } } public static void vectOuterMultAdd(double[] a, double[] b, double[] c, int[] aix, int ai, int bi, int ci, int alen, int len1, int len2) { - for( int i=0; i < alen; i++ ) - LibMatrixMult.vectMultiplyAdd(a[ai+i], b, c, bi, ci+aix[ai+i]*len2, len2); + if( isFlipOuter(len1, len2) ) { + for( int i=0, cix=ci; i < len2; i++, cix+=len1 ) { + final double val = b[bi+i]; + if( val != 0 ) + LibMatrixMult.vectMultiplyAdd(val, a, c, aix, ai, cix, alen); + } + } + else { + for( int i=0; i < alen; i++ ) + LibMatrixMult.vectMultiplyAdd(a[ai+i], b, c, bi, ci+aix[ai+i]*len2, len2); + } } public static void vectMultAdd(double[] a, double bval, double[] c, int bi, int ci, int len) { @@ -1434,6 +1452,9 @@ public class LibSpoofPrimitives return mod.execute(in1, in2); } + public static boolean isFlipOuter(int len1, int len2) { + return (len1 > 64 * len2); + } //dynamic memory management http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java index c35695f..575043b 100644 --- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java +++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java @@ -115,7 +115,7 @@ public abstract class SpoofCellwise extends SpoofOperator implements Serializabl if( inputs==null || inputs.size() < 1 ) throw new RuntimeException("Invalid input arguments."); - if( inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD ) { + if( getTotalInputNnz(inputs) < PAR_NUMCELL_THRESHOLD ) { k = 1; //serial execution } @@ -180,21 +180,21 @@ public abstract class SpoofCellwise extends SpoofOperator implements Serializabl } @Override - public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out) + public MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out) throws DMLRuntimeException { - execute(inputs, scalarObjects, out, 1); + return execute(inputs, scalarObjects, out, 1); } @Override - public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int k) + public MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int k) throws DMLRuntimeException { //sanity check if( inputs==null || inputs.size() < 1 || out==null ) throw new RuntimeException("Invalid input arguments."); - if( inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD ) { + if( getTotalInputNnz(inputs) < PAR_NUMCELL_THRESHOLD ) { k = 1; //serial execution } @@ -276,6 +276,7 @@ public abstract class SpoofCellwise extends SpoofOperator implements Serializabl //post-processing out.setNonZeros(lnnz); out.examSparsity(); + return out; } ///////// http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java index 43811f2..ae3c353 100644 --- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java +++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java @@ -72,21 +72,21 @@ public abstract class SpoofMultiAggregate extends SpoofOperator implements Seria } @Override - public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out) + public MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out) throws DMLRuntimeException { - execute(inputs, scalarObjects, out, 1); + return execute(inputs, scalarObjects, out, 1); } @Override - public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int k) + public MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int k) throws DMLRuntimeException { //sanity check if( inputs==null || inputs.size() < 1 ) throw new RuntimeException("Invalid input arguments."); - if( inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD ) { + if( getTotalInputNnz(inputs) < PAR_NUMCELL_THRESHOLD ) { k = 1; //serial execution } @@ -139,7 +139,8 @@ public abstract class SpoofMultiAggregate extends SpoofOperator implements Seria //post-processing out.recomputeNonZeros(); - out.examSparsity(); + out.examSparsity(); + return out; } private void executeDense(double[] a, SideInput[] b, double[] scalars, double[] c, int m, int n, int rl, int ru) throws DMLRuntimeException http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java index fe32839..3ea9246 100644 --- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java +++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java @@ -37,14 +37,14 @@ public abstract class SpoofOperator implements Serializable private static final long serialVersionUID = 3834006998853573319L; private static final Log LOG = LogFactory.getLog(SpoofOperator.class.getName()); - public abstract void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalars, MatrixBlock out) + public abstract MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalars, MatrixBlock out) throws DMLRuntimeException; - public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalars, MatrixBlock out, int k) + public MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalars, MatrixBlock out, int k) throws DMLRuntimeException { //default implementation serial execution - execute(inputs, scalars, out); + return execute(inputs, scalars, out); } public abstract String getSpoofType(); @@ -113,7 +113,7 @@ public abstract class SpoofOperator implements Serializable return b; } - protected SideInput[] createSparseSideInputs(SideInput[] input) { + protected static SideInput[] createSparseSideInputs(SideInput[] input) { //determine if there are sparse side inputs boolean containsSparse = false; for( int i=0; i<input.length; i++ ) { @@ -133,20 +133,24 @@ public abstract class SpoofOperator implements Serializable return ret; } - public double[][] getDenseMatrices(SideInput[] inputs) { + public static double[][] getDenseMatrices(SideInput[] inputs) { double[][] ret = new double[inputs.length][]; for( int i=0; i<inputs.length; i++ ) ret[i] = inputs[i].ddat; return ret; } - protected double[] prepInputScalars(ArrayList<ScalarObject> scalarObjects) { + protected static double[] prepInputScalars(ArrayList<ScalarObject> scalarObjects) { double[] scalars = new double[scalarObjects.size()]; for(int i=0; i < scalarObjects.size(); i++) scalars[i] = scalarObjects.get(i).getDoubleValue(); return scalars; } + public static long getTotalInputNnz(ArrayList<MatrixBlock> inputs) { + return inputs.stream().mapToLong(in -> in.getNonZeros()).sum(); + } + //abstraction for safely accessing sideways matrices without the need //to allocate empty matrices as dense, see prepInputMatrices http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java index 442755d..bc99859 100644 --- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java +++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java @@ -147,7 +147,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator return new DoubleObject(sum); } - public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out) + public MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out) throws DMLRuntimeException { //sanity check @@ -159,7 +159,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator || (_outerProductType == OutProdType.RIGHT_OUTER_PRODUCT && inputs.get(2).isEmptyBlock(false)) //V is empty || inputs.get(0).isEmptyBlock(false) ) { //X is empty out.examSparsity(); //turn empty dense into sparse - return; + return out; } //input preparation and result allocation (Allocate the output that is set by Sigma2CPInstruction) @@ -177,7 +177,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator //check for empty inputs; otherwise allocate result if( inputs.get(0).isEmptyBlock(false) ) - return; + return out; out.allocateDenseOrSparseBlock(); //input preparation @@ -223,10 +223,11 @@ public abstract class SpoofOuterProduct extends SpoofOperator out.sortSparseRows(); out.recomputeNonZeros(); out.examSparsity(); + return out; } @Override - public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int numThreads) + public MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int numThreads) throws DMLRuntimeException { //sanity check @@ -238,7 +239,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator || (_outerProductType == OutProdType.RIGHT_OUTER_PRODUCT && inputs.get(2).isEmptyBlock(false)) //V is empty || inputs.get(0).isEmptyBlock(false) ) { //X is empty out.examSparsity(); //turn empty dense into sparse - return; + return out; } //input preparation and result allocation (Allocate the output that is set by Sigma2CPInstruction) @@ -316,6 +317,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator out.recomputeNonZeros(); } out.examSparsity(); + return out; } private void executeDense(double[] a, double[] u, double[] v, double[][] b, double[] scalars, http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java index 9f47733..e2d9f41 100644 --- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java +++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java @@ -33,6 +33,7 @@ import org.apache.sysml.runtime.compress.CompressedMatrixBlock; import org.apache.sysml.runtime.instructions.cp.DoubleObject; import org.apache.sysml.runtime.instructions.cp.ScalarObject; import org.apache.sysml.runtime.matrix.data.LibMatrixMult; +import org.apache.sysml.runtime.matrix.data.LibMatrixReorg; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.SparseBlock; import org.apache.sysml.runtime.matrix.data.SparseRow; @@ -61,6 +62,9 @@ public abstract class SpoofRowwise extends SpoofOperator } public boolean isRowTypeB1() { return (this == NO_AGG_B1) || (this == COL_AGG_B1) || (this == COL_AGG_B1_T); + } + public boolean isRowTypeB1ColumnAgg() { + return (this == COL_AGG_B1) || (this == COL_AGG_B1_T); } } @@ -97,22 +101,20 @@ public abstract class SpoofRowwise extends SpoofOperator public ScalarObject execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, int k) throws DMLRuntimeException { - MatrixBlock out = new MatrixBlock(1, 1, false); - if( k > 1 ) - execute(inputs, scalarObjects, out, k); - else - execute(inputs, scalarObjects, out); + MatrixBlock out = ( k > 1 ) ? + execute(inputs, scalarObjects, new MatrixBlock(1,1,false), k) : + execute(inputs, scalarObjects, new MatrixBlock(1,1,false)); return new DoubleObject(out.quickGetValue(0, 0)); } @Override - public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out) + public MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out) throws DMLRuntimeException { - execute(inputs, scalarObjects, out, true, false); + return execute(inputs, scalarObjects, out, true, false); } - public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out, boolean allocTmp, boolean aggIncr) + public MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out, boolean allocTmp, boolean aggIncr) throws DMLRuntimeException { //sanity check @@ -127,6 +129,8 @@ public abstract class SpoofRowwise extends SpoofOperator if( !aggIncr || !out.isAllocated() ) allocateOutputMatrix(m, n, n2, out); double[] c = out.getDenseBlock(); + final boolean flipOut = _type.isRowTypeB1ColumnAgg() + && LibSpoofPrimitives.isFlipOuter(out.getNumRows(), out.getNumColumns()); //input preparation SideInput[] b = prepInputMatrices(inputs, 1, inputs.size()-1, true, _tB1); @@ -149,18 +153,23 @@ public abstract class SpoofRowwise extends SpoofOperator if( allocTmp &&_reqVectMem > 0 ) LibSpoofPrimitives.cleanupThreadLocalMemory(); out.recomputeNonZeros(); + if( flipOut ) { + fixTransposeDimensions(out); + out = LibMatrixReorg.transpose(out, new MatrixBlock( + out.getNumColumns(), out.getNumRows(), false)); + } out.examSparsity(); + return out; } @Override - public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int k) + public MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int k) throws DMLRuntimeException { //redirect to serial execution if( k <= 1 || (_type.isColumnAgg() && !LibMatrixMult.checkParColumnAgg(inputs.get(0), k, false)) - || (long)inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD ) { - execute(inputs, scalarObjects, out); - return; + || getTotalInputNnz(inputs) < PAR_NUMCELL_THRESHOLD ) { + return execute(inputs, scalarObjects, out); } //sanity check @@ -173,6 +182,8 @@ public abstract class SpoofRowwise extends SpoofOperator final int n2 = _type.isRowTypeB1() || hasMatrixSideInput(inputs) ? getMinColsMatrixSideInputs(inputs) : -1; allocateOutputMatrix(m, n, n2, out); + final boolean flipOut = _type.isRowTypeB1ColumnAgg() + && LibSpoofPrimitives.isFlipOuter(out.getNumRows(), out.getNumColumns()); //input preparation SideInput[] b = prepInputMatrices(inputs, 1, inputs.size()-1, true, _tB1); @@ -210,11 +221,18 @@ public abstract class SpoofRowwise extends SpoofOperator } pool.shutdown(); + if( flipOut ) { + fixTransposeDimensions(out); + out = LibMatrixReorg.transpose(out, new MatrixBlock( + out.getNumColumns(), out.getNumRows(), false)); + } out.examSparsity(); } catch(Exception ex) { throw new DMLRuntimeException(ex); } + + return out; } public static boolean hasMatrixSideInput(ArrayList<MatrixBlock> inputs) { @@ -246,6 +264,12 @@ public abstract class SpoofRowwise extends SpoofOperator out.allocateDenseBlock(); } + private void fixTransposeDimensions(MatrixBlock out) { + int rlen = out.getNumRows(); + out.setNumRows(out.getNumColumns()); + out.setNumColumns(rlen); + } + private void executeDense(double[] a, SideInput[] b, double[] scalars, double[] c, int n, int rl, int ru) { if( a == null ) http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/runtime/instructions/cp/SpoofCPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/SpoofCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/SpoofCPInstruction.java index 7c16242..0e8489b 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/cp/SpoofCPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/SpoofCPInstruction.java @@ -84,8 +84,7 @@ public class SpoofCPInstruction extends ComputationCPInstruction // set the output dimensions to the hop node matrix dimensions if( output.getDataType() == DataType.MATRIX) { - MatrixBlock out = new MatrixBlock(); - _op.execute(inputs, scalars, out, _numThreads); + MatrixBlock out = _op.execute(inputs, scalars, new MatrixBlock(), _numThreads); ec.setMatrixOutput(output.getName(), out, getExtendedOpcode()); } else if (output.getDataType() == DataType.SCALAR) { http://git-wip-us.apache.org/repos/asf/systemml/blob/0325da7d/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java index 0af46df..34041cf 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java @@ -471,7 +471,7 @@ public class SpoofSPInstruction extends SPInstruction //prepare output and execute single-threaded operator ArrayList<MatrixBlock> inputs = getAllMatrixInputs(ixIn, blkIn); blkOut = aggIncr ? blkOut : new MatrixBlock(); - _op.execute(inputs, _scalars, blkOut, false, aggIncr); + blkOut = _op.execute(inputs, _scalars, blkOut, false, aggIncr); if( !aggIncr ) { MatrixIndexes ixOut = new MatrixIndexes(ixIn.getRowIndex(), _op.getRowType()!=RowType.NO_AGG ? 1 : ixIn.getColumnIndex()); @@ -532,7 +532,7 @@ public class SpoofSPInstruction extends SPInstruction ixOut = new MatrixIndexes(ixOut.getRowIndex(), 1); else if(((SpoofCellwise)_op).getCellType()==CellType.COL_AGG) ixOut = new MatrixIndexes(1, ixOut.getColumnIndex()); - _op.execute(inputs, _scalars, blkOut); + blkOut = _op.execute(inputs, _scalars, blkOut); } ret.add(new Tuple2<MatrixIndexes,MatrixBlock>(ixOut, blkOut)); } @@ -566,7 +566,7 @@ public class SpoofSPInstruction extends SPInstruction //execute core operation ArrayList<MatrixBlock> inputs = getAllMatrixInputs(arg._1(), arg._2()); MatrixBlock blkOut = new MatrixBlock(); - _op.execute(inputs, _scalars, blkOut); + blkOut = _op.execute(inputs, _scalars, blkOut); return new Tuple2<MatrixIndexes,MatrixBlock>(arg._1(), blkOut); } @@ -641,7 +641,7 @@ public class SpoofSPInstruction extends SPInstruction blkOut.quickSetValue(0, 0, obj.getDoubleValue()); } else { - _op.execute(inputs, _scalars, blkOut); + blkOut = _op.execute(inputs, _scalars, blkOut); } ret.add(new Tuple2<MatrixIndexes,MatrixBlock>(createOutputIndexes(ixIn,_op), blkOut));