Repository: systemml Updated Branches: refs/heads/master 4d5a82ecf -> 3ca053535
[SYSTEMML-445] Integrate GPU exectype selection into our existing infrastructure Closes #627. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/3ca05353 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/3ca05353 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/3ca05353 Branch: refs/heads/master Commit: 3ca05353593e7847dc6d6a7e862e323ffa96bfcc Parents: 4d5a82e Author: Niketan Pansare <npan...@us.ibm.com> Authored: Tue Aug 22 14:55:37 2017 -0700 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Tue Aug 22 14:55:37 2017 -0700 ---------------------------------------------------------------------- .../java/org/apache/sysml/hops/AggBinaryOp.java | 53 +++++++++----- .../java/org/apache/sysml/hops/AggUnaryOp.java | 53 ++++++++------ .../java/org/apache/sysml/hops/BinaryOp.java | 77 +++++++++++++------- .../org/apache/sysml/hops/ConvolutionOp.java | 12 ++- .../java/org/apache/sysml/hops/DataGenOp.java | 6 ++ src/main/java/org/apache/sysml/hops/DataOp.java | 5 ++ .../java/org/apache/sysml/hops/FunctionOp.java | 5 ++ src/main/java/org/apache/sysml/hops/Hop.java | 32 +++++--- .../java/org/apache/sysml/hops/IndexingOp.java | 5 ++ .../org/apache/sysml/hops/LeftIndexingOp.java | 5 ++ .../java/org/apache/sysml/hops/LiteralOp.java | 5 ++ .../java/org/apache/sysml/hops/MultipleOp.java | 5 ++ .../sysml/hops/ParameterizedBuiltinOp.java | 5 ++ .../org/apache/sysml/hops/QuaternaryOp.java | 5 ++ .../java/org/apache/sysml/hops/ReorgOp.java | 34 +++++++-- .../java/org/apache/sysml/hops/TernaryOp.java | 30 +++++--- .../java/org/apache/sysml/hops/UnaryOp.java | 38 +++++++--- .../apache/sysml/hops/codegen/SpoofFusedOp.java | 5 ++ 18 files changed, 278 insertions(+), 102 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/AggBinaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java index 4f709b4..11a2399 100644 --- a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java +++ b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java @@ -48,7 +48,6 @@ import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat; import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; -import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.mapred.DistributedCacheInput; @@ -143,6 +142,33 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop return _method; } + @Override + public boolean isGPUEnabled() { + if(!DMLScript.USE_ACCELERATOR) + return false; + + Hop input1 = getInput().get(0); + Hop input2 = getInput().get(1); + //matrix mult operation selection part 2 (specific pattern) + MMTSJType mmtsj = checkTransposeSelf(); //determine tsmm pattern + ChainType chain = checkMapMultChain(); //determine mmchain pattern + + _method = optFindMMultMethodCP ( input1.getDim1(), input1.getDim2(), + input2.getDim1(), input2.getDim2(), mmtsj, chain, _hasLeftPMInput ); + switch( _method ){ + case TSMM: + return true; + case MAPMM_CHAIN: + return false; + case PMM: + return false; + case MM: + return true; + default: + throw new RuntimeException("Unsupported method:" + _method); + } + } + /** * NOTE: overestimated mem in case of transpose-identity matmult, but 3/2 at worst * and existing mem estimate advantageous in terms of consistency hops/lops, @@ -169,7 +195,7 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop MMTSJType mmtsj = checkTransposeSelf(); //determine tsmm pattern ChainType chain = checkMapMultChain(); //determine mmchain pattern - if( et == ExecType.CP ) + if( et == ExecType.CP || et == ExecType.GPU ) { //matrix mult operation selection part 3 (CP type) _method = optFindMMultMethodCP ( input1.getDim1(), input1.getDim2(), @@ -178,7 +204,7 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop //dispatch CP lops construction switch( _method ){ case TSMM: - constructCPLopsTSMM( mmtsj ); + constructCPLopsTSMM( mmtsj, et ); break; case MAPMM_CHAIN: constructCPLopsMMChain( chain ); @@ -187,7 +213,7 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop constructCPLopsPMM(); break; case MM: - constructCPLopsMM(); + constructCPLopsMM(et); break; default: throw new HopsException(this.printErrorLocation() + "Invalid Matrix Mult Method (" + _method + ") while constructing CP lops."); @@ -344,7 +370,7 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop { double ret = 0; - if (DMLScript.USE_ACCELERATOR) { + if (isGPUEnabled()) { // In GPU Mode, intermediate memory is only needed in case of one of the matrix blocks is sparse // When sparse block is converted to dense and a dense MM takes place, we need (dim1 * dim2) // When dense block is converted to sparse and a sparse MM takes place, we need (dim1 * dim2 * 2) @@ -581,17 +607,11 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop // CP Lops generation ///////////////////////// - private void constructCPLopsTSMM( MMTSJType mmtsj ) + private void constructCPLopsTSMM( MMTSJType mmtsj, ExecType et ) throws HopsException, LopsException { int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads); - ExecType et = ExecType.CP; - if (DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR - || getMemEstimate() < Math.min(GPUContextPool.initialGPUMemBudget(), OptimizerUtils.getLocalMemBudget()))) { - et = ExecType.GPU; - } - Lop matmultCP = new MMTSJ(getInput().get(mmtsj.isLeft()?1:0).constructLops(), getDataType(), getValueType(), et, mmtsj, false, k); @@ -662,13 +682,12 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop HopRewriteUtils.removeChildReference(pmInput, nrow); } - private void constructCPLopsMM() + private void constructCPLopsMM(ExecType et) throws HopsException, LopsException { Lop matmultCP = null; - if (DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR - || getMemEstimate() < Math.min(GPUContextPool.initialGPUMemBudget(), OptimizerUtils.getLocalMemBudget()))) { + if (et == ExecType.GPU) { Hop h1 = getInput().get(0); Hop h2 = getInput().get(1); Lop left; Lop right; @@ -691,7 +710,7 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop } matmultCP = new Binary(left, right, - Binary.OperationTypes.MATMULT, getDataType(), getValueType(), ExecType.GPU, isLeftTransposed, isRightTransposed); + Binary.OperationTypes.MATMULT, getDataType(), getValueType(), et, isLeftTransposed, isRightTransposed); setOutputDimensions(matmultCP); setNnz(-1); } @@ -702,7 +721,7 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop else { int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads); matmultCP = new Binary(getInput().get(0).constructLops(),getInput().get(1).constructLops(), - Binary.OperationTypes.MATMULT, getDataType(), getValueType(), ExecType.CP, k); + Binary.OperationTypes.MATMULT, getDataType(), getValueType(), et, k); } setOutputDimensions(matmultCP); } http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/AggUnaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java index 7a6d463..4f5e2bc 100644 --- a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java +++ b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java @@ -38,7 +38,6 @@ import org.apache.sysml.lops.LopProperties.ExecType; import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; -import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; @@ -109,6 +108,30 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop } @Override + public boolean isGPUEnabled() { + if(!DMLScript.USE_ACCELERATOR) + return false; + + try { + if( isTernaryAggregateRewriteApplicable() || isUnaryAggregateOuterCPRewriteApplicable() ) { + return false; + } + else if ((_op == AggOp.SUM && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) + || (_op == AggOp.SUM_SQ && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) + || (_op == AggOp.MAX && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) + || (_op == AggOp.MIN && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) + || (_op == AggOp.MEAN && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) + || (_op == AggOp.VAR && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) + || (_op == AggOp.PROD && (_direction == Direction.RowCol))){ + return true; + } + } catch (HopsException e) { + throw new RuntimeException(e); + } + return false; + } + + @Override public Lop constructLops() throws HopsException, LopsException { @@ -121,10 +144,10 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop ExecType et = optFindExecType(); Hop input = getInput().get(0); - if ( et == ExecType.CP ) + if ( et == ExecType.CP || et == ExecType.GPU ) { Lop agg1 = null; - if( isTernaryAggregateRewriteApplicable(et) ) { + if( isTernaryAggregateRewriteApplicable() ) { agg1 = constructLopsTernaryAggregateRewrite(et); } else if( isUnaryAggregateOuterCPRewriteApplicable() ) @@ -149,20 +172,6 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop } else { //general case int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads); - if (DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR - || getMemEstimate() < Math.min(GPUContextPool.initialGPUMemBudget(), OptimizerUtils.getLocalMemBudget()))) { - // Only implemented methods for GPU - if ((_op == AggOp.SUM && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) - || (_op == AggOp.SUM_SQ && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) - || (_op == AggOp.MAX && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) - || (_op == AggOp.MIN && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) - || (_op == AggOp.MEAN && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) - || (_op == AggOp.VAR && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) - || (_op == AggOp.PROD && (_direction == Direction.RowCol))){ - et = ExecType.GPU; - k = 1; - } - } agg1 = new PartialAggregate(input.constructLops(), HopsAgg2Lops.get(_op), HopsDirection2Lops.get(_direction), getDataType(),getValueType(), et, k); } @@ -251,7 +260,7 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop DirectionTypes dir = HopsDirection2Lops.get(_direction); //unary aggregate - if( isTernaryAggregateRewriteApplicable(et) ) + if( isTernaryAggregateRewriteApplicable() ) { Lop aggregate = constructLopsTernaryAggregateRewrite(et); setOutputDimensions(aggregate); //0x0 (scalar) @@ -330,7 +339,7 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop protected double computeOutputMemEstimate( long dim1, long dim2, long nnz ) { double sparsity = -1; - if (DMLScript.USE_ACCELERATOR) { + if (isGPUEnabled()) { // The GPU version (for the time being) only does dense outputs sparsity = 1.0; } else { @@ -373,7 +382,7 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop break; case VAR: //worst-case correction LASTFOURROWS / LASTFOURCOLUMNS - if (DMLScript.USE_ACCELERATOR) { + if (isGPUEnabled()) { // The GPU implementation only operates on dense data // It allocates 2 dense blocks to help with these ops: // Assume Y = var(X) Or colVars(X), Or rowVars(X) @@ -506,7 +515,7 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop return SparkAggType.MULTI_BLOCK; } - private boolean isTernaryAggregateRewriteApplicable(ExecType et) + private boolean isTernaryAggregateRewriteApplicable() throws HopsException { boolean ret = false; @@ -726,6 +735,8 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop // The execution type of a unary aggregate instruction should depend on the execution type of inputs to avoid OOM // Since we only support matrix-vector and not vector-matrix, checking the execution type of input1 should suffice. ExecType et_input = input1.optFindExecType(); + // Because ternary aggregate are not supported on GPU + et_input = et_input == ExecType.GPU ? ExecType.CP : et_input; DirectionTypes dir = HopsDirection2Lops.get(_direction); return new TernaryAggregate(in1, in2, in3, Aggregate.OperationTypes.KahanSum, http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/BinaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/BinaryOp.java b/src/main/java/org/apache/sysml/hops/BinaryOp.java index 54c06f7..ad9f0ad 100644 --- a/src/main/java/org/apache/sysml/hops/BinaryOp.java +++ b/src/main/java/org/apache/sysml/hops/BinaryOp.java @@ -53,7 +53,6 @@ import org.apache.sysml.lops.UnaryCP; import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat; -import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.mapred.DistributedCacheInput; @@ -134,6 +133,56 @@ public class BinaryOp extends Hop } @Override + public boolean isGPUEnabled() { + if(!DMLScript.USE_ACCELERATOR) + return false; + + switch(op) + { + case IQM: + case CENTRALMOMENT: + case COVARIANCE: + case QUANTILE: + case INTERQUANTILE: + case MEDIAN: + return false; + case CBIND: + case RBIND: { + DataType dt1 = getInput().get(0).getDataType(); + return dt1 == DataType.MATRIX; // only matrix cbind, rbind supported on GPU + } + default: { + DataType dt1 = getInput().get(0).getDataType(); + DataType dt2 = getInput().get(1).getDataType(); + + boolean isMatrixScalar = (dt1 == DataType.MATRIX && dt2 == DataType.SCALAR) || (dt1 == DataType.SCALAR && dt2 == DataType.MATRIX); + boolean isMatrixMatrix = (dt1 == DataType.MATRIX && dt2 == DataType.MATRIX); + + OpOp2 [] supportedOps = { OpOp2.MULT, OpOp2.PLUS, OpOp2.MINUS, OpOp2.DIV, OpOp2.POW, OpOp2.MINUS1_MULT, + OpOp2.MODULUS, OpOp2.INTDIV, OpOp2.LESS, OpOp2.LESSEQUAL, OpOp2.EQUAL, OpOp2.NOTEQUAL, OpOp2.GREATER, OpOp2.GREATEREQUAL}; + + if(isMatrixScalar && op == OpOp2.MINUS_NZ) { + // Only supported for matrix scalar: + return true; + } + else if(isMatrixMatrix && op == OpOp2.SOLVE) { + // Only supported for matrix matrix: + return true; + } + else if(isMatrixScalar || isMatrixMatrix) { + for(OpOp2 supportedOp : supportedOps) { + if(op == supportedOp) + return true; + } + return false; + } + else + return false; + } + } + } + + @Override public Lop constructLops() throws HopsException, LopsException { @@ -527,11 +576,6 @@ public class BinaryOp extends Hop } else //CP { - if (DMLScript.USE_ACCELERATOR && dt1 == DataType.MATRIX && (DMLScript.FORCE_ACCELERATOR - || getMemEstimate() < GPUContextPool.initialGPUMemBudget())) { - et = ExecType.GPU; - } - Lop offset = createOffsetLop( getInput().get(0), cbind ); //offset 1st input append = new Append(getInput().get(0).constructLops(), getInput().get(1).constructLops(), offset, getDataType(), getValueType(), cbind, et); append.getOutputParameters().setDimensions(rlen, clen, getRowsInBlock(), getColsInBlock(), getNnz()); @@ -582,14 +626,6 @@ public class BinaryOp extends Hop else //general case ot = HopsOpOp2LopsU.get(op); - if (DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR - || getMemEstimate() < Math.min(GPUContextPool.initialGPUMemBudget(), OptimizerUtils.getLocalMemBudget())) - && (op == OpOp2.MULT || op == OpOp2.PLUS || op == OpOp2.MINUS || op == OpOp2.DIV || op == OpOp2.POW - || op == OpOp2.MINUS_NZ || op == OpOp2.MINUS1_MULT || op == OpOp2.MODULUS || op == OpOp2.INTDIV - || op == OpOp2.LESS || op == OpOp2.LESSEQUAL || op == OpOp2.EQUAL || op == OpOp2.NOTEQUAL - || op == OpOp2.GREATER || op == OpOp2.GREATEREQUAL)) { - et = ExecType.GPU; - } Unary unary1 = new Unary(getInput().get(0).constructLops(), getInput().get(1).constructLops(), ot, getDataType(), getValueType(), et); @@ -602,17 +638,8 @@ public class BinaryOp extends Hop { // Both operands are Matrixes ExecType et = optFindExecType(); - if ( et == ExecType.CP ) + if ( et == ExecType.CP || et == ExecType.GPU ) { - if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR - || getMemEstimate() < Math.min(GPUContextPool.initialGPUMemBudget(), OptimizerUtils.getLocalMemBudget())) - && (op == OpOp2.MULT || op == OpOp2.PLUS || op == OpOp2.MINUS || op == OpOp2.DIV || op == OpOp2.POW - || op == OpOp2.SOLVE || op == OpOp2.MINUS1_MULT || op == OpOp2.MODULUS || op == OpOp2.INTDIV - || op == OpOp2.LESS || op == OpOp2.LESSEQUAL || op == OpOp2.EQUAL || op == OpOp2.NOTEQUAL - || op == OpOp2.GREATER || op == OpOp2.GREATEREQUAL)) { - et = ExecType.GPU; - } - Lop binary = null; boolean isLeftXGt = (getInput().get(0) instanceof BinaryOp) && ((BinaryOp) getInput().get(0)).getOp() == OpOp2.GREATER; @@ -827,7 +854,7 @@ public class BinaryOp extends Hop ret = getInput().get(0).getMemEstimate() * 3; } else if ( op == OpOp2.SOLVE ) { - if (DMLScript.USE_ACCELERATOR) { + if (isGPUEnabled()) { // Solve on the GPU takes an awful lot of intermediate space // First the inputs are converted from row-major to column major // Then a workspace and a temporary output (workSize, tauSize) are needed http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/ConvolutionOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java index a3d8a81..2b9335c 100644 --- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java +++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java @@ -19,6 +19,7 @@ package org.apache.sysml.hops; +import org.apache.sysml.api.DMLScript; import org.apache.sysml.hops.Hop.MultiThreadedHop; import org.apache.sysml.lops.ConvolutionTransform; import org.apache.sysml.lops.ConvolutionTransform.OperationTypes; @@ -79,6 +80,13 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop } @Override + public boolean isGPUEnabled() { + if(!DMLScript.USE_ACCELERATOR) + return false; + return true; + } + + @Override public Lop constructLops() throws HopsException, LopsException { @@ -315,12 +323,12 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop if( _etypeForced != null ) { - _etype = findGPUExecTypeByMemEstimate(_etypeForced); + _etype = _etypeForced; } else { if ( OptimizerUtils.isMemoryBasedOptLevel() ) { - _etype = findGPUExecTypeByMemEstimate(findExecTypeByMemEstimate()); + _etype = findExecTypeByMemEstimate(); } else { _etype = REMOTE; http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/DataGenOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/DataGenOp.java b/src/main/java/org/apache/sysml/hops/DataGenOp.java index ce08dbc..89a5814 100644 --- a/src/main/java/org/apache/sysml/hops/DataGenOp.java +++ b/src/main/java/org/apache/sysml/hops/DataGenOp.java @@ -146,6 +146,11 @@ public class DataGenOp extends Hop implements MultiThreadedHop } @Override + public boolean isGPUEnabled() { + return false; + } + + @Override public Lop constructLops() throws HopsException, LopsException { @@ -502,4 +507,5 @@ public class DataGenOp extends Hop implements MultiThreadedHop return ret; } + } http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/DataOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/DataOp.java b/src/main/java/org/apache/sysml/hops/DataOp.java index bcded04..f410210 100644 --- a/src/main/java/org/apache/sysml/hops/DataOp.java +++ b/src/main/java/org/apache/sysml/hops/DataOp.java @@ -241,6 +241,11 @@ public class DataOp extends Hop } @Override + public boolean isGPUEnabled() { + return false; + } + + @Override public Lop constructLops() throws HopsException, LopsException { http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/FunctionOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/FunctionOp.java b/src/main/java/org/apache/sysml/hops/FunctionOp.java index c677bb8..3ad2d15 100644 --- a/src/main/java/org/apache/sysml/hops/FunctionOp.java +++ b/src/main/java/org/apache/sysml/hops/FunctionOp.java @@ -209,6 +209,11 @@ public class FunctionOp extends Hop } @Override + public boolean isGPUEnabled() { + return false; + } + + @Override public Lop constructLops() throws HopsException, LopsException { http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/Hop.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/Hop.java b/src/main/java/org/apache/sysml/hops/Hop.java index bfbdbaf..1cf875f 100644 --- a/src/main/java/org/apache/sysml/hops/Hop.java +++ b/src/main/java/org/apache/sysml/hops/Hop.java @@ -192,7 +192,9 @@ public abstract class Hop public void checkAndSetForcedPlatform() { - if ( DMLScript.rtplatform == RUNTIME_PLATFORM.SINGLE_NODE ) + if(DMLScript.USE_ACCELERATOR && DMLScript.FORCE_ACCELERATOR && isGPUEnabled()) + _etypeForced = ExecType.GPU; + else if ( DMLScript.rtplatform == RUNTIME_PLATFORM.SINGLE_NODE ) _etypeForced = ExecType.CP; else if ( DMLScript.rtplatform == RUNTIME_PLATFORM.HADOOP ) _etypeForced = ExecType.MR; @@ -768,8 +770,12 @@ public abstract class Hop protected ExecType findExecTypeByMemEstimate() { ExecType et = null; char c = ' '; - if ( getMemEstimate() < OptimizerUtils.getLocalMemBudget() ) { - et = ExecType.CP; + double memEst = getMemEstimate(); + if ( memEst < OptimizerUtils.getLocalMemBudget() ) { + if (DMLScript.USE_ACCELERATOR && isGPUEnabled() && memEst < GPUContextPool.initialGPUMemBudget()) + et = ExecType.GPU; + else + et = ExecType.CP; } else { if( DMLScript.rtplatform == DMLScript.RUNTIME_PLATFORM.HYBRID ) @@ -788,14 +794,6 @@ public abstract class Hop return et; } - - protected ExecType findGPUExecTypeByMemEstimate(ExecType et) { - if (DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR - || getMemEstimate() < Math.min(GPUContextPool.initialGPUMemBudget(), OptimizerUtils.getLocalMemBudget()))) { - return ExecType.GPU; - } - return et; - } public ArrayList<Hop> getParent() { return _parent; @@ -850,6 +848,18 @@ public abstract class Hop public abstract String getOpString(); + /** + * In memory-based optimizer mode (see OptimizerUtils.isMemoryBasedOptLevel()), + * the exectype is determined by checking this method as well as memory budget of this Hop. + * Please see findExecTypeByMemEstimate for more detail. + * + * This method is necessary because not all operator are supported efficiently + * on GPU (for example: operations on frames and scalar as well as operations such as table). + * + * @return true if the Hop is eligible for GPU Exectype. + */ + public abstract boolean isGPUEnabled(); + protected boolean isVector() { return (dimsKnown() && (_dim1 == 1 || _dim2 == 1) ); } http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/IndexingOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/IndexingOp.java b/src/main/java/org/apache/sysml/hops/IndexingOp.java index 5a27ed6..5f2ce34 100644 --- a/src/main/java/org/apache/sysml/hops/IndexingOp.java +++ b/src/main/java/org/apache/sysml/hops/IndexingOp.java @@ -94,6 +94,11 @@ public class IndexingOp extends Hop public void setColLowerEqualsUpper(boolean passed) { _colLowerEqualsUpper = passed; } + + @Override + public boolean isGPUEnabled() { + return false; + } @Override public Lop constructLops() http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/LeftIndexingOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/LeftIndexingOp.java b/src/main/java/org/apache/sysml/hops/LeftIndexingOp.java index a641622..02e7753 100644 --- a/src/main/java/org/apache/sysml/hops/LeftIndexingOp.java +++ b/src/main/java/org/apache/sysml/hops/LeftIndexingOp.java @@ -99,6 +99,11 @@ public class LeftIndexingOp extends Hop } @Override + public boolean isGPUEnabled() { + return false; + } + + @Override public Lop constructLops() throws HopsException, LopsException { http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/LiteralOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/LiteralOp.java b/src/main/java/org/apache/sysml/hops/LiteralOp.java index b96d032..16ebf1b 100644 --- a/src/main/java/org/apache/sysml/hops/LiteralOp.java +++ b/src/main/java/org/apache/sysml/hops/LiteralOp.java @@ -73,6 +73,11 @@ public class LiteralOp extends Hop public void checkArity() throws HopsException { HopsException.check(_input.isEmpty(), this, "should have 0 inputs but has %d inputs", _input.size()); } + + @Override + public boolean isGPUEnabled() { + return false; + } @Override public Lop constructLops() http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/MultipleOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/MultipleOp.java b/src/main/java/org/apache/sysml/hops/MultipleOp.java index 5fb6b29..5c178c0 100644 --- a/src/main/java/org/apache/sysml/hops/MultipleOp.java +++ b/src/main/java/org/apache/sysml/hops/MultipleOp.java @@ -80,6 +80,11 @@ public class MultipleOp extends Hop { public String getOpString() { return "m(" + _op.name().toLowerCase() + ")"; } + + @Override + public boolean isGPUEnabled() { + return false; + } /** * Construct the corresponding Lops for this Hop http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/ParameterizedBuiltinOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/ParameterizedBuiltinOp.java b/src/main/java/org/apache/sysml/hops/ParameterizedBuiltinOp.java index ab276d7..a611893 100644 --- a/src/main/java/org/apache/sysml/hops/ParameterizedBuiltinOp.java +++ b/src/main/java/org/apache/sysml/hops/ParameterizedBuiltinOp.java @@ -175,6 +175,11 @@ public class ParameterizedBuiltinOp extends Hop implements MultiThreadedHop } @Override + public boolean isGPUEnabled() { + return false; + } + + @Override public Lop constructLops() throws HopsException, LopsException { http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/QuaternaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/QuaternaryOp.java b/src/main/java/org/apache/sysml/hops/QuaternaryOp.java index 6517de6..17188be 100644 --- a/src/main/java/org/apache/sysml/hops/QuaternaryOp.java +++ b/src/main/java/org/apache/sysml/hops/QuaternaryOp.java @@ -189,6 +189,11 @@ public class QuaternaryOp extends Hop implements MultiThreadedHop } @Override + public boolean isGPUEnabled() { + return false; + } + + @Override public Lop constructLops() throws HopsException, LopsException { http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/ReorgOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/ReorgOp.java b/src/main/java/org/apache/sysml/hops/ReorgOp.java index 3e27eb3..f0560d3 100644 --- a/src/main/java/org/apache/sysml/hops/ReorgOp.java +++ b/src/main/java/org/apache/sysml/hops/ReorgOp.java @@ -34,7 +34,6 @@ import org.apache.sysml.lops.LopProperties.ExecType; import org.apache.sysml.lops.Transform.OperationTypes; import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; -import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; /** @@ -129,6 +128,35 @@ public class ReorgOp extends Hop implements MultiThreadedHop s += "r(" + HopsTransf2String.get(op) + ")"; return s; } + + @Override + public boolean isGPUEnabled() { + if(!DMLScript.USE_ACCELERATOR) + return false; + switch( op ) { + case TRANSPOSE: { + Lop lin; + try { + lin = getInput().get(0).constructLops(); + } catch (HopsException | LopsException e) { + throw new RuntimeException("Unable to create child lop", e); + } + if( lin instanceof Transform && ((Transform)lin).getOperationType()==OperationTypes.Transpose ) + return false; //if input is already a transpose, avoid redundant transpose ops + else if( getDim1()==1 && getDim2()==1 ) + return false; //if input of size 1x1, avoid unnecessary transpose + else + return true; + } + case DIAG: + case REV: + case RESHAPE: + case SORT: + return false; + default: + throw new RuntimeException("Unsupported operator:" + op.name()); + } + } @Override public Lop constructLops() @@ -151,10 +179,6 @@ public class ReorgOp extends Hop implements MultiThreadedHop setLops(lin); //if input of size 1x1, avoid unnecessary transpose else { //general case int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads); - if (DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR - || getMemEstimate() < Math.min(GPUContextPool.initialGPUMemBudget(), OptimizerUtils.getLocalMemBudget()))) { - et = ExecType.GPU; - } Transform transform1 = new Transform( lin, HopsTransf2Lops.get(op), getDataType(), getValueType(), et, k); setOutputDimensions(transform1); http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/TernaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/TernaryOp.java b/src/main/java/org/apache/sysml/hops/TernaryOp.java index 98c8ad3..47b012e 100644 --- a/src/main/java/org/apache/sysml/hops/TernaryOp.java +++ b/src/main/java/org/apache/sysml/hops/TernaryOp.java @@ -42,7 +42,6 @@ import org.apache.sysml.lops.PartialAggregate.CorrectionLocationType; import org.apache.sysml.parser.Statement; import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; -import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; /** Primary use cases for now, are @@ -128,6 +127,25 @@ public class TernaryOp extends Hop } @Override + public boolean isGPUEnabled() { + if(!DMLScript.USE_ACCELERATOR) + return false; + switch( _op ) { + case CENTRALMOMENT: + case COVARIANCE: + case CTABLE: + case INTERQUANTILE: + case QUANTILE: + return false; + case MINUS_MULT: + case PLUS_MULT: + return true; + default: + throw new RuntimeException("Unsupported operator:" + _op.name()); + } + } + + @Override public Lop constructLops() throws HopsException, LopsException { @@ -631,13 +649,7 @@ public class TernaryOp extends Hop if ( _op != OpOp3.PLUS_MULT && _op != OpOp3.MINUS_MULT ) throw new HopsException("Unexpected operation: " + _op + ", expecting " + OpOp3.PLUS_MULT + " or" + OpOp3.MINUS_MULT); - ExecType et = null; - if (DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR - || getMemEstimate() < Math.min(GPUContextPool.initialGPUMemBudget(), OptimizerUtils.getLocalMemBudget()))) { - et = ExecType.GPU; - } else { - et = optFindExecType(); - } + ExecType et = optFindExecType(); PlusMult plusmult = null; if( et == ExecType.CP || et == ExecType.SPARK || et == ExecType.GPU ) { @@ -711,7 +723,7 @@ public class TernaryOp extends Hop return OptimizerUtils.estimateSizeExactSparsity(dim1, dim2, 1.0); case PLUS_MULT: case MINUS_MULT: { - if (DMLScript.USE_ACCELERATOR) { + if (isGPUEnabled()) { // For the GPU, the input is converted to dense sparsity = 1.0; } else { http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/UnaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/UnaryOp.java b/src/main/java/org/apache/sysml/hops/UnaryOp.java index 2b31247..0a5bc65 100644 --- a/src/main/java/org/apache/sysml/hops/UnaryOp.java +++ b/src/main/java/org/apache/sysml/hops/UnaryOp.java @@ -99,6 +99,29 @@ public class UnaryOp extends Hop implements MultiThreadedHop } @Override + public boolean isGPUEnabled() { + if(!DMLScript.USE_ACCELERATOR) + return false; + boolean isScalar = ( getDataType() == DataType.SCALAR //value type casts or matrix to scalar + || (_op == OpOp1.CAST_AS_MATRIX && getInput().get(0).getDataType()==DataType.SCALAR) + || (_op == OpOp1.CAST_AS_FRAME && getInput().get(0).getDataType()==DataType.SCALAR)); + if(!isScalar) { + switch(_op) { + case SELP:case EXP:case SQRT:case LOG:case ABS: + case ROUND:case FLOOR:case CEIL: + case SIN:case COS: case TAN:case ASIN:case ACOS:case ATAN: + case SIGN: + return true; + default: + return false; + } + } + else { + return false; + } + } + + @Override public Lop constructLops() throws HopsException, LopsException { @@ -149,7 +172,7 @@ public class UnaryOp extends Hop implements MultiThreadedHop ExecType et = optFindExecType(); //special handling cumsum/cumprod/cummin/cumsum - if( isCumulativeUnaryOperation() && et != ExecType.CP ) + if( isCumulativeUnaryOperation() && !(et == ExecType.CP || et == ExecType.GPU) ) { //TODO additional physical operation if offsets fit in memory Lop cumsumLop = null; @@ -162,15 +185,6 @@ public class UnaryOp extends Hop implements MultiThreadedHop else //default unary { int k = isCumulativeUnaryOperation() ? OptimizerUtils.getConstrainedNumThreads( _maxNumThreads ) : 1; - switch(_op) { - case SELP:case EXP:case SQRT:case LOG:case ABS: - case ROUND:case FLOOR:case CEIL: - case SIN:case COS: case TAN:case ASIN:case ACOS:case ATAN: - case SIGN: - et = findGPUExecTypeByMemEstimate(et); - break; - default: - } Unary unary1 = new Unary(input.constructLops(), HopsOpOp1LopsU.get(_op), getDataType(), getValueType(), et, k); setOutputDimensions(unary1); @@ -550,7 +564,7 @@ public class UnaryOp extends Hop implements MultiThreadedHop protected double computeOutputMemEstimate( long dim1, long dim2, long nnz ) { double sparsity = -1; - if (DMLScript.USE_ACCELERATOR) { + if (isGPUEnabled()) { sparsity = 1.0; // Output is always dense (for now) on the GPU } else { sparsity = OptimizerUtils.getSparsity(dim1, dim2, nnz); @@ -569,7 +583,7 @@ public class UnaryOp extends Hop implements MultiThreadedHop ret = getInput().get(0).getMemEstimate() * 3; } - if (DMLScript.USE_ACCELERATOR) { + if (isGPUEnabled()) { OptimizerUtils.estimateSize(dim1, dim2); // Intermediate memory required to convert sparse to dense } http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java b/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java index 0d4b8db..247a142 100644 --- a/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java +++ b/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java @@ -283,4 +283,9 @@ public class SpoofFusedOp extends Hop implements MultiThreadedHop return ret; } + + @Override + public boolean isGPUEnabled() { + return false; + } }