http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/hops/ConvolutionOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java index 2b9335c..59ac29e 100644 --- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java +++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java @@ -191,7 +191,7 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop // // TODO: Inserting reblock requires knowing columns apriori // ConvolutionTransform transform1 = new ConvolutionTransform(addReblockIfNecessary(et, lopOp, in), lopOp, getDataType(), getValueType(), et, k); // setReblockedOutputDimension(et, transform1); - ConvolutionTransform transform1 = new ConvolutionTransform(in, lopOp, getDataType(), getValueType(), et, k); + ConvolutionTransform transform1 = new ConvolutionTransform(in, lopOp, getDataType(), getValueType(), et, k, computeIntermediateMemEstimate(-1, -1, -1 )); setOutputDimensions(transform1); setLineNumbers(transform1); @@ -223,13 +223,171 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop return OptimizerUtils.estimateSizeExactSparsity(dim1, dim2, sparsity); } + // --------------------------------------------------------------- + // Utility methods to guard the computation of memory estimates in presense of unknowns + private static class IntermediateDimensions { + int dim1; int dim2; double sp; + public IntermediateDimensions(ConvolutionOp h, String dim1Str, String dim2Str, double sp) { + dim1 = (int) h.getDim(dim1Str); + dim2 = (int) h.getDim(dim2Str); + this.sp = sp; + } + public IntermediateDimensions(ConvolutionOp h, String dim1Str, String dim2Str) { + dim1 = (int) h.getDim(dim1Str); + dim2 = (int) h.getDim(dim2Str); + sp = 1; + } + public IntermediateDimensions(ConvolutionOp h, int dim1, String dim2Str) { + this.dim1 = dim1; + dim2 = (int) h.getDim(dim2Str); + sp = 1; + } + + /** + * Add two computed memory estimates + * + * @param val1 memory estimate 1 + * @param val2 memory estimate 2 + * @return sum of memory estimates + */ + static double guardedAdd(double val1, double val2) { + if(val1 < 0 || val2 < 0) return OptimizerUtils.DEFAULT_SIZE; + double ret = val1 + val2; + if(ret >= OptimizerUtils.DEFAULT_SIZE) return OptimizerUtils.DEFAULT_SIZE; + else return ret; + } + + /** + * Compute memory estimates for given intermediate matrices + * + * @param intermediates list of intermediates + * @param numWorkers number of workers + * @return memory estimate + */ + public static double addEstimateSizes(ArrayList<IntermediateDimensions> intermediates, int numWorkers) { + double memBudget = 0; + for(int i = 0; i < intermediates.size(); i++) { + memBudget = guardedAdd(memBudget, OptimizerUtils.estimateSizeExactSparsity( + intermediates.get(i).dim1, intermediates.get(i).dim2, intermediates.get(i).sp)*numWorkers); + } + return memBudget; + } + + /** + * Compute max of two computed memory estimates + * @param val1 memory estimate 1 + * @param val2 memory estimate 2 + * @return max of memory estimates + */ + public static double guardedMax(double val1, double val2) { + if(val1 < 0 || val2 < 0) return OptimizerUtils.DEFAULT_SIZE; + double ret = Math.max(val1, val2); + if(ret >= OptimizerUtils.DEFAULT_SIZE) return OptimizerUtils.DEFAULT_SIZE; + else return ret; + } + } + + /** + * Helper utility to compute intermediate memory estimate + * + * @param gpuIntermediates intermediates for GPU + * @param cpIntermediates intermediates for CP + * @return memory estimates + */ + private double computeIntermediateMemEstimateHelper( + ArrayList<IntermediateDimensions> gpuIntermediates, + ArrayList<IntermediateDimensions> cpIntermediates) { + // Since CP operators use row-level parallelism by default + int numWorkers = (int) Math.min(OptimizerUtils.getConstrainedNumThreads(_maxNumThreads), Math.max(getDim("N"), 1)); + if(DMLScript.USE_ACCELERATOR) { + // Account for potential sparse-to-dense conversion + double gpuMemBudget = IntermediateDimensions.addEstimateSizes(gpuIntermediates, 1); + double cpMemoryBudget = IntermediateDimensions.addEstimateSizes(cpIntermediates, numWorkers); + if(cpMemoryBudget > gpuMemBudget) { + double oneThreadCPMemBudget = IntermediateDimensions.addEstimateSizes(cpIntermediates, 1); + if(oneThreadCPMemBudget <= gpuMemBudget) { + // Why limit CPU ? in-order to give more opportunity to compile GPU operators + cpMemoryBudget = oneThreadCPMemBudget; + } + } + // Finally, use the maximum of CP and GPU memory budget + return IntermediateDimensions.guardedMax(cpMemoryBudget, gpuMemBudget); + } + else { + // When -gpu flag is not provided, the memory estimates for CP are not affected. + return IntermediateDimensions.addEstimateSizes(cpIntermediates, numWorkers); + } + } + @Override - protected double computeIntermediateMemEstimate( long dim1, long dim2, long nnz ) + protected double computeIntermediateMemEstimate( long ignoreDim1, long ignoreDim2, long ignoreNnz ) { - //default: no intermediate memory requirements - return 0; + ArrayList<IntermediateDimensions> gpuIntermediates = new ArrayList<IntermediateDimensions>(); + ArrayList<IntermediateDimensions> cpIntermediates = new ArrayList<IntermediateDimensions>(); + if(getOp() == ConvOp.DIRECT_CONV2D) { + // Assumption: To compile a GPU conv2d operator, following should fit on the GPU: + // 1. output in dense format (i.e. computeOutputMemEstimate) + // 2. input in any format + // 3. atleast one input row in dense format + // 4. filter in dense format + + // Account for potential sparse-to-dense conversion of atleast 1 input row and filter + gpuIntermediates.add(new IntermediateDimensions(this, 1, "CHW")); + gpuIntermediates.add(new IntermediateDimensions(this, "K", "CRS")); + + // im2col operation preserves the worst-case sparsity of the input. + cpIntermediates.add(new IntermediateDimensions(this, "CRS", "PQ", getInput().get(0).getSparsity())); + } + else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) { + // Assumption: To compile a GPU conv2d_backward_data operator, following should fit on the GPU: + // 1. output in dense format (i.e. computeOutputMemEstimate) + // 2. dout in any format + // 3. atleast one dout row in dense format + // 4. filter in dense format + + // Account for potential sparse-to-dense conversion of atleast 1 input row and filter + gpuIntermediates.add(new IntermediateDimensions(this, 1, "KPQ")); + gpuIntermediates.add(new IntermediateDimensions(this, "K", "CRS")); + + // There are 2 intermediates: rotate180 and input to col2im for conv2d_backward_data + // rotate180 preserves the "exact" sparsity of the dout matrix + cpIntermediates.add(new IntermediateDimensions(this, "PQ", "K", getInput().get(1).getSparsity())); + // Note: worst-case sparsity for the input of col2im (of size NPQ x CRS where N is determined by degree of parallelism) + cpIntermediates.add(new IntermediateDimensions(this, "PQ", "CRS")); + } + else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER) { + // Assumption: To compile a GPU conv2d_backward_filter operator, following should fit on the GPU: + // 1. output in dense format (i.e. computeOutputMemEstimate) + // 2. dout in any format + // 3. atleast one dout and input row in dense format + + // Account for potential sparse-to-dense conversion of atleast 1 input + dout row + gpuIntermediates.add(new IntermediateDimensions(this, 1, "CHW")); + gpuIntermediates.add(new IntermediateDimensions(this, 1, "KPQ")); + + // There are 2 intermediates: im2col and rotate180 for conv2d_backward_filter + // rotate180 preserves the "exact" sparsity of the dout matrix + cpIntermediates.add(new IntermediateDimensions(this, "PQ", "K", getInput().get(1).getSparsity())); + // im2col operation preserves the worst-case sparsity of the input. + cpIntermediates.add(new IntermediateDimensions(this, "CRS", "PQ", getInput().get(0).getSparsity())); + } + else if(getOp() == ConvOp.MAX_POOLING) { + // Account for potential sparse-to-dense conversion of atleast 1 input row + gpuIntermediates.add(new IntermediateDimensions(this, 1, "CHW")); + } + else if(getOp() == ConvOp.MAX_POOLING_BACKWARD) { + // Account for potential sparse-to-dense conversion of atleast 1 input + dout row + gpuIntermediates.add(new IntermediateDimensions(this, 1, "CHW")); + gpuIntermediates.add(new IntermediateDimensions(this, 1, "CPQ")); + } + + if(gpuIntermediates.size() > 0 || cpIntermediates.size() > 0) + return computeIntermediateMemEstimateHelper(gpuIntermediates, cpIntermediates); + else + return 0; } + @Override protected long[] inferOutputCharacteristics( MemoTable memo ) { @@ -243,65 +401,9 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop ret[2] = -1; return (ret[0]>0 && ret[1]>0) ? ret : null; } - - ConvolutionParameters params; - try { - params = parseInput(); - } catch (DMLRuntimeException e) { - throw new RuntimeException(e); - } - switch(op) - { - case MAX_POOLING: { - // input - long N = getInput().get(0)._dim1; - ret[0] = N; - ret[1] = getExtractedVal(params.C, params.P, params.Q); - ret[2] = -1; - break; - } - case DIRECT_CONV2D: { - // input, filter - long N = getInput().get(0)._dim1; - ret[0] = N; - ret[1] = getExtractedVal(params.K, params.P, params.Q); - ret[2] = -1; - break; - } - case DIRECT_CONV2D_BACKWARD_FILTER: { - // input, dout - ret[0] = params.K; - ret[1] = getExtractedVal(params.C, params.R, params.S); - ret[2] = -1; - break; - } - case MAX_POOLING_BACKWARD: { - // input, dout - ret[0] = getInput().get(0)._dim1; - ret[1] = getInput().get(0)._dim2; - ret[2] = -1; - break; - } - case DIRECT_CONV2D_BACKWARD_DATA: { - // filter, dout - long N = getInput().get(1)._dim1; - ret[0] = N; - ret[1] = getExtractedVal(params.C, params.H, params.W); - ret[2] = -1; - break; - } - default: - throw new RuntimeException("Unsupported op:" + op.name()); - } - - if(LOG.isDebugEnabled() && (ret[0] <= 0 || ret[1] <= 0)) { - LOG.debug("Unknown dimensions for ConvolutionOp in inferOutputCharacteristics:" + op.name() + " " + ret[0] + " " + ret[1] + - " img_dim=[" + params.N + " " + params.C + " " + params.H + " " + params.W + "]" + - " filter_dim=[" + params.K + " " + params.C + " " + params.H + " " + params.W + "]" + - " output_feature_map=[" + params.P + " " + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" + - " pad=[" + params.pad_h + " " + params.pad_w + "]"); - } + refreshSizeInformation(); + ret[0] = _dim1; ret[1] = _dim2; ret[2] = _nnz; //safe return (create entry only if at least dims known) return (ret[0]>0 && ret[1]>0) ? ret : null; @@ -347,50 +449,44 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop return _etype; } + // Caching parameters speed-ups dynamic recompilation time by avoiding unnecessary computeSizeInformation + private ConvolutionParameters _cachedParams = new ConvolutionParameters(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, _maxNumThreads); // stride1, stride2, padding1, padding2 // input_shape1, input_shape2, input_shape3, input_shape4, // filter_shape1, filter_shape2, filter_shape3, filter_shape4 ConvolutionParameters parseInput() throws DMLRuntimeException { - ConvolutionParameters params = null; if(op == ConvOp.MAX_POOLING_BACKWARD || op == ConvOp.DIRECT_CONV2D || op == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER || op == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) { - params = new ConvolutionParameters( - computeSizeInformation(getInput().get(6)), - computeSizeInformation(getInput().get(7)), - computeSizeInformation(getInput().get(8)), - computeSizeInformation(getInput().get(9)), - computeSizeInformation(getInput().get(10)), - computeSizeInformation(getInput().get(12)), - computeSizeInformation(getInput().get(13)), - computeSizeInformation(getInput().get(2)), - computeSizeInformation(getInput().get(3)), - computeSizeInformation(getInput().get(4)), - computeSizeInformation(getInput().get(5)), _maxNumThreads); + _cachedParams.setIfUnknown( + getInput().get(6), + getInput().get(7), + getInput().get(8), + getInput().get(9), + getInput().get(10), + getInput().get(12), + getInput().get(13), + getInput().get(2), + getInput().get(3), + getInput().get(4), + getInput().get(5), _maxNumThreads); } else { - params = new ConvolutionParameters( - computeSizeInformation(getInput().get(5)), - computeSizeInformation(getInput().get(6)), - computeSizeInformation(getInput().get(7)), - computeSizeInformation(getInput().get(8)), - computeSizeInformation(getInput().get(9)), - computeSizeInformation(getInput().get(11)), - computeSizeInformation(getInput().get(12)), - computeSizeInformation(getInput().get(1)), - computeSizeInformation(getInput().get(2)), - computeSizeInformation(getInput().get(3)), - computeSizeInformation(getInput().get(4)), _maxNumThreads); - } - return params; - } - - public static long getExtractedVal(long val1, long val2, long val3) { - if(val1 == -1 || val2 == -1 || val3 == -1) { - return -1; + _cachedParams.setIfUnknown( + getInput().get(5), + getInput().get(6), + getInput().get(7), + getInput().get(8), + getInput().get(9), + getInput().get(11), + getInput().get(12), + getInput().get(1), + getInput().get(2), + getInput().get(3), + getInput().get(4), _maxNumThreads); } - return val1*val2*val3; + return _cachedParams; } @Override @@ -400,72 +496,50 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop Hop input1 = getInput().get(0); setDim1(input1.getDim1()); setDim2(input1.getDim2()); + _nnz = -1; // cannot infer stats return; } - ConvolutionParameters params; - try { - params = parseInput(); - } catch (DMLRuntimeException e) { - throw new RuntimeException(e); - } - switch(op) { case MAX_POOLING: { - // input - long N = getInput().get(0)._dim1; - _dim1 = N; - _dim2 = getExtractedVal(params.C, params.P, params.Q); + _dim1 = getDim("N"); + _dim2 = getDim("CPQ"); _nnz = -1; // cannot infer stats break; } case MAX_POOLING_BACKWARD: { - // input, dout - _dim1 = getInput().get(0)._dim1; - _dim2 = getInput().get(0)._dim2; + _dim1 = getDim("N"); + _dim2 = getDim("CHW"); _nnz = -1; break; } case DIRECT_CONV2D: { - // input, filter - long N = getInput().get(0)._dim1; - _dim1 = N; - _dim2 = getExtractedVal(params.K, params.P, params.Q); + _dim1 = getDim("N"); + _dim2 = getDim("KPQ"); _nnz = -1; // cannot infer stats break; } case DIRECT_CONV2D_BACKWARD_DATA: { - // filter, dout - long N = getInput().get(1)._dim1; - _dim1 = N; - _dim2 = getExtractedVal(params.C, params.H, params.W); + _dim1 = getDim("N"); + _dim2 = getDim("CHW"); _nnz = -1; // cannot infer stats break; } case DIRECT_CONV2D_BACKWARD_FILTER: { - // input, dout - _dim1 = params.K; - _dim2 = getExtractedVal(params.C, params.R, params.S); + _dim1 = getDim("K"); + _dim2 = getDim("CRS"); _nnz = -1; // cannot infer stats break; } default: throw new RuntimeException("The sizes are not refreshed for " + op.name()); } - - if(LOG.isDebugEnabled() && (_dim1 <= 0 || _dim2 <= 0)) { - LOG.debug("Unknown dimensions for ConvolutionOp in refreshSizeInformation:" + op.name() + " " + _dim1 + " " + _dim2 + - " img_dim=[" + params.N + " " + params.C + " " + params.H + " " + params.W + "]" + - " filter_dim=[" + params.K + " " + params.C + " " + params.H + " " + params.W + "]" + - " output_feature_map=[" + params.P + " " + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" + - " pad=[" + params.pad_h + " " + params.pad_w + "]"); - } } @Override @@ -511,4 +585,132 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop public int getMaxNumThreads() { return _maxNumThreads; } + + + // ------------------------------------------------------------------------------------------------------ + // Utility methods to get the dimensions taking into account unknown dimensions + + /** + * Convenient method to get the dimensions required by ConvolutionOp. + * + * @param dimString can be K, CRS, N, CHW, KPQ, PQ + * @return either -1 or value associated with the dimString + */ + private long getDim(String dimString) { + if(op == ConvOp.BIAS_ADD || op == ConvOp.BIAS_MULTIPLY) { + throw new RuntimeException("getDim method should not be invoked for bias_add and bias_multiply"); + } + ConvolutionParameters params; + try { + params = parseInput(); + } catch (DMLRuntimeException e) { + throw new RuntimeException(e); + } + Hop filter = null; // shape: K x CRS + Hop input = null; // shape: N x CHW + Hop dout = null; // shape: N x KPQ + Hop dout1 = null; // shape: N x CPQ + + if(getOp() == ConvOp.DIRECT_CONV2D) { + input = getInput().get(0); + filter = getInput().get(1); + } + else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) { + filter = getInput().get(0); + dout = getInput().get(1); + } + else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER) { + input = getInput().get(0); + dout = getInput().get(1); + } + else if(getOp() == ConvOp.MAX_POOLING) { + input = getInput().get(0); + } + else if(getOp() == ConvOp.MAX_POOLING_BACKWARD) { + input = getInput().get(0); + dout1 = getInput().get(1); + } + + long ret = -1; + if(dimString.equals("K") && filter != null) { + ret = getNonNegative(ret, getNonNegative(params.K, filter._dim1)); + } + else if(dimString.equals("CRS") && filter != null) { + ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.C, params.R, params.S), filter._dim2)); + } + else if(dimString.equals("N") && input != null) { + ret = getNonNegative(ret, getNonNegative(params.N, input._dim1)); + } + else if(dimString.equals("CHW") && input != null) { + ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.C, params.H, params.W), input._dim2)); + } + else if(dimString.equals("N") && dout != null) { + ret = getNonNegative(ret, getNonNegative(params.N, dout._dim1)); + } + else if(dimString.equals("KPQ") && dout != null) { + ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.K, params.P, params.Q), dout._dim2)); + } + else if(dimString.equals("N") && dout1 != null) { + ret = getNonNegative(ret, getNonNegative(params.N, dout1._dim1)); + } + else if(dimString.equals("CPQ") && dout1 != null) { + ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.C, params.P, params.Q), dout1._dim2)); + } + else if(dimString.equals("K")) { + ret = getNonNegative(ret, params.K >= 0 ? params.K : -1); + } + else if(dimString.equals("CRS")) { + ret = getNonNegative(ret, nonNegativeMultiply(params.C, params.R, params.S)); + } + else if(dimString.equals("N")) { + ret = getNonNegative(ret, params.N >= 0 ? params.N : -1); + } + else if(dimString.equals("CHW")) { + ret = getNonNegative(ret, nonNegativeMultiply(params.C, params.H, params.W)); + } + else if(dimString.equals("KPQ")) { + ret = getNonNegative(ret, nonNegativeMultiply(params.K, params.P, params.Q)); + } + else if(dimString.equals("PQ")) { + ret = getNonNegative(ret, nonNegativeMultiply(params.P, params.Q)); + } + else if(dimString.equals("CPQ")) { + ret = getNonNegative(ret, nonNegativeMultiply(params.C, params.P, params.Q)); + } + else { + throw new RuntimeException("Unsupported dimension:" + dimString + " for operator " + getOp().name()); + } + + if(LOG.isDebugEnabled() && ret < 0) { + LOG.debug("Unknown dimension " + dimString + " for ConvolutionOp:" + op.name() + + " img_dim=[" + params.N + " " + params.C + " " + params.H + " " + params.W + "]" + + " filter_dim=[" + params.K + " " + params.C + " " + params.H + " " + params.W + "]" + + " output_feature_map=[" + params.P + " " + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" + + " pad=[" + params.pad_h + " " + params.pad_w + "]"); + } + return ret; + } + + private long nonNegativeMultiply(long val1, long val2, long val3) { + if(val1 >= 0 && val2 >= 0 && val3 >= 0) { + return val1 * val2 * val3; + } + else return -1; + } + private long nonNegativeMultiply(long val1, long val2) { + if(val1 >= 0 && val2 >= 0) { + return val1 * val2; + } + else return -1; + } + private long getNonNegative(long val1, long val2) { + if(val1 >= 0 && val2 >= 0) { + if(val1 == val2) return val1; + else throw new RuntimeException("Incorrect dimensions in Convolution Hop: " + val1 + " != " + val2); + } + else if(val1 >= 0) return val1; + else if(val2 >= 0) return val2; + else return -1; + } + // ------------------------------------------------------------------------------------------------------ }
http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/hops/Hop.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/Hop.java b/src/main/java/org/apache/sysml/hops/Hop.java index eeaa5f1..b454771 100644 --- a/src/main/java/org/apache/sysml/hops/Hop.java +++ b/src/main/java/org/apache/sysml/hops/Hop.java @@ -708,31 +708,8 @@ public abstract class Hop implements ParseInfo _validCPSizeEstimate = (wstats!=null) ? OptimizerUtils.isValidCPMatrixSize( wstats[0], wstats[1], OptimizerUtils.getSparsity(wstats[0], wstats[1], wstats[2])) : false; } - /** - * Computes the hop-specific output memory estimate in bytes. Should be 0 if not - * applicable. - * - * @param dim1 dimension 1 - * @param dim2 dimension 2 - * @param nnz number of non-zeros - * @return memory estimate - */ - protected abstract double computeOutputMemEstimate( long dim1, long dim2, long nnz ); - - /** - * Computes the hop-specific intermediate memory estimate in bytes. Should be 0 if not - * applicable. - * - * @param dim1 dimension 1 - * @param dim2 dimension 2 - * @param nnz number of non-zeros - * @return memory estimate - */ - protected abstract double computeIntermediateMemEstimate( long dim1, long dim2, long nnz ); - - /** * Computes the output matrix characteristics (rows, cols, nnz) based on worst-case output * and/or input estimates. Should return null if dimensions are unknown. * @@ -849,6 +826,21 @@ public abstract class Hop implements ParseInfo public abstract String getOpString(); + // ======================================================================================== + // Design doc: Memory estimation of GPU + // 1. Since not all operator are supported on GPU, isGPUEnabled indicates whether an operation + // is enabled for GPU. This method doesnot take into account any memory estimates. + // 2. To simplify memory estimation logic, the methods computeOutputMemEstimate and computeIntermediateMemEstimate + // should return maximum of memory required for GPU and CP operators. + // 3. Additionally, these methods are guarded so that when -gpu flag is not provided, additional memory overhead due to GPU + // are ignored. For example: sparse-to-dense conversion on GPU. + // 4. (WIP) Every GPU operators should respect the memory returned by computeIntermediateMemEstimate (and computeOutputMemEstimate - see below point). + // 5. (WIP) Every GPU operator should create output in the same format as the corresponding CP operator. That is, computeOutputMemEstimate + // are consistent across both CP and GPU in terms of worst-case. + // 6. The drawback of using maximum memory (mem = Math.max(mem_gpu, mem_gpu)) are: + // - GPU operator is not selected when mem_gpu < total memory available on GPU < mem + // - CP operator is not selected (i.e. distributed operator compiled) when mem_cpu < driver memory budget < mem + /** * In memory-based optimizer mode (see OptimizerUtils.isMemoryBasedOptLevel()), * the exectype is determined by checking this method as well as memory budget of this Hop. @@ -861,6 +853,31 @@ public abstract class Hop implements ParseInfo */ public abstract boolean isGPUEnabled(); + /** + * Computes the hop-specific output memory estimate in bytes. Should be 0 if not + * applicable. + * + * @param dim1 dimension 1 + * @param dim2 dimension 2 + * @param nnz number of non-zeros + * @return memory estimate + */ + protected abstract double computeOutputMemEstimate( long dim1, long dim2, long nnz ); + + /** + * Computes the hop-specific intermediate memory estimate in bytes. Should be 0 if not + * applicable. + * + * @param dim1 dimension 1 + * @param dim2 dimension 2 + * @param nnz number of non-zeros + * @return memory estimate + */ + protected abstract double computeIntermediateMemEstimate( long dim1, long dim2, long nnz ); + + // ======================================================================================== + + protected boolean isVector() { return (dimsKnown() && (_dim1 == 1 || _dim2 == 1) ); } http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java index 8784956..121112b 100644 --- a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java +++ b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java @@ -37,6 +37,7 @@ public class ConvolutionTransform extends Lop private OperationTypes operation = null; private int numThreads = -1; + private double intermediateMemBudget = 0; /** * Constructor when we have one input. @@ -47,12 +48,14 @@ public class ConvolutionTransform extends Lop * @param vt value type * @param et execution type * @param k number of threads + * @param intermediateMemBudget intermediate memory budget */ - public ConvolutionTransform(Lop input, ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, int k) + public ConvolutionTransform(Lop input, ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, int k, double intermediateMemBudget) { super(Lop.Type.Transform, dt, vt); init(input, op, dt, vt, et); numThreads = k; + this.intermediateMemBudget = intermediateMemBudget; } public ConvolutionTransform(Lop input1, Lop input2, ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, int k) @@ -165,6 +168,9 @@ public class ConvolutionTransform extends Lop sb.append( OPERAND_DELIMITOR ); sb.append( numThreads ); } + + sb.append( OPERAND_DELIMITOR ); + sb.append( intermediateMemBudget ); return sb.toString(); } else { @@ -210,6 +216,9 @@ public class ConvolutionTransform extends Lop sb.append( OPERAND_DELIMITOR ); sb.append( numThreads ); } + + sb.append( OPERAND_DELIMITOR ); + sb.append( intermediateMemBudget ); } } http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java index 629b688..e91029e 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java @@ -22,6 +22,10 @@ package org.apache.sysml.runtime.instructions.cp; import java.util.ArrayList; import java.util.Arrays; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysml.api.DMLScript; +import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.functionobjects.SwapIndex; @@ -41,24 +45,25 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { private ArrayList<CPOperand> _filter_shape; private ArrayList<CPOperand> _stride = new ArrayList<CPOperand>(); private ArrayList<CPOperand> _padding = new ArrayList<CPOperand>(); - private int _numThreads = -1; - - private ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode, String istr, - int numThreads) throws DMLRuntimeException { - super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, opcode, istr); - if (!(opcode.equals("bias_add") || opcode.equals("relu_backward") || opcode.equals("bias_multiply"))) { - throw new DMLRuntimeException( - "Incorrect usage. Expected the opcode to be bias_add or bias_multiply or relu_backward, but found " - + opcode); + private int _numThreads = -1; private double _intermediateMemoryBudget = 0; + private static final Log LOG = LogFactory.getLog(ConvolutionCPInstruction.class.getName()); + private static boolean warnedUnderUtilitization = false; + + public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode, String istr, int numThreads, double intermediateMemoryBudget) throws DMLRuntimeException { + super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, + opcode, istr); + if( !(opcode.equals("bias_add") || opcode.equals("relu_backward") || opcode.equals("bias_multiply") ) ) { + throw new DMLRuntimeException("Incorrect usage. Expected the opcode to be bias_add or bias_multiply or relu_backward, but found " + opcode); } _in2 = in2; _cptype = CPINSTRUCTION_TYPE.Convolution; _numThreads = numThreads; + _intermediateMemoryBudget = intermediateMemoryBudget; } private ConvolutionCPInstruction(CPOperand in, CPOperand out, String opcode, String istr, ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape, - ArrayList<CPOperand> filter_shape, int numThreads) { + ArrayList<CPOperand> filter_shape, int numThreads, double intermediateMemoryBudget) { super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, opcode, istr); _cptype = CPINSTRUCTION_TYPE.Convolution; _stride = stride; @@ -66,12 +71,15 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { _input_shape = input_shape; _filter_shape = filter_shape; _numThreads = numThreads; + _intermediateMemoryBudget = intermediateMemoryBudget; } - - private ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode, String istr, - ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape, - ArrayList<CPOperand> filter_shape, int numThreads) { - super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, opcode, istr); + + public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode, + String istr, ArrayList<CPOperand> stride, + ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape, + ArrayList<CPOperand> filter_shape, int numThreads, double intermediateMemoryBudget) { + super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, + opcode, istr); _in2 = in2; _cptype = CPINSTRUCTION_TYPE.Convolution; _stride = stride; @@ -79,12 +87,15 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { _input_shape = input_shape; _filter_shape = filter_shape; _numThreads = numThreads; + _intermediateMemoryBudget = intermediateMemoryBudget; } - - private ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand in3, CPOperand out, String opcode, - String istr, ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape, - ArrayList<CPOperand> filter_shape, int numThreads) { - super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, opcode, istr); + + public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand in3, CPOperand out, String opcode, + String istr, ArrayList<CPOperand> stride, + ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape, + ArrayList<CPOperand> filter_shape, int numThreads, double intermediateMemoryBudget) { + super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, + opcode, istr); _in2 = in2; _in3 = in3; _cptype = CPINSTRUCTION_TYPE.Convolution; @@ -93,6 +104,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { _input_shape = input_shape; _filter_shape = filter_shape; _numThreads = numThreads; + _intermediateMemoryBudget = intermediateMemoryBudget; } public static ConvolutionCPInstruction parseInstruction(String str) @@ -101,7 +113,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { String[] parts = InstructionUtils.getInstructionPartsWithValueType(str); String opcode = parts[0]; if (opcode.equalsIgnoreCase("maxpooling") || opcode.equalsIgnoreCase("relu_maxpooling")) { - InstructionUtils.checkNumFields(parts, 15); + InstructionUtils.checkNumFields(parts, 16); // stride1, stride2, padding1, padding2 // input_shape1, input_shape2, input_shape3, input_shape4, // filter_shape1, filter_shape2, filter_shape3, filter_shape4, k @@ -127,13 +139,13 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { int k = Integer.parseInt(parts[15]); return new ConvolutionCPInstruction(in, out, opcode, str, stride, - padding, input_shape, filter_shape, k); + padding, input_shape, filter_shape, k, Double.parseDouble(parts[16])); } else if (opcode.equalsIgnoreCase("maxpooling_backward") || opcode.equalsIgnoreCase("relu_maxpooling_backward") || opcode.equalsIgnoreCase("conv2d") || opcode.equalsIgnoreCase("conv2d_backward_filter") || opcode.equalsIgnoreCase("conv2d_backward_data")) { - InstructionUtils.checkNumFields(parts, 16); + InstructionUtils.checkNumFields(parts, 17); // dout, stride1, stride2, padding1, padding2 // input_shape1, input_shape2, input_shape3, input_shape4, // filter_shape1, filter_shape2, filter_shape3, filter_shape4, k @@ -160,10 +172,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { int k = Integer.parseInt(parts[16]); return new ConvolutionCPInstruction(in, in2, out, opcode, str, stride, - padding, input_shape, filter_shape, k); + padding, input_shape, filter_shape, k, Double.parseDouble(parts[17])); } else if (opcode.equalsIgnoreCase("conv2d_bias_add")) { - InstructionUtils.checkNumFields(parts, 17); + InstructionUtils.checkNumFields(parts, 18); // dout, stride1, stride2, padding1, padding2 // input_shape1, input_shape2, input_shape3, input_shape4, // filter_shape1, filter_shape2, filter_shape3, filter_shape4, k @@ -191,15 +203,15 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { int k = Integer.parseInt(parts[17]); return new ConvolutionCPInstruction(in, in2, in3, out, opcode, str, stride, - padding, input_shape, filter_shape, k); + padding, input_shape, filter_shape, k, Double.parseDouble(parts[18])); } else if (opcode.equalsIgnoreCase("bias_add") || opcode.equals("relu_backward") || opcode.equalsIgnoreCase("bias_multiply") ) { - InstructionUtils.checkNumFields(parts, 4); + InstructionUtils.checkNumFields(parts, 5); CPOperand in = new CPOperand(parts[1]); CPOperand in2 = new CPOperand(parts[2]); CPOperand out = new CPOperand(parts[3]); int k = Integer.parseInt(parts[4]); - return new ConvolutionCPInstruction(in, in2, out, opcode, str, k); + return new ConvolutionCPInstruction(in, in2, out, opcode, str, k, Double.parseDouble(parts[5])); } else { throw new DMLRuntimeException("Unknown opcode while parsing a ConvolutionCPInstruction: " + str); @@ -363,6 +375,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { ec.releaseMatrixInput(_in2.getName(), getExtendedOpcode()); } else if (instOpcode.equalsIgnoreCase("conv2d")) { + resetNumThreads(params, C*R*S, P*Q, matBlock.getNonZeros() / (matBlock.getNumRows()*matBlock.getNumColumns())); MatrixBlock filter = ec.getMatrixInput(_in2.getName(), getExtendedOpcode()); if(filter.isEmpty() || matBlock.isEmpty()) { outputBlock = new MatrixBlock(N, K*P*Q, true); @@ -377,6 +390,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { ec.releaseMatrixInput(_in2.getName(), getExtendedOpcode()); } else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) { + resetNumThreads(params, C*R*S, P*Q, matBlock.getNonZeros() / (matBlock.getNumRows()*matBlock.getNumColumns())); MatrixBlock filter = ec.getMatrixInput(_in3.getName(), getExtendedOpcode()); MatrixBlock bias = ec.getMatrixInput(_in2.getName(), getExtendedOpcode()); if(bias.getNumRows() != params.K || bias.getNumColumns() != 1) { @@ -446,6 +460,27 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { ec.setMatrixOutput(getOutputVariableName(), outputBlock, getExtendedOpcode()); } + /** + * Reset the number of thread to respect the intermediate CP memory budget + * + * @param params convolution parameters + * @param numRows number of rows of intermediate matrix used per thread + * @param numCols number of rows of intermediate matrix used per thread + * @param sparsity sparsity of intermediate matrix used per thread + */ + private void resetNumThreads(ConvolutionParameters params, int numRows, int numCols, double sparsity) { + if(DMLScript.USE_ACCELERATOR) { + double memBudget1Thread = OptimizerUtils.estimateSizeExactSparsity(numRows, numCols, sparsity); + int limitedDegreeOfParallelism = (int) Math.floor(_intermediateMemoryBudget / memBudget1Thread); + if(params.numThreads > limitedDegreeOfParallelism) { + params.numThreads = limitedDegreeOfParallelism; + if(!warnedUnderUtilitization) + LOG.warn("CPU Under-utilization to respect the intermediate memory budget. To avoid this, please try reducing the mini-batch or forcing gpu execution."); + warnedUnderUtilitization = true; + } + } + } + private MatrixBlock getDenseOutputBlock(int numRows, int numCols) throws DMLRuntimeException { MatrixBlock outputBlock = new MatrixBlock(numRows, numCols, false); outputBlock.allocateDenseBlock(); http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java index 5b37576..b25f787 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java @@ -27,6 +27,7 @@ import org.apache.sysml.runtime.functionobjects.SwapIndex; import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.instructions.cp.CPOperand; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; +import org.apache.sysml.runtime.matrix.data.LibMatrixCuDNN; import org.apache.sysml.runtime.matrix.operators.ReorgOperator; import org.apache.sysml.runtime.util.ConvolutionUtils; import org.apache.sysml.utils.GPUStatistics; @@ -40,9 +41,9 @@ public class ConvolutionGPUInstruction extends GPUInstruction { private ArrayList<CPOperand> _filter_shape; private ArrayList<CPOperand> _stride = new ArrayList<CPOperand>(); private ArrayList<CPOperand> _padding = new ArrayList<CPOperand>(); - - private ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr) - throws DMLRuntimeException { + private double _intermediateMemoryBudget = 0; + + public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr, double intermediateMemoryBudget) throws DMLRuntimeException { super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), opcode, istr); if (!(opcode.equals("bias_add") || opcode.equals("bias_multiply") || opcode.equals("relu_backward"))) { throw new DMLRuntimeException( @@ -53,18 +54,23 @@ public class ConvolutionGPUInstruction extends GPUInstruction { _input2 = in2; _gputype = GPUINSTRUCTION_TYPE.Convolution; _output = out; + _intermediateMemoryBudget = intermediateMemoryBudget; } - - private ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand in3, CPOperand out, String opcode, - String istr, ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape, - ArrayList<CPOperand> filter_shape) { - this(in1, in2, out, opcode, istr, stride, padding, input_shape, filter_shape); + + public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand in3, CPOperand out, String opcode, + String istr, ArrayList<CPOperand> stride, + ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape, + ArrayList<CPOperand> filter_shape, double intermediateMemoryBudget) + { + this(in1, in2, out, opcode, istr, stride, padding, input_shape, filter_shape, intermediateMemoryBudget); _input3 = in3; } - - private ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr, - ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape, - ArrayList<CPOperand> filter_shape) { + + public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand out, String opcode, + String istr, ArrayList<CPOperand> stride, + ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape, + ArrayList<CPOperand> filter_shape, double intermediateMemoryBudget) + { super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), opcode, istr); _gputype = GPUINSTRUCTION_TYPE.Convolution; @@ -75,6 +81,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction { _padding = padding; _input_shape = input_shape; _filter_shape = filter_shape; + _intermediateMemoryBudget = intermediateMemoryBudget; } public static ConvolutionGPUInstruction parseInstruction(String str) @@ -87,7 +94,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction { || opcode.equalsIgnoreCase("conv2d_backward_filter") || opcode.equalsIgnoreCase("conv2d_backward_data") || opcode.equalsIgnoreCase("maxpooling_backward")) ) { - InstructionUtils.checkNumFields(parts, 15); + InstructionUtils.checkNumFields(parts, 16); CPOperand in1 = new CPOperand(parts[1]); CPOperand in2 = new CPOperand(parts[2]); CPOperand out = new CPOperand(parts[15]); @@ -110,10 +117,10 @@ public class ConvolutionGPUInstruction extends GPUInstruction { filter_shape.add(new CPOperand(parts[14])); return new ConvolutionGPUInstruction(in1, in2, out, opcode, str, stride, - padding, input_shape, filter_shape); + padding, input_shape, filter_shape, Double.parseDouble(parts[16])); } else if (opcode.equalsIgnoreCase("conv2d_bias_add")) { - InstructionUtils.checkNumFields(parts, 16); + InstructionUtils.checkNumFields(parts, 17); CPOperand in1 = new CPOperand(parts[1]); CPOperand in2 = new CPOperand(parts[2]); CPOperand in3 = new CPOperand(parts[3]); @@ -137,10 +144,10 @@ public class ConvolutionGPUInstruction extends GPUInstruction { filter_shape.add(new CPOperand(parts[15])); return new ConvolutionGPUInstruction(in1, in2, in3, out, opcode, str, stride, - padding, input_shape, filter_shape); + padding, input_shape, filter_shape, Double.parseDouble(parts[17])); } else if (opcode.equalsIgnoreCase("maxpooling")) { - InstructionUtils.checkNumFields(parts, 14); + InstructionUtils.checkNumFields(parts, 15); CPOperand in1 = new CPOperand(parts[1]); CPOperand out = new CPOperand(parts[14]); @@ -162,14 +169,14 @@ public class ConvolutionGPUInstruction extends GPUInstruction { filter_shape.add(new CPOperand(parts[13])); return new ConvolutionGPUInstruction(in1, null, out, opcode, str, stride, - padding, input_shape, filter_shape); + padding, input_shape, filter_shape, Double.parseDouble(parts[15])); } else if( opcode.equalsIgnoreCase("bias_add") || opcode.equalsIgnoreCase("relu_backward") || opcode.equalsIgnoreCase("bias_multiply") ) { - InstructionUtils.checkNumFields(parts, 3); + InstructionUtils.checkNumFields(parts, 4); CPOperand in1 = new CPOperand(parts[1]); CPOperand in2 = new CPOperand(parts[2]); CPOperand out = new CPOperand(parts[3]); - return new ConvolutionGPUInstruction(in1, in2, out, opcode, str); + return new ConvolutionGPUInstruction(in1, in2, out, opcode, str, Double.parseDouble(parts[4])); } else { throw new DMLRuntimeException("Unknown opcode while parsing a ConvolutionGPUInstruction: " + str); @@ -251,8 +258,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction { MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, K * P * Q); - LibMatrixCUDA.conv2d(ec.getGPUContext(0), getExtendedOpcode(), image, filter, out, N, C, H, W, - K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); + LibMatrixCuDNN.conv2d(ec.getGPUContext(0), getExtendedOpcode(), image, filter, out, N, C, H, W, + K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget); } else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) { MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName()); @@ -266,8 +273,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction { MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, K * P * Q); - LibMatrixCUDA.conv2dBiasAdd(ec.getGPUContext(0), getExtendedOpcode(), image, bias, filter, out, N, C, H, W, - K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); + LibMatrixCuDNN.conv2dBiasAdd(ec.getGPUContext(0), getExtendedOpcode(), image, bias, filter, out, N, C, H, W, + K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget); } else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) { MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName()); @@ -281,8 +288,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction { MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), K, C * R * S); - LibMatrixCUDA.conv2dBackwardFilter(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W, - K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); + LibMatrixCuDNN.conv2dBackwardFilter(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W, + K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget); // TODO: For now always copy the device data to host // ec.gpuCtx.copyDeviceToHost(outputBlock); } @@ -298,8 +305,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction { MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * H * W); - LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(0), getExtendedOpcode(), filter, dout, out, N, C, H, W, - K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); + LibMatrixCuDNN.conv2dBackwardData(ec.getGPUContext(0), getExtendedOpcode(), filter, dout, out, N, C, H, W, + K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget); } else if (instOpcode.equalsIgnoreCase("maxpooling")) { MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName()); @@ -311,8 +318,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction { MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * P * Q); if(instOpcode.equalsIgnoreCase("maxpooling")) - LibMatrixCUDA.maxpooling(ec.getGPUContext(0), getExtendedOpcode(), image, out, N, C, H, W, - K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); + LibMatrixCuDNN.maxpooling(ec.getGPUContext(0), getExtendedOpcode(), image, out, N, C, H, W, + K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget); } else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) { MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName()); @@ -326,8 +333,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction { MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * H * W); - LibMatrixCUDA.maxpoolingBackward(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W, - K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); + LibMatrixCuDNN.maxpoolingBackward(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W, + K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget); } else { throw new DMLRuntimeException("Unsupported GPU context for " + instOpcode); @@ -345,6 +352,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction { ec.releaseMatrixOutputForGPUInstruction(_output.getName()); } + private int getScalarInput(ExecutionContext ec, ArrayList<CPOperand> aL, int index) throws DMLRuntimeException { http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java index af27dc6..5096566 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java @@ -24,6 +24,7 @@ import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.instructions.cp.CPOperand; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; +import org.apache.sysml.runtime.matrix.data.LibMatrixCuDNN; import org.apache.sysml.runtime.matrix.operators.Operator; import org.apache.sysml.utils.GPUStatistics; @@ -44,7 +45,7 @@ public class MatrixBuiltinGPUInstruction extends BuiltinUnaryGPUInstruction { switch(opcode) { case "sel+": - LibMatrixCUDA.relu(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break; + LibMatrixCuDNN.relu(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break; case "exp": LibMatrixCUDA.exp(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break; case "sqrt": http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java index c6b82c4..197daaf 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java @@ -49,6 +49,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.sysml.api.DMLScript; import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.conf.DMLConfig; +import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; @@ -151,6 +152,11 @@ public class GPUContext { LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, Available: " + (free[0] * (1e-6)) + " MB on " + this); + if(GPUContextPool.initialGPUMemBudget() > OptimizerUtils.getLocalMemBudget()) { + LOG.warn("Potential under-utilization: GPU memory (" + GPUContextPool.initialGPUMemBudget() + + ") > driver memory budget (" + OptimizerUtils.getLocalMemBudget() + "). " + + "Consider increasing the driver memory budget."); + } } private void initializeCudaLibraryHandles() throws DMLRuntimeException { http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java index 78b6e3b..6d06ee5 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java @@ -21,6 +21,7 @@ package org.apache.sysml.runtime.matrix.data; import java.io.Serializable; +import org.apache.sysml.hops.Hop; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.util.ConvolutionUtils; @@ -34,7 +35,9 @@ public class ConvolutionParameters implements Serializable { public int K; public int R; public int S; public int stride_h; public int stride_w; public int pad_h; public int pad_w; public int P; public int Q; public int numThreads; + // Optional variables used by ConvolutionCPInstruction public boolean enableNative = false; + public MatrixBlock input1; public MatrixBlock input2; public MatrixBlock output; public MatrixBlock bias; @@ -62,6 +65,28 @@ public class ConvolutionParameters implements Serializable { "], pad=[" + pad_h + "," + pad_w + "])"; } + public void setIfUnknown(Hop N, Hop C, Hop H, Hop W, + Hop K, Hop R, Hop S, Hop stride_h, Hop stride_w, Hop pad_h, Hop pad_w, int numThreads) throws DMLRuntimeException { + if(this.N < 0) this.N = convertToInt(Hop.computeSizeInformation(N)); + if(this.C < 0) this.C = convertToInt(Hop.computeSizeInformation(C)); + if(this.H < 0) this.H = convertToInt(Hop.computeSizeInformation(H)); + if(this.W < 0) this.W = convertToInt(Hop.computeSizeInformation(W)); + if(this.K < 0) this.K = convertToInt(Hop.computeSizeInformation(K)); + if(this.R < 0) this.R = convertToInt(Hop.computeSizeInformation(R)); + if(this.S < 0) this.S = convertToInt(Hop.computeSizeInformation(S)); + if(this.stride_h < 0) this.stride_h = convertToInt(Hop.computeSizeInformation(stride_h)); + if(this.stride_w < 0) this.stride_w = convertToInt(Hop.computeSizeInformation(stride_w)); + if(this.pad_h < 0) this.pad_h = convertToInt(Hop.computeSizeInformation(pad_h)); + if(this.pad_w < 0) this.pad_w = convertToInt(Hop.computeSizeInformation(pad_w)); + if(this.P < 0 && this.H >= 0 && this.R >= 0 && this.stride_h >= 0 && this.pad_h >= 0) { + this.P = (int) ConvolutionUtils.getP(this.H, this.R, this.stride_h, this.pad_h); + } + if(this.Q < 0 && this.W >= 0 && this.S >= 0 && this.stride_w >= 0 && this.pad_w >= 0) { + this.Q = (int) ConvolutionUtils.getQ(this.W, this.S, this.stride_w, this.pad_w); + } + this.numThreads = numThreads; + } + public ConvolutionParameters(long N, long C, long H, long W, long K, long R, long S, long stride_h, long stride_w, long pad_h, long pad_w, int numThreads) throws DMLRuntimeException { this.N = convertToInt(N);