Repository: systemml Updated Branches: refs/heads/master 8606754ea -> beb1a1d19
[SYSTEMML-445] Support recomputation of activations to reduce the memory footprint - Added a configuration property sysml.gpu.recompute.activations to enable recomputation of ReLU. - This configuration is disabled by default, but can be enabled for large networks. Closes #841. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/beb1a1d1 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/beb1a1d1 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/beb1a1d1 Branch: refs/heads/master Commit: beb1a1d19a5a2710b55bd41d36a5d8085fb0afda Parents: 8606754 Author: Niketan Pansare <npan...@us.ibm.com> Authored: Sun Nov 4 14:19:38 2018 +0530 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Sun Nov 4 14:19:38 2018 +0530 ---------------------------------------------------------------------- conf/SystemML-config.xml.template | 4 + .../java/org/apache/sysml/conf/DMLConfig.java | 4 +- src/main/java/org/apache/sysml/hops/DnnOp.java | 12 ++- .../instructions/GPUInstructionParser.java | 2 + .../instructions/gpu/DnnGPUInstruction.java | 60 +++++++++++--- .../runtime/matrix/data/LibMatrixCuDNN.java | 87 +++++++++++++++++++- 6 files changed, 155 insertions(+), 14 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/beb1a1d1/conf/SystemML-config.xml.template ---------------------------------------------------------------------- diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template index 7b535c9..b9189b1 100644 --- a/conf/SystemML-config.xml.template +++ b/conf/SystemML-config.xml.template @@ -114,4 +114,8 @@ <!-- Allocator to use to allocate GPU device memory. Supported values are cuda, unified_memory (default: cuda) --> <sysml.gpu.memory.allocator>cuda</sysml.gpu.memory.allocator> + + <!-- Should perform recomputation of activations such as ReLU to reduce memory consumption. Set this to true + when performing inference or for training very large networks (default: false) --> + <sysml.gpu.recompute.activations>false</sysml.gpu.recompute.activations> </root> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/beb1a1d1/src/main/java/org/apache/sysml/conf/DMLConfig.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java index 7f0ecbc..8459fd4 100644 --- a/src/main/java/org/apache/sysml/conf/DMLConfig.java +++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java @@ -96,6 +96,7 @@ public class DMLConfig public static final String FLOATING_POINT_PRECISION = "sysml.floating.point.precision"; // String to specify the datatype to use internally: supported values are double, single public static final String PRINT_GPU_MEMORY_INFO = "sysml.gpu.print.memoryInfo"; public static final String EVICTION_SHADOW_BUFFERSIZE = "sysml.gpu.eviction.shadow.bufferSize"; + public static final String GPU_RECOMPUTE_ACTIVATIONS = "sysml.gpu.recompute.activations"; // supported prefixes for custom map/reduce configurations public static final String PREFIX_MAPRED = "mapred"; @@ -147,6 +148,7 @@ public class DMLConfig _defaultVals.put(SYNCHRONIZE_GPU, "false" ); _defaultVals.put(CACHING_BUFFER_SIZE, "0.15" ); _defaultVals.put(EAGER_CUDA_FREE, "false" ); + _defaultVals.put(GPU_RECOMPUTE_ACTIVATIONS, "false" ); _defaultVals.put(FLOATING_POINT_PRECISION, "double" ); } @@ -430,7 +432,7 @@ public class DMLConfig CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS, EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN, PRINT_GPU_MEMORY_INFO, CACHING_BUFFER_SIZE, AVAILABLE_GPUS, SYNCHRONIZE_GPU, EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY, EVICTION_SHADOW_BUFFERSIZE, - GPU_MEMORY_ALLOCATOR, GPU_MEMORY_UTILIZATION_FACTOR + GPU_MEMORY_ALLOCATOR, GPU_MEMORY_UTILIZATION_FACTOR, GPU_RECOMPUTE_ACTIVATIONS }; StringBuilder sb = new StringBuilder(); http://git-wip-us.apache.org/repos/asf/systemml/blob/beb1a1d1/src/main/java/org/apache/sysml/hops/DnnOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/DnnOp.java b/src/main/java/org/apache/sysml/hops/DnnOp.java index 7cf5061..cc94111 100644 --- a/src/main/java/org/apache/sysml/hops/DnnOp.java +++ b/src/main/java/org/apache/sysml/hops/DnnOp.java @@ -20,6 +20,7 @@ package org.apache.sysml.hops; import org.apache.sysml.conf.ConfigurationManager; +import org.apache.sysml.conf.DMLConfig; import org.apache.sysml.hops.rewrite.HopRewriteUtils; import org.apache.sysml.lops.DnnTransform; import org.apache.sysml.lops.DnnTransform.OperationTypes; @@ -47,6 +48,8 @@ public class DnnOp extends MultiThreadedHop private static final boolean THROW_ERROR_IF_INFERRED_SHAPE_MISMATCH = true; // ------------------------------------------------------------------------- + private static final boolean GPU_RECOMPUTE_ACTIVATIONS = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.GPU_RECOMPUTE_ACTIVATIONS); + // Specifies the type of this hop private Hop.OpOpDnn op; @@ -273,11 +276,16 @@ public class DnnOp extends MultiThreadedHop // by reducing unnecessary sparse-to-dense-to-sparse conversion. // For other backends, this operators is not necessary as it reduces an additional relu operator. Hop parentReLU = isInputReLU(inputs.get(0)); - if(OptimizerUtils.ALLOW_OPERATOR_FUSION && et == ExecType.CP && op == OpOpDnn.MAX_POOL && parentReLU != null) { + + if(OptimizerUtils.ALLOW_OPERATOR_FUSION && + (et == ExecType.CP || (et == ExecType.GPU && GPU_RECOMPUTE_ACTIVATIONS)) + && op == OpOpDnn.MAX_POOL && parentReLU != null) { lhsInputLop = parentReLU.constructLops(); lopOp = OperationTypes.RELU_MAX_POOLING; } - else if(OptimizerUtils.ALLOW_OPERATOR_FUSION && et == ExecType.CP && op == OpOpDnn.MAX_POOL_BACKWARD && parentReLU != null) { + else if(OptimizerUtils.ALLOW_OPERATOR_FUSION && + (et == ExecType.CP || (et == ExecType.GPU && GPU_RECOMPUTE_ACTIVATIONS)) + && op == OpOpDnn.MAX_POOL_BACKWARD && parentReLU != null) { lhsInputLop = parentReLU.constructLops(); lopOp = OperationTypes.RELU_MAX_POOLING_BACKWARD; } http://git-wip-us.apache.org/repos/asf/systemml/blob/beb1a1d1/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java index c8a0e8d..20058de 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java @@ -52,6 +52,8 @@ public class GPUInstructionParser extends InstructionParser String2GPUInstructionType.put( "conv2d_backward_data", GPUINSTRUCTION_TYPE.Dnn); String2GPUInstructionType.put( "maxpooling", GPUINSTRUCTION_TYPE.Dnn); String2GPUInstructionType.put( "maxpooling_backward", GPUINSTRUCTION_TYPE.Dnn); + String2GPUInstructionType.put( "relu_maxpooling", GPUINSTRUCTION_TYPE.Dnn); + String2GPUInstructionType.put( "relu_maxpooling_backward", GPUINSTRUCTION_TYPE.Dnn); String2GPUInstructionType.put( "avgpooling", GPUINSTRUCTION_TYPE.Dnn); String2GPUInstructionType.put( "avgpooling_backward", GPUINSTRUCTION_TYPE.Dnn); String2GPUInstructionType.put( "bias_add", GPUINSTRUCTION_TYPE.Dnn); http://git-wip-us.apache.org/repos/asf/systemml/blob/beb1a1d1/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java index 0424114..35c9591 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java @@ -238,7 +238,8 @@ public class DnnGPUInstruction extends GPUInstruction { return new DnnGPUInstruction(in1, in2, out, opcode, str, stride, padding, input_shape, filter_shape, Double.parseDouble(parts[16])); } - else if( opcode.equalsIgnoreCase("maxpooling_backward") || opcode.equalsIgnoreCase("avgpooling_backward") ) { + else if( opcode.equalsIgnoreCase("maxpooling_backward") || opcode.equalsIgnoreCase("relu_maxpooling_backward") + || opcode.equalsIgnoreCase("avgpooling_backward") ) { boolean withMaxPoolOut = false; if(parts.length == 18) { withMaxPoolOut = true; @@ -298,7 +299,8 @@ public class DnnGPUInstruction extends GPUInstruction { return new DnnGPUInstruction(in1, in2, in3, out, opcode, str, stride, padding, input_shape, filter_shape, Double.parseDouble(parts[17])); } - else if (opcode.equalsIgnoreCase("maxpooling") || opcode.equalsIgnoreCase("avgpooling")) { + else if (opcode.equalsIgnoreCase("maxpooling") || opcode.equalsIgnoreCase("relu_maxpooling") + || opcode.equalsIgnoreCase("avgpooling")) { InstructionUtils.checkNumFields(parts, 15); CPOperand in1 = new CPOperand(parts[1]); CPOperand out = new CPOperand(parts[14]); @@ -1005,8 +1007,19 @@ public class DnnGPUInstruction extends GPUInstruction { LibMatrixCuDNN.conv2dBackwardData(ec.getGPUContext(0), getExtendedOpcode(), filter, dout, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget); } - else if (instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("avgpooling")) { + else if (instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling") + || instOpcode.equalsIgnoreCase("avgpooling")) { MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName()); + Pointer x = null; + if(instOpcode.equalsIgnoreCase("relu_maxpooling")) { + Pointer tmpX = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, image, instName); + long CHW = ((long)C)*((long)H)*((long)W); + x = gCtx.allocate(instName, ((long)N)*CHW*LibMatrixCUDA.sizeOfDataType); + LibMatrixCuDNN.getCudaKernels(gCtx).launchKernel("relu", + ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)), + tmpX, x, N, toInt(CHW)); + ec.releaseMatrixInputForGPUInstruction(_input1.getName()); + } if(image.getNumRows() != N || image.getNumColumns() != C*H*W) throw new DMLRuntimeException("Incorrect dimensions for image in maxpooling: " + @@ -1014,11 +1027,30 @@ public class DnnGPUInstruction extends GPUInstruction { MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * P * Q); PoolingType poolType = instOpcode.equalsIgnoreCase("maxpooling") ? PoolingType.MAX : PoolingType.AVG; - LibMatrixCuDNN.pooling(ec.getGPUContext(0), getExtendedOpcode(), image, out, N, C, H, W, + if(instOpcode.equalsIgnoreCase("relu_maxpooling")) { + LibMatrixCuDNN.pooling(ec.getGPUContext(0), getExtendedOpcode(), x, out, N, C, H, W, + K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, poolType, _intermediateMemoryBudget); + gCtx.cudaFreeHelper(instName, x, gCtx.EAGER_CUDA_FREE); + } + else { + LibMatrixCuDNN.pooling(ec.getGPUContext(0), getExtendedOpcode(), image, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, poolType, _intermediateMemoryBudget); + } } - else if (instOpcode.equalsIgnoreCase("maxpooling_backward") || instOpcode.equalsIgnoreCase("avgpooling_backward")) { + else if (instOpcode.equalsIgnoreCase("maxpooling_backward") || instOpcode.equalsIgnoreCase("relu_maxpooling_backward") + || instOpcode.equalsIgnoreCase("avgpooling_backward")) { MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName()); + Pointer x = null; + if(instOpcode.equalsIgnoreCase("relu_maxpooling_backward")) { + Pointer tmpX = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, image, instName); + long CHW = ((long)C)*((long)H)*((long)W); + x = gCtx.allocate(instName, ((long)N)*CHW*LibMatrixCUDA.sizeOfDataType); + LibMatrixCuDNN.getCudaKernels(gCtx).launchKernel("relu", + ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)), + tmpX, x, N, toInt(CHW)); + ec.releaseMatrixInputForGPUInstruction(_input1.getName()); + } + MatrixObject dout = getMatrixInputForGPUInstruction(ec, _input2.getName()); MatrixObject maxPoolOutput = _input3 != null ? getMatrixInputForGPUInstruction(ec, _input3.getName()) : null; if(dout.getNumRows() != N || dout.getNumColumns() != C*P*Q) @@ -1029,18 +1061,26 @@ public class DnnGPUInstruction extends GPUInstruction { MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * H * W); PoolingType poolType = instOpcode.equalsIgnoreCase("maxpooling_backward") ? PoolingType.MAX : PoolingType.AVG; - LibMatrixCuDNN.poolingBackward(ec.getGPUContext(0), getExtendedOpcode(), image, dout, maxPoolOutput, out, N, C, H, W, - K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, poolType, _intermediateMemoryBudget); + if(instOpcode.equalsIgnoreCase("relu_maxpooling_backward")) { + LibMatrixCuDNN.poolingBackward(ec.getGPUContext(0), getExtendedOpcode(), x, dout, maxPoolOutput, out, N, C, H, W, + K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, poolType, _intermediateMemoryBudget); + gCtx.cudaFreeHelper(instName, x, gCtx.EAGER_CUDA_FREE); + } + else { + LibMatrixCuDNN.poolingBackward(ec.getGPUContext(0), getExtendedOpcode(), image, dout, maxPoolOutput, out, N, C, H, W, + K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, poolType, _intermediateMemoryBudget); + } } else { throw new DMLRuntimeException("Unsupported GPU context for " + instOpcode); } // release inputs/outputs - ec.releaseMatrixInputForGPUInstruction(_input1.getName()); + if(!instOpcode.equalsIgnoreCase("relu_maxpooling") && !instOpcode.equalsIgnoreCase("relu_maxpooling_backward")) + ec.releaseMatrixInputForGPUInstruction(_input1.getName()); - boolean isPool = instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("avgpooling"); - boolean isPoolBackward = instOpcode.equalsIgnoreCase("maxpooling_backward") || instOpcode.equalsIgnoreCase("avgpooling_backward"); + boolean isPool = instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("avgpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling"); + boolean isPoolBackward = instOpcode.equalsIgnoreCase("maxpooling_backward") || instOpcode.equalsIgnoreCase("avgpooling_backward") || instOpcode.equalsIgnoreCase("relu_maxpooling_backward"); if ( !isPool ) ec.releaseMatrixInputForGPUInstruction(_input2.getName()); http://git-wip-us.apache.org/repos/asf/systemml/blob/beb1a1d1/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java index 413c550..e496ddb 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java @@ -643,6 +643,44 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { throwCuDNNDimensionError(N, CHW, N, CPQ); } } + + /** + * performs maxpooling on GPU by exploiting cudnnPoolingForward(...) + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param x image as pointer + * @param outputBlock output matrix + * @param N batch size + * @param C number of channels + * @param H height of image + * @param W width of image + * @param K number of filters + * @param R height of filter + * @param S width of filter + * @param pad_h vertical padding + * @param pad_w horizontal padding + * @param stride_h horizontal stride + * @param stride_w vertical stride + * @param P (H - R + 1 + 2*pad_h)/stride_h + * @param Q (W - S + 1 + 2*pad_w)/stride_w + * @param poolingType type of pooling + * @param intermediateMemoryBudget intermediate memory budget + */ + public static void pooling(GPUContext gCtx, String instName, Pointer x, + MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, + int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, + int Q, PoolingType poolingType, double intermediateMemoryBudget) { + long CHW = C*H*W; long CPQ = C*P*Q; + long NCHW = N*CHW; long NCPQ = N*CPQ; + + if(NCHW < maxNumElementsOfCuDNNTensor && NCPQ < maxNumElementsOfCuDNNTensor) { + Pointer y = getDensePointerForCuDNN(gCtx, outputBlock, instName); + cudnnPoolingHelper(gCtx, instName, x, y, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, poolingType); + } + else { + throwCuDNNDimensionError(N, CHW, N, CPQ); + } + } private static void cudnnPoolingHelper(GPUContext gCtx, String instName, Pointer x, Pointer y, int N, int C, int H, int W, int K, int R, @@ -738,6 +776,53 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { } } + /** + * Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...) + * This method computes the backpropogation errors for previous layer of maxpooling operation + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param x image as dense pointer + * @param dout delta matrix, output of previous layer + * @param maxpoolOutput (optional and can be null) output of maxpool forward function + * @param outputBlock output matrix + * @param N batch size + * @param C number of channels + * @param H height of image + * @param W width of image + * @param K number of filters + * @param R height of filter + * @param S width of filter + * @param pad_h vertical padding + * @param pad_w horizontal padding + * @param stride_h horizontal stride + * @param stride_w vertical stride + * @param P (H - R + 1 + 2*pad_h)/stride_h + * @param Q (W - S + 1 + 2*pad_w)/stride_w + * @param poolingType type of pooling + * @param intermediateMemoryBudget intermediate memory budget + */ + public static void poolingBackward(GPUContext gCtx, String instName, Pointer x, MatrixObject dout, + MatrixObject maxpoolOutput, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, + int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, + int Q, PoolingType poolingType, double intermediateMemoryBudget) { + long CHW = C*H*W; long CPQ = C*P*Q; + long NCHW = N*CHW; long NCPQ = N*CPQ; + + final boolean isMaxPoolOutputProvided = maxpoolOutput != null; + + if(NCHW < maxNumElementsOfCuDNNTensor && NCPQ < maxNumElementsOfCuDNNTensor) { + // Filter and output are accounted as dense in the memory estimation for conv2dBackwardData + Pointer dx = getDensePointerForCuDNN(gCtx, outputBlock, instName); + Pointer dy = getDensePointerForCuDNN(gCtx, dout, instName); + Pointer y = isMaxPoolOutputProvided ? getDensePointerForCuDNN(gCtx, maxpoolOutput, instName) : null; + cudnnPoolingBackwardHelper(gCtx, instName, x, dy, y, dx, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, poolingType); + + } + else { + throwCuDNNDimensionError(N, CHW, N, CPQ); + } + } + private static void cudnnPoolingBackwardHelper(GPUContext gCtx, String instName, Pointer x, Pointer dy, Pointer y, Pointer dx, int N, int C, int H, int W, int K, int R, @@ -1457,7 +1542,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { * @param instName name of the instruction * @return jcuda pointer */ - protected static Pointer getDensePointerForCuDNN(GPUContext gCtx, MatrixObject image, String instName) { + public static Pointer getDensePointerForCuDNN(GPUContext gCtx, MatrixObject image, String instName) { long numElems = image.getNumRows()*image.getNumColumns(); if(numElems > maxNumElementsOfCuDNNTensor) { throw new DMLRuntimeException("CuDNN restriction: the size of input tensor cannot have greater than 2 giga-elements, but has " + numElems + " (i.e. [" + image.getNumRows() + " X " + image.getNumColumns() + "]). Hint: try reducing the mini-batch size.");