systemml git commit: [SYSTEMML-445] Support recomputation of activations to reduce the memory footprint

niketanpansare Sun, 04 Nov 2018 01:52:23 -0700

Repository: systemml
Updated Branches:
  refs/heads/master 8606754ea -> beb1a1d19



[SYSTEMML-445] Support recomputation of activations to reduce the memory 
footprint

- Added a configuration property sysml.gpu.recompute.activations to enable 
recomputation of ReLU.
- This configuration is disabled by default, but can be enabled for large 
networks.

Closes #841.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/beb1a1d1
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/beb1a1d1
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/beb1a1d1

Branch: refs/heads/master
Commit: beb1a1d19a5a2710b55bd41d36a5d8085fb0afda
Parents: 8606754
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Sun Nov 4 14:19:38 2018 +0530
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Sun Nov 4 14:19:38 2018 +0530

----------------------------------------------------------------------
 conf/SystemML-config.xml.template               |  4 +
 .../java/org/apache/sysml/conf/DMLConfig.java   |  4 +-
 src/main/java/org/apache/sysml/hops/DnnOp.java  | 12 ++-
 .../instructions/GPUInstructionParser.java      |  2 +
 .../instructions/gpu/DnnGPUInstruction.java     | 60 +++++++++++---
 .../runtime/matrix/data/LibMatrixCuDNN.java     | 87 +++++++++++++++++++-
 6 files changed, 155 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/beb1a1d1/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template 
b/conf/SystemML-config.xml.template
index 7b535c9..b9189b1 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -114,4 +114,8 @@
    
    <!-- Allocator to use to allocate GPU device memory. Supported values are 
cuda, unified_memory (default: cuda) -->
    <sysml.gpu.memory.allocator>cuda</sysml.gpu.memory.allocator>
+   
+   <!-- Should perform recomputation of activations such as ReLU to reduce 
memory consumption. Set this to true
+   when performing inference or for training very large networks (default: 
false) -->
+   <sysml.gpu.recompute.activations>false</sysml.gpu.recompute.activations>
 </root>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/beb1a1d1/src/main/java/org/apache/sysml/conf/DMLConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java 
b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index 7f0ecbc..8459fd4 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -96,6 +96,7 @@ public class DMLConfig
        public static final String FLOATING_POINT_PRECISION = 
"sysml.floating.point.precision"; // String to specify the datatype to use 
internally: supported values are double, single
        public static final String PRINT_GPU_MEMORY_INFO = 
"sysml.gpu.print.memoryInfo";
        public static final String EVICTION_SHADOW_BUFFERSIZE = 
"sysml.gpu.eviction.shadow.bufferSize";
+       public static final String GPU_RECOMPUTE_ACTIVATIONS = 
"sysml.gpu.recompute.activations";
 
        // supported prefixes for custom map/reduce configurations
        public static final String PREFIX_MAPRED = "mapred";
@@ -147,6 +148,7 @@ public class DMLConfig
                _defaultVals.put(SYNCHRONIZE_GPU,        "false" );
                _defaultVals.put(CACHING_BUFFER_SIZE,    "0.15" );
                _defaultVals.put(EAGER_CUDA_FREE,        "false" );
+               _defaultVals.put(GPU_RECOMPUTE_ACTIVATIONS, "false" );
                _defaultVals.put(FLOATING_POINT_PRECISION,               
"double" );
        }
        
@@ -430,7 +432,7 @@ public class DMLConfig
                                CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, 
CODEGEN_PLANCACHE, CODEGEN_LITERALS,
                                EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN, 
PRINT_GPU_MEMORY_INFO, CACHING_BUFFER_SIZE,
                                AVAILABLE_GPUS, SYNCHRONIZE_GPU, 
EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY, 
EVICTION_SHADOW_BUFFERSIZE,
-                               GPU_MEMORY_ALLOCATOR, 
GPU_MEMORY_UTILIZATION_FACTOR
+                               GPU_MEMORY_ALLOCATOR, 
GPU_MEMORY_UTILIZATION_FACTOR, GPU_RECOMPUTE_ACTIVATIONS
                }; 
                
                StringBuilder sb = new StringBuilder();

http://git-wip-us.apache.org/repos/asf/systemml/blob/beb1a1d1/src/main/java/org/apache/sysml/hops/DnnOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/DnnOp.java 
b/src/main/java/org/apache/sysml/hops/DnnOp.java
index 7cf5061..cc94111 100644
--- a/src/main/java/org/apache/sysml/hops/DnnOp.java
+++ b/src/main/java/org/apache/sysml/hops/DnnOp.java
@@ -20,6 +20,7 @@
 package org.apache.sysml.hops;
 
 import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.hops.rewrite.HopRewriteUtils;
 import org.apache.sysml.lops.DnnTransform;
 import org.apache.sysml.lops.DnnTransform.OperationTypes;
@@ -47,6 +48,8 @@ public class DnnOp extends MultiThreadedHop
        private static final boolean THROW_ERROR_IF_INFERRED_SHAPE_MISMATCH = 
true;
        // 
-------------------------------------------------------------------------
        
+       private static final boolean GPU_RECOMPUTE_ACTIVATIONS = 
ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.GPU_RECOMPUTE_ACTIVATIONS);
 
+       
        // Specifies the type of this hop
        private Hop.OpOpDnn op;
 
@@ -273,11 +276,16 @@ public class DnnOp extends MultiThreadedHop
                // by reducing unnecessary sparse-to-dense-to-sparse conversion.
                // For other backends, this operators is not necessary as it 
reduces an additional relu operator.
                Hop parentReLU = isInputReLU(inputs.get(0));
-               if(OptimizerUtils.ALLOW_OPERATOR_FUSION && et == ExecType.CP && 
op == OpOpDnn.MAX_POOL && parentReLU != null) {
+               
+               if(OptimizerUtils.ALLOW_OPERATOR_FUSION && 
+                       (et == ExecType.CP || (et == ExecType.GPU && 
GPU_RECOMPUTE_ACTIVATIONS)) 
+                       && op == OpOpDnn.MAX_POOL && parentReLU != null) {
                        lhsInputLop = parentReLU.constructLops();
                        lopOp = OperationTypes.RELU_MAX_POOLING;
                }
-               else if(OptimizerUtils.ALLOW_OPERATOR_FUSION && et == 
ExecType.CP && op == OpOpDnn.MAX_POOL_BACKWARD && parentReLU != null) {
+               else if(OptimizerUtils.ALLOW_OPERATOR_FUSION && 
+                       (et == ExecType.CP || (et == ExecType.GPU && 
GPU_RECOMPUTE_ACTIVATIONS)) 
+                       && op == OpOpDnn.MAX_POOL_BACKWARD && parentReLU != 
null) {
                        lhsInputLop = parentReLU.constructLops();
                        lopOp = OperationTypes.RELU_MAX_POOLING_BACKWARD;
                }

http://git-wip-us.apache.org/repos/asf/systemml/blob/beb1a1d1/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index c8a0e8d..20058de 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -52,6 +52,8 @@ public class GPUInstructionParser  extends InstructionParser
                String2GPUInstructionType.put( "conv2d_backward_data",   
GPUINSTRUCTION_TYPE.Dnn);
                String2GPUInstructionType.put( "maxpooling",             
GPUINSTRUCTION_TYPE.Dnn);
                String2GPUInstructionType.put( "maxpooling_backward",    
GPUINSTRUCTION_TYPE.Dnn);
+               String2GPUInstructionType.put( "relu_maxpooling",        
GPUINSTRUCTION_TYPE.Dnn);
+               String2GPUInstructionType.put( "relu_maxpooling_backward", 
GPUINSTRUCTION_TYPE.Dnn);
                String2GPUInstructionType.put( "avgpooling",             
GPUINSTRUCTION_TYPE.Dnn);
                String2GPUInstructionType.put( "avgpooling_backward",    
GPUINSTRUCTION_TYPE.Dnn);
                String2GPUInstructionType.put( "bias_add",               
GPUINSTRUCTION_TYPE.Dnn);

http://git-wip-us.apache.org/repos/asf/systemml/blob/beb1a1d1/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java
index 0424114..35c9591 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java
@@ -238,7 +238,8 @@ public class DnnGPUInstruction extends GPUInstruction {
                        return new DnnGPUInstruction(in1, in2, out, opcode, 
str, stride,
                                        padding, input_shape, filter_shape, 
Double.parseDouble(parts[16]));
                }
-               else if( opcode.equalsIgnoreCase("maxpooling_backward") || 
opcode.equalsIgnoreCase("avgpooling_backward") ) {
+               else if( opcode.equalsIgnoreCase("maxpooling_backward") || 
opcode.equalsIgnoreCase("relu_maxpooling_backward") 
+                               || 
opcode.equalsIgnoreCase("avgpooling_backward") ) {
                        boolean withMaxPoolOut = false;
                        if(parts.length == 18) {
                                withMaxPoolOut = true;
@@ -298,7 +299,8 @@ public class DnnGPUInstruction extends GPUInstruction {
                        return new DnnGPUInstruction(in1, in2, in3, out, 
opcode, str, stride,
                                        padding, input_shape, filter_shape, 
Double.parseDouble(parts[17]));
                }
-               else if (opcode.equalsIgnoreCase("maxpooling") || 
opcode.equalsIgnoreCase("avgpooling")) {
+               else if (opcode.equalsIgnoreCase("maxpooling") || 
opcode.equalsIgnoreCase("relu_maxpooling") 
+                               || opcode.equalsIgnoreCase("avgpooling")) {
                        InstructionUtils.checkNumFields(parts, 15);
                        CPOperand in1 = new CPOperand(parts[1]);
                        CPOperand out = new CPOperand(parts[14]);
@@ -1005,8 +1007,19 @@ public class DnnGPUInstruction extends GPUInstruction {
                        LibMatrixCuDNN.conv2dBackwardData(ec.getGPUContext(0), 
getExtendedOpcode(), filter, dout, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q, _intermediateMemoryBudget);
                }
-               else if (instOpcode.equalsIgnoreCase("maxpooling") || 
instOpcode.equalsIgnoreCase("avgpooling")) {
+               else if (instOpcode.equalsIgnoreCase("maxpooling") || 
instOpcode.equalsIgnoreCase("relu_maxpooling") 
+                       || instOpcode.equalsIgnoreCase("avgpooling")) {
                        MatrixObject image = 
getMatrixInputForGPUInstruction(ec, _input1.getName());
+                       Pointer x = null;
+                       if(instOpcode.equalsIgnoreCase("relu_maxpooling")) {
+                               Pointer tmpX = 
LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, image, instName);
+                               long CHW = ((long)C)*((long)H)*((long)W);
+                               x = gCtx.allocate(instName, 
((long)N)*CHW*LibMatrixCUDA.sizeOfDataType);
+                               
LibMatrixCuDNN.getCudaKernels(gCtx).launchKernel("relu",
+                                               
ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)),
+                                               tmpX, x, N, toInt(CHW));
+                               
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
+                       }
 
                        if(image.getNumRows() != N || image.getNumColumns() != 
C*H*W) 
                                throw new DMLRuntimeException("Incorrect 
dimensions for image in maxpooling: " + 
@@ -1014,11 +1027,30 @@ public class DnnGPUInstruction extends GPUInstruction {
                        
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * P * Q);
                        PoolingType poolType = 
instOpcode.equalsIgnoreCase("maxpooling") ? PoolingType.MAX : PoolingType.AVG;
-                       LibMatrixCuDNN.pooling(ec.getGPUContext(0), 
getExtendedOpcode(), image, out, N, C, H, W,
+                       if(instOpcode.equalsIgnoreCase("relu_maxpooling")) {
+                               LibMatrixCuDNN.pooling(ec.getGPUContext(0), 
getExtendedOpcode(), x, out, N, C, H, W,
+                                               K, R, S, pad_h, pad_w, 
stride_h, stride_w, P, Q, poolType, _intermediateMemoryBudget);
+                               gCtx.cudaFreeHelper(instName, x, 
gCtx.EAGER_CUDA_FREE);
+                       }
+                       else {
+                               LibMatrixCuDNN.pooling(ec.getGPUContext(0), 
getExtendedOpcode(), image, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q, poolType, _intermediateMemoryBudget);
+                       }
                }
-               else if (instOpcode.equalsIgnoreCase("maxpooling_backward") || 
instOpcode.equalsIgnoreCase("avgpooling_backward")) {
+               else if (instOpcode.equalsIgnoreCase("maxpooling_backward") || 
instOpcode.equalsIgnoreCase("relu_maxpooling_backward") 
+                       || instOpcode.equalsIgnoreCase("avgpooling_backward")) {
                        MatrixObject image = 
getMatrixInputForGPUInstruction(ec, _input1.getName());
+                       Pointer x = null;
+                       
if(instOpcode.equalsIgnoreCase("relu_maxpooling_backward")) {
+                               Pointer tmpX = 
LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, image, instName);
+                               long CHW = ((long)C)*((long)H)*((long)W);
+                               x = gCtx.allocate(instName, 
((long)N)*CHW*LibMatrixCUDA.sizeOfDataType);
+                               
LibMatrixCuDNN.getCudaKernels(gCtx).launchKernel("relu",
+                                               
ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)),
+                                               tmpX, x, N, toInt(CHW));
+                               
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
+                       }
+                       
                        MatrixObject dout = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
                        MatrixObject maxPoolOutput = _input3 != null ? 
getMatrixInputForGPUInstruction(ec, _input3.getName()) : null;
                        if(dout.getNumRows() != N || dout.getNumColumns() != 
C*P*Q) 
@@ -1029,18 +1061,26 @@ public class DnnGPUInstruction extends GPUInstruction {
                        
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * H * W);
                        PoolingType poolType = 
instOpcode.equalsIgnoreCase("maxpooling_backward") ? PoolingType.MAX : 
PoolingType.AVG;
-                       LibMatrixCuDNN.poolingBackward(ec.getGPUContext(0), 
getExtendedOpcode(), image, dout, maxPoolOutput, out, N, C, H, W,
-                                       K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q, poolType, _intermediateMemoryBudget);
+                       
if(instOpcode.equalsIgnoreCase("relu_maxpooling_backward")) {
+                               
LibMatrixCuDNN.poolingBackward(ec.getGPUContext(0), getExtendedOpcode(), x, 
dout, maxPoolOutput, out, N, C, H, W,
+                                               K, R, S, pad_h, pad_w, 
stride_h, stride_w, P, Q, poolType, _intermediateMemoryBudget);
+                               gCtx.cudaFreeHelper(instName, x, 
gCtx.EAGER_CUDA_FREE);
+                       }
+                       else {
+                               
LibMatrixCuDNN.poolingBackward(ec.getGPUContext(0), getExtendedOpcode(), image, 
dout, maxPoolOutput, out, N, C, H, W,
+                                               K, R, S, pad_h, pad_w, 
stride_h, stride_w, P, Q, poolType, _intermediateMemoryBudget);
+                       }
                }
                else {
                        throw new DMLRuntimeException("Unsupported GPU context 
for " + instOpcode);
                }
                
                // release inputs/outputs
-               ec.releaseMatrixInputForGPUInstruction(_input1.getName());
+               if(!instOpcode.equalsIgnoreCase("relu_maxpooling") && 
!instOpcode.equalsIgnoreCase("relu_maxpooling_backward"))
+                       
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                
-               boolean isPool = instOpcode.equalsIgnoreCase("maxpooling") || 
instOpcode.equalsIgnoreCase("avgpooling");
-               boolean isPoolBackward = 
instOpcode.equalsIgnoreCase("maxpooling_backward") || 
instOpcode.equalsIgnoreCase("avgpooling_backward");
+               boolean isPool = instOpcode.equalsIgnoreCase("maxpooling") || 
instOpcode.equalsIgnoreCase("avgpooling") || 
instOpcode.equalsIgnoreCase("relu_maxpooling");
+               boolean isPoolBackward = 
instOpcode.equalsIgnoreCase("maxpooling_backward") || 
instOpcode.equalsIgnoreCase("avgpooling_backward") || 
instOpcode.equalsIgnoreCase("relu_maxpooling_backward");
 
                if ( !isPool )
                        
ec.releaseMatrixInputForGPUInstruction(_input2.getName());

http://git-wip-us.apache.org/repos/asf/systemml/blob/beb1a1d1/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index 413c550..e496ddb 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -643,6 +643,44 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        throwCuDNNDimensionError(N, CHW, N, CPQ);
                }
        }
+       
+       /**
+        * performs maxpooling on GPU by exploiting cudnnPoolingForward(...)
+        * @param gCtx   a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param x image as pointer
+        * @param outputBlock output matrix
+        * @param N                             batch size
+        * @param C                             number of channels
+        * @param H                             height of image
+        * @param W                             width of image
+        * @param K                             number of filters
+        * @param R                             height of filter
+        * @param S                             width of filter
+        * @param pad_h                 vertical padding
+        * @param pad_w                 horizontal padding
+        * @param stride_h              horizontal stride
+        * @param stride_w              vertical stride
+        * @param P                             (H - R + 1 + 2*pad_h)/stride_h
+        * @param Q                             (W - S + 1 + 2*pad_w)/stride_w
+        * @param poolingType   type of pooling
+        * @param intermediateMemoryBudget intermediate memory budget
+        */
+       public static void pooling(GPUContext gCtx, String instName, Pointer x,
+                       MatrixObject outputBlock, int N, int C, int H, int W, 
int K, int R,
+                       int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
+                       int Q, PoolingType poolingType, double 
intermediateMemoryBudget) {
+               long CHW = C*H*W; long CPQ = C*P*Q;  
+               long NCHW = N*CHW; long NCPQ = N*CPQ; 
+
+               if(NCHW < maxNumElementsOfCuDNNTensor && NCPQ < 
maxNumElementsOfCuDNNTensor) {
+                       Pointer y = getDensePointerForCuDNN(gCtx, outputBlock, 
instName);
+                       cudnnPoolingHelper(gCtx, instName, x, y, N, C, H, W, K, 
R, S, pad_h, pad_w, stride_h, stride_w, P, Q, poolingType);
+               }
+               else {
+                       throwCuDNNDimensionError(N, CHW, N, CPQ);
+               }
+       }
 
        private static void cudnnPoolingHelper(GPUContext gCtx, String 
instName, Pointer x,
                        Pointer y, int N, int C, int H, int W, int K, int R,
@@ -738,6 +776,53 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                }
        }
        
+       /**
+        * Performs maxpoolingBackward on GPU by exploiting 
cudnnPoolingBackward(...)
+        * This method computes the backpropogation errors for previous layer 
of maxpooling operation
+        * @param gCtx   a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param x image as dense pointer
+        * @param dout                  delta matrix, output of previous layer
+        * @param maxpoolOutput (optional and can be null) output of maxpool 
forward function
+        * @param outputBlock output matrix
+        * @param N                             batch size
+        * @param C                             number of channels
+        * @param H                             height of image
+        * @param W                             width of image
+        * @param K                             number of filters
+        * @param R                             height of filter
+        * @param S                             width of filter
+        * @param pad_h                 vertical padding
+        * @param pad_w                 horizontal padding
+        * @param stride_h              horizontal stride
+        * @param stride_w              vertical stride
+        * @param P                             (H - R + 1 + 2*pad_h)/stride_h
+        * @param Q                             (W - S + 1 + 2*pad_w)/stride_w
+        * @param poolingType   type of pooling
+        * @param intermediateMemoryBudget intermediate memory budget
+        */
+       public static void poolingBackward(GPUContext gCtx, String instName, 
Pointer x, MatrixObject dout,
+                       MatrixObject maxpoolOutput, MatrixObject outputBlock, 
int N, int C, int H, int W, int K, int R,
+                       int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
+                       int Q, PoolingType poolingType, double 
intermediateMemoryBudget) {
+               long CHW = C*H*W; long CPQ = C*P*Q;  
+               long NCHW = N*CHW; long NCPQ = N*CPQ; 
+
+               final boolean isMaxPoolOutputProvided = maxpoolOutput != null;
+               
+               if(NCHW < maxNumElementsOfCuDNNTensor && NCPQ < 
maxNumElementsOfCuDNNTensor) {
+                       // Filter and output are accounted as dense in the 
memory estimation for conv2dBackwardData
+                       Pointer dx = getDensePointerForCuDNN(gCtx, outputBlock, 
instName);
+                       Pointer dy = getDensePointerForCuDNN(gCtx, dout, 
instName);
+                       Pointer y = isMaxPoolOutputProvided ? 
getDensePointerForCuDNN(gCtx, maxpoolOutput, instName) : null;
+                       cudnnPoolingBackwardHelper(gCtx, instName, x, dy, y, 
dx, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, poolingType);
+                       
+               }
+               else {
+                       throwCuDNNDimensionError(N, CHW, N, CPQ);
+               }
+       }
+       
        private static void cudnnPoolingBackwardHelper(GPUContext gCtx, String 
instName, 
                        Pointer x, Pointer dy, Pointer y, Pointer dx, 
                        int N, int C, int H, int W, int K, int R,
@@ -1457,7 +1542,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
         * @param instName name of the instruction
         * @return jcuda pointer
         */
-       protected static Pointer getDensePointerForCuDNN(GPUContext gCtx, 
MatrixObject image, String instName) {
+       public static Pointer getDensePointerForCuDNN(GPUContext gCtx, 
MatrixObject image, String instName) {
                long numElems = image.getNumRows()*image.getNumColumns();
                if(numElems > maxNumElementsOfCuDNNTensor) {
                        throw new DMLRuntimeException("CuDNN restriction: the 
size of input tensor cannot have greater than 2 giga-elements, but has " + 
numElems + " (i.e. [" + image.getNumRows() + " X " + image.getNumColumns() + 
"]). Hint: try reducing the mini-batch size.");

systemml git commit: [SYSTEMML-445] Support recomputation of activations to reduce the memory footprint

Reply via email to