systemml git commit: [SYSTEMML-445] Select CuDNN operator dynamically based on memory budget

niketanpansare Thu, 28 Sep 2017 21:12:07 -0700

Repository: systemml
Updated Branches:
  refs/heads/master e4c74eda6 -> 43b573dfb



[SYSTEMML-445] Select CuDNN operator dynamically based on memory budget

- Refactored CuDNN operator selection logic into separate file to simplify
  LibMatrixCuDNN.
- Fixed a minor memory leak in conv2d backward filter (deallocation of
  descriptor).
- Added optimistic intermediate memory budget for GPU ConvolutionOp.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/43b573df
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/43b573df
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/43b573df

Branch: refs/heads/master
Commit: 43b573dfbb6632d834ab8fa1695e93ce2c4cf15f
Parents: e4c74ed
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Thu Sep 28 21:01:03 2017 -0700
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Thu Sep 28 21:10:15 2017 -0700

----------------------------------------------------------------------
 .../org/apache/sysml/hops/ConvolutionOp.java    |  14 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      |   2 +-
 .../runtime/matrix/data/LibMatrixCuDNN.java     | 434 +++++--------------
 .../LibMatrixCuDNNConvolutionAlgorithm.java     | 249 +++++++++++
 .../data/LibMatrixCuDNNInputRowFetcher.java     |  82 ++++
 5 files changed, 456 insertions(+), 325 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/43b573df/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java 
b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index 0ad9182..c5cf667 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -29,6 +29,7 @@ import org.apache.sysml.lops.LopsException;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 import org.apache.sysml.runtime.matrix.data.ConvolutionParameters;
 
@@ -191,7 +192,18 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
 //             // TODO: Inserting reblock requires knowing columns apriori
 //             ConvolutionTransform transform1 = new 
ConvolutionTransform(addReblockIfNecessary(et, lopOp, in), lopOp, 
getDataType(), getValueType(), et, k);
 //             setReblockedOutputDimension(et, transform1);
-               ConvolutionTransform transform1 = new ConvolutionTransform(in, 
lopOp, getDataType(), getValueType(), et, k, computeIntermediateMemEstimate(-1, 
-1, -1 ));
+               double cpIntermediateMemEstimate = 
computeIntermediateMemEstimate(-1, -1, -1 );
+               if(et == ExecType.GPU && _dim1 > 0 && _dim2 > 0) {
+                       // This enables us to compile more efficient 
matrix-matrix CuDNN operation instead of 
+                       // row-by-row invocation of multiple vector-matrix 
CuDNN operations.
+                       // This is possible as the operations on GPU are 
single-threaded
+                       double optimisticIntermediateMemEstimate = 
GPUContextPool.initialGPUMemBudget() - getOutputMemEstimate() - 
inputs.get(0).getOutputMemEstimate();
+                       if(in2 != null) {
+                               optimisticIntermediateMemEstimate -= 
inputs.get(1).getOutputMemEstimate();
+                       }
+                       cpIntermediateMemEstimate = 
Math.max(cpIntermediateMemEstimate, optimisticIntermediateMemEstimate);
+               }
+               ConvolutionTransform transform1 = new ConvolutionTransform(in, 
lopOp, getDataType(), getValueType(), et, k, cpIntermediateMemEstimate);
                setOutputDimensions(transform1);
                
                setLineNumbers(transform1);

http://git-wip-us.apache.org/repos/asf/systemml/blob/43b573df/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 4b2cd73..f4a00ab 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -2208,7 +2208,7 @@ public class LibMatrixCUDA {
        //******************* End of Re-org Functions ************************/
        //********************************************************************/
 
-       protected static int toInt(long num) throws DMLRuntimeException {
+       static int toInt(long num) throws DMLRuntimeException {
                if(num >= Integer.MAX_VALUE || num <= Integer.MIN_VALUE) {
                        throw new DMLRuntimeException("GPU : Exceeded supported 
size " + num);
                }

http://git-wip-us.apache.org/repos/asf/systemml/blob/43b573df/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index 654bd9d..25dc604 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -30,12 +30,6 @@ import static 
jcuda.jcudnn.JCudnn.cudnnCreateConvolutionDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnCreateFilterDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnCreatePoolingDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnDestroyConvolutionDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnDestroyFilterDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnDestroyPoolingDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize;
-import static 
jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize;
-import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize;
 import static jcuda.jcudnn.JCudnn.cudnnPoolingBackward;
 import static jcuda.jcudnn.JCudnn.cudnnPoolingForward;
 import static jcuda.jcudnn.JCudnn.cudnnSetActivationDescriptor;
@@ -67,13 +61,11 @@ import jcuda.jcudnn.cudnnTensorDescriptor;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
-import org.apache.sysml.runtime.instructions.gpu.context.CSRPointer;
 import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.utils.GPUStatistics;
@@ -154,28 +146,34 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; 
                long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
 
-               if(DMLScript.FORCE_ACCELERATOR ||
-                               (NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < 
maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor)) {
+               if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < 
maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) {
                        // Filter and output are accounted as dense in the 
memory estimation for conv2d
                        double overhead = isInSparseFormat(gCtx, filter) ? 
OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0;
                        overhead += isInSparseFormat(gCtx, image) ? 
OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
 
                        Pointer filterPointer = getDensePointerForCuDNN(gCtx, 
filter, instName);
                        Pointer dstPointer = getDensePointerForCuDNN(gCtx, 
outputBlock, instName);
-
-                       if(DMLScript.FORCE_ACCELERATOR || overhead <= 
intermediateMemoryBudget) {
-                               // Perform all-input all-channel conv2d
-                               Pointer imagePointer = 
getDensePointerForCuDNN(gCtx, image, instName);
-                               cudnnConv2d(gCtx, instName, imagePointer, 
filterPointer, dstPointer, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
-                       }
-                       else {
-                               InputRowFetcher imgFetcher = new 
InputRowFetcher(gCtx, instName, image);
-                               for(int n = 0; n < N; n++) {
-                                       // Perform one-input all-channel conv2d
-                                       cudnnConv2d(gCtx, instName, 
imgFetcher.getNthRow(n), filterPointer, 
dstPointer.withByteOffset(n*KPQ*Sizeof.DOUBLE), 
-                                                       1, C, H, W, K, R, S, 
pad_h, pad_w, stride_h, stride_w, P, Q);
+                       
+                       // Required for LibMatrixCuDNNConvolutionAlgorithm
+                       long workspaceLimit = (long) 
(intermediateMemoryBudget-overhead);
+                       int localN = overhead <= intermediateMemoryBudget ? N : 
1;
+                       
+                       try(LibMatrixCuDNNConvolutionAlgorithm algo = 
+                                       
LibMatrixCuDNNConvolutionAlgorithm.cudnnGetConvolutionForwardAlgorithm(gCtx, 
instName, 
+                                       localN, C, H, W, K, R, S, pad_h, pad_w, 
stride_h, stride_w, P, Q, workspaceLimit)) {
+                               if(localN == N) {
+                                       // Perform all-input all-channel conv2d
+                                       Pointer imagePointer = 
getDensePointerForCuDNN(gCtx, image, instName);
+                                       cudnnConv2d(gCtx, instName, 
imagePointer, filterPointer, dstPointer, algo);
+                               }
+                               else {
+                                       try(LibMatrixCuDNNInputRowFetcher 
imgFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, image)) {
+                                               for(int n = 0; n < N; n++) {
+                                                       // Perform one-input 
all-channel conv2d
+                                                       cudnnConv2d(gCtx, 
instName, imgFetcher.getNthRow(n), filterPointer, 
dstPointer.withByteOffset(n*KPQ*Sizeof.DOUBLE), algo);
+                                               }
+                                       }
                                }
-                               imgFetcher.close();
                        }
                }
                else {
@@ -228,93 +226,31 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
         * @param image    the input matrix (or image) allocated on the GPU
         * @param filter   the filter allocated on the GPU
         * @param output   the output matrix allocated on the GPU
-        * @param N        number of input images
-        * @param C        number of channels
-        * @param H        height of each image
-        * @param W        width of each image
-        * @param K        number of output "channels"
-        * @param R        height of filter
-        * @param S        width of filter
-        * @param pad_h    padding height
-        * @param pad_w    padding width
-        * @param stride_h stride height
-        * @param stride_w string width
-        * @param P        output height
-        * @param Q        output width
+        * @param algo     cudnn algorithm wrapper
         * @throws DMLRuntimeException if error
         */
-       private static void cudnnConv2d(GPUContext gCtx, String instName, 
Pointer image, Pointer filter, Pointer output, int N,
-                       int C, int H, int W, int K, int R, int S, int pad_h, 
int pad_w, int stride_h, int stride_w, int P, int Q)
+       private static void cudnnConv2d(GPUContext gCtx, String instName, 
Pointer image, Pointer filter, Pointer output, 
+                       LibMatrixCuDNNConvolutionAlgorithm algo)
                                        throws DMLRuntimeException {
                if(LOG.isTraceEnabled()) {
                        LOG.trace("GPU : conv2d" + ", GPUContext=" + gCtx);
                }
-               cudnnFilterDescriptor filterDesc = null;
-               cudnnConvolutionDescriptor convDesc = null;
-               Pointer workSpace = null;
-               long sizeInBytes = 0;
                try {
-                       long t1 = 0, t2 = 0;
-                       // Allocate descriptors
+                       long t1 = 0;
                        if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
-                       cudnnTensorDescriptor srcTensorDesc = 
allocateTensorDescriptor(N, C, H, W);
-                       cudnnTensorDescriptor dstTensorDesc = 
allocateTensorDescriptor(N, K, P, Q);
-                       filterDesc = allocateFilterDescriptor(K, C, R, S);
-
-                       int padding[] = {pad_h, pad_w};
-                       int strides[] = {stride_h, stride_w};
-                       convDesc = allocateConvolutionDescriptor(padding, 
strides);
-
-                       // Select the best algorithm depending on the data and 
supported CUDA
-
-                       int algo = -1;
-                       workSpace = new Pointer();
-
-                       if (CONVOLUTION_PREFERENCE == 
cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE) {
-                               algo = 
jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-                       } else if (CONVOLUTION_PREFERENCE == 
cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_PREFER_FASTEST) {
-                               int[] algos = {-1};
-                               // TODO: Look into FFt, Winograd, etc
-                               // Also ensure that GPU has enough memory to 
allocate memory
-                               long sizeInBytesArray[] = {0};
-                               
jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(getCudnnHandle(gCtx), 
srcTensorDesc, filterDesc, convDesc, dstTensorDesc,
-                                               CONVOLUTION_PREFERENCE, 
sizeInBytesArray[0], algos);
-                               
cudnnGetConvolutionForwardWorkspaceSize(getCudnnHandle(gCtx), srcTensorDesc, 
filterDesc, convDesc, dstTensorDesc, algos[0], sizeInBytesArray);
-                               if (sizeInBytesArray[0] != 0)
-                                       workSpace = 
gCtx.allocate(sizeInBytesArray[0]);
-                               sizeInBytes = sizeInBytesArray[0];
-                       } else if (CONVOLUTION_PREFERENCE == 
cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT) {
-                               throw new 
DMLRuntimeException("CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT is not 
implemented");
-                       } else {
-                               throw new DMLRuntimeException("Unsupported 
preference criteria for convolution");
-                       }
-                       if (GPUStatistics.DISPLAY_STATISTICS)
-                               GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
-                       if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
                        int status = 
cudnnConvolutionForward(getCudnnHandle(gCtx), one(),
-                                       srcTensorDesc, image,
-                                       filterDesc, filter,
-                                       convDesc, algo, workSpace, sizeInBytes, 
zero(),
-                                       dstTensorDesc, output);
+                                       algo.nchwTensorDesc, image,
+                                       algo.filterDesc, filter,
+                                       algo.convDesc, algo.algo, 
algo.workSpace, algo.sizeInBytes, zero(),
+                                       algo.nkpqTensorDesc, output);
                        if (GPUStatistics.DISPLAY_STATISTICS)
-                               GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CONVOLUTION_FORWARD_LIB, System.nanoTime() - t2);
+                               GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CONVOLUTION_FORWARD_LIB, System.nanoTime() - t1);
                        if (status != cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnConvolutionForward: " + cudnnStatus.stringFor(status));
                        }
                } catch (CudaException e) {
                        throw new DMLRuntimeException("Error in conv2d in 
GPUContext " + gCtx.toString() + " from Thread " + 
Thread.currentThread().toString(), e);
-               } finally {
-                       long t3 = 0;
-                       if (GPUStatistics.DISPLAY_STATISTICS) t3 = 
System.nanoTime();
-                       if (filterDesc != null)
-                               cudnnDestroyFilterDescriptor(filterDesc);
-                       if (convDesc != null)
-                               cudnnDestroyConvolutionDescriptor(convDesc);
-                       if (workSpace != null && sizeInBytes != 0)
-                               gCtx.cudaFreeHelper(instName, workSpace);
-                       if (GPUStatistics.DISPLAY_STATISTICS)
-                               GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
-               }
+               } 
        }
 
        /**
@@ -347,41 +283,46 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        int Q, double intermediateMemoryBudget) throws 
DMLRuntimeException {
                long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; 
                long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
-
-               if(DMLScript.FORCE_ACCELERATOR || 
-                               (NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < 
maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor)) {
+               
+               
+               if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < 
maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) {
                        Pointer dwPointer = getDensePointerForCuDNN(gCtx, 
outputBlock, instName);
                        double overhead = isInSparseFormat(gCtx, image) ? 
OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
                        overhead += isInSparseFormat(gCtx, dout) ? 
OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0;
-                       if(DMLScript.FORCE_ACCELERATOR || overhead <= 
intermediateMemoryBudget) {
-                               // Perform all-input all-channel 
conv2dBackwardFilter
-                               Pointer imagePointer = 
getDensePointerForCuDNN(gCtx, image, instName);
-                               Pointer doutPointer = 
getDensePointerForCuDNN(gCtx, dout, instName);
-                               cudnnConv2dBackwardFilter(gCtx, instName, 
imagePointer, doutPointer, dwPointer, 
-                                               N, C, H, W, K, R, S, pad_h, 
pad_w, stride_h, stride_w, P, Q);
-                       }
-                       else {
-                               // Perform one-input conv2dBackwardFilter
-                               Pointer tempdwPointer = 
gCtx.allocate(KCRS*Sizeof.DOUBLE);
-                               InputRowFetcher imgFetcher = new 
InputRowFetcher(gCtx, instName, image);
-                               InputRowFetcher doutFetcher = new 
InputRowFetcher(gCtx, instName, dout);
-                               for(int n = 0; n < N; n++) {
-                                       long t0 = 
GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-                                       cudaMemset(tempdwPointer, 0, 
KCRS*Sizeof.DOUBLE);
-                                       if(GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, 
System.nanoTime() - t0);
-                                       // Perform one-input 
conv2dBackwardFilter
-                                       cudnnConv2dBackwardFilter(gCtx, 
instName, imgFetcher.getNthRow(n), doutFetcher.getNthRow(n), tempdwPointer, 
-                                                       1, C, H, W, K, R, S, 
pad_h, pad_w, stride_h, stride_w, P, Q);
-                                       
getCudaKernels(gCtx).launchKernel("inplace_add",
-                                                       
ExecutionConfig.getConfigForSimpleMatrixOperations(K, toInt(CRS)),
-                                                       tempdwPointer, 
dwPointer, K, toInt(CRS));
 
+                       // Required for LibMatrixCuDNNConvolutionAlgorithm
+                       long workspaceLimit = (long) 
(intermediateMemoryBudget-overhead);
+                       int localN = overhead <= intermediateMemoryBudget ? N : 
1;
+                       
+                       try(LibMatrixCuDNNConvolutionAlgorithm algo = 
+                                       
LibMatrixCuDNNConvolutionAlgorithm.cudnnGetConvolutionBackwardFilterAlgorithm(gCtx,
 instName, 
+                                       localN, C, H, W, K, R, S, pad_h, pad_w, 
stride_h, stride_w, P, Q, workspaceLimit)) {
+                               if(localN == N) {
+                                       // Perform all-input all-channel 
conv2dBackwardFilter
+                                       Pointer imagePointer = 
getDensePointerForCuDNN(gCtx, image, instName);
+                                       Pointer doutPointer = 
getDensePointerForCuDNN(gCtx, dout, instName);
+                                       cudnnConv2dBackwardFilter(gCtx, 
instName, imagePointer, doutPointer, dwPointer, algo);
+                               }
+                               else {
+                                       try(LibMatrixCuDNNInputRowFetcher 
imgFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, image);
+                                               LibMatrixCuDNNInputRowFetcher 
doutFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, dout)) {
+                                               // Perform one-input 
conv2dBackwardFilter
+                                               Pointer tempdwPointer = 
gCtx.allocate(KCRS*Sizeof.DOUBLE);
+                                               for(int n = 0; n < N; n++) {
+                                                       long t0 = 
GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+                                                       
cudaMemset(tempdwPointer, 0, KCRS*Sizeof.DOUBLE);
+                                                       
if(GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, 
System.nanoTime() - t0);
+                                                       // Perform one-input 
conv2dBackwardFilter
+                                                       
cudnnConv2dBackwardFilter(gCtx, instName, imgFetcher.getNthRow(n), 
doutFetcher.getNthRow(n), tempdwPointer, algo);
+                                                       
getCudaKernels(gCtx).launchKernel("inplace_add",
+                                                                       
ExecutionConfig.getConfigForSimpleMatrixOperations(K, toInt(CRS)),
+                                                                       
tempdwPointer, dwPointer, K, toInt(CRS));
+
+                                               }
+                                               // Deallocate temporary array 
to hold one element of input
+                                               
gCtx.cudaFreeHelper(tempdwPointer, true);
+                                       }
                                }
-
-                               // Deallocate temporary array to hold one 
element of input
-                               gCtx.cudaFreeHelper(tempdwPointer, true);
-                               imgFetcher.close();
-                               doutFetcher.close();
                        }
                }
                else {
@@ -397,81 +338,26 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
         * @param imagePointer pointer to input image
         * @param doutPointer pointer to errors from next layer
         * @param dwPointer  output errors
-        * @param N number of images
-        * @param C number of channels
-        * @param H height
-        * @param W width
-        * @param K number of filters
-        * @param R filter height
-        * @param S filter width
-        * @param pad_h pad height
-        * @param pad_w pad width
-        * @param stride_h stride height
-        * @param stride_w stride width
-        * @param P output activation height
-        * @param Q output activation width
+        * @param algo     cudnn algorithm wrapper
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        private static void cudnnConv2dBackwardFilter(GPUContext gCtx, String 
instName, Pointer imagePointer, Pointer doutPointer,
-                       Pointer dwPointer, int N, int C, int H, int W, int K, 
int R,
-                       int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
-                       int Q) throws DMLRuntimeException {
+                       Pointer dwPointer, LibMatrixCuDNNConvolutionAlgorithm 
algo) throws DMLRuntimeException {
                if(LOG.isTraceEnabled()) {
                        LOG.trace("GPU : conv2dBackwardFilter" + ", 
GPUContext=" + gCtx);
                }
-               cudnnFilterDescriptor dwDesc = null;
-               cudnnConvolutionDescriptor convDesc = null;
-
-               Pointer workSpace = null;
-               long sizeInBytes = 0;
                try {
-
-                       long t1 = 0, t2 = 0;
-                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
-                       // Allocate descriptors
-                       cudnnTensorDescriptor xTensorDesc = 
allocateTensorDescriptor(N, C, H, W);
-                       cudnnTensorDescriptor doutTensorDesc = 
allocateTensorDescriptor(N, K, P, Q);
-                       dwDesc = allocateFilterDescriptor(K, C, R, S);
-
-                       // Allocate data
-                       int padding[] = {pad_h, pad_w};
-                       int strides[] = {stride_h, stride_w};
-                       convDesc = allocateConvolutionDescriptor(padding, 
strides);
-                       long sizeInBytesArray[] = {0};
-
-                       // TODO: Select the best algorithm depending on the 
data and supported CUDA
-                       int algo = 
jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
-
-                       workSpace = new Pointer();
-                       
cudnnGetConvolutionBackwardFilterWorkspaceSize(getCudnnHandle(gCtx),
-                                       xTensorDesc, doutTensorDesc, convDesc, 
dwDesc, algo, sizeInBytesArray);
+                       long t1 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+                       int status = 
cudnnConvolutionBackwardFilter(getCudnnHandle(gCtx), one(), 
algo.nchwTensorDesc, imagePointer,
+                                       algo.nkpqTensorDesc, doutPointer, 
algo.convDesc, algo.algo, algo.workSpace, algo.sizeInBytes, zero(), 
algo.filterDesc, dwPointer);
                        if (GPUStatistics.DISPLAY_STATISTICS)
-                               GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
-
-                       if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
-                       int status = 
cudnnConvolutionBackwardFilter(getCudnnHandle(gCtx), one(), xTensorDesc, 
imagePointer,
-                                       doutTensorDesc, doutPointer, convDesc, 
algo, workSpace, sizeInBytes, zero(), dwDesc, dwPointer);
-                       if (GPUStatistics.DISPLAY_STATISTICS)
-                               GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - 
t2);
-
+                               GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - 
t1);
                        if (status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnConvolutionBackwardFilter: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
                        }
                } catch (CudaException e) {
                        throw new DMLRuntimeException("Error in conv2d in 
GPUContext " + gCtx.toString() + " from Thread " + 
Thread.currentThread().toString(), e);
-               } finally {
-                       long t3=0;
-                       if (GPUStatistics.DISPLAY_STATISTICS) t3 = 
System.nanoTime();
-
-                       if(workSpace != null && sizeInBytes != 0)
-                               gCtx.cudaFreeHelper(instName, workSpace);
-                       if(dwDesc != null)
-                               cudnnDestroyFilterDescriptor(dwDesc);
-
-                       if(convDesc != null)
-                               cudnnDestroyConvolutionDescriptor(convDesc);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
-               }
+               } 
        }
 
        /**
@@ -505,26 +391,32 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; 
                long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
 
-               if(DMLScript.FORCE_ACCELERATOR ||
-                               (NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < 
maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor)) {
+               if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < 
maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) {
                        // Filter and output are accounted as dense in the 
memory estimation for conv2dBackwardData
                        double overhead = isInSparseFormat(gCtx, filter) ? 
OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0;
                        overhead += isInSparseFormat(gCtx, dout) ? 
OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0;
                        Pointer filterPointer = getDensePointerForCuDNN(gCtx, 
filter, instName);
                        Pointer dstPointer = getDensePointerForCuDNN(gCtx, 
output, instName);
-                       if(DMLScript.FORCE_ACCELERATOR || overhead <= 
intermediateMemoryBudget) {
-                               // Perform all-input all-channel 
conv2dBackwardData
-                               Pointer doutPointer = 
getDensePointerForCuDNN(gCtx, dout, instName);
-                               cudnnConv2dBackwardData(gCtx, instName, 
filterPointer, doutPointer, dstPointer, 
-                                               N, C, H, W, K, R, S, pad_h, 
pad_w, stride_h, stride_w, P, Q);
-                       }
-                       else {
-                               InputRowFetcher doutFetcher = new 
InputRowFetcher(gCtx, instName, dout);
-                               for(int n = 0; n < N; n++) {
-                                       cudnnConv2d(gCtx, instName, 
doutFetcher.getNthRow(n), filterPointer, 
dstPointer.withByteOffset(n*CHW*Sizeof.DOUBLE), 
-                                                       1, C, H, W, K, R, S, 
pad_h, pad_w, stride_h, stride_w, P, Q);
+                       
+                       // Required for LibMatrixCuDNNConvolutionAlgorithm
+                       long workspaceLimit = (long) 
(intermediateMemoryBudget-overhead);
+                       int localN = overhead <= intermediateMemoryBudget ? N : 
1;
+                       
+                       try(LibMatrixCuDNNConvolutionAlgorithm algo = 
+                                       
LibMatrixCuDNNConvolutionAlgorithm.cudnnGetConvolutionBackwardDataAlgorithm(gCtx,
 instName, 
+                                       localN, C, H, W, K, R, S, pad_h, pad_w, 
stride_h, stride_w, P, Q, workspaceLimit)) {
+                               if(localN == N) {
+                                       // Perform all-input all-channel 
conv2dBackwardData
+                                       Pointer doutPointer = 
getDensePointerForCuDNN(gCtx, dout, instName);
+                                       cudnnConv2dBackwardData(gCtx, instName, 
filterPointer, doutPointer, dstPointer, algo);
+                               }
+                               else {
+                                       try(LibMatrixCuDNNInputRowFetcher 
doutFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, dout)) {
+                                               for(int n = 0; n < N; n++) {
+                                                       
cudnnConv2dBackwardData(gCtx, instName, doutFetcher.getNthRow(n), 
filterPointer, dstPointer.withByteOffset(n*CHW*Sizeof.DOUBLE), algo);
+                                               }
+                                       }
                                }
-                               doutFetcher.close();
                        }
                }
                else {
@@ -540,77 +432,26 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
         * @param w pointer to filter used in conv2d
         * @param dy pointer to errors from next layer
         * @param dx pointer to  output errors
-        * @param N number of images
-        * @param C number of channels
-        * @param H height
-        * @param W width
-        * @param K number of filters
-        * @param R filter height
-        * @param S filter width
-        * @param pad_h pad height
-        * @param pad_w pad width
-        * @param stride_h stride height
-        * @param stride_w stride width
-        * @param P output activation height
-        * @param Q output activation width
+        * @param algo cudnn algorithm wrapper
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        private static void cudnnConv2dBackwardData(GPUContext gCtx, String 
instName, Pointer w, Pointer dy,
-                       Pointer dx, int N, int C, int H, int W, int K, int R,
-                       int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
-                       int Q) throws DMLRuntimeException {
+                       Pointer dx, LibMatrixCuDNNConvolutionAlgorithm algo) 
throws DMLRuntimeException {
                if(LOG.isTraceEnabled()) {
                        LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" 
+ gCtx);
                }
-               cudnnFilterDescriptor wDesc = null;
-               cudnnConvolutionDescriptor convDesc = null;
-
-               Pointer workSpace = null;
-               long sizeInBytes = 0;
                try {
-                       long t1=0, t2=0;
-                       if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
-                       // Allocate descriptors
-                       wDesc = allocateFilterDescriptor(K, C, R, S);
-                       cudnnTensorDescriptor dyDesc = 
allocateTensorDescriptor(N, K, P, Q);
-                       cudnnTensorDescriptor dxDesc = 
allocateTensorDescriptor(N, C, H, W);
-
-                       int padding [] = { pad_h, pad_w };
-                       int strides [] = { stride_h, stride_w };
-                       convDesc = allocateConvolutionDescriptor(padding, 
strides);
-                       long sizeInBytesArray[] = { 0 };
-
-                       // TODO: Select the best algorithm depending on the 
data and supported CUDA
-                       int algo = 
jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-                       workSpace = new Pointer();
-                       
cudnnGetConvolutionBackwardDataWorkspaceSize(getCudnnHandle(gCtx),
-                                       wDesc, dyDesc, convDesc, dxDesc, algo, 
sizeInBytesArray);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
-
-                       if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
-                       int status = 
cudnnConvolutionBackwardData(getCudnnHandle(gCtx), one(), wDesc, w,
-                                       dyDesc, dy, convDesc, algo, workSpace, 
sizeInBytes, zero(), dxDesc, dx);
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - 
t2);
+                       long t1 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+                       int status = 
cudnnConvolutionBackwardData(getCudnnHandle(gCtx), one(), algo.filterDesc, w,
+                                       algo.nkpqTensorDesc, dy, algo.convDesc, 
algo.algo, algo.workSpace, algo.sizeInBytes, zero(), algo.nchwTensorDesc, dx);
+                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - 
t1);
 
                        if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnConvolutionBackwardData: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
                        }
                } catch (CudaException e) {
                        throw new DMLRuntimeException("Error in conv2d in 
GPUContext " + gCtx.toString() + " from Thread " + 
Thread.currentThread().toString(), e);
-               }
-               finally {
-                       long t3=0;
-                       if (GPUStatistics.DISPLAY_STATISTICS) t3 = 
System.nanoTime();
-
-                       if(workSpace != null && sizeInBytes != 0)
-                               gCtx.cudaFreeHelper(instName, workSpace);
-                       if(wDesc != null)
-                               cudnnDestroyFilterDescriptor(wDesc);
-                       if(convDesc != null)
-                               cudnnDestroyConvolutionDescriptor(convDesc);
-
-                       if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
-               }
+               }       
        }
 
        /**
@@ -642,18 +483,17 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                long CHW = C*H*W; long CPQ = C*P*Q;  
                long NCHW = N*CHW; long NCPQ = N*CPQ; 
 
-               if(DMLScript.FORCE_ACCELERATOR || 
-                               (NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < 
maxNumDoublesOfCuDNNTensor)) {
+               if(NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < 
maxNumDoublesOfCuDNNTensor) {
                        // Filter and output are accounted as dense in the 
memory estimation for conv2dBackwardData
                        long overhead = isInSparseFormat(gCtx, image) ? 
OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
                        Pointer y = getDensePointerForCuDNN(gCtx, outputBlock, 
instName);
-                       if(DMLScript.FORCE_ACCELERATOR || overhead <= 
intermediateMemoryBudget) {
+                       if(overhead <= intermediateMemoryBudget) {
                                Pointer x = getDensePointerForCuDNN(gCtx, 
image, instName);
                                cudnnTensorDescriptor xDesc = 
allocateTensorDescriptor(gCtx, image, N, C, H, W);
                                cudnnMaxpooling(gCtx, instName, x, xDesc, y, N, 
C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
                        }
                        else {
-                               InputRowFetcher imgFetcher = new 
InputRowFetcher(gCtx, instName, image);
+                               LibMatrixCuDNNInputRowFetcher imgFetcher = new 
LibMatrixCuDNNInputRowFetcher(gCtx, instName, image);
                                cudnnTensorDescriptor xDesc = 
allocateTensorDescriptor(gCtx, image, N, C, H, W);
                                for(int n = 0; n < N; n++) {
                                        cudnnMaxpooling(gCtx, instName, 
imgFetcher.getNthRow(n), xDesc, y.withByteOffset(n*CPQ*Sizeof.DOUBLE), 1, C, H, 
W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
@@ -666,57 +506,6 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                }
        }
 
-       /**
-        * Performs a slice operation: out = in[(n+1):(n+1), 1:numColumns]
-        */
-       private static class InputRowFetcher {
-               GPUContext gCtx; String instName; int numColumns; boolean 
isInputInSparseFormat; 
-               Object inPointer; // can be either CSRPointer or Pointer 
-               Pointer outPointer;
-
-               /**
-                * Initialize the input fetcher
-                * 
-                * @param gCtx current gpu context
-                * @param instName name of the instruction
-                * @param image input matrix object.
-                * @throws DMLRuntimeException if error
-                */
-               public InputRowFetcher(GPUContext gCtx, String instName, 
MatrixObject image) throws DMLRuntimeException {
-                       this.gCtx = gCtx; this.instName = instName;
-                       numColumns = toInt(image.getNumColumns());
-                       isInputInSparseFormat = isInSparseFormat(gCtx, image);
-                       inPointer = isInputInSparseFormat ? 
getSparsePointer(gCtx, image, instName) : getDensePointerForCuDNN(gCtx, image, 
instName);
-                       outPointer = gCtx.allocate(numColumns*Sizeof.DOUBLE);
-               }
-               /**
-                * Copy the nth row and return the dense pointer
-                * @param n zero-based row index
-                * @return dense pointer containing the nth row. This row is 
reused in the next iteration
-                * @throws DMLRuntimeException
-                */
-               public Pointer getNthRow(int n) throws DMLRuntimeException {
-                       if(isInputInSparseFormat) {
-                               jcuda.runtime.JCuda.cudaDeviceSynchronize();
-                               long t0 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
-                               cudaMemset(outPointer, 0, 
numColumns*Sizeof.DOUBLE);
-                               jcuda.runtime.JCuda.cudaDeviceSynchronize();
-                               if(GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, 
System.nanoTime() - t0);
-                               sliceSparseDense(gCtx, instName, 
(CSRPointer)inPointer, outPointer, n, n, 0, toInt(numColumns-1), numColumns);
-                       }
-                       else {
-                               sliceDenseDense(gCtx, instName, 
(Pointer)inPointer, outPointer, n, n, 0, toInt(numColumns-1), numColumns);
-                       }
-                       return outPointer;
-               }
-               /**
-                * Deallocates temporary pointer
-                */
-               public void close() {
-                       gCtx.cudaFreeHelper(outPointer, true);
-               }
-       }
-
        private static void cudnnMaxpooling(GPUContext gCtx, String instName, 
Pointer x, cudnnTensorDescriptor xDesc,
                        Pointer y, int N, int C, int H, int W, int K, int R,
                        int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
@@ -749,7 +538,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        long t3=0;
                        if (GPUStatistics.DISPLAY_STATISTICS) t3 = 
System.nanoTime();
                        if(poolingDesc != null)
-                               cudnnDestroyPoolingDescriptor(poolingDesc);
+                               
jcuda.jcudnn.JCudnn.cudnnDestroyPoolingDescriptor(poolingDesc);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
                }
        }
@@ -785,20 +574,19 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                long CHW = C*H*W; long CPQ = C*P*Q;  
                long NCHW = N*CHW; long NCPQ = N*CPQ; 
 
-               if(DMLScript.FORCE_ACCELERATOR || 
-                               (NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < 
maxNumDoublesOfCuDNNTensor)) {
+               if(NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < 
maxNumDoublesOfCuDNNTensor) {
                        // Filter and output are accounted as dense in the 
memory estimation for conv2dBackwardData
                        long overhead = isInSparseFormat(gCtx, image) ? 
OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
                        overhead += isInSparseFormat(gCtx, dout) ? 
OptimizerUtils.estimateSizeExactSparsity(N, CPQ, 1.0) : 0;
                        Pointer dx = getDensePointerForCuDNN(gCtx, outputBlock, 
instName);
-                       if(DMLScript.FORCE_ACCELERATOR || overhead <= 
intermediateMemoryBudget) {
+                       if(overhead <= intermediateMemoryBudget) {
                                Pointer x = getDensePointerForCuDNN(gCtx, 
image, instName);
                                Pointer dy = getDensePointerForCuDNN(gCtx, 
dout, instName);
                                cudnnMaxpoolingBackward(gCtx, instName, x, dy, 
dx, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
                        }
                        else {
-                               InputRowFetcher imgFetcher = new 
InputRowFetcher(gCtx, instName, image);
-                               InputRowFetcher doutFetcher = new 
InputRowFetcher(gCtx, instName, dout);
+                               LibMatrixCuDNNInputRowFetcher imgFetcher = new 
LibMatrixCuDNNInputRowFetcher(gCtx, instName, image);
+                               LibMatrixCuDNNInputRowFetcher doutFetcher = new 
LibMatrixCuDNNInputRowFetcher(gCtx, instName, dout);
                                for(int n = 0; n < N; n++) {
                                        cudnnMaxpoolingBackward(gCtx, instName, 
imgFetcher.getNthRow(n), doutFetcher.getNthRow(n), 
                                                        
dx.withByteOffset(n*CHW*Sizeof.DOUBLE), 
@@ -868,13 +656,13 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        if(y != null)
                                gCtx.cudaFreeHelper(instName, y);
                        if(poolingDesc != null)
-                               cudnnDestroyPoolingDescriptor(poolingDesc);
+                               
jcuda.jcudnn.JCudnn.cudnnDestroyPoolingDescriptor(poolingDesc);
 
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4);
                }
        }
 
-       private static cudnnConvolutionDescriptor 
allocateConvolutionDescriptor(int padding [], int strides []) {
+       static cudnnConvolutionDescriptor allocateConvolutionDescriptor(int 
padding [], int strides []) {
                cudnnConvolutionDescriptor convDesc = new 
cudnnConvolutionDescriptor();
                cudnnCreateConvolutionDescriptor(convDesc);
                cudnnSetConvolution2dDescriptor(convDesc, padding[0], 
padding[1], strides[0], strides[1], 1, 1, CUDNN_CROSS_CORRELATION);
@@ -914,7 +702,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
         * @return cudnn tensor descriptor
         * @throws DMLRuntimeException if the input descriptor and matrix 
dimensions don't match
         */
-       private static cudnnTensorDescriptor allocateTensorDescriptor(int N, 
int C, int H, int W) throws DMLRuntimeException {
+       static cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int 
H, int W) throws DMLRuntimeException {
                cudnnTensorDescriptor tensorDescriptor = new 
cudnnTensorDescriptor();
                cudnnCreateTensorDescriptor(tensorDescriptor);
                cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, 
CUDNN_DATA_DOUBLE, N, C, H, W);

http://git-wip-us.apache.org/repos/asf/systemml/blob/43b573df/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
new file mode 100644
index 0000000..363cf78
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
@@ -0,0 +1,249 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.matrix.data;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.utils.GPUStatistics;
+
+import jcuda.Pointer;
+import jcuda.jcudnn.cudnnConvolutionBwdDataPreference;
+import jcuda.jcudnn.cudnnConvolutionBwdFilterPreference;
+import jcuda.jcudnn.cudnnConvolutionDescriptor;
+import jcuda.jcudnn.cudnnConvolutionFwdPreference;
+import jcuda.jcudnn.cudnnFilterDescriptor;
+import jcuda.jcudnn.cudnnTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyConvolutionDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyFilterDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor;
+
+/**
+ * This class is a wrapper that contain necessary data structures to invoke 
+ * a cudnn convolution* functions (such as cudnnConvolutionForward, etc)
+ * 
+ * It implements autocloseable to simplify the LibMatrixCuDNN code and also 
avoids potential memory leaks.
+ * 
+ * The caller has to use the factory methods: 
cudnnGetConvolutionForwardAlgorithm, 
+ * cudnnGetConvolutionBackwardFilterAlgorithm and 
cudnnGetConvolutionBackwardDataAlgorithm
+ * to get the LibMatrixCuDNNConvolutionAlgorithm object.
+ * The naming of this methods is consistent with that of CuDNN library.
+ *  
+ */
+public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseable {
+       public int algo = -1;
+       public Pointer workSpace = new Pointer();
+       public long sizeInBytes = 0;
+       cudnnTensorDescriptor nchwTensorDesc = null;
+       cudnnTensorDescriptor nkpqTensorDesc = null;
+       cudnnFilterDescriptor filterDesc = null;
+       cudnnConvolutionDescriptor convDesc = null;
+       GPUContext gCtx = null; String instName = null;
+       
+       private LibMatrixCuDNNConvolutionAlgorithm(GPUContext gCtx, String 
instName, int N, int C, int H, int W, int K, int R, int S, 
+                       int pad_h, int pad_w, int stride_h, int stride_w, int 
P, int Q) throws DMLRuntimeException {
+               int padding[] = {pad_h, pad_w};
+               int strides[] = {stride_h, stride_w};
+               convDesc = 
LibMatrixCuDNN.allocateConvolutionDescriptor(padding, strides);
+               this.gCtx = gCtx;
+               this.instName = instName;
+               nchwTensorDesc = LibMatrixCuDNN.allocateTensorDescriptor(N, C, 
H, W);
+               nkpqTensorDesc = LibMatrixCuDNN.allocateTensorDescriptor(N, K, 
P, Q);
+               filterDesc = LibMatrixCuDNN.allocateFilterDescriptor(K, C, R, 
S);
+       }
+       
+       /**
+        * Deallocates the tensor and filter descriptors as well as allocated 
workspace
+        */
+       @Override
+       public void close() {
+               long t3 = 0;
+               if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
+               if(nchwTensorDesc != null)
+                       cudnnDestroyTensorDescriptor(nchwTensorDesc);
+               if(nkpqTensorDesc != null)
+                       cudnnDestroyTensorDescriptor(nkpqTensorDesc);
+               if(filterDesc != null)
+                       cudnnDestroyFilterDescriptor(filterDesc);
+               if(convDesc != null)
+                       cudnnDestroyConvolutionDescriptor(convDesc);
+               if(sizeInBytes != 0)
+                       gCtx.cudaFreeHelper(instName, workSpace);
+               if(GPUStatistics.DISPLAY_STATISTICS)
+                       GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
+       }
+       
+       /**
+        * Factory method to get the algorithm wrapper for convolution forward
+        * 
+        * @param gCtx     a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param N        number of input images
+        * @param C        number of channels
+        * @param H        height of each image
+        * @param W        width of each image
+        * @param K        number of output "channels"
+        * @param R        height of filter
+        * @param S        width of filter
+        * @param pad_h    padding height
+        * @param pad_w    padding width
+        * @param stride_h stride height
+        * @param stride_w string width
+        * @param P        output height
+        * @param Q        output width
+        * @param workspaceLimit maximum intermediate memory to use
+        * @return algorithm wrapper
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static LibMatrixCuDNNConvolutionAlgorithm 
cudnnGetConvolutionForwardAlgorithm(
+                       GPUContext gCtx, String instName, int N, int C, int H, 
int W, int K, int R, int S, 
+                       int pad_h, int pad_w, int stride_h, int stride_w, int 
P, int Q, long workspaceLimit) throws DMLRuntimeException {
+               long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() 
: 0;
+               LibMatrixCuDNNConvolutionAlgorithm ret = new 
LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, 
+                               pad_h, pad_w, stride_h, stride_w, P, Q);
+               if(workspaceLimit <= 0) {
+                       // If overhead is greater than intermediate allocated 
memory, prefer the cudnn operator with no memory requirement, 
+                       // i.e. CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
+                       ret.algo = 
jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+               }
+               else {
+                       int[] algos = {-1};
+                       long sizeInBytesArray[] = {workspaceLimit};
+                       
jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(LibMatrixCuDNN.getCudnnHandle(gCtx),
 
+                                       ret.nchwTensorDesc, ret.filterDesc, 
ret.convDesc, ret.nkpqTensorDesc,
+                                       
cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, 
sizeInBytesArray[0], algos);
+                       
jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
 
+                                       ret.nchwTensorDesc, ret.filterDesc, 
ret.convDesc, ret.nkpqTensorDesc, algos[0], sizeInBytesArray);
+                       if (sizeInBytesArray[0] != 0)
+                               ret.workSpace = 
gCtx.allocate(sizeInBytesArray[0]);
+                       ret.sizeInBytes = sizeInBytesArray[0];
+                       ret.algo = algos[0];
+               }
+               if (GPUStatistics.DISPLAY_STATISTICS)
+                       GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+               return ret;
+       }
+       
+       /**
+        * Factory method to get the algorithm wrapper for convolution backward 
filter
+        * 
+        * @param gCtx     a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param N        number of input images
+        * @param C        number of channels
+        * @param H        height of each image
+        * @param W        width of each image
+        * @param K        number of output "channels"
+        * @param R        height of filter
+        * @param S        width of filter
+        * @param pad_h    padding height
+        * @param pad_w    padding width
+        * @param stride_h stride height
+        * @param stride_w string width
+        * @param P        output height
+        * @param Q        output width
+        * @param workspaceLimit maximum intermediate memory to use
+        * @return algorithm wrapper
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static LibMatrixCuDNNConvolutionAlgorithm 
cudnnGetConvolutionBackwardFilterAlgorithm(
+                       GPUContext gCtx, String instName, int N, int C, int H, 
int W, int K, int R, int S, 
+                       int pad_h, int pad_w, int stride_h, int stride_w, int 
P, int Q, long workspaceLimit) throws DMLRuntimeException {
+               long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() 
: 0;
+               LibMatrixCuDNNConvolutionAlgorithm ret = new 
LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, 
+                               pad_h, pad_w, stride_h, stride_w, P, Q);
+               
+               if(workspaceLimit <= 0) {
+                       // If overhead is greater than intermediate allocated 
memory, prefer the cudnn operator with no memory requirement
+                       // i.e. CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0
+                       ret.algo = 
jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+               }
+               else {
+                       int[] algos = {-1};
+                       long sizeInBytesArray[] = {workspaceLimit};
+                       
jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterAlgorithm(
+                                       LibMatrixCuDNN.getCudnnHandle(gCtx), 
+                                       ret.nchwTensorDesc, ret.nkpqTensorDesc, 
ret.convDesc, ret.filterDesc, 
+                                       
cudnnConvolutionBwdFilterPreference.CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
 sizeInBytesArray[0], algos);
+                       
jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
 
+                                       ret.nchwTensorDesc, ret.nkpqTensorDesc, 
ret.convDesc, ret.filterDesc, algos[0], sizeInBytesArray);
+                       if (sizeInBytesArray[0] != 0)
+                               ret.workSpace = 
gCtx.allocate(sizeInBytesArray[0]);
+                       ret.sizeInBytes = sizeInBytesArray[0];
+                       ret.algo = algos[0];
+               }
+               if (GPUStatistics.DISPLAY_STATISTICS)
+                       GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+               return ret;
+       }
+       
+       /**
+        * Factory method to get the algorithm wrapper for convolution backward 
data
+        * 
+        * @param gCtx     a valid {@link GPUContext}
+        * @param instName the invoking instruction's name for record {@link 
Statistics}.
+        * @param N        number of input images
+        * @param C        number of channels
+        * @param H        height of each image
+        * @param W        width of each image
+        * @param K        number of output "channels"
+        * @param R        height of filter
+        * @param S        width of filter
+        * @param pad_h    padding height
+        * @param pad_w    padding width
+        * @param stride_h stride height
+        * @param stride_w string width
+        * @param P        output height
+        * @param Q        output width
+        * @param workspaceLimit maximum intermediate memory to use
+        * @return algorithm wrapper
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static LibMatrixCuDNNConvolutionAlgorithm 
cudnnGetConvolutionBackwardDataAlgorithm(
+                       GPUContext gCtx, String instName, int N, int C, int H, 
int W, int K, int R, int S, 
+                       int pad_h, int pad_w, int stride_h, int stride_w, int 
P, int Q, long workspaceLimit) throws DMLRuntimeException {
+               long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() 
: 0;
+               LibMatrixCuDNNConvolutionAlgorithm ret = new 
LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, 
+                               pad_h, pad_w, stride_h, stride_w, P, Q);
+               
+               if(workspaceLimit <= 0) {
+                       // If overhead is greater than intermediate allocated 
memory, prefer the cudnn operator with no memory requirement
+                       // i.e. CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
+                       ret.algo = 
jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+               }
+               else {
+                       int[] algos = {-1};
+                       long sizeInBytesArray[] = {workspaceLimit};
+                       
jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm(
+                                       LibMatrixCuDNN.getCudnnHandle(gCtx), 
+                                       ret.filterDesc, ret.nkpqTensorDesc, 
ret.convDesc, ret.nchwTensorDesc,
+                                       
cudnnConvolutionBwdDataPreference.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
 sizeInBytesArray[0], algos);
+                       
jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
 
+                                       ret.filterDesc, ret.nkpqTensorDesc, 
ret.convDesc, ret.nchwTensorDesc, algos[0], sizeInBytesArray);
+                       if (sizeInBytesArray[0] != 0)
+                               ret.workSpace = 
gCtx.allocate(sizeInBytesArray[0]);
+                       ret.sizeInBytes = sizeInBytesArray[0];
+                       ret.algo = algos[0];
+               }
+               if (GPUStatistics.DISPLAY_STATISTICS)
+                       GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+               return ret;
+       }
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/43b573df/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
new file mode 100644
index 0000000..b9619c8
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import static jcuda.runtime.JCuda.cudaMemset;
+import jcuda.Pointer;
+import jcuda.Sizeof;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.context.CSRPointer;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.utils.GPUStatistics;
+
+/**
+ * Performs a slice operation: out = in[(n+1):(n+1), 1:numColumns]
+ */
+public class LibMatrixCuDNNInputRowFetcher implements java.lang.AutoCloseable {
+       GPUContext gCtx; String instName; int numColumns; boolean 
isInputInSparseFormat; 
+       Object inPointer; // can be either CSRPointer or Pointer 
+       Pointer outPointer;
+
+       /**
+        * Initialize the input fetcher
+        * 
+        * @param gCtx current gpu context
+        * @param instName name of the instruction
+        * @param image input matrix object.
+        * @throws DMLRuntimeException if error
+        */
+       public LibMatrixCuDNNInputRowFetcher(GPUContext gCtx, String instName, 
MatrixObject image) throws DMLRuntimeException {
+               this.gCtx = gCtx; this.instName = instName;
+               numColumns = LibMatrixCuDNN.toInt(image.getNumColumns());
+               isInputInSparseFormat = LibMatrixCuDNN.isInSparseFormat(gCtx, 
image);
+               inPointer = isInputInSparseFormat ? 
LibMatrixCuDNN.getSparsePointer(gCtx, image, instName) : 
LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, image, instName);
+               outPointer = gCtx.allocate(numColumns*Sizeof.DOUBLE);
+       }
+       /**
+        * Copy the nth row and return the dense pointer
+        * @param n zero-based row index
+        * @return dense pointer containing the nth row. This row is reused in 
the next iteration
+        * @throws DMLRuntimeException
+        */
+       public Pointer getNthRow(int n) throws DMLRuntimeException {
+               if(isInputInSparseFormat) {
+                       jcuda.runtime.JCuda.cudaDeviceSynchronize();
+                       long t0 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+                       cudaMemset(outPointer, 0, numColumns*Sizeof.DOUBLE);
+                       jcuda.runtime.JCuda.cudaDeviceSynchronize();
+                       if(GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, 
System.nanoTime() - t0);
+                       LibMatrixCuDNN.sliceSparseDense(gCtx, instName, 
(CSRPointer)inPointer, outPointer, n, n, 0, LibMatrixCuDNN.toInt(numColumns-1), 
numColumns);
+               }
+               else {
+                       LibMatrixCuDNN.sliceDenseDense(gCtx, instName, 
(Pointer)inPointer, outPointer, n, n, 0, LibMatrixCuDNN.toInt(numColumns-1), 
numColumns);
+               }
+               return outPointer;
+       }
+       /**
+        * Deallocates temporary pointer
+        */
+       @Override
+       public void close() {
+               gCtx.cudaFreeHelper(outPointer, true);
+       }
+}
\ No newline at end of file

systemml git commit: [SYSTEMML-445] Select CuDNN operator dynamically based on memory budget

Reply via email to