Repository: systemml
Updated Branches:
  refs/heads/master 86b3090ba -> b56612f02


[SYSTEMML-445] Fixed a memory leak in GPU lstm builtin function and also
added developer utility to debug such bugs in the future.

Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/b56612f0
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/b56612f0
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/b56612f0

Branch: refs/heads/master
Commit: b56612f0231c7fe7abc100bf6f296bdb393aa971
Parents: 86b3090
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Fri Jul 6 09:53:22 2018 -0700
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Fri Jul 6 09:53:22 2018 -0700

----------------------------------------------------------------------
 .../instructions/gpu/context/CSRPointer.java    |  4 +-
 .../instructions/gpu/context/GPUContext.java    | 10 ----
 .../gpu/context/GPUMemoryManager.java           | 54 +++++++++++---------
 .../instructions/gpu/context/GPUObject.java     | 12 ++---
 .../runtime/matrix/data/LibMatrixCUDA.java      |  2 +-
 .../runtime/matrix/data/LibMatrixCuDNN.java     |  5 +-
 .../LibMatrixCuDNNConvolutionAlgorithm.java     |  6 +--
 .../data/LibMatrixCuDNNInputRowFetcher.java     |  2 +-
 .../matrix/data/LibMatrixCuDNNRnnAlgorithm.java | 31 ++++++-----
 .../runtime/matrix/data/LibMatrixCuMatMult.java |  2 +-
 .../SinglePrecisionCudaSupportFunctions.java    |  4 +-
 11 files changed, 69 insertions(+), 63 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index ff14ec4..d9be663 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -299,7 +299,7 @@ public class CSRPointer {
                cusparseSetPointerMode(handle, 
cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
                //cudaDeviceSynchronize;
                // Do not increment the cudaCount of allocations on GPU
-               C.rowPtr = gCtx.allocate(getIntSizeOf((long) rowsC + 1));
+               C.rowPtr = gCtx.allocate(null, getIntSizeOf((long) rowsC + 1));
        }
 
        /**
@@ -413,7 +413,7 @@ public class CSRPointer {
        }
 
        private Pointer allocate(long size) {
-               return getGPUContext().allocate(size);
+               return getGPUContext().allocate(null, size);
        }
 
        private GPUContext getGPUContext() {

http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 4c0d1eb..2ac92a7 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -183,16 +183,6 @@ public class GPUContext {
        }
 
        /**
-        * Convenience method for {@link #allocate(String, long)}.
-        *
-        * @param size size of data (in bytes) to allocate
-        * @return jcuda pointer
-        */
-       public Pointer allocate(long size) {
-               return memoryManager.malloc(null, size);
-       }
-
-       /**
         * Invokes memory manager's malloc method
         *
         * @param instructionName name of instruction for which to record per 
instruction performance statistics, null if don't want to record

http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
index 35a31be..45611a4 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -28,6 +28,7 @@ import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map.Entry;
 import java.util.Set;
 import java.util.concurrent.atomic.LongAdder;
 import java.util.stream.Collectors;
@@ -50,6 +51,11 @@ import jcuda.Pointer;
 public class GPUMemoryManager {
        protected static final Log LOG = 
LogFactory.getLog(GPUMemoryManager.class.getName());
        
+       // Developer flag: Use this flag to check for GPU memory leak in 
SystemML.  
+       // This has an additional overhead of maintaining stack trace of all 
the allocated GPU pointers via PointerInfo class.
+       private static final boolean DEBUG_MEMORY_LEAK = false;
+       private static final int [] DEBUG_MEMORY_LEAK_STACKTRACE_DEPTH = {5, 6, 
7, 8, 9, 10}; // Avoids printing too much text while debuggin
+       
        
/*****************************************************************************************/
        // GPU Memory is divided into three major sections:
        // 1. Matrix Memory: Memory allocated to matrices in SystemML and 
addressable by GPUObjects.
@@ -109,7 +115,7 @@ public class GPUMemoryManager {
                private long sizeInBytes;
                private StackTraceElement[] stackTraceElements;
                public PointerInfo(long sizeInBytes) {
-                       if(DMLScript.PRINT_GPU_MEMORY_INFO) {
+                       if(DEBUG_MEMORY_LEAK) {
                                this.stackTraceElements = 
Thread.currentThread().getStackTrace();
                        }
                        this.sizeInBytes = sizeInBytes;
@@ -196,6 +202,7 @@ public class GPUMemoryManager {
            }
        }
        
+       
        /**
         * Allocate pointer of the given size in bytes.
         * 
@@ -207,6 +214,10 @@ public class GPUMemoryManager {
                if(size < 0) {
                        throw new DMLRuntimeException("Cannot allocate memory 
of size " + byteCountToDisplaySize(size));
                }
+               if(DEBUG_MEMORY_LEAK) {
+                       LOG.info("GPU Memory info during malloc:" + toString());
+               }
+               
                long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
                long mallocStart = 0;
                // Step 1: First try reusing exact match in rmvarGPUPointers to 
avoid holes in the GPU memory
@@ -395,25 +406,23 @@ public class GPUMemoryManager {
        }
        
        // --------------- Developer Utilities to debug potential memory leaks 
------------------------
-       @SuppressWarnings("unused")
-       private void printPointers(List<PointerInfo> pointers) {
-               for(PointerInfo ptrInfo : pointers) {
-                       System.out.println(">>" + 
-                                       // 
getCallerInfo(ptrInfo.stackTraceElements, 5) + 
getCallerInfo(ptrInfo.stackTraceElements, 6) + 
getCallerInfo(ptrInfo.stackTraceElements, 7) +
-                                       
getCallerInfo(ptrInfo.stackTraceElements, 8) + 
getCallerInfo(ptrInfo.stackTraceElements, 9) + 
getCallerInfo(ptrInfo.stackTraceElements, 10));
-               }
-       }
-       
-       @SuppressWarnings("unused")
        private void printPointers(Set<Pointer> pointers, StringBuilder sb) {
+               HashMap<String, Integer> frequency = new HashMap<>();
                for(Pointer ptr : pointers) {
                        PointerInfo ptrInfo = allPointers.get(ptr);
-                       sb.append(">>");
-                       // getCallerInfo(ptrInfo.stackTraceElements, 5) + 
getCallerInfo(ptrInfo.stackTraceElements, 6) + 
getCallerInfo(ptrInfo.stackTraceElements, 7) +
-                       sb.append(getCallerInfo(ptrInfo.stackTraceElements, 8));
-                       sb.append(getCallerInfo(ptrInfo.stackTraceElements, 9));
-                       sb.append(getCallerInfo(ptrInfo.stackTraceElements, 
10));
-                       sb.append("\n");
+                       String key = "";
+                       for(int index : DEBUG_MEMORY_LEAK_STACKTRACE_DEPTH) {
+                               key += 
getCallerInfo(ptrInfo.stackTraceElements, index);
+                       }
+                       if(frequency.containsKey(key)) {
+                               frequency.put(key, frequency.get(key)+1);
+                       }
+                       else {
+                               frequency.put(key, 1);
+                       }
+               }
+               for(Entry<String, Integer> kv : frequency.entrySet()) {
+                       sb.append(">>" + kv.getKey() + " => " + kv.getValue() + 
"\n");
                }
        }
        // 
--------------------------------------------------------------------------------------------
@@ -566,6 +575,7 @@ public class GPUMemoryManager {
        /**
         * Print debugging information
         */
+       @SuppressWarnings("unused")
        public String toString() {
                long sizeOfLockedGPUObjects = 0; int numLockedGPUObjects = 0; 
int numLockedPointers = 0;
                long sizeOfUnlockedDirtyGPUObjects = 0; int 
numUnlockedDirtyGPUObjects = 0; int numUnlockedDirtyPointers = 0;
@@ -605,12 +615,10 @@ public class GPUMemoryManager {
                        totalSizePotentiallyLeakyPointers += size;
                }
                StringBuilder ret = new StringBuilder();
-               //if(DMLScript.PRINT_GPU_MEMORY_INFO) {
-               //      if(potentiallyLeakyPointers.size() > 0) {
-               //              ret.append("Non-matrix pointers were allocated 
by:\n");
-               //              printPointers(potentiallyLeakyPointers, ret);
-               //      }
-               //}
+               if(DEBUG_MEMORY_LEAK && potentiallyLeakyPointers.size() > 0) {
+                       ret.append("Non-matrix pointers were allocated by:\n");
+                       printPointers(potentiallyLeakyPointers, ret);
+               }
                
ret.append("\n====================================================\n");
                ret.append(String.format("%-35s%-15s%-15s%-15s\n", "", 
                                "Num Objects", "Num Pointers", "Size"));

http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 328d1d4..6125d15 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -183,7 +183,7 @@ public class GPUObject {
        }
 
        private Pointer allocate(long size) {
-               return getGPUContext().allocate(size);
+               return getGPUContext().allocate(null, size);
        }
 
        private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
@@ -212,7 +212,7 @@ public class GPUObject {
                Pointer alpha = LibMatrixCUDA.one();
                Pointer beta = LibMatrixCUDA.zero();
                Pointer A = densePtr;
-               Pointer C = gCtx.allocate(((long) m) * getDatatypeSizeOf(n));
+               Pointer C = gCtx.allocate(null, ((long) m) * 
getDatatypeSizeOf(n));
 
                // Transpose the matrix to get a dense matrix
                
LibMatrixCUDA.cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), 
CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, new Pointer(),
@@ -240,8 +240,8 @@ public class GPUObject {
                Pointer nnzPerRowPtr = null;
                Pointer nnzTotalDevHostPtr = null;
 
-               nnzPerRowPtr = gCtx.allocate(getIntSizeOf(rows));
-               nnzTotalDevHostPtr = gCtx.allocate(getIntSizeOf(1));
+               nnzPerRowPtr = gCtx.allocate(null, getIntSizeOf(rows));
+               nnzTotalDevHostPtr = gCtx.allocate(null, getIntSizeOf(1));
 
                // Output is in dense vector format, convert it to CSR
                LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, 
cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows,
@@ -532,8 +532,8 @@ public class GPUObject {
                                int cols = toIntExact(mat.getNumColumns());
                                Pointer nnzPerRowPtr = null;
                                Pointer nnzTotalDevHostPtr = null;
-                               nnzPerRowPtr = 
gCtx.allocate(getIntSizeOf(rows));
-                               nnzTotalDevHostPtr = 
gCtx.allocate(getIntSizeOf(1));
+                               nnzPerRowPtr = gCtx.allocate(instName, 
getIntSizeOf(rows));
+                               nnzTotalDevHostPtr = gCtx.allocate(instName, 
getIntSizeOf(1));
                                
LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, 
cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, 
getDensePointer(), rows,
                                                nnzPerRowPtr, 
nnzTotalDevHostPtr);
                                int[] nnzC = { -1 };

http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index e4fdc96..464c4c2 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -2454,7 +2454,7 @@ public class LibMatrixCUDA {
                // step 4: compute QR factorization
                Pointer work = gCtx.allocate(instName, lwork[0] * 
sizeOfDataType);
                Pointer tau = gCtx.allocate(instName, m * sizeOfDataType);
-               Pointer devInfo = gCtx.allocate(Sizeof.INT);
+               Pointer devInfo = gCtx.allocate(instName, Sizeof.INT);
                if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                
cudaSupportFunctions.cusolverDngeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, 
tau, work, lwork[0], devInfo);
                if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR, 
System.nanoTime() - t0);

http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index a692739..c6abbfe 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -436,7 +436,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                                        try(LibMatrixCuDNNInputRowFetcher 
imgFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, image);
                                                LibMatrixCuDNNInputRowFetcher 
doutFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, dout)) {
                                                // Perform one-input 
conv2dBackwardFilter
-                                               Pointer tempdwPointer = 
gCtx.allocate(KCRS*sizeOfDataType);
+                                               Pointer tempdwPointer = 
gCtx.allocate(instName, KCRS*sizeOfDataType);
                                                for(int n = 0; n < N; n++) {
                                                        long t0 = 
DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
                                                        
cudaMemset(tempdwPointer, 0, KCRS*sizeOfDataType);
@@ -754,7 +754,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        if(!isMaxPoolOutputProvided) {
                                if (DMLScript.FINEGRAINED_STATISTICS) t1 = 
System.nanoTime();
                                long numBytes = N*C*P*Q*sizeOfDataType;
-                               y = gCtx.allocate(numBytes);
+                               y = gCtx.allocate(instName, numBytes);
                                if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
                                if (DMLScript.FINEGRAINED_STATISTICS) t2 = 
System.nanoTime();
                                status = 
cudnnPoolingForward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.xDesc, 
x, zero(), desc.yDesc, y);
@@ -976,6 +976,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                                        
ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
                                        smlDx, cudnnDx, N, D, T*D, N*T*D);
                        ec.releaseMatrixOutputForGPUInstruction(dxName);
+                       gCtx.cudaFreeHelper(instName, cudnnDx, 
DMLScript.EAGER_CUDA_FREE);
                        
                        // 
-------------------------------------------------------------------------------------------
                        Pointer cudnnDwPointer = gCtx.allocate(instName, 
(D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);

http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
index c95c3b3..f70b453 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
@@ -141,7 +141,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseab
                
jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
 
                                ret.nchwTensorDesc, ret.filterDesc, 
ret.convDesc, ret.nkpqTensorDesc, algos[0], sizeInBytesArray);
                if (sizeInBytesArray[0] != 0)
-                       ret.workSpace = gCtx.allocate(sizeInBytesArray[0]);
+                       ret.workSpace = gCtx.allocate(instName, 
sizeInBytesArray[0]);
                ret.sizeInBytes = sizeInBytesArray[0];
                ret.algo = algos[0];
                if (DMLScript.FINEGRAINED_STATISTICS)
@@ -186,7 +186,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseab
                
jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
 
                                ret.nchwTensorDesc, ret.nkpqTensorDesc, 
ret.convDesc, ret.filterDesc, algos[0], sizeInBytesArray);
                if (sizeInBytesArray[0] != 0)
-                       ret.workSpace = gCtx.allocate(sizeInBytesArray[0]);
+                       ret.workSpace = gCtx.allocate(instName, 
sizeInBytesArray[0]);
                ret.sizeInBytes = sizeInBytesArray[0];
                ret.algo = algos[0];
                
@@ -239,7 +239,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseab
                        
jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
 
                                        ret.filterDesc, ret.nkpqTensorDesc, 
ret.convDesc, ret.nchwTensorDesc, algos[0], sizeInBytesArray);
                        if (sizeInBytesArray[0] != 0)
-                               ret.workSpace = 
gCtx.allocate(sizeInBytesArray[0]);
+                               ret.workSpace = gCtx.allocate(instName, 
sizeInBytesArray[0]);
                        ret.sizeInBytes = sizeInBytesArray[0];
                        ret.algo = algos[0];
                        if (DMLScript.FINEGRAINED_STATISTICS)

http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
index 70c33d6..0130aa6 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
@@ -49,7 +49,7 @@ public class LibMatrixCuDNNInputRowFetcher extends 
LibMatrixCUDA implements java
                numColumns = LibMatrixCUDA.toInt(image.getNumColumns());
                isInputInSparseFormat = LibMatrixCUDA.isInSparseFormat(gCtx, 
image);
                inPointer = isInputInSparseFormat ? 
LibMatrixCUDA.getSparsePointer(gCtx, image, instName) : 
LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, image, instName);
-               outPointer = gCtx.allocate(numColumns*sizeOfDataType);
+               outPointer = gCtx.allocate(instName, numColumns*sizeOfDataType);
        }
        /**
         * Copy the nth row and return the dense pointer

http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
index 68d308e..8ebc4e0 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
@@ -55,6 +55,7 @@ public class LibMatrixCuDNNRnnAlgorithm implements 
java.lang.AutoCloseable {
        cudnnFilterDescriptor dwDesc;
        long sizeInBytes; Pointer workSpace;
        long reserveSpaceSizeInBytes; Pointer reserveSpace;
+       long dropOutSizeInBytes; Pointer dropOutStateSpace;
        public LibMatrixCuDNNRnnAlgorithm(ExecutionContext ec, GPUContext gCtx, 
String instName, 
                        String rnnMode, int N, int T, int M, int D, boolean 
isTraining, Pointer w) throws DMLRuntimeException {
                this.gCtx = gCtx;
@@ -83,12 +84,13 @@ public class LibMatrixCuDNNRnnAlgorithm implements 
java.lang.AutoCloseable {
                // Initial dropout descriptor
                dropoutDesc = new cudnnDropoutDescriptor();
                JCudnn.cudnnCreateDropoutDescriptor(dropoutDesc);
-               long [] dropOutSizeInBytes = {-1};
-               JCudnn.cudnnDropoutGetStatesSize(gCtx.getCudnnHandle(), 
dropOutSizeInBytes);
-               Pointer dropOutStateSpace = new Pointer();
-               if (dropOutSizeInBytes[0] != 0)
-                       dropOutStateSpace = 
gCtx.allocate(dropOutSizeInBytes[0]);
-               JCudnn.cudnnSetDropoutDescriptor(dropoutDesc, 
gCtx.getCudnnHandle(), 0, dropOutStateSpace, dropOutSizeInBytes[0], 12345);
+               long [] _dropOutSizeInBytes = {-1};
+               JCudnn.cudnnDropoutGetStatesSize(gCtx.getCudnnHandle(), 
_dropOutSizeInBytes);
+               dropOutSizeInBytes = _dropOutSizeInBytes[0];
+               dropOutStateSpace = new Pointer();
+               if (dropOutSizeInBytes != 0)
+                       dropOutStateSpace = gCtx.allocate(instName, 
dropOutSizeInBytes);
+               JCudnn.cudnnSetDropoutDescriptor(dropoutDesc, 
gCtx.getCudnnHandle(), 0, dropOutStateSpace, dropOutSizeInBytes, 12345);
                
                // Initialize RNN descriptor
                rnnDesc = new cudnnRNNDescriptor();
@@ -109,18 +111,14 @@ public class LibMatrixCuDNNRnnAlgorithm implements 
java.lang.AutoCloseable {
                workSpace = new Pointer(); reserveSpace = new Pointer();
                sizeInBytes = getWorkspaceSize(T);
                if(sizeInBytes != 0)
-                       workSpace = gCtx.allocate(sizeInBytes);
+                       workSpace = gCtx.allocate(instName, sizeInBytes);
                reserveSpaceSizeInBytes = 0;
                if(isTraining) {
                        reserveSpaceSizeInBytes = getReservespaceSize(T);
                        if (reserveSpaceSizeInBytes != 0) {
-                               reserveSpace = 
gCtx.allocate(reserveSpaceSizeInBytes);
+                               reserveSpace = gCtx.allocate(instName, 
reserveSpaceSizeInBytes);
                        }
                }
-               if (reserveSpaceSizeInBytes == 0) {
-                       reserveSpace = gCtx.allocate(reserveSpaceSizeInBytes);
-               }
-               
                /*
                int numLinearLayers = getNumLinearLayers(rnnMode); 
                for(int i = 0; i < numLinearLayers; i++) {
@@ -308,6 +306,7 @@ public class LibMatrixCuDNNRnnAlgorithm implements 
java.lang.AutoCloseable {
                                throw new RuntimeException(e);
                        }
                }
+               workSpace = null;
                if(reserveSpaceSizeInBytes != 0) {
                        try {
                                gCtx.cudaFreeHelper(instName, reserveSpace, 
DMLScript.EAGER_CUDA_FREE);
@@ -315,5 +314,13 @@ public class LibMatrixCuDNNRnnAlgorithm implements 
java.lang.AutoCloseable {
                                throw new RuntimeException(e);
                        }
                }       
+               reserveSpace = null;
+               if(dropOutSizeInBytes != 0) {
+                       try {
+                               gCtx.cudaFreeHelper(instName, 
dropOutStateSpace, DMLScript.EAGER_CUDA_FREE);
+                       } catch (DMLRuntimeException e) {
+                               throw new RuntimeException(e);
+                       }
+               }
        }
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
index adbbcb8..18739a8 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
@@ -269,7 +269,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
                // t(C) = t(B) %*% t(A)
                Pointer output = null;
                if (outRLen != 1 && outCLen != 1) {
-                       output = gCtx.allocate(outRLen * outCLen * 
sizeOfDataType);
+                       output = gCtx.allocate(instName, outRLen * outCLen * 
sizeOfDataType);
                } else {
                        // no transpose required for vector output
                        output = C;

http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
index 39371e6..942b56b 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
@@ -179,7 +179,7 @@ public class SinglePrecisionCudaSupportFunctions implements 
CudaSupportFunctions
                // during eviction: `evict -> devictToHost -> float2double -> 
allocate -> ensureFreeSpace -> evict`. 
                // To avoid this recursion, it is necessary to perform this 
conversion in host.
                if(PERFORM_CONVERSION_ON_DEVICE && !isEviction) {
-                       Pointer deviceDoubleData = 
gCtx.allocate(((long)dest.length)*Sizeof.DOUBLE);
+                       Pointer deviceDoubleData = gCtx.allocate(instName, 
((long)dest.length)*Sizeof.DOUBLE);
                        LibMatrixCUDA.float2double(gCtx, src, deviceDoubleData, 
dest.length);
                        cudaMemcpy(Pointer.to(dest), deviceDoubleData, 
((long)dest.length)*Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
                        gCtx.cudaFreeHelper(instName, deviceDoubleData, 
DMLScript.EAGER_CUDA_FREE);
@@ -205,7 +205,7 @@ public class SinglePrecisionCudaSupportFunctions implements 
CudaSupportFunctions
                // TODO: Perform conversion on GPU using double2float and 
float2double kernels
                long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
                if(PERFORM_CONVERSION_ON_DEVICE) {
-                       Pointer deviceDoubleData = 
gCtx.allocate(((long)src.length)*Sizeof.DOUBLE);
+                       Pointer deviceDoubleData = gCtx.allocate(instName, 
((long)src.length)*Sizeof.DOUBLE);
                        cudaMemcpy(deviceDoubleData, Pointer.to(src), 
((long)src.length)*Sizeof.DOUBLE, cudaMemcpyHostToDevice);
                        LibMatrixCUDA.double2float(gCtx, deviceDoubleData, 
dest, src.length);
                        gCtx.cudaFreeHelper(instName, deviceDoubleData, 
DMLScript.EAGER_CUDA_FREE);

Reply via email to