Repository: systemml Updated Branches: refs/heads/master 86b3090ba -> b56612f02
[SYSTEMML-445] Fixed a memory leak in GPU lstm builtin function and also added developer utility to debug such bugs in the future. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/b56612f0 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/b56612f0 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/b56612f0 Branch: refs/heads/master Commit: b56612f0231c7fe7abc100bf6f296bdb393aa971 Parents: 86b3090 Author: Niketan Pansare <npan...@us.ibm.com> Authored: Fri Jul 6 09:53:22 2018 -0700 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Fri Jul 6 09:53:22 2018 -0700 ---------------------------------------------------------------------- .../instructions/gpu/context/CSRPointer.java | 4 +- .../instructions/gpu/context/GPUContext.java | 10 ---- .../gpu/context/GPUMemoryManager.java | 54 +++++++++++--------- .../instructions/gpu/context/GPUObject.java | 12 ++--- .../runtime/matrix/data/LibMatrixCUDA.java | 2 +- .../runtime/matrix/data/LibMatrixCuDNN.java | 5 +- .../LibMatrixCuDNNConvolutionAlgorithm.java | 6 +-- .../data/LibMatrixCuDNNInputRowFetcher.java | 2 +- .../matrix/data/LibMatrixCuDNNRnnAlgorithm.java | 31 ++++++----- .../runtime/matrix/data/LibMatrixCuMatMult.java | 2 +- .../SinglePrecisionCudaSupportFunctions.java | 4 +- 11 files changed, 69 insertions(+), 63 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java index ff14ec4..d9be663 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java @@ -299,7 +299,7 @@ public class CSRPointer { cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST); //cudaDeviceSynchronize; // Do not increment the cudaCount of allocations on GPU - C.rowPtr = gCtx.allocate(getIntSizeOf((long) rowsC + 1)); + C.rowPtr = gCtx.allocate(null, getIntSizeOf((long) rowsC + 1)); } /** @@ -413,7 +413,7 @@ public class CSRPointer { } private Pointer allocate(long size) { - return getGPUContext().allocate(size); + return getGPUContext().allocate(null, size); } private GPUContext getGPUContext() { http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java index 4c0d1eb..2ac92a7 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java @@ -183,16 +183,6 @@ public class GPUContext { } /** - * Convenience method for {@link #allocate(String, long)}. - * - * @param size size of data (in bytes) to allocate - * @return jcuda pointer - */ - public Pointer allocate(long size) { - return memoryManager.malloc(null, size); - } - - /** * Invokes memory manager's malloc method * * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java index 35a31be..45611a4 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java @@ -28,6 +28,7 @@ import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.atomic.LongAdder; import java.util.stream.Collectors; @@ -50,6 +51,11 @@ import jcuda.Pointer; public class GPUMemoryManager { protected static final Log LOG = LogFactory.getLog(GPUMemoryManager.class.getName()); + // Developer flag: Use this flag to check for GPU memory leak in SystemML. + // This has an additional overhead of maintaining stack trace of all the allocated GPU pointers via PointerInfo class. + private static final boolean DEBUG_MEMORY_LEAK = false; + private static final int [] DEBUG_MEMORY_LEAK_STACKTRACE_DEPTH = {5, 6, 7, 8, 9, 10}; // Avoids printing too much text while debuggin + /*****************************************************************************************/ // GPU Memory is divided into three major sections: // 1. Matrix Memory: Memory allocated to matrices in SystemML and addressable by GPUObjects. @@ -109,7 +115,7 @@ public class GPUMemoryManager { private long sizeInBytes; private StackTraceElement[] stackTraceElements; public PointerInfo(long sizeInBytes) { - if(DMLScript.PRINT_GPU_MEMORY_INFO) { + if(DEBUG_MEMORY_LEAK) { this.stackTraceElements = Thread.currentThread().getStackTrace(); } this.sizeInBytes = sizeInBytes; @@ -196,6 +202,7 @@ public class GPUMemoryManager { } } + /** * Allocate pointer of the given size in bytes. * @@ -207,6 +214,10 @@ public class GPUMemoryManager { if(size < 0) { throw new DMLRuntimeException("Cannot allocate memory of size " + byteCountToDisplaySize(size)); } + if(DEBUG_MEMORY_LEAK) { + LOG.info("GPU Memory info during malloc:" + toString()); + } + long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; long mallocStart = 0; // Step 1: First try reusing exact match in rmvarGPUPointers to avoid holes in the GPU memory @@ -395,25 +406,23 @@ public class GPUMemoryManager { } // --------------- Developer Utilities to debug potential memory leaks ------------------------ - @SuppressWarnings("unused") - private void printPointers(List<PointerInfo> pointers) { - for(PointerInfo ptrInfo : pointers) { - System.out.println(">>" + - // getCallerInfo(ptrInfo.stackTraceElements, 5) + getCallerInfo(ptrInfo.stackTraceElements, 6) + getCallerInfo(ptrInfo.stackTraceElements, 7) + - getCallerInfo(ptrInfo.stackTraceElements, 8) + getCallerInfo(ptrInfo.stackTraceElements, 9) + getCallerInfo(ptrInfo.stackTraceElements, 10)); - } - } - - @SuppressWarnings("unused") private void printPointers(Set<Pointer> pointers, StringBuilder sb) { + HashMap<String, Integer> frequency = new HashMap<>(); for(Pointer ptr : pointers) { PointerInfo ptrInfo = allPointers.get(ptr); - sb.append(">>"); - // getCallerInfo(ptrInfo.stackTraceElements, 5) + getCallerInfo(ptrInfo.stackTraceElements, 6) + getCallerInfo(ptrInfo.stackTraceElements, 7) + - sb.append(getCallerInfo(ptrInfo.stackTraceElements, 8)); - sb.append(getCallerInfo(ptrInfo.stackTraceElements, 9)); - sb.append(getCallerInfo(ptrInfo.stackTraceElements, 10)); - sb.append("\n"); + String key = ""; + for(int index : DEBUG_MEMORY_LEAK_STACKTRACE_DEPTH) { + key += getCallerInfo(ptrInfo.stackTraceElements, index); + } + if(frequency.containsKey(key)) { + frequency.put(key, frequency.get(key)+1); + } + else { + frequency.put(key, 1); + } + } + for(Entry<String, Integer> kv : frequency.entrySet()) { + sb.append(">>" + kv.getKey() + " => " + kv.getValue() + "\n"); } } // -------------------------------------------------------------------------------------------- @@ -566,6 +575,7 @@ public class GPUMemoryManager { /** * Print debugging information */ + @SuppressWarnings("unused") public String toString() { long sizeOfLockedGPUObjects = 0; int numLockedGPUObjects = 0; int numLockedPointers = 0; long sizeOfUnlockedDirtyGPUObjects = 0; int numUnlockedDirtyGPUObjects = 0; int numUnlockedDirtyPointers = 0; @@ -605,12 +615,10 @@ public class GPUMemoryManager { totalSizePotentiallyLeakyPointers += size; } StringBuilder ret = new StringBuilder(); - //if(DMLScript.PRINT_GPU_MEMORY_INFO) { - // if(potentiallyLeakyPointers.size() > 0) { - // ret.append("Non-matrix pointers were allocated by:\n"); - // printPointers(potentiallyLeakyPointers, ret); - // } - //} + if(DEBUG_MEMORY_LEAK && potentiallyLeakyPointers.size() > 0) { + ret.append("Non-matrix pointers were allocated by:\n"); + printPointers(potentiallyLeakyPointers, ret); + } ret.append("\n====================================================\n"); ret.append(String.format("%-35s%-15s%-15s%-15s\n", "", "Num Objects", "Num Pointers", "Size")); http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index 328d1d4..6125d15 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -183,7 +183,7 @@ public class GPUObject { } private Pointer allocate(long size) { - return getGPUContext().allocate(size); + return getGPUContext().allocate(null, size); } private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException { @@ -212,7 +212,7 @@ public class GPUObject { Pointer alpha = LibMatrixCUDA.one(); Pointer beta = LibMatrixCUDA.zero(); Pointer A = densePtr; - Pointer C = gCtx.allocate(((long) m) * getDatatypeSizeOf(n)); + Pointer C = gCtx.allocate(null, ((long) m) * getDatatypeSizeOf(n)); // Transpose the matrix to get a dense matrix LibMatrixCUDA.cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, new Pointer(), @@ -240,8 +240,8 @@ public class GPUObject { Pointer nnzPerRowPtr = null; Pointer nnzTotalDevHostPtr = null; - nnzPerRowPtr = gCtx.allocate(getIntSizeOf(rows)); - nnzTotalDevHostPtr = gCtx.allocate(getIntSizeOf(1)); + nnzPerRowPtr = gCtx.allocate(null, getIntSizeOf(rows)); + nnzTotalDevHostPtr = gCtx.allocate(null, getIntSizeOf(1)); // Output is in dense vector format, convert it to CSR LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows, @@ -532,8 +532,8 @@ public class GPUObject { int cols = toIntExact(mat.getNumColumns()); Pointer nnzPerRowPtr = null; Pointer nnzTotalDevHostPtr = null; - nnzPerRowPtr = gCtx.allocate(getIntSizeOf(rows)); - nnzTotalDevHostPtr = gCtx.allocate(getIntSizeOf(1)); + nnzPerRowPtr = gCtx.allocate(instName, getIntSizeOf(rows)); + nnzTotalDevHostPtr = gCtx.allocate(instName, getIntSizeOf(1)); LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, getDensePointer(), rows, nnzPerRowPtr, nnzTotalDevHostPtr); int[] nnzC = { -1 }; http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index e4fdc96..464c4c2 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -2454,7 +2454,7 @@ public class LibMatrixCUDA { // step 4: compute QR factorization Pointer work = gCtx.allocate(instName, lwork[0] * sizeOfDataType); Pointer tau = gCtx.allocate(instName, m * sizeOfDataType); - Pointer devInfo = gCtx.allocate(Sizeof.INT); + Pointer devInfo = gCtx.allocate(instName, Sizeof.INT); if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); cudaSupportFunctions.cusolverDngeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo); if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR, System.nanoTime() - t0); http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java index a692739..c6abbfe 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java @@ -436,7 +436,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { try(LibMatrixCuDNNInputRowFetcher imgFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, image); LibMatrixCuDNNInputRowFetcher doutFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, dout)) { // Perform one-input conv2dBackwardFilter - Pointer tempdwPointer = gCtx.allocate(KCRS*sizeOfDataType); + Pointer tempdwPointer = gCtx.allocate(instName, KCRS*sizeOfDataType); for(int n = 0; n < N; n++) { long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; cudaMemset(tempdwPointer, 0, KCRS*sizeOfDataType); @@ -754,7 +754,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { if(!isMaxPoolOutputProvided) { if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); long numBytes = N*C*P*Q*sizeOfDataType; - y = gCtx.allocate(numBytes); + y = gCtx.allocate(instName, numBytes); if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); if (DMLScript.FINEGRAINED_STATISTICS) t2 = System.nanoTime(); status = cudnnPoolingForward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.xDesc, x, zero(), desc.yDesc, y); @@ -976,6 +976,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D), smlDx, cudnnDx, N, D, T*D, N*T*D); ec.releaseMatrixOutputForGPUInstruction(dxName); + gCtx.cudaFreeHelper(instName, cudnnDx, DMLScript.EAGER_CUDA_FREE); // ------------------------------------------------------------------------------------------- Pointer cudnnDwPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType); http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java index c95c3b3..f70b453 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java @@ -141,7 +141,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), ret.nchwTensorDesc, ret.filterDesc, ret.convDesc, ret.nkpqTensorDesc, algos[0], sizeInBytesArray); if (sizeInBytesArray[0] != 0) - ret.workSpace = gCtx.allocate(sizeInBytesArray[0]); + ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0]); ret.sizeInBytes = sizeInBytesArray[0]; ret.algo = algos[0]; if (DMLScript.FINEGRAINED_STATISTICS) @@ -186,7 +186,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, algos[0], sizeInBytesArray); if (sizeInBytesArray[0] != 0) - ret.workSpace = gCtx.allocate(sizeInBytesArray[0]); + ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0]); ret.sizeInBytes = sizeInBytesArray[0]; ret.algo = algos[0]; @@ -239,7 +239,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, algos[0], sizeInBytesArray); if (sizeInBytesArray[0] != 0) - ret.workSpace = gCtx.allocate(sizeInBytesArray[0]); + ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0]); ret.sizeInBytes = sizeInBytesArray[0]; ret.algo = algos[0]; if (DMLScript.FINEGRAINED_STATISTICS) http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java index 70c33d6..0130aa6 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java @@ -49,7 +49,7 @@ public class LibMatrixCuDNNInputRowFetcher extends LibMatrixCUDA implements java numColumns = LibMatrixCUDA.toInt(image.getNumColumns()); isInputInSparseFormat = LibMatrixCUDA.isInSparseFormat(gCtx, image); inPointer = isInputInSparseFormat ? LibMatrixCUDA.getSparsePointer(gCtx, image, instName) : LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, image, instName); - outPointer = gCtx.allocate(numColumns*sizeOfDataType); + outPointer = gCtx.allocate(instName, numColumns*sizeOfDataType); } /** * Copy the nth row and return the dense pointer http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java index 68d308e..8ebc4e0 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java @@ -55,6 +55,7 @@ public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable { cudnnFilterDescriptor dwDesc; long sizeInBytes; Pointer workSpace; long reserveSpaceSizeInBytes; Pointer reserveSpace; + long dropOutSizeInBytes; Pointer dropOutStateSpace; public LibMatrixCuDNNRnnAlgorithm(ExecutionContext ec, GPUContext gCtx, String instName, String rnnMode, int N, int T, int M, int D, boolean isTraining, Pointer w) throws DMLRuntimeException { this.gCtx = gCtx; @@ -83,12 +84,13 @@ public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable { // Initial dropout descriptor dropoutDesc = new cudnnDropoutDescriptor(); JCudnn.cudnnCreateDropoutDescriptor(dropoutDesc); - long [] dropOutSizeInBytes = {-1}; - JCudnn.cudnnDropoutGetStatesSize(gCtx.getCudnnHandle(), dropOutSizeInBytes); - Pointer dropOutStateSpace = new Pointer(); - if (dropOutSizeInBytes[0] != 0) - dropOutStateSpace = gCtx.allocate(dropOutSizeInBytes[0]); - JCudnn.cudnnSetDropoutDescriptor(dropoutDesc, gCtx.getCudnnHandle(), 0, dropOutStateSpace, dropOutSizeInBytes[0], 12345); + long [] _dropOutSizeInBytes = {-1}; + JCudnn.cudnnDropoutGetStatesSize(gCtx.getCudnnHandle(), _dropOutSizeInBytes); + dropOutSizeInBytes = _dropOutSizeInBytes[0]; + dropOutStateSpace = new Pointer(); + if (dropOutSizeInBytes != 0) + dropOutStateSpace = gCtx.allocate(instName, dropOutSizeInBytes); + JCudnn.cudnnSetDropoutDescriptor(dropoutDesc, gCtx.getCudnnHandle(), 0, dropOutStateSpace, dropOutSizeInBytes, 12345); // Initialize RNN descriptor rnnDesc = new cudnnRNNDescriptor(); @@ -109,18 +111,14 @@ public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable { workSpace = new Pointer(); reserveSpace = new Pointer(); sizeInBytes = getWorkspaceSize(T); if(sizeInBytes != 0) - workSpace = gCtx.allocate(sizeInBytes); + workSpace = gCtx.allocate(instName, sizeInBytes); reserveSpaceSizeInBytes = 0; if(isTraining) { reserveSpaceSizeInBytes = getReservespaceSize(T); if (reserveSpaceSizeInBytes != 0) { - reserveSpace = gCtx.allocate(reserveSpaceSizeInBytes); + reserveSpace = gCtx.allocate(instName, reserveSpaceSizeInBytes); } } - if (reserveSpaceSizeInBytes == 0) { - reserveSpace = gCtx.allocate(reserveSpaceSizeInBytes); - } - /* int numLinearLayers = getNumLinearLayers(rnnMode); for(int i = 0; i < numLinearLayers; i++) { @@ -308,6 +306,7 @@ public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable { throw new RuntimeException(e); } } + workSpace = null; if(reserveSpaceSizeInBytes != 0) { try { gCtx.cudaFreeHelper(instName, reserveSpace, DMLScript.EAGER_CUDA_FREE); @@ -315,5 +314,13 @@ public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable { throw new RuntimeException(e); } } + reserveSpace = null; + if(dropOutSizeInBytes != 0) { + try { + gCtx.cudaFreeHelper(instName, dropOutStateSpace, DMLScript.EAGER_CUDA_FREE); + } catch (DMLRuntimeException e) { + throw new RuntimeException(e); + } + } } } http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java index adbbcb8..18739a8 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java @@ -269,7 +269,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA { // t(C) = t(B) %*% t(A) Pointer output = null; if (outRLen != 1 && outCLen != 1) { - output = gCtx.allocate(outRLen * outCLen * sizeOfDataType); + output = gCtx.allocate(instName, outRLen * outCLen * sizeOfDataType); } else { // no transpose required for vector output output = C; http://git-wip-us.apache.org/repos/asf/systemml/blob/b56612f0/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java index 39371e6..942b56b 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java @@ -179,7 +179,7 @@ public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions // during eviction: `evict -> devictToHost -> float2double -> allocate -> ensureFreeSpace -> evict`. // To avoid this recursion, it is necessary to perform this conversion in host. if(PERFORM_CONVERSION_ON_DEVICE && !isEviction) { - Pointer deviceDoubleData = gCtx.allocate(((long)dest.length)*Sizeof.DOUBLE); + Pointer deviceDoubleData = gCtx.allocate(instName, ((long)dest.length)*Sizeof.DOUBLE); LibMatrixCUDA.float2double(gCtx, src, deviceDoubleData, dest.length); cudaMemcpy(Pointer.to(dest), deviceDoubleData, ((long)dest.length)*Sizeof.DOUBLE, cudaMemcpyDeviceToHost); gCtx.cudaFreeHelper(instName, deviceDoubleData, DMLScript.EAGER_CUDA_FREE); @@ -205,7 +205,7 @@ public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions // TODO: Perform conversion on GPU using double2float and float2double kernels long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; if(PERFORM_CONVERSION_ON_DEVICE) { - Pointer deviceDoubleData = gCtx.allocate(((long)src.length)*Sizeof.DOUBLE); + Pointer deviceDoubleData = gCtx.allocate(instName, ((long)src.length)*Sizeof.DOUBLE); cudaMemcpy(deviceDoubleData, Pointer.to(src), ((long)src.length)*Sizeof.DOUBLE, cudaMemcpyHostToDevice); LibMatrixCUDA.double2float(gCtx, deviceDoubleData, dest, src.length); gCtx.cudaFreeHelper(instName, deviceDoubleData, DMLScript.EAGER_CUDA_FREE);