Repository: systemml
Updated Branches:
  refs/heads/master f86879bd0 -> ec5dfda57


[MINOR] gpu memory leak fix

- Changed list of free pointers to set of free pointers for GPU
- Changed threadlocal cuda handles to non threadlocal. This is assuming there 
will be one thread per GPU.

Closes #665


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/ec5dfda5
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/ec5dfda5
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/ec5dfda5

Branch: refs/heads/master
Commit: ec5dfda57a42b172886dd5d42bfe3b034b30c7b7
Parents: f86879b
Author: Nakul Jindal <naku...@gmail.com>
Authored: Tue Sep 19 14:57:16 2017 -0700
Committer: Nakul Jindal <naku...@gmail.com>
Committed: Tue Sep 19 14:57:16 2017 -0700

----------------------------------------------------------------------
 src/main/cpp/CMakeLists.txt                     |   4 +-
 .../instructions/gpu/context/GPUContext.java    | 313 +++++++++++--------
 2 files changed, 179 insertions(+), 138 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/ec5dfda5/src/main/cpp/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 47555bf..04e12b4 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -29,6 +29,7 @@ option(USE_INTEL_MKL "Whether to use Intel MKL (Defaults to 
compiling with Intel
 
 # Build a shared libraray
 add_library(systemml SHARED libmatrixdnn.cpp  libmatrixmult.cpp  systemml.cpp)
+set_target_properties(systemml PROPERTIES MACOSX_RPATH 1)
 
 set(MATH_LIBRARIES "")
 
@@ -72,5 +73,6 @@ if (USE_OPEN_BLAS)
   find_package(OpenMP REQUIRED)
   set_target_properties(systemml PROPERTIES LINK_FLAGS "${OpenMP_CXX_FLAGS} 
${MATH_LIBRARIES}")
 elseif(USE_INTEL_MKL)
-  set_target_properties(systemml PROPERTIES LINK_FLAGS ${MATH_LIBRARIES}")
+  set_target_properties(systemml PROPERTIES LINK_FLAGS "${MATH_LIBRARIES}")
 endif()
+

http://git-wip-us.apache.org/repos/asf/systemml/blob/ec5dfda5/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 271109d..8a823cc 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -41,9 +41,11 @@ import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
-import java.util.LinkedList;
+import java.util.HashSet;
+import java.util.Iterator;
 import java.util.Map;
 import java.util.Map.Entry;
+import java.util.Set;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -88,83 +90,50 @@ public class GPUContext {
         * active device assigned to this GPUContext instance
         */
        private final int deviceNum;
-       // Invoke cudaMemGetInfo to get available memory information. Useful if 
GPU is shared among multiple application.
-       public double GPU_MEMORY_UTILIZATION_FACTOR = 
ConfigurationManager.getDMLConfig()
-                       
.getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
-       /**
-        * Map of free blocks allocate on GPU. maps size_of_block -> pointer on 
GPU
-        */
-       private LRUCacheMap<Long, LinkedList<Pointer>> freeCUDASpaceMap = new 
LRUCacheMap<>();
-       /**
-        * To record size of allocated blocks
-        */
-       private HashMap<Pointer, Long> cudaBlockSizeMap = new HashMap<>();
-       /**
-        * list of allocated {@link GPUObject} instances allocated on {@link 
GPUContext#deviceNum} GPU
-        * These are matrices allocated on the GPU on which rmvar hasn't been 
called yet.
-        * If a {@link GPUObject} has more than one lock on it, it cannot be 
freed
-        * If it has zero locks on it, it can be freed, but it is preferrable 
to keep it around
-        * so that an extraneous host to dev transfer can be avoided
-        */
-       private ArrayList<GPUObject> allocatedGPUObjects = new ArrayList<>();
        /**
         * cudnnHandle for Deep Neural Network operations on the GPU
         */
-       private final ThreadLocal<cudnnHandle> cudnnHandle = new 
ThreadLocal<>();
+       private cudnnHandle cudnnHandle;
        /**
         * cublasHandle for BLAS operations on the GPU
         */
-       private final ThreadLocal<cublasHandle> cublasHandle = new 
ThreadLocal<>();
+       private cublasHandle cublasHandle;
        /**
         * cusparseHandle for certain sparse BLAS operations on the GPU
         */
-       private final ThreadLocal<cusparseHandle> cusparseHandle = new 
ThreadLocal<>();
+       private cusparseHandle cusparseHandle;
        /**
         * cusolverDnHandle for invoking solve() function on dense matrices on 
the GPU
         */
-       private final ThreadLocal<cusolverDnHandle> cusolverDnHandle = new 
ThreadLocal<>();
+       private cusolverDnHandle cusolverDnHandle;
        /**
         * cusolverSpHandle for invoking solve() function on sparse matrices on 
the GPU
         */
-       private final ThreadLocal<cusolverSpHandle> cusolverSpHandle = new 
ThreadLocal<>();
+       private cusolverSpHandle cusolverSpHandle;
        /**
         * to launch custom CUDA kernel, specific to the active GPU for this 
GPUContext
         */
-       private final ThreadLocal<JCudaKernels> kernels = new ThreadLocal<>();
-       
+       private JCudaKernels kernels;
+
+       // Invoke cudaMemGetInfo to get available memory information. Useful if 
GPU is shared among multiple application.
+       public double GPU_MEMORY_UTILIZATION_FACTOR = 
ConfigurationManager.getDMLConfig()
+                       
.getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
        /**
-        * Print information of memory usage. 
-        * 
-        * @param opcode opcode of caller
-        * @throws DMLRuntimeException if error 
+        * Map of free blocks allocate on GPU. maps size_of_block -> pointer on 
GPU
         */
-       public void printMemoryInfo(String opcode) throws DMLRuntimeException {
-               if(LOG.isDebugEnabled()) {
-                       long totalFreeCUDASpace = 0;
-                       for(Entry<Long, LinkedList<Pointer>> kv : 
freeCUDASpaceMap.entrySet()) {
-                               totalFreeCUDASpace += 
kv.getKey()*kv.getValue().size();
-                       }
-                       long readLockedAllocatedMemory = 0;
-                       long writeLockedAllocatedMemory = 0;
-                       long unlockedAllocatedMemory = 0;
-                       for(GPUObject gpuObj : allocatedGPUObjects) {
-                               if(gpuObj.readLocks.longValue() > 0)
-                                       readLockedAllocatedMemory += 
gpuObj.getSizeOnDevice();
-                               else if(gpuObj.writeLock)
-                                       writeLockedAllocatedMemory += 
gpuObj.getSizeOnDevice();
-                               else
-                                       unlockedAllocatedMemory += 
gpuObj.getSizeOnDevice();
-                       }
-                       long free[] = { 0 };
-                       long total[] = { 0 };
-                       cudaMemGetInfo(free, total);
-                       long gpuFreeMemory =  (long) (free[0] * 
GPU_MEMORY_UTILIZATION_FACTOR);
-                       LOG.debug(opcode + ": Total memory: " + total[0] + ", 
Free memory: " + free[0] + " (with util factor: " + gpuFreeMemory + "), "
-                                       + "Lazy unfreed memory: " + 
totalFreeCUDASpace + ", Locked allocated memory (read/write): " 
-                                       + readLockedAllocatedMemory + "/" + 
writeLockedAllocatedMemory + ", "
-                                       + " Unlocked allocated memory: " + 
unlockedAllocatedMemory);
-               }
-       }
+       private LRUCacheMap<Long, Set<Pointer>> freeCUDASpaceMap = new 
LRUCacheMap<>();
+       /**
+        * To record size of allocated blocks
+        */
+       private HashMap<Pointer, Long> cudaBlockSizeMap = new HashMap<>();
+       /**
+        * list of allocated {@link GPUObject} instances allocated on {@link 
GPUContext#deviceNum} GPU
+        * These are matrices allocated on the GPU on which rmvar hasn't been 
called yet.
+        * If a {@link GPUObject} has more than one lock on it, it cannot be 
freed
+        * If it has zero locks on it, it can be freed, but it is preferrable 
to keep it around
+        * so that an extraneous host to dev transfer can be avoided
+        */
+       private ArrayList<GPUObject> allocatedGPUObjects = new ArrayList<>();
 
        protected GPUContext(int deviceNum) throws DMLRuntimeException {
                this.deviceNum = deviceNum;
@@ -187,61 +156,97 @@ public class GPUContext {
                LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, 
Available: " + (free[0] * (1e-6)) + " MB on "
                                + this);
 
-               if(GPUContextPool.initialGPUMemBudget() > 
OptimizerUtils.getLocalMemBudget()) {
-                       LOG.warn("Potential under-utilization: GPU memory (" + 
GPUContextPool.initialGPUMemBudget() 
+               if (GPUContextPool.initialGPUMemBudget() > 
OptimizerUtils.getLocalMemBudget()) {
+                       LOG.warn("Potential under-utilization: GPU memory (" + 
GPUContextPool.initialGPUMemBudget()
                                        + ") > driver memory budget (" + 
OptimizerUtils.getLocalMemBudget() + "). "
                                        + "Consider increasing the driver 
memory budget.");
                }
        }
 
+       /**
+        * Returns which device is currently being used.
+        *
+        * @return the current device for the calling host thread
+        */
+       public static int cudaGetDevice() {
+               int[] device = new int[1];
+               JCuda.cudaGetDevice(device);
+               return device[0];
+       }
+
+       /**
+        * Print information of memory usage.
+        *
+        * @param opcode opcode of caller
+        * @throws DMLRuntimeException if error
+        */
+       public void printMemoryInfo(String opcode) throws DMLRuntimeException {
+               if (LOG.isDebugEnabled()) {
+                       long totalFreeCUDASpace = 0;
+                       for (Entry<Long, Set<Pointer>> kv : 
freeCUDASpaceMap.entrySet()) {
+                               totalFreeCUDASpace += kv.getKey() * 
kv.getValue().size();
+                       }
+                       long readLockedAllocatedMemory = 0;
+                       long writeLockedAllocatedMemory = 0;
+                       long unlockedAllocatedMemory = 0;
+                       for (GPUObject gpuObj : allocatedGPUObjects) {
+                               if (gpuObj.readLocks.longValue() > 0)
+                                       readLockedAllocatedMemory += 
gpuObj.getSizeOnDevice();
+                               else if (gpuObj.writeLock)
+                                       writeLockedAllocatedMemory += 
gpuObj.getSizeOnDevice();
+                               else
+                                       unlockedAllocatedMemory += 
gpuObj.getSizeOnDevice();
+                       }
+                       long free[] = { 0 };
+                       long total[] = { 0 };
+                       cudaMemGetInfo(free, total);
+                       long gpuFreeMemory = (long) (free[0] * 
GPU_MEMORY_UTILIZATION_FACTOR);
+                       LOG.debug(opcode + ": Total memory: " + total[0] + ", 
Free memory: " + free[0] + " (with util factor: "
+                                       + gpuFreeMemory + "), " + "Lazy unfreed 
memory: " + totalFreeCUDASpace
+                                       + ", Locked allocated memory 
(read/write): " + readLockedAllocatedMemory + "/"
+                                       + writeLockedAllocatedMemory + ", " + " 
Unlocked allocated memory: " + unlockedAllocatedMemory);
+               }
+       }
+
        private void initializeCudaLibraryHandles() throws DMLRuntimeException {
-               if (cudnnHandle.get() == null) {
-                       cudnnHandle.set(new cudnnHandle());
-                       cudnnCreate(cudnnHandle.get());
+               deleteCudaLibraryHandles();
+
+               if (cudnnHandle == null) {
+                       cudnnHandle = new cudnnHandle();
+                       cudnnCreate(cudnnHandle);
                }
 
-               if (cublasHandle.get() == null) {
-                       cublasHandle.set(new cublasHandle());
-                       cublasCreate(cublasHandle.get());
+               if (cublasHandle == null) {
+                       cublasHandle = new cublasHandle();
+                       cublasCreate(cublasHandle);
                }
                // For cublas v2, cublasSetPointerMode tells Cublas whether to 
expect scalar arguments on device or on host
                // This applies to arguments like "alpha" in Dgemm, and "y" in 
Ddot.
                // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, 
cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE);
 
-               if (cusparseHandle.get() == null) {
-                       cusparseHandle.set(new cusparseHandle());
-                       cusparseCreate(cusparseHandle.get());
+               if (cusparseHandle == null) {
+                       cusparseHandle = new cusparseHandle();
+                       cusparseCreate(cusparseHandle);
                }
 
-               if (cusolverDnHandle.get() == null) {
-                       cusolverDnHandle.set(new cusolverDnHandle());
-                       cusolverDnCreate(cusolverDnHandle.get());
+               if (cusolverDnHandle == null) {
+                       cusolverDnHandle = new cusolverDnHandle();
+                       cusolverDnCreate(cusolverDnHandle);
                }
 
-               if (cusolverSpHandle.get() == null) {
-                       cusolverSpHandle.set(new cusolverSpHandle());
-                       cusolverSpCreate(cusolverSpHandle.get());
+               if (cusolverSpHandle == null) {
+                       cusolverSpHandle = new cusolverSpHandle();
+                       cusolverSpCreate(cusolverSpHandle);
                }
 
-               if (kernels.get() == null) {
-                       kernels.set(new JCudaKernels());
+               if (kernels == null) {
+                       kernels = new JCudaKernels();
                }
        }
 
        /**
-        * Returns which device is currently being used.
-        * 
-        * @return the current device for the calling host thread
-        */
-       public static int cudaGetDevice() {
-               int[] device = new int[1];
-               JCuda.cudaGetDevice(device);
-               return device[0];
-       }
-
-       /**
         * Returns which device is assigned to this GPUContext instance.
-        * 
+        *
         * @return active device assigned to this GPUContext instance
         */
        public int getDeviceNum() {
@@ -254,7 +259,7 @@ public class GPUContext {
         * {@link 
org.apache.sysml.runtime.controlprogram.context.ExecutionContext#getGPUContext(int)}
         * If in a multi-threaded environment like parfor, this method must be 
called when in the
         * appropriate thread.
-        * 
+        *
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        public void initializeThread() throws DMLRuntimeException {
@@ -300,24 +305,29 @@ public class GPUContext {
                long t0 = 0, t1 = 0, end = 0;
                Pointer A;
                if (freeCUDASpaceMap.containsKey(size)) {
-                       if(LOG.isTraceEnabled()) {
-                               LOG.trace("GPU : in allocate from instruction " 
+ instructionName + ", found free block of size " + (size
-                                       / 1024.0) + " Kbytes from previously 
allocated block on " + this);
+                       if (LOG.isTraceEnabled()) {
+                               LOG.trace(
+                                               "GPU : in allocate from 
instruction " + instructionName + ", found free block of size " + (size
+                                                               / 1024.0) + " 
Kbytes from previously allocated block on " + this);
                        }
                        if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS)
                                t0 = System.nanoTime();
-                       LinkedList<Pointer> freeList = 
freeCUDASpaceMap.get(size);
-                       A = freeList.pop();
+                       Set<Pointer> freeList = freeCUDASpaceMap.get(size);
+
+                       Iterator<Pointer> it = freeList.iterator(); // at this 
point, freeList should have at least one element
+                       A = it.next();
+                       it.remove();
+
                        if (freeList.isEmpty())
                                freeCUDASpaceMap.remove(size);
                        if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS)
                                GPUStatistics
-                               .maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0);
+                                               
.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, 
System.nanoTime() - t0);
                } else {
-                       if(LOG.isTraceEnabled()) {
+                       if (LOG.isTraceEnabled()) {
                                LOG.trace(
-                                       "GPU : in allocate from instruction " + 
instructionName + ", allocating new block of size " + (size
-                                                       / 1024.0) + " Kbytes on 
" + this);
+                                               "GPU : in allocate from 
instruction " + instructionName + ", allocating new block of size " + (
+                                                               size / 1024.0) 
+ " Kbytes on " + this);
                        }
                        if (DMLScript.STATISTICS)
                                t0 = System.nanoTime();
@@ -335,9 +345,9 @@ public class GPUContext {
                // Set all elements to 0 since newly allocated space will 
contain garbage
                if (DMLScript.STATISTICS)
                        t1 = System.nanoTime();
-               if(LOG.isTraceEnabled()) {
-                       LOG.trace("GPU : in allocate from instruction " + 
instructionName + ", setting block of size " + (size / 1024.0)
-                               + " Kbytes to zero on " + this);
+               if (LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : in allocate from instruction " + 
instructionName + ", setting block of size " + (size
+                                       / 1024.0) + " Kbytes to zero on " + 
this);
                }
                cudaMemset(A, 0, size);
                if (DMLScript.STATISTICS)
@@ -394,13 +404,14 @@ public class GPUContext {
                if (toFree == dummy) // trying to free a null pointer
                        return;
                long t0 = 0;
-               if(!cudaBlockSizeMap.containsKey(toFree))
-                       throw new RuntimeException("ERROR : Internal state 
corrupted, cache block size map is not aware of a block it trying to free up");
+               if (!cudaBlockSizeMap.containsKey(toFree))
+                       throw new RuntimeException(
+                                       "ERROR : Internal state corrupted, 
cache block size map is not aware of a block it trying to free up");
                long size = cudaBlockSizeMap.get(toFree);
                if (eager) {
-                       if(LOG.isTraceEnabled()) {
-                               LOG.trace("GPU : eagerly freeing cuda memory [ 
" + toFree + " ] for instruction " + instructionName + " on "
-                                       + this);
+                       if (LOG.isTraceEnabled()) {
+                               LOG.trace("GPU : eagerly freeing cuda memory [ 
" + toFree + " ] for instruction " + instructionName
+                                               + " on " + this);
                        }
                        if (DMLScript.STATISTICS)
                                t0 = System.nanoTime();
@@ -414,12 +425,12 @@ public class GPUContext {
                                
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_CUDA_FREE,
                                                System.nanoTime() - t0);
                } else {
-                       if(LOG.isTraceEnabled()) {
+                       if (LOG.isTraceEnabled()) {
                                LOG.trace("GPU : lazily freeing cuda memory for 
instruction " + instructionName + " on " + this);
                        }
-                       LinkedList<Pointer> freeList = 
freeCUDASpaceMap.get(size);
+                       Set<Pointer> freeList = freeCUDASpaceMap.get(size);
                        if (freeList == null) {
-                               freeList = new LinkedList<Pointer>();
+                               freeList = new HashSet<>();
                                freeCUDASpaceMap.put(size, freeList);
                        }
                        if (freeList.contains(toFree))
@@ -446,7 +457,7 @@ public class GPUContext {
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        void ensureFreeSpace(String instructionName, long size) throws 
DMLRuntimeException {
-               if(size < 0 )
+               if (size < 0)
                        throw new DMLRuntimeException("The size cannot be 
negative:" + size);
                else if (size >= getAvailableMemory())
                        evict(instructionName, size);
@@ -477,20 +488,24 @@ public class GPUContext {
         * @throws DMLRuntimeException If no reusable memory blocks to free up 
or if not enough matrix blocks with zero locks on them.
         */
        protected void evict(String instructionName, final long neededSize) 
throws DMLRuntimeException {
-               if(LOG.isTraceEnabled()) {
+               if (LOG.isTraceEnabled()) {
                        LOG.trace("GPU : evict called from " + instructionName 
+ " for size " + neededSize + " on " + this);
                }
                GPUStatistics.cudaEvictionCount.add(1);
                // Release the set of free blocks maintained in a 
GPUObject.freeCUDASpaceMap
                // to free up space
-               LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = 
freeCUDASpaceMap;
+               LRUCacheMap<Long, Set<Pointer>> lruCacheMap = freeCUDASpaceMap;
                while (lruCacheMap.size() > 0) {
                        if (neededSize <= getAvailableMemory())
                                break;
-                       Map.Entry<Long, LinkedList<Pointer>> toFreeListPair = 
lruCacheMap.removeAndGetLRUEntry();
-                       LinkedList<Pointer> toFreeList = 
toFreeListPair.getValue();
+                       Map.Entry<Long, Set<Pointer>> toFreeListPair = 
lruCacheMap.removeAndGetLRUEntry();
+                       Set<Pointer> toFreeList = toFreeListPair.getValue();
                        Long size = toFreeListPair.getKey();
-                       Pointer toFree = toFreeList.pop();
+
+                       Iterator<Pointer> it = toFreeList.iterator(); // at 
this point, freeList should have at least one element
+                       Pointer toFree = it.next();
+                       it.remove();
+
                        if (toFreeList.isEmpty())
                                lruCacheMap.remove(size);
                        cudaFreeHelper(instructionName, toFree, true);
@@ -548,7 +563,8 @@ public class GPUContext {
                        GPUObject toBeRemoved = 
allocatedGPUObjects.get(allocatedGPUObjects.size() - 1);
                        if (toBeRemoved.isLocked()) {
                                throw new DMLRuntimeException(
-                                               "There is not enough memory on 
device for this matrix, request (" + neededSize + "). Allocated GPU objects:" + 
allocatedGPUObjects.toString());
+                                               "There is not enough memory on 
device for this matrix, request (" + neededSize
+                                                               + "). Allocated 
GPU objects:" + allocatedGPUObjects.toString());
                        }
                        if (toBeRemoved.dirty) {
                                toBeRemoved.copyFromDeviceToHost();
@@ -629,7 +645,7 @@ public class GPUContext {
 
        /**
         * Instantiates a new {@link GPUObject} initialized with the given 
{@link org.apache.sysml.runtime.controlprogram.caching.MatrixObject 
MatrixObject}.
-        * 
+        *
         * @param mo a {@link 
org.apache.sysml.runtime.controlprogram.caching.MatrixObject MatrixObject} that 
represents a matrix
         * @return a new {@link GPUObject} instance
         */
@@ -693,56 +709,56 @@ public class GPUContext {
 
        /**
         * Returns the cudnnHandle for Deep Neural Network operations on the 
GPU.
-        * 
+        *
         * @return cudnnHandle for current thread
         */
        public cudnnHandle getCudnnHandle() {
-               return cudnnHandle.get();
+               return cudnnHandle;
        }
 
        /**
         * Returns cublasHandle for BLAS operations on the GPU.
-        * 
+        *
         * @return cublasHandle for current thread
         */
        public cublasHandle getCublasHandle() {
-               return cublasHandle.get();
+               return cublasHandle;
        }
 
        /**
         * Returns cusparseHandle for certain sparse BLAS operations on the GPU.
-        * 
+        *
         * @return cusparseHandle for current thread
         */
        public cusparseHandle getCusparseHandle() {
-               return cusparseHandle.get();
+               return cusparseHandle;
        }
 
        /**
         * Returns cusolverDnHandle for invoking solve() function on dense 
matrices on the GPU.
-        * 
+        *
         * @return cusolverDnHandle for current thread
         */
        public cusolverDnHandle getCusolverDnHandle() {
-               return cusolverDnHandle.get();
+               return cusolverDnHandle;
        }
 
        /**
         * Returns cusolverSpHandle for invoking solve() function on sparse 
matrices on the GPU.
-        * 
+        *
         * @return cusolverSpHandle for current thread
         */
        public cusolverSpHandle getCusolverSpHandle() {
-               return cusolverSpHandle.get();
+               return cusolverSpHandle;
        }
 
        /**
         * Returns utility class used to launch custom CUDA kernel, specific to 
the active GPU for this GPUContext.
-        * 
+        *
         * @return {@link JCudaKernels} for current thread
         */
        public JCudaKernels getKernels() {
-               return kernels.get();
+               return kernels;
        }
 
        /**
@@ -751,15 +767,38 @@ public class GPUContext {
         * @throws DMLRuntimeException if error
         */
        public void destroy() throws DMLRuntimeException {
-               if(LOG.isTraceEnabled()) {
+               if (LOG.isTraceEnabled()) {
                        LOG.trace("GPU : this context was destroyed, this = " + 
this.toString());
                }
                clearMemory();
-               cudnnDestroy(cudnnHandle.get());
-               cublasDestroy(cublasHandle.get());
-               cusparseDestroy(cusparseHandle.get());
-               cusolverDnDestroy(cusolverDnHandle.get());
-               cusolverSpDestroy(cusolverSpHandle.get());
+
+               deleteCudaLibraryHandles();
+       }
+
+       /**
+        *      Deletes CUDA library handles
+        */
+       private void deleteCudaLibraryHandles() {
+               if (cudnnHandle != null)
+                       cudnnDestroy(cudnnHandle);
+
+               if (cublasHandle != null)
+                       cublasDestroy(cublasHandle);
+
+               if (cusparseHandle != null)
+                       cusparseDestroy(cusparseHandle);
+
+               if (cusolverDnHandle != null)
+                       cusolverDnDestroy(cusolverDnHandle);
+
+               if (cusolverSpHandle != null)
+                       cusolverSpDestroy(cusolverSpHandle);
+
+               cudnnHandle = null;
+               cublasHandle = null;
+               cusparseHandle = null;
+               cusolverDnHandle = null;
+               cusolverSpHandle = null;
        }
 
        /**
@@ -817,7 +856,7 @@ public class GPUContext {
                }
 
                // garbage collect all temporarily allocated spaces
-               for (LinkedList<Pointer> l : freeCUDASpaceMap.values()) {
+               for (Set<Pointer> l : freeCUDASpaceMap.values()) {
                        for (Pointer p : l) {
                                cudaFreeHelper(p, true);
                        }

Reply via email to