Repository: systemml
Updated Branches:
  refs/heads/master 0a984a43b -> c6d499d3e


[MINOR] Refactored the locks to seperate out read and write lock

- Refactoring the locks will avoid future bugs where the developer tries
  to obtain 2 write lock or a read lock on a write-locked objects, etc.
- I have also added a debugging utility to track potential memory leaks.

Closes #664.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/c6d499d3
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/c6d499d3
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/c6d499d3

Branch: refs/heads/master
Commit: c6d499d3e27a1842ccf5987ab84c92eee72aa5c2
Parents: 0a984a4
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Thu Sep 14 13:20:46 2017 -0700
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Thu Sep 14 13:23:14 2017 -0700

----------------------------------------------------------------------
 .../context/ExecutionContext.java               |  2 +-
 .../instructions/gpu/GPUInstruction.java        | 12 +++
 .../instructions/gpu/context/GPUContext.java    | 52 ++++++++++--
 .../instructions/gpu/context/GPUObject.java     | 84 ++++++++++++++------
 4 files changed, 117 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/c6d499d3/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
index b74c0dd..3b2436e 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
@@ -374,7 +374,7 @@ public class ExecutionContext {
                }
                // The lock is added here for an output block
                // so that any block currently in use is not deallocated by 
eviction on the GPU
-               mo.getGPUObject(getGPUContext(0)).addLock();
+               mo.getGPUObject(getGPUContext(0)).addWriteLock();
                return mo;
        }
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/c6d499d3/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index 9a6a3bb..2aa73b4 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -19,6 +19,8 @@
 
 package org.apache.sysml.runtime.instructions.gpu;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.lops.runtime.RunMRJobs;
 import org.apache.sysml.runtime.DMLRuntimeException;
@@ -26,6 +28,7 @@ import 
org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.instructions.GPUInstructionParser;
 import org.apache.sysml.runtime.instructions.Instruction;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.matrix.data.Pair;
 import org.apache.sysml.runtime.matrix.operators.Operator;
 import org.apache.sysml.utils.GPUStatistics;
@@ -46,6 +49,8 @@ public abstract class GPUInstruction extends Instruction {
                Builtin,
                MatrixIndexing
        };
+       
+       private static final Log LOG = 
LogFactory.getLog(GPUInstruction.class.getName());
 
        // Memory/conversions
        public final static String MISC_TIMER_HOST_TO_DEVICE =          "H2D";  
// time spent in bringing data to gpu (from host)
@@ -191,6 +196,13 @@ public abstract class GPUInstruction extends Instruction {
                if(DMLScript.SYNCHRONIZE_GPU) {
                        jcuda.runtime.JCuda.cudaDeviceSynchronize();
                }
+               if(LOG.isDebugEnabled()) {
+                       for(GPUContext gpuCtx : ec.getGPUContexts()) {
+                               if(gpuCtx != null)
+                                       gpuCtx.printMemoryInfo(getOpcode());
+                       }
+               }
+                       
        }
 
        /**

http://git-wip-us.apache.org/repos/asf/systemml/blob/c6d499d3/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index a31deab..271109d 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -43,6 +43,7 @@ import java.util.Comparator;
 import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.Map;
+import java.util.Map.Entry;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -130,6 +131,40 @@ public class GPUContext {
         * to launch custom CUDA kernel, specific to the active GPU for this 
GPUContext
         */
        private final ThreadLocal<JCudaKernels> kernels = new ThreadLocal<>();
+       
+       /**
+        * Print information of memory usage. 
+        * 
+        * @param opcode opcode of caller
+        * @throws DMLRuntimeException if error 
+        */
+       public void printMemoryInfo(String opcode) throws DMLRuntimeException {
+               if(LOG.isDebugEnabled()) {
+                       long totalFreeCUDASpace = 0;
+                       for(Entry<Long, LinkedList<Pointer>> kv : 
freeCUDASpaceMap.entrySet()) {
+                               totalFreeCUDASpace += 
kv.getKey()*kv.getValue().size();
+                       }
+                       long readLockedAllocatedMemory = 0;
+                       long writeLockedAllocatedMemory = 0;
+                       long unlockedAllocatedMemory = 0;
+                       for(GPUObject gpuObj : allocatedGPUObjects) {
+                               if(gpuObj.readLocks.longValue() > 0)
+                                       readLockedAllocatedMemory += 
gpuObj.getSizeOnDevice();
+                               else if(gpuObj.writeLock)
+                                       writeLockedAllocatedMemory += 
gpuObj.getSizeOnDevice();
+                               else
+                                       unlockedAllocatedMemory += 
gpuObj.getSizeOnDevice();
+                       }
+                       long free[] = { 0 };
+                       long total[] = { 0 };
+                       cudaMemGetInfo(free, total);
+                       long gpuFreeMemory =  (long) (free[0] * 
GPU_MEMORY_UTILIZATION_FACTOR);
+                       LOG.debug(opcode + ": Total memory: " + total[0] + ", 
Free memory: " + free[0] + " (with util factor: " + gpuFreeMemory + "), "
+                                       + "Lazy unfreed memory: " + 
totalFreeCUDASpace + ", Locked allocated memory (read/write): " 
+                                       + readLockedAllocatedMemory + "/" + 
writeLockedAllocatedMemory + ", "
+                                       + " Unlocked allocated memory: " + 
unlockedAllocatedMemory);
+               }
+       }
 
        protected GPUContext(int deviceNum) throws DMLRuntimeException {
                this.deviceNum = deviceNum;
@@ -472,18 +507,19 @@ public class GPUContext {
                Collections.sort(allocatedGPUObjects, new 
Comparator<GPUObject>() {
                        @Override
                        public int compare(GPUObject p1, GPUObject p2) {
-                               long p1Val = p1.locks.get();
-                               long p2Val = p2.locks.get();
-
-                               if (p1Val > 0 && p2Val > 0) {
+                               if (p1.isLocked() && p2.isLocked()) {
                                        // Both are locked, so don't sort
                                        return 0;
-                               } else if (p1Val > 0 || p2Val > 0) {
+                               } else if (p1.isLocked()) {
                                        // Put the unlocked one to RHS
-                                       return Long.compare(p2Val, p1Val);
+                                       // a value less than 0 if x < y; and a 
value greater than 0 if x > y
+                                       return -1;
+                               } else if (p2.isLocked()) {
+                                       // Put the unlocked one to RHS
+                                       // a value less than 0 if x < y; and a 
value greater than 0 if x > y
+                                       return 1;
                                } else {
                                        // Both are unlocked
-
                                        if (evictionPolicy == 
EvictionPolicy.MIN_EVICT) {
                                                long p1Size = 0;
                                                long p2Size = 0;
@@ -510,7 +546,7 @@ public class GPUContext {
 
                while (neededSize > getAvailableMemory() && 
allocatedGPUObjects.size() > 0) {
                        GPUObject toBeRemoved = 
allocatedGPUObjects.get(allocatedGPUObjects.size() - 1);
-                       if (toBeRemoved.locks.get() > 0) {
+                       if (toBeRemoved.isLocked()) {
                                throw new DMLRuntimeException(
                                                "There is not enough memory on 
device for this matrix, request (" + neededSize + "). Allocated GPU objects:" + 
allocatedGPUObjects.toString());
                        }

http://git-wip-us.apache.org/repos/asf/systemml/blob/c6d499d3/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 2642011..4bc983e 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -34,6 +34,7 @@ import static 
jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 
 import java.util.Arrays;
 import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.LongAdder;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -99,9 +100,14 @@ public class GPUObject {
        protected boolean dirty = false;
 
        /**
-        * number of read/write locks on this object (this GPUObject is being 
used in a current instruction)
+        * number of read locks on this object (this GPUObject is being used in 
a current instruction)
         */
-       protected AtomicLong locks = new AtomicLong();
+       protected LongAdder readLocks = new LongAdder();
+       
+       /**
+        * whether write lock on this object (this GPUObject is being used in a 
current instruction)
+        */
+       protected boolean writeLock = false;
 
        /**
         * Timestamp, needed by {@link GPUContext#evict(long)}
@@ -132,7 +138,11 @@ public class GPUObject {
                        that.allocateTensorDescriptor(me.tensorShape[0], 
me.tensorShape[1], me.tensorShape[2], me.tensorShape[3]);
                }
                that.dirty = me.dirty;
-               that.locks = new AtomicLong(me.locks.get());
+               // TODO Nakul: Should the locks be cloned here ?
+               // The only place clone is getting called: LibMatrixCUDA's solve
+               that.readLocks.reset();
+               that.writeLock = false;
+               
                that.timestamp = new AtomicLong(me.timestamp.get());
                that.isSparse = me.isSparse;
 
@@ -618,7 +628,7 @@ public class GPUObject {
                        copyFromHostToDevice(opcode);
                        transferred = true;
                }
-               addLock();
+               addReadLock();
                if (!isAllocated())
                        throw new DMLRuntimeException("Expected device data to 
be allocated");
                return transferred;
@@ -664,10 +674,6 @@ public class GPUObject {
                return allocated;
        }
 
-       public void addLock() {
-               locks.addAndGet(1);
-       }
-
        /**
         * if the data is allocated on the GPU and is dirty, it is copied back 
to the host memory
         *
@@ -693,22 +699,51 @@ public class GPUObject {
                }
                return copied;
        }
+       
+       public boolean isLocked() {
+               return writeLock || readLocks.longValue() > 0;
+       }
+       
+       public void addReadLock() throws DMLRuntimeException {
+               if(writeLock)
+                       throw new DMLRuntimeException("Attempting to add a read 
lock when writeLock="+ writeLock);
+               else
+                       readLocks.increment();
+       }
+       
+       public void addWriteLock() throws DMLRuntimeException {
+               if(readLocks.longValue() > 0)
+                       throw new DMLRuntimeException("Attempting to add a 
write lock when readLocks="+ readLocks.longValue());
+               else if(writeLock)
+                       throw new DMLRuntimeException("Attempting to add a 
write lock when writeLock="+ writeLock);
+               else
+                       writeLock = true;
+       }
+       
+       public void releaseReadLock() throws DMLRuntimeException {
+               readLocks.decrement();
+               if(readLocks.longValue() < 0)
+                       throw new DMLRuntimeException("Attempting to release a 
read lock when readLocks="+ readLocks.longValue());
+       }
+       
+       public void releaseWriteLock() throws DMLRuntimeException {
+               if(writeLock)
+                       writeLock = false;
+               else
+                       throw new DMLRuntimeException("Internal state error : 
Attempting to release write lock on a GPUObject, which was already released");
+       }
+       
+       public void resetReadWriteLock() {
+               readLocks.reset();
+               writeLock = false;
+       }
 
        /**
         * Updates the locks depending on the eviction policy selected
         *
         * @throws DMLRuntimeException if there is no locked GPU Object or if 
could not obtain a {@link GPUContext}
         */
-       private void updateReleaseLocks(int l) throws DMLRuntimeException {
-               int newLocks = (int) locks.addAndGet(l);
-               if (newLocks < 0) {
-                       throw new CacheException("Internal state error : 
Invalid number of locks on a GPUObject");
-               }
-
-               if(LOG.isTraceEnabled()) {
-                       LOG.trace("GPU : updateReleaseLocks, new number of 
locks is " + newLocks + ", on " + this + ", GPUContext="
-                               + getGPUContext());
-               }
+       private void updateReleaseLocks() throws DMLRuntimeException {
                GPUContext.EvictionPolicy evictionPolicy = 
getGPUContext().evictionPolicy;
                switch (evictionPolicy) {
                case LRU:
@@ -730,8 +765,8 @@ public class GPUObject {
         * @throws DMLRuntimeException if data is not allocated or if there is 
no locked GPU Object or if could not obtain a {@link GPUContext}
         */
        public void releaseInput() throws DMLRuntimeException {
-               // A read lock is a positive quantity, therefor when the lock 
is freed, a negative 1 is added
-               updateReleaseLocks(-1);
+               releaseReadLock();
+               updateReleaseLocks();
                if (!isAllocated())
                        throw new CacheException("Attempting to release an 
input before allocating it");
        }
@@ -742,8 +777,8 @@ public class GPUObject {
         * @throws DMLRuntimeException if data is not allocated or if there is 
no locked GPU Object or if could not obtain a {@link GPUContext}
         */
        public void releaseOutput() throws DMLRuntimeException {
-               // A write lock is a negative quantity, therefore when the lock 
is freed, a positive number is added
-               updateReleaseLocks(1);
+               releaseWriteLock();
+               updateReleaseLocks();
                dirty = true;
                if (!isAllocated())
                        throw new CacheException("Attempting to release an 
output before allocating it");
@@ -798,7 +833,7 @@ public class GPUObject {
                        cudnnDestroyTensorDescriptor(tensorDescriptor);
                        tensorDescriptor = null;
                }
-               locks.set(0);
+               resetReadWriteLock();
                getGPUContext().removeRecordedUsage(this);
        }
 
@@ -1061,7 +1096,8 @@ public class GPUObject {
                final StringBuilder sb = new StringBuilder("GPUObject{");
                sb.append(", 
tensorShape=").append(Arrays.toString(tensorShape));
                sb.append(", dirty=").append(dirty);
-               sb.append(", locks=").append(locks);
+               sb.append(", readLocks=").append(readLocks.longValue());
+               sb.append(", writeLock=").append(writeLock);
                sb.append(", sparse? ").append(isSparse);
                sb.append(", 
dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]");
                sb.append('}');

Reply via email to