Repository: systemml Updated Branches: refs/heads/master 0a984a43b -> c6d499d3e
[MINOR] Refactored the locks to seperate out read and write lock - Refactoring the locks will avoid future bugs where the developer tries to obtain 2 write lock or a read lock on a write-locked objects, etc. - I have also added a debugging utility to track potential memory leaks. Closes #664. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/c6d499d3 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/c6d499d3 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/c6d499d3 Branch: refs/heads/master Commit: c6d499d3e27a1842ccf5987ab84c92eee72aa5c2 Parents: 0a984a4 Author: Niketan Pansare <npan...@us.ibm.com> Authored: Thu Sep 14 13:20:46 2017 -0700 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Thu Sep 14 13:23:14 2017 -0700 ---------------------------------------------------------------------- .../context/ExecutionContext.java | 2 +- .../instructions/gpu/GPUInstruction.java | 12 +++ .../instructions/gpu/context/GPUContext.java | 52 ++++++++++-- .../instructions/gpu/context/GPUObject.java | 84 ++++++++++++++------ 4 files changed, 117 insertions(+), 33 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/c6d499d3/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java index b74c0dd..3b2436e 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java @@ -374,7 +374,7 @@ public class ExecutionContext { } // The lock is added here for an output block // so that any block currently in use is not deallocated by eviction on the GPU - mo.getGPUObject(getGPUContext(0)).addLock(); + mo.getGPUObject(getGPUContext(0)).addWriteLock(); return mo; } http://git-wip-us.apache.org/repos/asf/systemml/blob/c6d499d3/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java index 9a6a3bb..2aa73b4 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java @@ -19,6 +19,8 @@ package org.apache.sysml.runtime.instructions.gpu; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.sysml.api.DMLScript; import org.apache.sysml.lops.runtime.RunMRJobs; import org.apache.sysml.runtime.DMLRuntimeException; @@ -26,6 +28,7 @@ import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.instructions.GPUInstructionParser; import org.apache.sysml.runtime.instructions.Instruction; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; import org.apache.sysml.runtime.matrix.data.Pair; import org.apache.sysml.runtime.matrix.operators.Operator; import org.apache.sysml.utils.GPUStatistics; @@ -46,6 +49,8 @@ public abstract class GPUInstruction extends Instruction { Builtin, MatrixIndexing }; + + private static final Log LOG = LogFactory.getLog(GPUInstruction.class.getName()); // Memory/conversions public final static String MISC_TIMER_HOST_TO_DEVICE = "H2D"; // time spent in bringing data to gpu (from host) @@ -191,6 +196,13 @@ public abstract class GPUInstruction extends Instruction { if(DMLScript.SYNCHRONIZE_GPU) { jcuda.runtime.JCuda.cudaDeviceSynchronize(); } + if(LOG.isDebugEnabled()) { + for(GPUContext gpuCtx : ec.getGPUContexts()) { + if(gpuCtx != null) + gpuCtx.printMemoryInfo(getOpcode()); + } + } + } /** http://git-wip-us.apache.org/repos/asf/systemml/blob/c6d499d3/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java index a31deab..271109d 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java @@ -43,6 +43,7 @@ import java.util.Comparator; import java.util.HashMap; import java.util.LinkedList; import java.util.Map; +import java.util.Map.Entry; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -130,6 +131,40 @@ public class GPUContext { * to launch custom CUDA kernel, specific to the active GPU for this GPUContext */ private final ThreadLocal<JCudaKernels> kernels = new ThreadLocal<>(); + + /** + * Print information of memory usage. + * + * @param opcode opcode of caller + * @throws DMLRuntimeException if error + */ + public void printMemoryInfo(String opcode) throws DMLRuntimeException { + if(LOG.isDebugEnabled()) { + long totalFreeCUDASpace = 0; + for(Entry<Long, LinkedList<Pointer>> kv : freeCUDASpaceMap.entrySet()) { + totalFreeCUDASpace += kv.getKey()*kv.getValue().size(); + } + long readLockedAllocatedMemory = 0; + long writeLockedAllocatedMemory = 0; + long unlockedAllocatedMemory = 0; + for(GPUObject gpuObj : allocatedGPUObjects) { + if(gpuObj.readLocks.longValue() > 0) + readLockedAllocatedMemory += gpuObj.getSizeOnDevice(); + else if(gpuObj.writeLock) + writeLockedAllocatedMemory += gpuObj.getSizeOnDevice(); + else + unlockedAllocatedMemory += gpuObj.getSizeOnDevice(); + } + long free[] = { 0 }; + long total[] = { 0 }; + cudaMemGetInfo(free, total); + long gpuFreeMemory = (long) (free[0] * GPU_MEMORY_UTILIZATION_FACTOR); + LOG.debug(opcode + ": Total memory: " + total[0] + ", Free memory: " + free[0] + " (with util factor: " + gpuFreeMemory + "), " + + "Lazy unfreed memory: " + totalFreeCUDASpace + ", Locked allocated memory (read/write): " + + readLockedAllocatedMemory + "/" + writeLockedAllocatedMemory + ", " + + " Unlocked allocated memory: " + unlockedAllocatedMemory); + } + } protected GPUContext(int deviceNum) throws DMLRuntimeException { this.deviceNum = deviceNum; @@ -472,18 +507,19 @@ public class GPUContext { Collections.sort(allocatedGPUObjects, new Comparator<GPUObject>() { @Override public int compare(GPUObject p1, GPUObject p2) { - long p1Val = p1.locks.get(); - long p2Val = p2.locks.get(); - - if (p1Val > 0 && p2Val > 0) { + if (p1.isLocked() && p2.isLocked()) { // Both are locked, so don't sort return 0; - } else if (p1Val > 0 || p2Val > 0) { + } else if (p1.isLocked()) { // Put the unlocked one to RHS - return Long.compare(p2Val, p1Val); + // a value less than 0 if x < y; and a value greater than 0 if x > y + return -1; + } else if (p2.isLocked()) { + // Put the unlocked one to RHS + // a value less than 0 if x < y; and a value greater than 0 if x > y + return 1; } else { // Both are unlocked - if (evictionPolicy == EvictionPolicy.MIN_EVICT) { long p1Size = 0; long p2Size = 0; @@ -510,7 +546,7 @@ public class GPUContext { while (neededSize > getAvailableMemory() && allocatedGPUObjects.size() > 0) { GPUObject toBeRemoved = allocatedGPUObjects.get(allocatedGPUObjects.size() - 1); - if (toBeRemoved.locks.get() > 0) { + if (toBeRemoved.isLocked()) { throw new DMLRuntimeException( "There is not enough memory on device for this matrix, request (" + neededSize + "). Allocated GPU objects:" + allocatedGPUObjects.toString()); } http://git-wip-us.apache.org/repos/asf/systemml/blob/c6d499d3/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index 2642011..4bc983e 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -34,6 +34,7 @@ import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; import java.util.Arrays; import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.LongAdder; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -99,9 +100,14 @@ public class GPUObject { protected boolean dirty = false; /** - * number of read/write locks on this object (this GPUObject is being used in a current instruction) + * number of read locks on this object (this GPUObject is being used in a current instruction) */ - protected AtomicLong locks = new AtomicLong(); + protected LongAdder readLocks = new LongAdder(); + + /** + * whether write lock on this object (this GPUObject is being used in a current instruction) + */ + protected boolean writeLock = false; /** * Timestamp, needed by {@link GPUContext#evict(long)} @@ -132,7 +138,11 @@ public class GPUObject { that.allocateTensorDescriptor(me.tensorShape[0], me.tensorShape[1], me.tensorShape[2], me.tensorShape[3]); } that.dirty = me.dirty; - that.locks = new AtomicLong(me.locks.get()); + // TODO Nakul: Should the locks be cloned here ? + // The only place clone is getting called: LibMatrixCUDA's solve + that.readLocks.reset(); + that.writeLock = false; + that.timestamp = new AtomicLong(me.timestamp.get()); that.isSparse = me.isSparse; @@ -618,7 +628,7 @@ public class GPUObject { copyFromHostToDevice(opcode); transferred = true; } - addLock(); + addReadLock(); if (!isAllocated()) throw new DMLRuntimeException("Expected device data to be allocated"); return transferred; @@ -664,10 +674,6 @@ public class GPUObject { return allocated; } - public void addLock() { - locks.addAndGet(1); - } - /** * if the data is allocated on the GPU and is dirty, it is copied back to the host memory * @@ -693,22 +699,51 @@ public class GPUObject { } return copied; } + + public boolean isLocked() { + return writeLock || readLocks.longValue() > 0; + } + + public void addReadLock() throws DMLRuntimeException { + if(writeLock) + throw new DMLRuntimeException("Attempting to add a read lock when writeLock="+ writeLock); + else + readLocks.increment(); + } + + public void addWriteLock() throws DMLRuntimeException { + if(readLocks.longValue() > 0) + throw new DMLRuntimeException("Attempting to add a write lock when readLocks="+ readLocks.longValue()); + else if(writeLock) + throw new DMLRuntimeException("Attempting to add a write lock when writeLock="+ writeLock); + else + writeLock = true; + } + + public void releaseReadLock() throws DMLRuntimeException { + readLocks.decrement(); + if(readLocks.longValue() < 0) + throw new DMLRuntimeException("Attempting to release a read lock when readLocks="+ readLocks.longValue()); + } + + public void releaseWriteLock() throws DMLRuntimeException { + if(writeLock) + writeLock = false; + else + throw new DMLRuntimeException("Internal state error : Attempting to release write lock on a GPUObject, which was already released"); + } + + public void resetReadWriteLock() { + readLocks.reset(); + writeLock = false; + } /** * Updates the locks depending on the eviction policy selected * * @throws DMLRuntimeException if there is no locked GPU Object or if could not obtain a {@link GPUContext} */ - private void updateReleaseLocks(int l) throws DMLRuntimeException { - int newLocks = (int) locks.addAndGet(l); - if (newLocks < 0) { - throw new CacheException("Internal state error : Invalid number of locks on a GPUObject"); - } - - if(LOG.isTraceEnabled()) { - LOG.trace("GPU : updateReleaseLocks, new number of locks is " + newLocks + ", on " + this + ", GPUContext=" - + getGPUContext()); - } + private void updateReleaseLocks() throws DMLRuntimeException { GPUContext.EvictionPolicy evictionPolicy = getGPUContext().evictionPolicy; switch (evictionPolicy) { case LRU: @@ -730,8 +765,8 @@ public class GPUObject { * @throws DMLRuntimeException if data is not allocated or if there is no locked GPU Object or if could not obtain a {@link GPUContext} */ public void releaseInput() throws DMLRuntimeException { - // A read lock is a positive quantity, therefor when the lock is freed, a negative 1 is added - updateReleaseLocks(-1); + releaseReadLock(); + updateReleaseLocks(); if (!isAllocated()) throw new CacheException("Attempting to release an input before allocating it"); } @@ -742,8 +777,8 @@ public class GPUObject { * @throws DMLRuntimeException if data is not allocated or if there is no locked GPU Object or if could not obtain a {@link GPUContext} */ public void releaseOutput() throws DMLRuntimeException { - // A write lock is a negative quantity, therefore when the lock is freed, a positive number is added - updateReleaseLocks(1); + releaseWriteLock(); + updateReleaseLocks(); dirty = true; if (!isAllocated()) throw new CacheException("Attempting to release an output before allocating it"); @@ -798,7 +833,7 @@ public class GPUObject { cudnnDestroyTensorDescriptor(tensorDescriptor); tensorDescriptor = null; } - locks.set(0); + resetReadWriteLock(); getGPUContext().removeRecordedUsage(this); } @@ -1061,7 +1096,8 @@ public class GPUObject { final StringBuilder sb = new StringBuilder("GPUObject{"); sb.append(", tensorShape=").append(Arrays.toString(tensorShape)); sb.append(", dirty=").append(dirty); - sb.append(", locks=").append(locks); + sb.append(", readLocks=").append(readLocks.longValue()); + sb.append(", writeLock=").append(writeLock); sb.append(", sparse? ").append(isSparse); sb.append(", dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]"); sb.append('}');