Repository: systemml Updated Branches: refs/heads/master beb1a1d19 -> c022f1a5a
[SYSTEMML-1325] Fix a performance bug when using GPU backend with JMLC This commit simplifies the clearTemporaryMemory logic in GPU memory manager and performs aggressive cleanup to reduce the memory pressure. Closes #842. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/c022f1a5 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/c022f1a5 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/c022f1a5 Branch: refs/heads/master Commit: c022f1a5a4479c7cfd380190d20d81b7747c7b92 Parents: beb1a1d Author: Niketan Pansare <npan...@us.ibm.com> Authored: Mon Nov 12 18:50:20 2018 +0530 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Mon Nov 12 18:50:20 2018 +0530 ---------------------------------------------------------------------- .../apache/sysml/api/ScriptExecutorUtils.java | 4 ++- .../instructions/gpu/context/GPUContext.java | 6 ++-- .../gpu/context/GPUMatrixMemoryManager.java | 38 ++++---------------- .../gpu/context/GPUMemoryManager.java | 38 ++++++++++++-------- .../instructions/gpu/context/GPUObject.java | 24 +++++++++++++ 5 files changed, 61 insertions(+), 49 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/c022f1a5/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java index e2e76cb..0d072e5 100644 --- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java +++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java @@ -287,6 +287,7 @@ public class ScriptExecutorUtils { } finally { // ensure cleanup/shutdown if (ConfigurationManager.isGPU() && !ec.getGPUContexts().isEmpty()) { try { + HashSet<MatrixObject> outputMatrixObjects = new HashSet<>(); // ----------------------------------------------------------------- // The below code pulls the output variables on the GPU to the host. This is required especially when: // The output variable was generated as part of a MLContext session with GPU enabled @@ -302,12 +303,13 @@ public class ScriptExecutorUtils { gpuObj.acquireHostRead(null); } } + outputMatrixObjects.add(((MatrixObject)data)); } } } // ----------------------------------------------------------------- for(GPUContext gCtx : ec.getGPUContexts()) { - gCtx.clearTemporaryMemory(); + gCtx.clearTemporaryMemory(outputMatrixObjects); } } catch (Exception e1) { exceptionThrown = true; http://git-wip-us.apache.org/repos/asf/systemml/blob/c022f1a5/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java index 3275099..20587b4 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java @@ -31,6 +31,8 @@ import static jcuda.runtime.JCuda.cudaGetDeviceCount; import static jcuda.runtime.JCuda.cudaSetDevice; import static jcuda.runtime.JCuda.cudaSetDeviceFlags; +import java.util.HashSet; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysml.api.DMLScript.EvictionPolicy; @@ -424,8 +426,8 @@ public class GPUContext { memoryManager.clearMemory(); } - public void clearTemporaryMemory() { - memoryManager.clearTemporaryMemory(); + public void clearTemporaryMemory(HashSet<MatrixObject> outputMatrixObjects) { + memoryManager.clearTemporaryMemory(outputMatrixObjects); } @Override http://git-wip-us.apache.org/repos/asf/systemml/blob/c022f1a5/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java index f69d340..029af19 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java @@ -20,7 +20,6 @@ package org.apache.sysml.runtime.instructions.gpu.context; import java.util.Collections; import java.util.HashSet; -import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -46,33 +45,6 @@ public class GPUMatrixMemoryManager { gpuObjects.add(gpuObj); } - /** - * Get list of all Pointers in a GPUObject - * @param gObj gpu object - * @return set of pointers - */ - Set<Pointer> getPointers(GPUObject gObj) { - Set<Pointer> ret = new HashSet<>(); - if(!gObj.isDensePointerNull() && gObj.getSparseMatrixCudaPointer() != null) { - LOG.warn("Matrix allocated in both dense and sparse format"); - } - if(!gObj.isDensePointerNull()) { - // && gObj.evictedDenseArr == null - Ignore evicted array - ret.add(gObj.getDensePointer()); - } - if(gObj.getSparseMatrixCudaPointer() != null) { - CSRPointer sparsePtr = gObj.getSparseMatrixCudaPointer(); - if(sparsePtr != null) { - if(sparsePtr.rowPtr != null) - ret.add(sparsePtr.rowPtr); - else if(sparsePtr.colInd != null) - ret.add(sparsePtr.colInd); - else if(sparsePtr.val != null) - ret.add(sparsePtr.val); - } - } - return ret; - } /** * list of allocated {@link GPUObject} instances allocated on {@link GPUContext#deviceNum} GPU @@ -91,18 +63,22 @@ public class GPUMatrixMemoryManager { Set<GPUObject> getGpuObjects(Set<Pointer> pointers) { Set<GPUObject> gObjs = new HashSet<>(); for (GPUObject g : gpuObjects) { - if (!Collections.disjoint(getPointers(g), pointers)) + if (!Collections.disjoint(g.getPointers(), pointers)) gObjs.add(g); } return gObjs; } + Set<GPUObject> getGpuObjects() { + return gpuObjects; + } + /** * Return all pointers in the first section * @return all pointers in this section */ Set<Pointer> getPointers() { - return gpuObjects.stream().flatMap(gObj -> getPointers(gObj).stream()).collect(Collectors.toSet()); + return gpuObjects.stream().flatMap(gObj -> gObj.getPointers().stream()).collect(Collectors.toSet()); } /** @@ -116,7 +92,7 @@ public class GPUMatrixMemoryManager { return gpuObjects.stream().filter( gObj -> (gObj.isLocked() == locked && gObj.isDirty() == dirty) || (gObj.mat.isCleanupEnabled() == isCleanupEnabled)).flatMap( - gObj -> getPointers(gObj).stream()).collect(Collectors.toSet()); + gObj -> gObj.getPointers().stream()).collect(Collectors.toSet()); } /** http://git-wip-us.apache.org/repos/asf/systemml/blob/c022f1a5/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java index d403ca7..ce22a7e 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java @@ -39,6 +39,7 @@ import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.conf.DMLConfig; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; import org.apache.sysml.utils.GPUStatistics; @@ -518,21 +519,28 @@ public class GPUMemoryManager { /** * Clears up the memory used by non-dirty pointers. */ - public void clearTemporaryMemory() { - // To record the cuda block sizes needed by allocatedGPUObjects, others are cleared up. - Set<Pointer> unlockedDirtyPointers = matrixMemoryManager.getPointers(false, true, false); - Set<Pointer> temporaryPointers = nonIn(allPointers.keySet(), unlockedDirtyPointers); + public void clearTemporaryMemory(HashSet<MatrixObject> outputMatrixObjects) { + Set<Pointer> donotClearPointers = new HashSet<>(); + // First clean up all GPU objects except: + // 1. Output matrix objects + // 2. GPU objects that are currently being used (i.e. locked) + // 3. Matrix object are + Set<GPUObject> allGPUObjects = new HashSet<>(matrixMemoryManager.getGpuObjects()); + for (GPUObject gpuObj : allGPUObjects) { + boolean isOutput = outputMatrixObjects.contains(gpuObj.mat); + if(!isOutput && !gpuObj.isLocked() && gpuObj.mat.isCleanupEnabled()) { + gpuObj.clearData(null, gpuObj.getGPUContext().EAGER_CUDA_FREE); + } + else { + donotClearPointers.addAll(gpuObj.getPointers()); + } + } + + // Next, cleanup workspace and other temporary pointers + Set<Pointer> temporaryPointers = nonIn(allPointers.keySet(), donotClearPointers); for (Pointer tmpPtr : temporaryPointers) { guardedCudaFree(tmpPtr); } - - // Also set the pointer(s) to null in the corresponding GPU objects to avoid double freeing pointers - Set<GPUObject> gObjs = matrixMemoryManager.getGpuObjects(temporaryPointers); - for (GPUObject g : gObjs) { - g.jcudaDenseMatrixPtr = null; - g.jcudaSparseMatrixPtr = null; - removeGPUObject(g); - } } /** @@ -579,18 +587,18 @@ public class GPUMemoryManager { if(gpuObj.isLocked()) { numLockedGPUObjects++; sizeOfLockedGPUObjects += gpuObj.getSizeOnDevice(); - numLockedPointers += matrixMemoryManager.getPointers(gpuObj).size(); + numLockedPointers += gpuObj.getPointers().size(); } else { if(gpuObj.isDirty()) { numUnlockedDirtyGPUObjects++; sizeOfUnlockedDirtyGPUObjects += gpuObj.getSizeOnDevice(); - numUnlockedDirtyPointers += matrixMemoryManager.getPointers(gpuObj).size(); + numUnlockedDirtyPointers += gpuObj.getPointers().size(); } else { numUnlockedNonDirtyGPUObjects++; sizeOfUnlockedNonDirtyGPUObjects += gpuObj.getSizeOnDevice(); - numUnlockedNonDirtyPointers += matrixMemoryManager.getPointers(gpuObj).size(); + numUnlockedNonDirtyPointers += gpuObj.getPointers().size(); } } } http://git-wip-us.apache.org/repos/asf/systemml/blob/c022f1a5/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index edcd683..04af229 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -24,6 +24,8 @@ import static jcuda.runtime.JCuda.cudaMemset; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; +import java.util.HashSet; +import java.util.Set; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.LongAdder; @@ -151,6 +153,28 @@ public class GPUObject { jcudaSparseMatrixPtr = null; } + Set<Pointer> getPointers() { + Set<Pointer> ret = new HashSet<>(); + if(!isDensePointerNull() && getSparseMatrixCudaPointer() != null) { + LOG.warn("Matrix allocated in both dense and sparse format"); + } + if(!isDensePointerNull()) { + // && evictedDenseArr == null - Ignore evicted array + ret.add(getDensePointer()); + } + if(getSparseMatrixCudaPointer() != null) { + CSRPointer sparsePtr = getSparseMatrixCudaPointer(); + if(sparsePtr != null) { + if(sparsePtr.rowPtr != null) + ret.add(sparsePtr.rowPtr); + else if(sparsePtr.colInd != null) + ret.add(sparsePtr.colInd); + else if(sparsePtr.val != null) + ret.add(sparsePtr.val); + } + } + return ret; + } /** * Convenience method to directly set the dense matrix pointer on GPU