systemml git commit: [SYSTEMML-1325] Fix a performance bug when using GPU backend with JMLC

niketanpansare Mon, 12 Nov 2018 05:23:08 -0800

Repository: systemml
Updated Branches:
  refs/heads/master beb1a1d19 -> c022f1a5a



[SYSTEMML-1325] Fix a performance bug when using GPU backend with JMLC

This commit simplifies the clearTemporaryMemory logic in GPU memory manager and 
performs aggressive cleanup to reduce the memory pressure.

Closes #842.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/c022f1a5
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/c022f1a5
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/c022f1a5

Branch: refs/heads/master
Commit: c022f1a5a4479c7cfd380190d20d81b7747c7b92
Parents: beb1a1d
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Mon Nov 12 18:50:20 2018 +0530
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Mon Nov 12 18:50:20 2018 +0530

----------------------------------------------------------------------
 .../apache/sysml/api/ScriptExecutorUtils.java   |  4 ++-
 .../instructions/gpu/context/GPUContext.java    |  6 ++--
 .../gpu/context/GPUMatrixMemoryManager.java     | 38 ++++----------------
 .../gpu/context/GPUMemoryManager.java           | 38 ++++++++++++--------
 .../instructions/gpu/context/GPUObject.java     | 24 +++++++++++++
 5 files changed, 61 insertions(+), 49 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/c022f1a5/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java 
b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
index e2e76cb..0d072e5 100644
--- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
@@ -287,6 +287,7 @@ public class ScriptExecutorUtils {
                } finally { // ensure cleanup/shutdown
                        if (ConfigurationManager.isGPU() && 
!ec.getGPUContexts().isEmpty()) {
                                try {
+                                       HashSet<MatrixObject> 
outputMatrixObjects = new HashSet<>();
                                        // 
-----------------------------------------------------------------
                                        // The below code pulls the output 
variables on the GPU to the host. This is required especially when:
                                        // The output variable was generated as 
part of a MLContext session with GPU enabled
@@ -302,12 +303,13 @@ public class ScriptExecutorUtils {
                                                                                
gpuObj.acquireHostRead(null);
                                                                        }
                                                                }
+                                                               
outputMatrixObjects.add(((MatrixObject)data));
                                                        }
                                                }
                                        }
                                        // 
-----------------------------------------------------------------
                                        for(GPUContext gCtx : 
ec.getGPUContexts()) {
-                                               gCtx.clearTemporaryMemory();
+                                               
gCtx.clearTemporaryMemory(outputMatrixObjects);
                                        }
                                } catch (Exception e1) {
                                        exceptionThrown = true;

http://git-wip-us.apache.org/repos/asf/systemml/blob/c022f1a5/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 3275099..20587b4 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -31,6 +31,8 @@ import static jcuda.runtime.JCuda.cudaGetDeviceCount;
 import static jcuda.runtime.JCuda.cudaSetDevice;
 import static jcuda.runtime.JCuda.cudaSetDeviceFlags;
 
+import java.util.HashSet;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript.EvictionPolicy;
@@ -424,8 +426,8 @@ public class GPUContext {
                memoryManager.clearMemory();
        }
        
-       public void clearTemporaryMemory() {
-               memoryManager.clearTemporaryMemory();
+       public void clearTemporaryMemory(HashSet<MatrixObject> 
outputMatrixObjects) {
+               memoryManager.clearTemporaryMemory(outputMatrixObjects);
        }
 
        @Override

http://git-wip-us.apache.org/repos/asf/systemml/blob/c022f1a5/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
index f69d340..029af19 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
@@ -20,7 +20,6 @@ package org.apache.sysml.runtime.instructions.gpu.context;
 
 import java.util.Collections;
 import java.util.HashSet;
-import java.util.List;
 import java.util.Set;
 import java.util.stream.Collectors;
 
@@ -46,33 +45,6 @@ public class GPUMatrixMemoryManager {
                gpuObjects.add(gpuObj);
        }
 
-       /**
-        * Get list of all Pointers in a GPUObject 
-        * @param gObj gpu object 
-        * @return set of pointers
-        */
-       Set<Pointer> getPointers(GPUObject gObj) {
-               Set<Pointer> ret = new HashSet<>();
-               if(!gObj.isDensePointerNull() && 
gObj.getSparseMatrixCudaPointer() != null) {
-                       LOG.warn("Matrix allocated in both dense and sparse 
format");
-               }
-               if(!gObj.isDensePointerNull()) {
-                       // && gObj.evictedDenseArr == null - Ignore evicted 
array
-                       ret.add(gObj.getDensePointer());
-               }
-               if(gObj.getSparseMatrixCudaPointer() != null) {
-                       CSRPointer sparsePtr = 
gObj.getSparseMatrixCudaPointer();
-                       if(sparsePtr != null) {
-                               if(sparsePtr.rowPtr != null)
-                                       ret.add(sparsePtr.rowPtr);
-                               else if(sparsePtr.colInd != null)
-                                       ret.add(sparsePtr.colInd);
-                               else if(sparsePtr.val != null)
-                                       ret.add(sparsePtr.val);
-                       }
-               }
-               return ret;
-       }
        
        /**
         * list of allocated {@link GPUObject} instances allocated on {@link 
GPUContext#deviceNum} GPU
@@ -91,18 +63,22 @@ public class GPUMatrixMemoryManager {
        Set<GPUObject> getGpuObjects(Set<Pointer> pointers) {
                Set<GPUObject> gObjs = new HashSet<>();
                for (GPUObject g : gpuObjects) {
-                       if (!Collections.disjoint(getPointers(g), pointers))
+                       if (!Collections.disjoint(g.getPointers(), pointers))
                                gObjs.add(g);
                }
                return gObjs;
        }
        
+       Set<GPUObject> getGpuObjects() {
+               return gpuObjects;
+       }
+       
        /**
         * Return all pointers in the first section
         * @return all pointers in this section
         */
        Set<Pointer> getPointers() {
-               return gpuObjects.stream().flatMap(gObj -> 
getPointers(gObj).stream()).collect(Collectors.toSet());
+               return gpuObjects.stream().flatMap(gObj -> 
gObj.getPointers().stream()).collect(Collectors.toSet());
        }
        
        /**
@@ -116,7 +92,7 @@ public class GPUMatrixMemoryManager {
                return gpuObjects.stream().filter(
                                gObj -> (gObj.isLocked() == locked && 
gObj.isDirty() == dirty) ||
                                                (gObj.mat.isCleanupEnabled() == 
isCleanupEnabled)).flatMap(
-                                               gObj -> 
getPointers(gObj).stream()).collect(Collectors.toSet());
+                                               gObj -> 
gObj.getPointers().stream()).collect(Collectors.toSet());
        }
        
        /**

http://git-wip-us.apache.org/repos/asf/systemml/blob/c022f1a5/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
index d403ca7..ce22a7e 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -39,6 +39,7 @@ import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysml.utils.GPUStatistics;
 
@@ -518,21 +519,28 @@ public class GPUMemoryManager {
        /**
         * Clears up the memory used by non-dirty pointers.
         */
-       public void clearTemporaryMemory() {
-               // To record the cuda block sizes needed by 
allocatedGPUObjects, others are cleared up.
-               Set<Pointer> unlockedDirtyPointers = 
matrixMemoryManager.getPointers(false, true, false);
-               Set<Pointer> temporaryPointers = nonIn(allPointers.keySet(), 
unlockedDirtyPointers);
+       public void clearTemporaryMemory(HashSet<MatrixObject> 
outputMatrixObjects) {
+               Set<Pointer> donotClearPointers =  new HashSet<>();
+               // First clean up all GPU objects except:
+               // 1. Output matrix objects
+               // 2. GPU objects that are currently being used (i.e. locked)
+               // 3. Matrix object are 
+               Set<GPUObject> allGPUObjects = new 
HashSet<>(matrixMemoryManager.getGpuObjects());
+               for (GPUObject gpuObj : allGPUObjects) {
+                       boolean isOutput = 
outputMatrixObjects.contains(gpuObj.mat);
+                       if(!isOutput && !gpuObj.isLocked() && 
gpuObj.mat.isCleanupEnabled()) {
+                               gpuObj.clearData(null, 
gpuObj.getGPUContext().EAGER_CUDA_FREE);
+                       }
+                       else {
+                               donotClearPointers.addAll(gpuObj.getPointers());
+                       }
+               }
+               
+               // Next, cleanup workspace and other temporary pointers
+               Set<Pointer> temporaryPointers = nonIn(allPointers.keySet(), 
donotClearPointers);
                for (Pointer tmpPtr : temporaryPointers) {
                        guardedCudaFree(tmpPtr);
                }
-
-               // Also set the pointer(s) to null in the corresponding GPU 
objects to avoid double freeing pointers
-               Set<GPUObject> gObjs = 
matrixMemoryManager.getGpuObjects(temporaryPointers);
-               for (GPUObject g : gObjs) {
-                       g.jcudaDenseMatrixPtr = null;
-                       g.jcudaSparseMatrixPtr = null;
-                       removeGPUObject(g);
-               }
        }
        
        /**
@@ -579,18 +587,18 @@ public class GPUMemoryManager {
                        if(gpuObj.isLocked()) {
                                numLockedGPUObjects++;
                                sizeOfLockedGPUObjects += 
gpuObj.getSizeOnDevice();
-                               numLockedPointers += 
matrixMemoryManager.getPointers(gpuObj).size();
+                               numLockedPointers += 
gpuObj.getPointers().size();
                        }
                        else {
                                if(gpuObj.isDirty()) {
                                        numUnlockedDirtyGPUObjects++;
                                        sizeOfUnlockedDirtyGPUObjects += 
gpuObj.getSizeOnDevice();
-                                       numUnlockedDirtyPointers += 
matrixMemoryManager.getPointers(gpuObj).size();
+                                       numUnlockedDirtyPointers += 
gpuObj.getPointers().size();
                                }
                                else {
                                        numUnlockedNonDirtyGPUObjects++;
                                        sizeOfUnlockedNonDirtyGPUObjects += 
gpuObj.getSizeOnDevice();
-                                       numUnlockedNonDirtyPointers += 
matrixMemoryManager.getPointers(gpuObj).size();
+                                       numUnlockedNonDirtyPointers += 
gpuObj.getPointers().size();
                                }
                        }
                }

http://git-wip-us.apache.org/repos/asf/systemml/blob/c022f1a5/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index edcd683..04af229 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -24,6 +24,8 @@ import static jcuda.runtime.JCuda.cudaMemset;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 
+import java.util.HashSet;
+import java.util.Set;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.LongAdder;
 
@@ -151,6 +153,28 @@ public class GPUObject {
                jcudaSparseMatrixPtr = null;
        }
        
+       Set<Pointer> getPointers() {
+               Set<Pointer> ret = new HashSet<>();
+               if(!isDensePointerNull() && getSparseMatrixCudaPointer() != 
null) {
+                       LOG.warn("Matrix allocated in both dense and sparse 
format");
+               }
+               if(!isDensePointerNull()) {
+                       // && evictedDenseArr == null - Ignore evicted array
+                       ret.add(getDensePointer());
+               }
+               if(getSparseMatrixCudaPointer() != null) {
+                       CSRPointer sparsePtr = getSparseMatrixCudaPointer();
+                       if(sparsePtr != null) {
+                               if(sparsePtr.rowPtr != null)
+                                       ret.add(sparsePtr.rowPtr);
+                               else if(sparsePtr.colInd != null)
+                                       ret.add(sparsePtr.colInd);
+                               else if(sparsePtr.val != null)
+                                       ret.add(sparsePtr.val);
+                       }
+               }
+               return ret;
+       }
        
        /**
         * Convenience method to directly set the dense matrix pointer on GPU

systemml git commit: [SYSTEMML-1325] Fix a performance bug when using GPU backend with JMLC

Reply via email to