[systemds] branch main updated: [SYSTEMDS-3510] Recycling pointers from GPU lineage cache

arnabp20 Sun, 26 Mar 2023 07:39:15 -0700

This is an automated email from the ASF dual-hosted git repository.

arnabp20 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new 3c0fe25e8f [SYSTEMDS-3510] Recycling pointers from GPU lineage cache
3c0fe25e8f is described below

commit 3c0fe25e8fce49ecd091e92070e605e42b01826c
Author: Arnab Phani <[email protected]>
AuthorDate: Sun Mar 26 16:38:11 2023 +0200

    [SYSTEMDS-3510] Recycling pointers from GPU lineage cache
    
    This patch reworks the GPU cache eviction (deletion). The free pointer
    cache stays empty when lineage cache is enabled. This patch allows
    recycling the cached pointers. We maintain two lists for live and
    free pointers in the cache. When the GPU memory is full, we poll
    the first entry from the free list (weighted queue) and recycle
    the pointer if the size matches the requested size, otherwise we
    deallocate the cached pointers till enough space is available.
    
    Closes #1796
---
 .../controlprogram/caching/CacheableData.java      |   3 -
 .../controlprogram/context/ExecutionContext.java   |   6 +
 .../instructions/gpu/context/GPUContext.java       |   9 --
 .../gpu/context/GPUMatrixMemoryManager.java        |   7 +-
 .../gpu/context/GPUMemoryEviction.java             |   4 +-
 .../instructions/gpu/context/GPUMemoryManager.java | 122 ++++++---------------
 .../instructions/gpu/context/GPUObject.java        |  59 ++--------
 .../apache/sysds/runtime/lineage/LineageCache.java |  28 +++--
 .../sysds/runtime/lineage/LineageCacheEntry.java   |  27 ++---
 .../runtime/lineage/LineageGPUCacheEviction.java   |  79 +++++++++++--
 .../lineage/GPULineageCacheEvictionTest.java       |   2 +-
 11 files changed, 158 insertions(+), 188 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java
 
b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java
index 59c9be2639..6fd2c605ed 100644
--- 
a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java
+++ 
b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java
@@ -790,9 +790,6 @@ public abstract class CacheableData<T extends 
CacheBlock<?>> extends Data
                        for (GPUObject gObj : _gpuObjects.values())
                                if (gObj != null) {
                                        gObj.clearData(null, 
DMLScript.EAGER_CUDA_FREE);
-                                       if (gObj.isLinCached())
-                                               // set rmVarPending which helps 
detecting liveness
-                                               gObj.setrmVarPending(true);
                                }
                }
                
diff --git 
a/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java
 
b/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java
index 278f23c765..cdedfb9e45 100644
--- 
a/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java
+++ 
b/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java
@@ -51,7 +51,9 @@ import 
org.apache.sysds.runtime.instructions.gpu.context.CSRPointer;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.lineage.Lineage;
+import org.apache.sysds.runtime.lineage.LineageCacheConfig;
 import org.apache.sysds.runtime.lineage.LineageDebugger;
+import org.apache.sysds.runtime.lineage.LineageGPUCacheEviction;
 import org.apache.sysds.runtime.lineage.LineageItem;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.data.Pair;
@@ -183,8 +185,12 @@ public class ExecutionContext {
         */
        public void setGPUContexts(List<GPUContext> gpuContexts){
                _gpuContexts = gpuContexts;
+               // Set the single-GPU context in the lineage cache
+               if (!LineageCacheConfig.ReuseCacheType.isNone())
+                       
LineageGPUCacheEviction.setGPUContext(gpuContexts.get(0));
        }
 
+
        /**
         * Gets the list of GPUContexts
         * @return a list of GPUContexts
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
index 5ad9bc52a3..3ffbedea3f 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUContext.java
@@ -283,15 +283,6 @@ public class GPUContext {
                GPUObject ret = new GPUObject(this, source, mo);
                
getMemoryManager().getGPUMatrixMemoryManager().addGPUObject(ret);
 
-               // Maintain the linked list of GPUObjects that point to same 
memory region
-               if (!LineageCacheConfig.ReuseCacheType.isNone()) {
-                       if (source.lineageCachedChainHead == null)
-                               source.lineageCachedChainHead = source;
-                       if (source.nextLineageCachedEntry != null)
-                               ret.nextLineageCachedEntry = 
source.nextLineageCachedEntry;
-                       source.nextLineageCachedEntry = ret;
-                       ret.lineageCachedChainHead = source;
-               }
                return ret;
        }
 
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
index 23c1be9c0e..4a2f2a5cc4 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java
@@ -120,12 +120,11 @@ public class GPUMatrixMemoryManager {
         * Get pointers from the first memory sections "Matrix Memory"
         * @param locked return locked pointers if true
         * @param dirty return dirty pointers if true
-        * @param lineageCached return cached pointer if true
         * @return set of pointers
         */
-       Set<Pointer> getPointers(boolean locked, boolean dirty, boolean 
lineageCached) {
+       Set<Pointer> getPointers(boolean locked, boolean dirty) {
                return gpuObjects.stream()
-                       .filter(gObj -> gObj.isLocked() == locked && 
gObj.isDirty() == dirty || gObj.isLinCached() == lineageCached)
+                       .filter(gObj -> gObj.isLocked() == locked && 
gObj.isDirty() == dirty)
                        .flatMap(gObj -> 
getPointers(gObj).stream()).collect(Collectors.toSet());
        }
        
@@ -137,7 +136,7 @@ public class GPUMatrixMemoryManager {
         */
        void clearAllUnlocked(String opcode) throws DMLRuntimeException {
                Set<GPUObject> unlockedGPUObjects = gpuObjects.stream()
-                               .filter(gpuObj -> !gpuObj.isLocked() && 
!gpuObj.isLinCached()).collect(Collectors.toSet());
+                               .filter(gpuObj -> 
!gpuObj.isLocked()).collect(Collectors.toSet());
                if(unlockedGPUObjects.size() > 0) {
                        if(LOG.isWarnEnabled())
                                LOG.warn("Clearing all unlocked matrices 
(count=" + unlockedGPUObjects.size() + ").");
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
index 026449722d..3676d337ab 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
@@ -50,7 +50,7 @@ public class GPUMemoryEviction implements Runnable
                // Stop if 1) Evicted the request number of entries, 2) The 
parallel
                // CPU instruction is ended, and 3) No non-live entries left in 
the cache.
                long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
-               while (!LineageGPUCacheEviction.isGPUCacheEmpty()) 
+               /*while (!LineageGPUCacheEviction.isGPUCacheEmpty())
                {
                        if (LineageCacheConfig.STOPBACKGROUNDEVICTION)
                                // This logic reduces #evictions if the cpu 
instructions is so small
@@ -135,7 +135,7 @@ public class GPUMemoryEviction implements Runnable
                                
LineageCacheStatistics.incrementGpuAsyncEvicts();
                        }
                        count++;
-               }
+               }*/
 
                // Add the locked entries back to the eviction queue
                if (!lockedOrLiveEntries.isEmpty())
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
index 4aed4a7943..4b0a67cbb4 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -89,6 +89,7 @@ public class GPUMemoryManager {
        private Set<Pointer> getNonMatrixLockedPointers() {
                Set<Pointer> managedPointers = 
matrixMemoryManager.getPointers();
                
managedPointers.addAll(lazyCudaFreeMemoryManager.getAllPointers());
+               
managedPointers.addAll(LineageGPUCacheEviction.getAllCachedPointers());
                return nonIn(allPointers.keySet(), managedPointers);
        }
        
@@ -298,80 +299,35 @@ public class GPUMemoryManager {
                        long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
                        while (A == null && 
!LineageGPUCacheEviction.isGPUCacheEmpty()) {
                                LineageCacheEntry le = 
LineageGPUCacheEviction.pollFirstEntry();
-                               GPUObject cachedGpuObj = le.getGPUObject();
-                               GPUObject headGpuObj = 
cachedGpuObj.lineageCachedChainHead != null
-                                               ? 
cachedGpuObj.lineageCachedChainHead : cachedGpuObj;
-                               // Check and continue if any object in the 
linked list is locked
-                               boolean locked = false;
-                               GPUObject nextgpuObj = headGpuObj;
-                               while (nextgpuObj!= null) {
-                                       if (nextgpuObj.isLocked())
-                                               locked = true;
-                                       nextgpuObj = 
nextgpuObj.nextLineageCachedEntry;
-                               }
-                               if (locked) {
-                                       lockedAndLiveList.add(le);
-                                       continue;
-                               }
 
                                // First remove the gpuobj chains that don't 
contain any live and dirty objects.
-                               // Continue if any object is live
-                               boolean copied = false;
-                               boolean live = false;
-                               nextgpuObj = headGpuObj;
-                               while (nextgpuObj!= null) {
-                                       // Keeping isLinCached as True here 
will save data deletion by copyFromDeviceToHost
-                                       if (!nextgpuObj.isrmVarPending()) { 
//live
-                                               
//nextgpuObj.copyFromDeviceToHost(opcode, true, true);
-                                               //copied = true;
-                                               live = true;
-                                       }
-                                       //nextgpuObj.setIsLinCached(false);
-                                       nextgpuObj = 
nextgpuObj.nextLineageCachedEntry;
-                               }
-                               if (live) {
+                               // TODO: Handle dirty objects separately. Copy 
them back to the host
+
+                               // Check and continue if the pointer is live
+                               // Note: all locked entries are live
+                               Pointer ptr = le.getGPUPointer();
+                               if 
(LineageGPUCacheEviction.probeLiveCachedPointers(ptr)) {
                                        lockedAndLiveList.add(le);
                                        continue;
                                }
-                               // TODO: Handle dirty objects separately. Copy 
them back to the host
-
-                               currentAvailableMemory += 
headGpuObj.getSizeOnDevice();
+                               currentAvailableMemory += 
getSizeAllocatedGPUPointer(ptr);
 
-                               if (!LineageCacheConfig.GPU2HOSTEVICTION)
-                                       
LineageGPUCacheEviction.removeFromDeviceCache(le, opcode, copied);
-                               else {
+                               if (!LineageCacheConfig.GPU2HOSTEVICTION) {
+                                       
LineageGPUCacheEviction.removeFromDeviceCache(le, opcode, false);
+                                       // Recycle the pointer if matches the 
required size
+                                       if (getSizeAllocatedGPUPointer(ptr) == 
size) {
+                                               A = ptr;
+                                               continue;
+                                       }
+                                       else
+                                               free(opcode, ptr, true);
+                               }
+                               /*else {
                                        // Copy from device cache to CPU 
lineage cache if not already copied
                                        
LineageGPUCacheEviction.copyToHostCache(le, opcode, copied);
                                        if(DMLScript.STATISTICS)
                                                
LineageCacheStatistics.incrementGpuSyncEvicts();
-                               }
-
-                               // For all the other objects, remove and clear 
data (only once)
-                               nextgpuObj = headGpuObj;
-                               boolean freed = false;
-                               while (nextgpuObj!= null) {
-                                       // If not live or live but not dirty
-                                       if (nextgpuObj.isrmVarPending() || 
!nextgpuObj.isDirty()) {
-                                               if (!freed) {
-                                                       
nextgpuObj.setIsLinCached(false);
-                                                       
nextgpuObj.clearData(opcode, true);
-                                                       freed = true;
-                                               }
-                                               else
-                                                       
nextgpuObj.clearGPUObject();
-                                       }
-                                       nextgpuObj = 
nextgpuObj.nextLineageCachedEntry;
-                               }
-
-                               // Clear the GPUOjects chain
-                               GPUObject currgpuObj = headGpuObj;
-                               while (currgpuObj.nextLineageCachedEntry != 
null) {
-                                       nextgpuObj = 
currgpuObj.nextLineageCachedEntry;
-                                       currgpuObj.lineageCachedChainHead = 
null;
-                                       currgpuObj.nextLineageCachedEntry = 
null;
-                                       nextgpuObj.lineageCachedChainHead = 
null;
-                                       currgpuObj = nextgpuObj;
-                               }
+                               }*/
 
                                if(currentAvailableMemory >= size)
                                        // This doesn't guarantee allocation 
due to fragmented freed memory
@@ -393,7 +349,7 @@ public class GPUMemoryManager {
                        long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
                        synchronized (matrixMemoryManager.gpuObjects) {
                                Optional<GPUObject> sizeBasedUnlockedGPUObjects 
= matrixMemoryManager.gpuObjects.stream()
-                                       .filter(gpuObj -> !gpuObj.isLocked() && 
!gpuObj.isLinCached()
+                                       .filter(gpuObj -> !gpuObj.isLocked()
                                                && 
matrixMemoryManager.getWorstCaseContiguousMemorySize(gpuObj) >= size)
                                        .min((o1, o2) -> 
worstCaseContiguousMemorySizeCompare(o1, o2));
                                if(sizeBasedUnlockedGPUObjects.isPresent()) {
@@ -421,7 +377,7 @@ public class GPUMemoryManager {
                        // Evict unlocked GPU objects one-by-one and try malloc
                        synchronized(matrixMemoryManager.gpuObjects) {
                                List<GPUObject> unlockedGPUObjects = 
matrixMemoryManager.gpuObjects.stream()
-                                               .filter(gpuObj -> 
!gpuObj.isLocked() && !gpuObj.isLinCached()).collect(Collectors.toList());
+                                               .filter(gpuObj -> 
!gpuObj.isLocked()).collect(Collectors.toList());
                                Collections.sort(unlockedGPUObjects, new 
EvictionPolicyBasedComparator(size));
                                while(A == null && unlockedGPUObjects.size() > 
0) {
                                        GPUObject evictedGPUObject = 
unlockedGPUObjects.remove(unlockedGPUObjects.size()-1);
@@ -511,7 +467,7 @@ public class GPUMemoryManager {
         * 
         * @param toFree pointer to call cudaFree method on
         */
-       void guardedCudaFree(Pointer toFree) {
+       public void guardedCudaFree(Pointer toFree) {
                synchronized(allPointers) {
                        if(allPointers.containsKey(toFree)) {
                                long size = 
allPointers.get(toFree).getSizeInBytes();
@@ -541,6 +497,11 @@ public class GPUMemoryManager {
         * @throws DMLRuntimeException if error occurs
         */
        public void free(String opcode, Pointer toFree, boolean eager) throws 
DMLRuntimeException {
+               if (!LineageCacheConfig.ReuseCacheType.isNone()
+                       && 
LineageGPUCacheEviction.probeLiveCachedPointers(toFree)) {
+                       LineageGPUCacheEviction.decrementLiveCount(toFree);
+                       return;
+               }
                if(LOG.isTraceEnabled())
                        LOG.trace("Free-ing the pointer with eager=" + eager);
                if (eager) {
@@ -626,7 +587,7 @@ public class GPUMemoryManager {
         */
        public void clearTemporaryMemory() {
                // To record the cuda block sizes needed by 
allocatedGPUObjects, others are cleared up.
-               Set<Pointer> unlockedDirtyOrCachedPointers = 
matrixMemoryManager.getPointers(false, true, true);
+               Set<Pointer> unlockedDirtyOrCachedPointers = 
matrixMemoryManager.getPointers(false, true);
                Set<Pointer> temporaryPointers = nonIn(allPointers.keySet(), 
unlockedDirtyOrCachedPointers);
                for(Pointer tmpPtr : temporaryPointers) {
                        guardedCudaFree(tmpPtr);
@@ -659,18 +620,11 @@ public class GPUMemoryManager {
                long sizeOfLockedGPUObjects = 0; int numLockedGPUObjects = 0; 
int numLockedPointers = 0;
                long sizeOfUnlockedDirtyGPUObjects = 0; int 
numUnlockedDirtyGPUObjects = 0; int numUnlockedDirtyPointers = 0;
                long sizeOfUnlockedNonDirtyGPUObjects = 0; int 
numUnlockedNonDirtyGPUObjects = 0; int numUnlockedNonDirtyPointers = 0;
-               long sizeOfLockedCachedGPUObjects = 0; int 
numLockedCachedGPUObjects = 0; int numLockedCachedPointers = 0;
-               long sizeOfUnlockedCachedGPUObjects = 0; int 
numUnlockedCachedGPUObjects = 0; int numUnlockedCachedPointers = 0;
                for(GPUObject gpuObj : matrixMemoryManager.gpuObjects) {
                        if(gpuObj.isLocked()) {
                                numLockedGPUObjects++;
                                sizeOfLockedGPUObjects += 
gpuObj.getSizeOnDevice();
                                numLockedPointers += 
matrixMemoryManager.getPointers(gpuObj).size();
-                               if (gpuObj.isLinCached()) {
-                                       numLockedCachedGPUObjects++;
-                                       sizeOfLockedCachedGPUObjects += 
gpuObj.getSizeOnDevice();
-                                       numLockedCachedPointers += 
matrixMemoryManager.getPointers(gpuObj).size();
-                               }
                        }
                        else {
                                if(gpuObj.isDirty()) {
@@ -683,11 +637,6 @@ public class GPUMemoryManager {
                                        sizeOfUnlockedNonDirtyGPUObjects += 
gpuObj.getSizeOnDevice();
                                        numUnlockedNonDirtyPointers += 
matrixMemoryManager.getPointers(gpuObj).size();
                                }
-                               if (gpuObj.isLinCached()) {
-                                       numUnlockedCachedGPUObjects++;
-                                       sizeOfUnlockedCachedGPUObjects += 
gpuObj.getSizeOnDevice();
-                                       numUnlockedCachedPointers += 
matrixMemoryManager.getPointers(gpuObj).size();
-                               }
                        }
                }
                
@@ -696,7 +645,10 @@ public class GPUMemoryManager {
                for(PointerInfo ptrInfo : allPointers.values()) {
                        totalMemoryAllocated += ptrInfo.getSizeInBytes();
                }
-               
+
+               int numCachedPointers = 
LineageGPUCacheEviction.numPointersCached();
+               long totalMemoryCached = 
LineageGPUCacheEviction.totalMemoryCached();
+
                
                Set<Pointer> potentiallyLeakyPointers = 
getNonMatrixLockedPointers();
                List<Long> sizePotentiallyLeakyPointers = 
potentiallyLeakyPointers.stream().
@@ -719,11 +671,9 @@ public class GPUMemoryManager {
                                numUnlockedNonDirtyGPUObjects, 
numUnlockedNonDirtyPointers, 
byteCountToDisplaySize(sizeOfUnlockedNonDirtyGPUObjects)));
                ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Locked GPU 
objects", 
                                numLockedGPUObjects, numLockedPointers, 
byteCountToDisplaySize(sizeOfLockedGPUObjects)));
-               ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Locked 
Cached GPU objects", 
-                               numLockedCachedGPUObjects, 
numLockedCachedPointers, byteCountToDisplaySize(sizeOfLockedCachedGPUObjects)));
-               ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Unlocked 
Cached GPU objects", 
-                               numUnlockedCachedGPUObjects, 
numUnlockedCachedPointers, 
byteCountToDisplaySize(sizeOfUnlockedCachedGPUObjects)));
-               ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Cached 
rmvar-ed pointers", 
+               ret.append(String.format("%-35s%-15s%-15s%-15s\n", "All Cached 
Pointers",
+                       "-", numCachedPointers, 
byteCountToDisplaySize(totalMemoryCached)));
+               ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Cached 
rmvar-ed pointers",
                                "-", 
lazyCudaFreeMemoryManager.getNumPointers(), 
byteCountToDisplaySize(lazyCudaFreeMemoryManager.getTotalMemoryAllocated())));
                ret.append(String.format("%-35s%-15s%-15s%-15s\n", 
"Non-matrix/non-cached pointers", 
                                "-", potentiallyLeakyPointers.size(), 
byteCountToDisplaySize(totalSizePotentiallyLeakyPointers)));
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
index 043243f5ae..6dccee983d 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
@@ -102,27 +102,6 @@ public class GPUObject {
         */
        final ShadowBuffer shadowBuffer;
        
-       /**
-        * whether cached in lineage cache
-        */
-       private boolean isLineageCached = false;
-       
-       /**
-        * whether remove variable is called on this object.
-        * True -> live; False -> not-live
-        */
-       private boolean rmVarPending = false;
-       
-       /**
-        * Next GPUObject that points to the same lineage cached GPU pointer
-        */
-       public GPUObject lineageCachedChainHead = null;
-       
-       /**
-        * Head of the linked list of GPUObjects that point to the same lineage 
cached GPU pointer
-        */
-       public GPUObject nextLineageCachedEntry = null;
-       
        // 
----------------------------------------------------------------------
        // Methods used to access, set and check jcudaDenseMatrixPtr
        
@@ -466,13 +445,20 @@ public class GPUObject {
                this.shadowBuffer = new ShadowBuffer(this);
        }
 
+       public GPUObject(GPUContext gCtx, MatrixObject mat, Pointer ptr) {
+               gpuContext = gCtx;
+               this.mat = mat;
+               setDensePointer(ptr);
+               isSparse = false;
+               this.shadowBuffer = new ShadowBuffer(this);
+       }
+
        public GPUObject(GPUContext gCtx, GPUObject that, MatrixObject mat) {
                dirty = that.dirty;
                readLocks.reset();
                writeLock = false;
                timestamp = new AtomicLong(that.timestamp.get());
                isSparse = that.isSparse;
-               isLineageCached = that.isLineageCached;
                if (!that.isDensePointerNull())
                        setDensePointer(that.getDensePointer());
                if (that.getJcudaSparseMatrixPtr() != null)
@@ -991,7 +977,7 @@ public class GPUObject {
                        tmp.allocateDenseBlock();
                        
LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(),
                                                getDensePointer(), 
tmp.getDenseBlockValues(), instName, isEviction);
-                       if(eagerDelete && !isLinCached())
+                       if(eagerDelete)
                                clearData(instName, true);
                        tmp.recomputeNonZeros();
                } else {
@@ -1003,7 +989,7 @@ public class GPUObject {
                        int[] rowPtr = new int[rows + 1];
                        int[] colInd = new int[nnz];
                        CSRPointer.copyPtrToHost(getJcudaSparseMatrixPtr(), 
rows, nnz, rowPtr, colInd);
-                       if(eagerDelete && !isLinCached())
+                       if(eagerDelete)
                                clearData(instName, true);
                        SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, 
colInd, values, nnz);
                        tmp = new MatrixBlock(rows, cols, nnz, sparseBlock);
@@ -1064,11 +1050,6 @@ public class GPUObject {
         * @throws DMLRuntimeException if error occurs
         */
        synchronized public void clearData(String opcode, boolean eager) throws 
DMLRuntimeException {
-               if (isLineageCached) {
-                       setDirty(false);
-                       return;
-               }
-
                if(LOG.isTraceEnabled()) {
                        LOG.trace("GPU : clearData on " + this + ", 
GPUContext=" + getGPUContext());
                }
@@ -1082,13 +1063,10 @@ public class GPUObject {
                shadowBuffer.clearShadowPointer();
                jcudaSparseMatrixPtr = null;
                resetReadWriteLock();
-               setrmVarPending(false);
                getGPUContext().getMemoryManager().removeGPUObject(this);
        }
        
        public void clearGPUObject() {
-               if (isLineageCached)
-                       return;
                if(LOG.isTraceEnabled())
                        LOG.trace("GPU : clearData on " + this + ", 
GPUContext=" + getGPUContext());
 
@@ -1096,7 +1074,6 @@ public class GPUObject {
                shadowBuffer.clearShadowPointer();
                jcudaSparseMatrixPtr = null;
                resetReadWriteLock();
-               setrmVarPending(false);
                getGPUContext().getMemoryManager().removeGPUObject(this);
        }
 
@@ -1118,22 +1095,6 @@ public class GPUObject {
                return dirty;
        }
        
-       public void setIsLinCached(boolean val) {
-               isLineageCached = val;
-       }
-
-       public boolean isLinCached() {
-               return isLineageCached;
-       }
-       
-       public void setrmVarPending(boolean val) {
-               rmVarPending = val;
-       }
-       
-       public boolean isrmVarPending() {
-               return rmVarPending;
-       }
-
        @Override
        public String toString() {
                final StringBuilder sb = new StringBuilder("GPUObject{");
diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
index fb4f579986..c38132a5a3 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
@@ -19,6 +19,7 @@
 
 package org.apache.sysds.runtime.lineage;
 
+import jcuda.Pointer;
 import org.apache.commons.lang3.ArrayUtils;
 import org.apache.commons.lang3.tuple.MutablePair;
 import org.apache.commons.lang3.tuple.Pair;
@@ -151,10 +152,14 @@ public class LineageCache
                                        }
                                        else { //TODO handle locks on gpu 
objects
                                                //shallow copy the cached 
GPUObj to the output MatrixObject
-                                               
ec.getMatrixObject(outName).setGPUObject(ec.getGPUContext(0), 
-                                                               
ec.getGPUContext(0).shallowCopyGPUObject(e._gpuObject, 
ec.getMatrixObject(outName)));
+                                               //Create a GPUObject with the 
cached pointer
+                                               GPUObject gpuObj = new 
GPUObject(ec.getGPUContext(0),
+                                                       
ec.getMatrixObject(outName), e.getGPUPointer());
+                                               
ec.getMatrixObject(outName).setGPUObject(ec.getGPUContext(0), gpuObj);
                                                //Set dirty to true, so that it 
is later copied to the host for write
                                                
ec.getMatrixObject(outName).getGPUObject(ec.getGPUContext(0)).setDirty(true);
+                                               //Increment the live count for 
this pointer
+                                               
LineageGPUCacheEviction.incrementLiveCount(e.getGPUPointer());
                                        }
                                }
                                maintainReuseStatistics(ec, inst, 
liList.get(0).getValue());
@@ -474,7 +479,7 @@ public class LineageCache
                if (LineageCacheConfig.isReusable(inst, ec) ) {
                        //if (!isMarkedForCaching(inst, ec)) return;
                        List<Pair<LineageItem, Data>> liData = null;
-                       GPUObject liGpuObj = null;
+                       Pointer gpuPtr = null;
                        LineageItem instLI = ((LineageTraceable) 
inst).getLineageItem(ec).getValue();
                        if (inst instanceof MultiReturnBuiltinCPInstruction) {
                                liData = new ArrayList<>();
@@ -489,12 +494,13 @@ public class LineageCache
                        else if (inst instanceof GPUInstruction) {
                                // TODO: gpu multiretrun instructions
                                Data gpudata = ec.getVariable(((GPUInstruction) 
inst)._output);
-                               liGpuObj = gpudata instanceof MatrixObject ? 
-                                               
ec.getMatrixObject(((GPUInstruction)inst)._output).getGPUObject(ec.getGPUContext(0))
 : null;
+                               gpuPtr = gpudata instanceof MatrixObject ?
+                                               
ec.getMatrixObject(((GPUInstruction)inst)._output).
+                                                       
getGPUObject(ec.getGPUContext(0)).getDensePointer() : null;
 
                                // Scalar gpu intermediates is always copied 
back to host. 
                                // No need to cache the GPUobj for scalar 
intermediates.
-                               if (liGpuObj == null)
+                               if (gpuPtr == null)
                                        liData = Arrays.asList(Pair.of(instLI, 
ec.getVariable(((GPUInstruction)inst)._output)));
                        }
                        else if (inst instanceof ComputationSPInstruction
@@ -511,10 +517,10 @@ public class LineageCache
                                else if (inst instanceof 
ComputationSPInstruction) //collects or prefetches
                                        liData = Arrays.asList(Pair.of(instLI, 
ec.getVariable(((ComputationSPInstruction) inst).output)));
 
-                       if (liGpuObj == null)
+                       if (gpuPtr == null)
                                putValueCPU(inst, liData, computetime);
                        else
-                               putValueGPU(liGpuObj, instLI, computetime);
+                               putValueGPU(gpuPtr, instLI, computetime);
                }
        }
        
@@ -588,14 +594,14 @@ public class LineageCache
                }
        }
        
-       private static void putValueGPU(GPUObject gpuObj, LineageItem instLI, 
long computetime) {
+       private static void putValueGPU(Pointer gpuPtr, LineageItem instLI, 
long computetime) {
                synchronized( _cache ) {
                        LineageCacheEntry centry = _cache.get(instLI);
                        // Update the total size of lineage cached gpu objects
                        // The eviction is handled by the unified gpu memory 
manager
-                       
LineageGPUCacheEviction.updateSize(gpuObj.getSizeOnDevice(), true);
+                       
LineageGPUCacheEviction.updateSize(LineageGPUCacheEviction.getPointerSize(gpuPtr),
 true);
                        // Set the GPUOject in the cache
-                       centry.setGPUValue(gpuObj, computetime);
+                       centry.setGPUValue(gpuPtr, computetime);
                        // Maintain order for eviction
                        LineageGPUCacheEviction.addEntry(centry);
                }
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
index 8efe57a162..e0b595e29f 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
@@ -21,6 +21,7 @@ package org.apache.sysds.runtime.lineage;
 
 import java.util.Map;
 
+import jcuda.Pointer;
 import org.apache.sysds.common.Types.DataType;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
@@ -42,7 +43,8 @@ public class LineageCacheEntry {
        protected LineageItem _origItem;
        private String _outfile = null;
        protected double score;
-       protected GPUObject _gpuObject;
+       protected Pointer _gpuPointer;
+
        protected RDDObject _rddObject;
        
        public LineageCacheEntry(LineageItem key, DataType dt, MatrixBlock 
Mval, ScalarObject Sval, long computetime) {
@@ -55,7 +57,7 @@ public class LineageCacheEntry {
                _nextEntry = null;
                _origItem = null;
                _outfile = null;
-               _gpuObject = null;
+               _gpuPointer = null;
        }
        
        protected synchronized void setCacheStatus(LineageCacheStatus st) {
@@ -140,13 +142,13 @@ public class LineageCacheEntry {
                        size += _MBval.getInMemorySize();
                if (_SOval != null)
                        size += _SOval.getSize();
-               if (_gpuObject != null)
-                       size += _gpuObject.getSizeOnDevice();
+               if (_gpuPointer!= null)
+                       size += 
LineageGPUCacheEviction.getPointerSize(_gpuPointer);
                return size;
        }
        
        public boolean isNullVal() {
-               return(_MBval == null && _SOval == null && _gpuObject == null 
&& _serialBytes == null && _rddObject == null);
+               return(_MBval == null && _SOval == null && _gpuPointer == null 
&& _serialBytes == null && _rddObject == null);
        }
        
        public boolean isMatrixValue() {
@@ -162,7 +164,7 @@ public class LineageCacheEntry {
        }
 
        public boolean isGPUObject() {
-               return _gpuObject != null;
+               return _gpuPointer != null;
        }
 
        public boolean isSerializedBytes() {
@@ -171,7 +173,7 @@ public class LineageCacheEntry {
 
        public synchronized void setValue(MatrixBlock val, long computetime) {
                _MBval = val;
-               _gpuObject = null;  //Matrix block and gpu object cannot coexist
+               _gpuPointer = null;  //Matrix block and gpu pointer cannot 
coexist
                _computeTime = computetime;
                _status = isNullVal() ? LineageCacheStatus.EMPTY : 
LineageCacheStatus.CACHED;
                //resume all threads waiting for val
@@ -184,16 +186,15 @@ public class LineageCacheEntry {
 
        public synchronized void setValue(ScalarObject val, long computetime) {
                _SOval = val;
-               _gpuObject = null;  //scalar and gpu object cannot coexist
+               _gpuPointer = null;  //scalar and gpu pointer cannot coexist
                _computeTime = computetime;
                _status = isNullVal() ? LineageCacheStatus.EMPTY : 
LineageCacheStatus.CACHED;
                //resume all threads waiting for val
                notifyAll();
        }
        
-       public synchronized void setGPUValue(GPUObject gpuObj, long 
computetime) {
-               gpuObj.setIsLinCached(true);
-               _gpuObject = gpuObj;
+       public synchronized void setGPUValue(Pointer ptr, long computetime) {
+               _gpuPointer = ptr;
                _computeTime = computetime;
                _status = isNullVal() ? LineageCacheStatus.EMPTY : 
LineageCacheStatus.GPUCACHED;
                //resume all threads waiting for val
@@ -216,8 +217,8 @@ public class LineageCacheEntry {
                notifyAll();
        }
        
-       public synchronized GPUObject getGPUObject() {
-               return _gpuObject;
+       public synchronized Pointer getGPUPointer() {
+               return _gpuPointer;
        }
        
        protected synchronized void setNullValues() {
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
index b302359545..ed0d85a712 100644
--- 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
@@ -19,32 +19,66 @@
 
 package org.apache.sysds.runtime.lineage;
 
+import java.util.HashMap;
 import java.util.List;
+import java.util.Set;
 import java.util.TreeSet;
 import java.util.concurrent.ExecutorService;
+import java.util.stream.Collectors;
 
+import jcuda.Pointer;
 import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUContextPool;
-import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 
 public class LineageGPUCacheEviction 
 {
        private static long _currentCacheSize = 0;
        private static long GPU_CACHE_LIMIT; //limit in bytes
+       private static GPUContext _gpuContext = null;
        private static long _startTimestamp = 0;
        public static ExecutorService gpuEvictionThread = null;
+
+       // Weighted queue of freed pointers.
        private static TreeSet<LineageCacheEntry> weightedQueue = new 
TreeSet<>(LineageCacheConfig.LineageCacheComparator);
+       private static HashMap<Pointer, Integer> livePointers = new HashMap<>();
+       private static HashMap<Pointer, LineageCacheEntry> GPUCacheEntries = 
new HashMap<>();
 
        protected static void resetEviction() {
-               while(!weightedQueue.isEmpty()) {
-                       LineageCacheEntry e = weightedQueue.pollFirst();
-                       e._gpuObject.setIsLinCached(false);
-                       e._gpuObject.clearData(null, true);
-               }
                _currentCacheSize = 0;
                gpuEvictionThread = null;
                //LineageCacheConfig.CONCURRENTGPUEVICTION = false;
                weightedQueue.clear();
+               livePointers.clear();
+               GPUCacheEntries.clear();
+       }
+
+       public static void setGPUContext(GPUContext gpuCtx) {
+               _gpuContext = gpuCtx;
+       }
+
+       protected static GPUContext getGPUContext() {
+               return _gpuContext;
+       }
+
+       protected static long getPointerSize(Pointer ptr) {
+               return 
_gpuContext.getMemoryManager().getSizeAllocatedGPUPointer(ptr);
+       }
+
+       protected static void incrementLiveCount(Pointer ptr) {
+               //TODO: move from free list to live list
+               if(livePointers.merge(ptr, 1, Integer::sum) == 1)
+                       weightedQueue.remove(GPUCacheEntries.get(ptr));
+       }
+
+       public static void decrementLiveCount(Pointer ptr) {
+               // Decrement and remove if the live counte becomes 0
+               if(livePointers.compute(ptr, (k, v) -> v==1 ? null : v-1) == 
null)
+                       weightedQueue.add(GPUCacheEntries.get(ptr));
+       }
+
+       public static boolean probeLiveCachedPointers(Pointer ptr) {
+               return livePointers.containsKey(ptr);
        }
 
        //---------------- COSTING RELATED METHODS -----------------
@@ -88,7 +122,9 @@ public class LineageGPUCacheEviction
 
                // TODO: Separate removelist, starttimestamp, score and weights 
from CPU cache
                entry.computeScore(LineageCacheEviction._removelist);
-               weightedQueue.add(entry);
+               //weightedQueue.add(entry);
+               livePointers.put(entry.getGPUPointer(), 1);
+               GPUCacheEntries.put(entry.getGPUPointer(), entry);
        }
        
        public static boolean isGPUCacheEmpty() {
@@ -127,8 +163,29 @@ public class LineageGPUCacheEviction
        protected static long getGPUCacheLimit() {
                return GPU_CACHE_LIMIT;
        }
+
+       public static int numPointersCached() {
+               return livePointers.size() + weightedQueue.size();
+       }
+
+       public static long totalMemoryCached() {
+               long totLive = livePointers.keySet().stream()
+                       .mapToLong(ptr -> 
_gpuContext.getMemoryManager().getSizeAllocatedGPUPointer(ptr)).sum();
+               long totFree = weightedQueue.stream()
+                       .mapToLong(en -> 
_gpuContext.getMemoryManager().getSizeAllocatedGPUPointer(en.getGPUPointer())).sum();
+               return totLive + totFree;
+       }
+
+       public static Set<Pointer> getAllCachedPointers() {
+               //livePointers.keySet() + weightedQueue.stream().map()
+               Set<Pointer> cachedPointers = weightedQueue.stream()
+                       .map(LineageCacheEntry::getGPUPointer)
+                       .collect(Collectors.toSet());
+               cachedPointers.addAll(livePointers.keySet());
+               return cachedPointers;
+       }
        
-       public static void copyToHostCache(LineageCacheEntry entry, String 
instName, boolean alreadyCopied) {
+       /*public static void copyToHostCache(LineageCacheEntry entry, String 
instName, boolean alreadyCopied) {
                // TODO: move to the shadow buffer. Convert to double precision 
only when reused.
                long t0 = System.nanoTime();
                MatrixBlock mb = alreadyCopied ? 
entry._gpuObject.getMatrixObject().acquireReadAndRelease()
@@ -150,12 +207,14 @@ public class LineageGPUCacheEviction
                LineageCacheEviction.addEntry(entry);
                // manage space in gpu cache
                updateSize(size, false);
-       }
+       }*/
 
        public static void removeFromDeviceCache(LineageCacheEntry entry, 
String instName, boolean alreadyCopied) {
-               long size = entry.getGPUObject().getSizeOnDevice();
+               //long size = entry.getGPUObject().getSizeOnDevice();
+               long size = 
_gpuContext.getMemoryManager().getSizeAllocatedGPUPointer(entry.getGPUPointer());
                LineageCache.removeEntry(entry._key);
                updateSize(size, false);
+               GPUCacheEntries.remove(entry.getGPUPointer());
        }
 
 }
\ No newline at end of file
diff --git 
a/src/test/java/org/apache/sysds/test/functions/lineage/GPULineageCacheEvictionTest.java
 
b/src/test/java/org/apache/sysds/test/functions/lineage/GPULineageCacheEvictionTest.java
index 7536173ce0..7cfbbd9dc3 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/lineage/GPULineageCacheEvictionTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/lineage/GPULineageCacheEvictionTest.java
@@ -81,7 +81,7 @@ public class GPULineageCacheEvictionTest extends 
AutomatedTestBase{
                // reset clears the lineage cache held memory from the last run
                Lineage.resetInternalState();
                boolean gpu2Mem = LineageCacheConfig.GPU2HOSTEVICTION;
-               LineageCacheConfig.GPU2HOSTEVICTION = true;
+               //LineageCacheConfig.GPU2HOSTEVICTION = true;
                //run the test
                runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
                HashMap<MatrixValue.CellIndex, Double> R_orig = 
readDMLMatrixFromOutputDir("R");

[systemds] branch main updated: [SYSTEMDS-3510] Recycling pointers from GPU lineage cache

Reply via email to