[systemds] branch master updated: [SYSTEMDS-2947] Background thread for lineage cache eviction in GPU

arnabp20 Sat, 15 May 2021 10:39:47 -0700

This is an automated email from the ASF dual-hosted git repository.

arnabp20 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/master by this push:
     new e75c865  [SYSTEMDS-2947] Background thread for lineage cache eviction 
in GPU
e75c865 is described below

commit e75c86547ae8d2733a1e08a0befa6fa04d933bda
Author: arnabp <[email protected]>
AuthorDate: Sat May 15 19:34:30 2021 +0200

    [SYSTEMDS-2947] Background thread for lineage cache eviction in GPU
    
    This patch introduces a background thread for lineage cache eviction
    in GPU. The idea is to evict a few non-live variables when the current
    instruction is running in CPU. We submit a task in the pre-processing
    phase of a CPU instruction. The eviction loop breaks if 1) done evicting
    the requested number of entries or 2) the parallel CPU instruction
    is ended (set a flag in postprocess), or 3) no entries left for eviction.
    
    In addition to that, we introduce a logic to measure the DtoH eviction
    bandwidth in real time and apply exponential smoothing to predict
    the future IO. However, the facts that the measured bandwidth PCIe3 x16
    can vary greatly and a memcpy lazily execute the kernels make the
    estimated bandwidth flaky. We need this to decide the worth of eviction
    Both of these are disabled for now.
---
 .../org/apache/sysds/api/ScriptExecutorUtils.java  |   3 +
 .../runtime/instructions/cp/CPInstruction.java     |  23 ++++
 .../gpu/context/GPULazyCudaFreeMemoryManager.java  |  12 +-
 .../gpu/context/GPUMemoryEviction.java             | 137 ++++++++++++++++++++
 .../instructions/gpu/context/GPUMemoryManager.java | 138 +++++++++++++--------
 .../instructions/gpu/context/GPUObject.java        |   3 +
 .../sysds/runtime/lineage/LineageCacheConfig.java  |   4 +
 .../sysds/runtime/lineage/LineageCacheEntry.java   |   2 +
 .../runtime/lineage/LineageCacheEviction.java      |   5 +-
 .../runtime/lineage/LineageGPUCacheEviction.java   |  41 ++++--
 10 files changed, 296 insertions(+), 72 deletions(-)

diff --git a/src/main/java/org/apache/sysds/api/ScriptExecutorUtils.java 
b/src/main/java/org/apache/sysds/api/ScriptExecutorUtils.java
index 970a800..fd344e4 100644
--- a/src/main/java/org/apache/sysds/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysds/api/ScriptExecutorUtils.java
@@ -35,6 +35,7 @@ import 
org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.lineage.LineageEstimatorStatistics;
+import org.apache.sysds.runtime.lineage.LineageGPUCacheEviction;
 import org.apache.sysds.utils.Statistics;
 
 public class ScriptExecutorUtils {
@@ -113,6 +114,8 @@ public class ScriptExecutorUtils {
                                        gCtx.clearTemporaryMemory();
                                }
                                GPUContextPool.freeAllGPUContexts();
+                               if (LineageGPUCacheEviction.gpuEvictionThread 
!= null)
+                                       
LineageGPUCacheEviction.gpuEvictionThread.shutdown();
                        }
                        if( ConfigurationManager.isCodegenEnabled() )
                                SpoofCompiler.cleanupCodeGenerator();
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java 
b/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java
index b2a14ce..014b854 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java
@@ -19,6 +19,8 @@
 
 package org.apache.sysds.runtime.instructions.cp;
 
+import java.util.concurrent.Executors;
+
 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.common.Types.DataType;
 import org.apache.sysds.lops.Lop;
@@ -29,6 +31,9 @@ import 
org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.instructions.CPInstructionParser;
 import org.apache.sysds.runtime.instructions.Instruction;
 import org.apache.sysds.runtime.instructions.fed.FEDInstructionUtils;
+import org.apache.sysds.runtime.instructions.gpu.context.GPUMemoryEviction;
+import org.apache.sysds.runtime.lineage.LineageCacheConfig;
+import org.apache.sysds.runtime.lineage.LineageGPUCacheEviction;
 import org.apache.sysds.runtime.matrix.operators.Operator;
 import org.apache.sysds.runtime.privacy.propagation.PrivacyPropagator;
 
@@ -100,6 +105,17 @@ public abstract class CPInstruction extends Instruction
 
                tmp = PrivacyPropagator.preprocessInstruction(tmp, ec);
                
+               //Submit a task for the eviction thread. The stopping criteria 
are a passed
+               //eviction count and STOPBACKGROUNDEVICTION flag. 
STOPBACKGROUNDEVICTION flag
+               //is set to true in the post processing of CPU instruction to 
stop eviction.
+               if (!LineageCacheConfig.ReuseCacheType.isNone() && 
DMLScript.USE_ACCELERATOR
+                       && LineageCacheConfig.CONCURRENTGPUEVICTION && !(tmp 
instanceof VariableCPInstruction)) {
+                       if (LineageGPUCacheEviction.gpuEvictionThread == null)
+                               LineageGPUCacheEviction.gpuEvictionThread = 
Executors.newSingleThreadExecutor();
+                       LineageCacheConfig.STOPBACKGROUNDEVICTION = false;
+                       LineageGPUCacheEviction.gpuEvictionThread.submit(new 
GPUMemoryEviction(1));
+               }
+               
                return tmp;
        }
 
@@ -130,6 +146,13 @@ public abstract class CPInstruction extends Instruction
                }
                return updateInstList.toString();
        }
+       @Override
+       public void postprocessInstruction(ExecutionContext ec) {
+               //Stop the eviction thread if not done yet evicting the given 
count.
+               if (!LineageCacheConfig.ReuseCacheType.isNone() && 
DMLScript.USE_ACCELERATOR
+                       && LineageCacheConfig.CONCURRENTGPUEVICTION)
+                       LineageCacheConfig.STOPBACKGROUNDEVICTION = true;
+       }
        
        /** 
         * Replaces ALL placeholder strings (such as ##mVar2## and ##Var5##) in 
a single instruction.
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
index 9369026..332f115 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java
@@ -51,7 +51,7 @@ public class GPULazyCudaFreeMemoryManager {
         * @param size size in bytes
         * @return pointer
         */
-       public Pointer getRmvarPointer(String opcode, long size) {
+       synchronized public Pointer getRmvarPointer(String opcode, long size) {
                if (rmvarGPUPointers.containsKey(size)) {
                        if(LOG.isTraceEnabled())
                                LOG.trace("Getting rmvar-ed pointers for size:" 
+ size);
@@ -65,11 +65,11 @@ public class GPULazyCudaFreeMemoryManager {
                }
        }
        
-       public Set<Pointer> getAllPointers() {
+       synchronized public Set<Pointer> getAllPointers() {
                return rmvarGPUPointers.values().stream().flatMap(ptrs -> 
ptrs.stream()).collect(Collectors.toSet());
        }
        
-       public void clearAll() {
+       synchronized public void clearAll() {
                Set<Pointer> toFree = new HashSet<>();
                for(Set<Pointer> ptrs : rmvarGPUPointers.values()) {
                        toFree.addAll(ptrs);
@@ -80,7 +80,7 @@ public class GPULazyCudaFreeMemoryManager {
                }
        }
        
-       public Pointer getRmvarPointerMinSize(String opcode, long minSize) 
throws DMLRuntimeException {
+       synchronized public Pointer getRmvarPointerMinSize(String opcode, long 
minSize) throws DMLRuntimeException {
                Optional<Long> toClear = 
rmvarGPUPointers.entrySet().stream().filter(e -> e.getValue().size() > 0).map(e 
-> e.getKey())
                                .filter(size -> size >= minSize).min((s1, s2) 
-> s1 < s2 ? -1 : 1);
                if(toClear.isPresent()) {
@@ -145,7 +145,7 @@ public class GPULazyCudaFreeMemoryManager {
         * @param size size of the pointer
         * @param toFree pointer
         */
-       public void add(long size, Pointer toFree) {
+       synchronized public void add(long size, Pointer toFree) {
                Set<Pointer> freeList = rmvarGPUPointers.get(size);
                if (freeList == null) {
                        freeList = new HashSet<>();
@@ -162,7 +162,7 @@ public class GPULazyCudaFreeMemoryManager {
         * @param size size in bytes
         * @param ptr pointer to be removed
         */
-       public void removeIfPresent(long size, Pointer ptr) {
+       synchronized public void removeIfPresent(long size, Pointer ptr) {
                if(rmvarGPUPointers.containsKey(size) && 
rmvarGPUPointers.get(size).contains(ptr)) {
                        rmvarGPUPointers.get(size).remove(ptr);
                        if (rmvarGPUPointers.get(size).isEmpty())
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
new file mode 100644
index 0000000..5fd1474
--- /dev/null
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.instructions.gpu.context;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.sysds.api.DMLScript;
+import org.apache.sysds.runtime.lineage.LineageCacheConfig;
+import org.apache.sysds.runtime.lineage.LineageCacheEntry;
+import org.apache.sysds.runtime.lineage.LineageGPUCacheEviction;
+import org.apache.sysds.utils.GPUStatistics;
+
+public class GPUMemoryEviction implements Runnable 
+{
+       int numEvicts = 0;
+       
+       public GPUMemoryEviction(int num) {
+               numEvicts = num;
+       }
+
+       @Override
+       public void run() {
+               //long currentAvailableMemory = allocator.getAvailableMemory();
+               List<LineageCacheEntry> lockedOrLiveEntries = new ArrayList<>();
+               int count = 0;
+
+               // Stop if 1) Evicted the request number of entries, 2) The 
parallel
+               // CPU instruction is ended, and 3) No non-live entries left in 
the cache.
+               long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
+               while (!LineageGPUCacheEviction.isGPUCacheEmpty() && count < 
numEvicts) 
+               {
+                       if (LineageCacheConfig.STOPBACKGROUNDEVICTION)
+                               break;
+                       
+                       LineageCacheEntry le = 
LineageGPUCacheEviction.pollFirstEntry();
+                       GPUObject cachedGpuObj = le.getGPUObject();
+                       GPUObject headGpuObj = 
cachedGpuObj.lineageCachedChainHead != null
+                                       ? cachedGpuObj.lineageCachedChainHead : 
cachedGpuObj;
+                       // Check and continue if any object in the linked list 
is locked
+                       boolean lockedOrLive = false;
+                       GPUObject nextgpuObj = headGpuObj;
+                       while (nextgpuObj!= null) {
+                               if (!nextgpuObj.isrmVarPending() || 
nextgpuObj.isLocked()) // live or locked
+                                       lockedOrLive = true;
+                               nextgpuObj = nextgpuObj.nextLineageCachedEntry;
+                       }
+                       if (lockedOrLive) {
+                               lockedOrLiveEntries.add(le);
+                               continue;
+                       }
+
+                       // TODO: First remove the gobj chains that don't 
contain any live and dirty objects.
+                       //currentAvailableMemory += 
headGpuObj.getSizeOnDevice();
+
+                       // Copy from device to host for all live and dirty 
objects
+                       boolean copied = false;
+                       nextgpuObj = headGpuObj;
+                       while (nextgpuObj!= null) {
+                               // Keeping isLinCached as True here will save 
data deletion by copyFromDeviceToHost
+                               if (!nextgpuObj.isrmVarPending() && 
nextgpuObj.isDirty()) { //live and dirty
+                                       nextgpuObj.copyFromDeviceToHost(null, 
true, true);
+                                       copied = true;
+                               }
+                               nextgpuObj.setIsLinCached(false);
+                               nextgpuObj = nextgpuObj.nextLineageCachedEntry;
+                       }
+
+                       // Copy from device cache to CPU lineage cache if not 
already copied
+                       LineageGPUCacheEviction.copyToHostCache(le, null, 
copied);
+
+                       // For all the other objects, remove and clear data 
(only once)
+                       nextgpuObj = headGpuObj;
+                       boolean freed = false;
+                       synchronized 
(nextgpuObj.getGPUContext().getMemoryManager().getGPUMatrixMemoryManager().gpuObjects)
 {
+
+                       while (nextgpuObj!= null) {
+                               // If not live or live but not dirty
+                               if (nextgpuObj.isrmVarPending() || 
!nextgpuObj.isDirty()) {
+                                       if (!freed) {
+                                               nextgpuObj.clearData(null, 
true);
+                                               //FIXME: adding to rmVar cache 
causes multiple failures due to concurrent
+                                               //access to the rmVar cache and 
other data structures. VariableCP instruction
+                                               //and other instruction free 
memory and add to rmVar cache in parallel to
+                                               //the background eviction task, 
which needs to be synchronized.
+                                               freed = true;
+                                       }
+                                       else
+                                               nextgpuObj.clearGPUObject();
+                               }
+                               nextgpuObj = nextgpuObj.nextLineageCachedEntry;
+                       }
+                       }
+                       // Clear the GPUOjects chain
+                       GPUObject currgpuObj = headGpuObj;
+                       while (currgpuObj.nextLineageCachedEntry != null) {
+                               nextgpuObj = currgpuObj.nextLineageCachedEntry;
+                               currgpuObj.lineageCachedChainHead = null;
+                               currgpuObj.nextLineageCachedEntry = null;
+                               nextgpuObj.lineageCachedChainHead = null;
+                               currgpuObj = nextgpuObj;
+                       }
+
+                       //if(currentAvailableMemory >= size)
+                               // This doesn't guarantee allocation due to 
fragmented freed memory
+                       //      A = cudaMallocNoWarn(tmpA, size, null); 
+                       if (DMLScript.STATISTICS) {
+                               GPUStatistics.cudaEvictCount.increment();
+                       }
+                       count++;
+               }
+
+               // Add the locked entries back to the eviction queue
+               if (!lockedOrLiveEntries.isEmpty())
+                       
LineageGPUCacheEviction.addEntryList(lockedOrLiveEntries);
+               
+               if (DMLScript.STATISTICS) //TODO: dedicated statistics for 
lineage
+                       GPUStatistics.cudaEvictTime.add(System.nanoTime() - t0);
+       }
+}
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
index 4e1d577..a9c0a57 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -106,8 +106,10 @@ public class GPUMemoryManager {
         * @return either the size or -1 if no such pointer exists
         */
        public long getSizeAllocatedGPUPointer(Pointer ptr) {
-               if(allPointers.containsKey(ptr)) {
-                       return allPointers.get(ptr).getSizeInBytes();
+               synchronized(allPointers) {
+                       if(allPointers.containsKey(ptr)) {
+                               return allPointers.get(ptr).getSizeInBytes();
+                       }
                }
                return -1;
        }
@@ -175,7 +177,9 @@ public class GPUMemoryManager {
                long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
                try {
                        allocator.allocate(A, size);
-                       allPointers.put(A, new PointerInfo(size));
+                       synchronized(allPointers) {
+                               allPointers.put(A, new PointerInfo(size));
+                       }
                        if(DMLScript.STATISTICS) {
                                long totalTime = System.nanoTime() - t0;
                                
GPUStatistics.cudaAllocSuccessTime.add(totalTime);
@@ -286,20 +290,23 @@ public class GPUMemoryManager {
                // Step 5: Try eviction/clearing exactly one with size 
restriction
                if(A == null) {
                        long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
-                       Optional<GPUObject> sizeBasedUnlockedGPUObjects = 
matrixMemoryManager.gpuObjects.stream()
-                                               .filter(gpuObj -> 
!gpuObj.isLocked() && !gpuObj.isLinCached() && 
matrixMemoryManager.getWorstCaseContiguousMemorySize(gpuObj) >= size)
-                                               .min((o1, o2) -> 
worstCaseContiguousMemorySizeCompare(o1, o2));
-                       if(sizeBasedUnlockedGPUObjects.isPresent()) {
-                               evictOrClear(sizeBasedUnlockedGPUObjects.get(), 
opcode);
-                               A = cudaMallocNoWarn(tmpA, size, null);
-                               if(A == null)
-                                       LOG.warn("cudaMalloc failed after 
clearing/evicting based on size.");
-                               if(DMLScript.STATISTICS) {
-                                       long totalTime = System.nanoTime() - t0;
-                                       
GPUStatistics.cudaEvictTime.add(totalTime);
-                                       
GPUStatistics.cudaEvictSizeTime.add(totalTime);
-                                       
GPUStatistics.cudaEvictCount.increment();
-                                       
GPUStatistics.cudaEvictSizeCount.increment();
+                       synchronized (matrixMemoryManager.gpuObjects) {
+                               Optional<GPUObject> sizeBasedUnlockedGPUObjects 
= matrixMemoryManager.gpuObjects.stream()
+                                                       .filter(gpuObj -> 
!gpuObj.isLocked() && !gpuObj.isLinCached() 
+                                                       && 
matrixMemoryManager.getWorstCaseContiguousMemorySize(gpuObj) >= size)
+                                                       .min((o1, o2) -> 
worstCaseContiguousMemorySizeCompare(o1, o2));
+                               if(sizeBasedUnlockedGPUObjects.isPresent()) {
+                                       
evictOrClear(sizeBasedUnlockedGPUObjects.get(), opcode);
+                                       A = cudaMallocNoWarn(tmpA, size, null);
+                                       if(A == null)
+                                               LOG.warn("cudaMalloc failed 
after clearing/evicting based on size.");
+                                       if(DMLScript.STATISTICS) {
+                                               long totalTime = 
System.nanoTime() - t0;
+                                               
GPUStatistics.cudaEvictTime.add(totalTime);
+                                               
GPUStatistics.cudaEvictSizeTime.add(totalTime);
+                                               
GPUStatistics.cudaEvictCount.increment();
+                                               
GPUStatistics.cudaEvictSizeCount.increment();
+                                       }
                                }
                        }
                }
@@ -310,6 +317,7 @@ public class GPUMemoryManager {
                if (A == null && !LineageCacheConfig.ReuseCacheType.isNone()) {
                        long currentAvailableMemory = 
allocator.getAvailableMemory();
                        List<LineageCacheEntry> lockedEntries = new 
ArrayList<>();
+                       long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
                        while (A == null && 
!LineageGPUCacheEviction.isGPUCacheEmpty()) {
                                LineageCacheEntry le = 
LineageGPUCacheEviction.pollFirstEntry();
                                GPUObject cachedGpuObj = le.getGPUObject();
@@ -330,17 +338,25 @@ public class GPUMemoryManager {
 
                                // TODO: First remove the gobj chains that 
don't contain any live and dirty objects.
                                currentAvailableMemory += 
headGpuObj.getSizeOnDevice();
-                               LineageGPUCacheEviction.copyToHostCache(le, 
opcode, true);
 
                                // Copy from device to host for all live and 
dirty objects
+                               boolean copied = false;
                                nextgpuObj = headGpuObj;
                                while (nextgpuObj!= null) {
                                        // Keeping isLinCached as True here 
will save data deletion by copyFromDeviceToHost
-                                       if (!nextgpuObj.isrmVarPending() && 
nextgpuObj.isDirty()) //live and dirty
+                                       if (!nextgpuObj.isrmVarPending() && 
nextgpuObj.isDirty()) { //live and dirty
                                                
nextgpuObj.copyFromDeviceToHost(opcode, true, true);
+                                               copied = true;
+                                       }
                                        nextgpuObj.setIsLinCached(false);
                                        nextgpuObj = 
nextgpuObj.nextLineageCachedEntry;
                                }
+
+                               // Copy from device cache to CPU lineage cache 
if not already copied
+                               LineageGPUCacheEviction.copyToHostCache(le, 
opcode, copied);
+                               if (DMLScript.STATISTICS)
+                                       
GPUStatistics.cudaEvictCount.increment();
+
                                // For all the other objects, remove and clear 
data (only once)
                                nextgpuObj = headGpuObj;
                                boolean freed = false;
@@ -374,6 +390,8 @@ public class GPUMemoryManager {
                        // Add the locked entries back to the eviction queue
                        if (!lockedEntries.isEmpty())
                                
LineageGPUCacheEviction.addEntryList(lockedEntries);
+                       if (DMLScript.STATISTICS) //TODO: dedicated statistics 
for lineage
+                               
GPUStatistics.cudaEvictTime.add(System.nanoTime() - t0);
 
                        if (A == null)
                                LOG.warn("cudaMalloc failed after Lineage GPU 
cache eviction.");
@@ -386,24 +404,26 @@ public class GPUMemoryManager {
                        boolean canFit = false;
                        // 
---------------------------------------------------------------
                        // Evict unlocked GPU objects one-by-one and try malloc
-                       List<GPUObject> unlockedGPUObjects = 
matrixMemoryManager.gpuObjects.stream()
+                       synchronized(matrixMemoryManager.gpuObjects) {
+                               List<GPUObject> unlockedGPUObjects = 
matrixMemoryManager.gpuObjects.stream()
                                                .filter(gpuObj -> 
!gpuObj.isLocked() && !gpuObj.isLinCached()).collect(Collectors.toList());
-                       Collections.sort(unlockedGPUObjects, new 
EvictionPolicyBasedComparator(size));
-                       while(A == null && unlockedGPUObjects.size() > 0) {
-                               GPUObject evictedGPUObject = 
unlockedGPUObjects.remove(unlockedGPUObjects.size()-1);
-                               evictOrClear(evictedGPUObject, opcode);
-                               if(!canFit) {
-                                       currentAvailableMemory += 
evictedGPUObject.getSizeOnDevice();
-                                       if(currentAvailableMemory >= size)
-                                               canFit = true;
-                               }
-                               if(canFit) {
-                                       // Checking before invoking cudaMalloc 
reduces the time spent in unnecessary cudaMalloc.
-                                       // This was the bottleneck for 
ResNet200 experiments with batch size > 32 on P100+Intel
-                                       A = cudaMallocNoWarn(tmpA, size, null); 
+                               Collections.sort(unlockedGPUObjects, new 
EvictionPolicyBasedComparator(size));
+                               while(A == null && unlockedGPUObjects.size() > 
0) {
+                                       GPUObject evictedGPUObject = 
unlockedGPUObjects.remove(unlockedGPUObjects.size()-1);
+                                       evictOrClear(evictedGPUObject, opcode);
+                                       if(!canFit) {
+                                               currentAvailableMemory += 
evictedGPUObject.getSizeOnDevice();
+                                               if(currentAvailableMemory >= 
size)
+                                                       canFit = true;
+                                       }
+                                       if(canFit) {
+                                               // Checking before invoking 
cudaMalloc reduces the time spent in unnecessary cudaMalloc.
+                                               // This was the bottleneck for 
ResNet200 experiments with batch size > 32 on P100+Intel
+                                               A = cudaMallocNoWarn(tmpA, 
size, null); 
+                                       }
+                                       if(DMLScript.STATISTICS) 
+                                               
GPUStatistics.cudaEvictCount.increment();
                                }
-                               if(DMLScript.STATISTICS) 
-                                       
GPUStatistics.cudaEvictCount.increment();
                        }
                        if(DMLScript.STATISTICS) {
                                long totalTime = System.nanoTime() - t0;
@@ -477,19 +497,22 @@ public class GPUMemoryManager {
         * @param toFree pointer to call cudaFree method on
         */
        void guardedCudaFree(Pointer toFree) {
-               if(allPointers.containsKey(toFree)) {
-                       long size = allPointers.get(toFree).getSizeInBytes();
-                       if(LOG.isTraceEnabled()) {
-                               LOG.trace("Free-ing up the pointer of size " +  
byteCountToDisplaySize(size));
+               synchronized(allPointers) {
+                       if(allPointers.containsKey(toFree)) {
+                               long size = 
allPointers.get(toFree).getSizeInBytes();
+                               if(LOG.isTraceEnabled()) {
+                                       LOG.trace("Free-ing up the pointer of 
size " +  byteCountToDisplaySize(size));
+                               }
+                               allPointers.remove(toFree);
+                               lazyCudaFreeMemoryManager.removeIfPresent(size, 
toFree);
+                               allocator.free(toFree);
+                               if(DMLScript.SYNCHRONIZE_GPU)
+                                       // Force a device synchronize after 
free-ing the pointer for debugging
+                                       
jcuda.runtime.JCuda.cudaDeviceSynchronize(); 
+                       }
+                       else {
+                               throw new RuntimeException("Attempting to free 
an unaccounted pointer:" + toFree);
                        }
-                       allPointers.remove(toFree);
-                       lazyCudaFreeMemoryManager.removeIfPresent(size, toFree);
-                       allocator.free(toFree);
-                       if(DMLScript.SYNCHRONIZE_GPU)
-                               jcuda.runtime.JCuda.cudaDeviceSynchronize(); // 
Force a device synchronize after free-ing the pointer for debugging
-               }
-               else {
-                       throw new RuntimeException("Attempting to free an 
unaccounted pointer:" + toFree);
                }
 
        }
@@ -511,11 +534,14 @@ public class GPUMemoryManager {
                        addMiscTime(opcode, GPUStatistics.cudaDeAllocTime, 
GPUStatistics.cudaDeAllocCount, GPUInstruction.MISC_TIMER_CUDA_FREE, t0);
                }
                else {
-                       if (!allPointers.containsKey(toFree)) {
-                               LOG.info("GPU memory info before failure:" + 
toString());
-                               throw new RuntimeException("ERROR : Internal 
state corrupted, cache block size map is not aware of a block it trying to free 
up");
+                       long size = 0;
+                       synchronized(allPointers) {
+                               if (!allPointers.containsKey(toFree)) {
+                                       LOG.info("GPU memory info before 
failure:" + toString());
+                                       throw new RuntimeException("ERROR : 
Internal state corrupted, cache block size map is not aware of a block it 
trying to free up");
+                               }
+                               size = allPointers.get(toFree).getSizeInBytes();
                        }
-                       long size = allPointers.get(toFree).getSizeInBytes();
                        lazyCudaFreeMemoryManager.add(size, toFree);
                }
        }
@@ -554,11 +580,13 @@ public class GPUMemoryManager {
                matrixMemoryManager.gpuObjects.clear();
                
                // Then clean up remaining allocated GPU pointers 
-               Set<Pointer> remainingPtr = new HashSet<>(allPointers.keySet());
-               for(Pointer toFree : remainingPtr) {
-                       guardedCudaFree(toFree); // cleans up 
allocatedGPUPointers and rmvarGPUPointers as well
+               synchronized(allPointers) {
+                       Set<Pointer> remainingPtr = new 
HashSet<>(allPointers.keySet());
+                       for(Pointer toFree : remainingPtr) {
+                               guardedCudaFree(toFree); // cleans up 
allocatedGPUPointers and rmvarGPUPointers as well
+                       }
+                       allPointers.clear();
                }
-               allPointers.clear();
        }
                
        /**
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
index d346d56..291cb07 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
@@ -155,6 +155,9 @@ public class GPUObject {
                shadowBuffer.clearShadowPointer();
        }
        
+       public MatrixObject getMatrixObject() {
+               return mat;
+       }
        
        /**
         * Convenience method to directly set the dense matrix pointer on GPU
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
index 98d7c08..b7b66a7 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
@@ -95,6 +95,8 @@ public class LineageCacheConfig
        public static double FSREAD_SPARSE = 400;
        public static double FSWRITE_DENSE = 450;
        public static double FSWRITE_SPARSE = 225;
+       public static double D2HCOPY = 1500;
+       public static double D2HMAXBANDWIDTH = 8192;
        
        private enum CachedItemHead {
                TSMM,
@@ -113,6 +115,8 @@ public class LineageCacheConfig
        private static LineageCachePolicy _cachepolicy = null;
        // Weights for scoring components (computeTime/size, LRU timestamp, DAG 
height)
        protected static double[] WEIGHTS = {1, 0, 0};
+       public static boolean CONCURRENTGPUEVICTION = false;
+       public static volatile boolean STOPBACKGROUNDEVICTION = false;
 
        protected enum LineageCacheStatus {
                EMPTY,     //Placeholder with no data. Cannot be evicted.
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
index 1a32e4b..ea99cf8 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEntry.java
@@ -126,6 +126,7 @@ public class LineageCacheEntry {
        
        public synchronized void setValue(MatrixBlock val, long computetime) {
                _MBval = val;
+               _gpuObject = null;  //Matrix block and gpu object cannot coexist
                _computeTime = computetime;
                _status = isNullVal() ? LineageCacheStatus.EMPTY : 
LineageCacheStatus.CACHED;
                //resume all threads waiting for val
@@ -138,6 +139,7 @@ public class LineageCacheEntry {
 
        public synchronized void setValue(ScalarObject val, long computetime) {
                _SOval = val;
+               _gpuObject = null;  //scalar and gpu object cannot coexist
                _computeTime = computetime;
                _status = isNullVal() ? LineageCacheStatus.EMPTY : 
LineageCacheStatus.CACHED;
                //resume all threads waiting for val
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEviction.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEviction.java
index 70a08d9..3fca108 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEviction.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheEviction.java
@@ -238,7 +238,7 @@ public class LineageCacheEviction
                        }
 
                        if (spilltime < 
LineageCacheConfig.MIN_SPILL_TIME_ESTIMATE) {
-                               // Can't trust the estimate if less than 100ms.
+                               // Can't trust the estimate if less than 10ms.
                                // Spill if it takes longer to recompute.
                                removeOrSpillEntry(cache, e, //spill or delete
                                        exectime >= 
LineageCacheConfig.MIN_SPILL_TIME_ESTIMATE);
@@ -290,7 +290,7 @@ public class LineageCacheEviction
                        return; 
                
                double newIOSpeed = size / IOtime; // MB per second 
-               // Adjust the read/write speed taking into account the last 
read/write.
+               // Adjust the read/write speed using exponential smoothing 
(alpha = 0.5)
                // These constants will eventually converge to the real speed.
                if (read) {
                        if (isSparse(e))
@@ -304,6 +304,7 @@ public class LineageCacheEviction
                        else
                                LineageCacheConfig.FSWRITE_DENSE= 
(LineageCacheConfig.FSWRITE_DENSE+ newIOSpeed) / 2;
                }
+               // TODO: exponential smoothing with arbitrary smoothing factor
        }
        
        private static boolean isSparse(LineageCacheEntry e) {
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
index 156147b..412db39 100644
--- 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
@@ -21,6 +21,7 @@ package org.apache.sysds.runtime.lineage;
 
 import java.util.List;
 import java.util.TreeSet;
+import java.util.concurrent.ExecutorService;
 
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUContextPool;
@@ -31,6 +32,7 @@ public class LineageGPUCacheEviction
        private static long _currentCacheSize = 0;
        private static long GPU_CACHE_LIMIT; //limit in bytes
        private static long _startTimestamp = 0;
+       public static ExecutorService gpuEvictionThread = null;
        private static TreeSet<LineageCacheEntry> weightedQueue = new 
TreeSet<>(LineageCacheConfig.LineageCacheComparator);
 
        protected static void resetEviction() {
@@ -40,6 +42,8 @@ public class LineageGPUCacheEviction
                        e._gpuObject.clearData(null, true);
                }
                _currentCacheSize = 0;
+               gpuEvictionThread = null;
+               //LineageCacheConfig.CONCURRENTGPUEVICTION = false;
                weightedQueue.clear();
        }
 
@@ -59,6 +63,19 @@ public class LineageGPUCacheEviction
        protected static long getStartTimestamp() {
                return _startTimestamp;
        }
+       
+       private static void adjustD2HTransferSpeed(double sizeByte, double 
copyTime) {
+               double sizeMB = sizeByte / (1024*1024);
+               double newTSpeed = sizeMB / copyTime;  //bandwidth (MB/sec) + 
java overhead
+
+               // FIXME: A D2H copy lazily executes previous kernels
+               if (newTSpeed > LineageCacheConfig.D2HMAXBANDWIDTH)
+                       return;  //filter out errorneous measurements (~ 
>8GB/sec)
+               // Perform exponential smoothing.
+               double smFactor = 0.5;  //smoothing factor
+               LineageCacheConfig.D2HCOPY = (smFactor * newTSpeed) + 
((1-smFactor) * LineageCacheConfig.D2HCOPY);
+               //System.out.println("size_t: "+sizeMB+ " speed_t: "+newTSpeed 
+ " estimate_t+1: "+LineageCacheConfig.D2HCOPY);
+       }
 
        //--------------- CACHE MAINTENANCE & LOOKUP FUNCTIONS --------------//
 
@@ -111,19 +128,25 @@ public class LineageGPUCacheEviction
                return GPU_CACHE_LIMIT;
        }
        
-       public static void copyToHostCache(LineageCacheEntry entry, String 
instName, boolean eagerDelete) {
+       public static void copyToHostCache(LineageCacheEntry entry, String 
instName, boolean alreadyCopied) {
                // TODO: move to the shadow buffer. Convert to double precision 
only when reused.
-               // FIXME: Remove double copying (copyFromDeviceToHost, 
evictFromDeviceToHostMB)
-               MatrixBlock mb = 
entry._gpuObject.evictFromDeviceToHostMB(instName, eagerDelete);
+               long t0 = System.nanoTime();
+               MatrixBlock mb = alreadyCopied ? 
entry._gpuObject.getMatrixObject().acquireReadAndRelease()
+                               : 
entry._gpuObject.evictFromDeviceToHostMB(instName, false);
+               long t1 = System.nanoTime();
+               
adjustD2HTransferSpeed(((double)entry._gpuObject.getSizeOnDevice()), 
((double)(t1-t0))/1000000000);
                long size = mb.getInMemorySize();
-               // make space in the host memory for the data
-               if (!LineageCacheEviction.isBelowThreshold(size))
-                       
LineageCacheEviction.makeSpace(LineageCache.getLineageCache(), size);
+               // make space in the host memory for the data TODO: synchronize
+               if (!LineageCacheEviction.isBelowThreshold(size)) {
+                       synchronized (LineageCache.getLineageCache()) {
+                               
LineageCacheEviction.makeSpace(LineageCache.getLineageCache(), size);
+                       }
+               }
+               // FIXME: updateSize outside of synchronized is problematic, 
but eliminates waiting for background eviction
                LineageCacheEviction.updateSize(size, true);
-               // place the data
+               // place the data and set gpu object to null in the cache entry
                entry.setValue(mb);
-               entry._gpuObject = null;
-               // maintain order for eviction of host cache
+               // maintain order for eviction of host cache. FIXME: synchronize
                LineageCacheEviction.addEntry(entry);
                // manage space in gpu cache
                updateSize(size, false);

[systemds] branch master updated: [SYSTEMDS-2947] Background thread for lineage cache eviction in GPU

Reply via email to