This is an automated email from the ASF dual-hosted git repository.
arnabp20 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new acd5865f0b [SYSTEMDS-2947] Remove background eviction of GPU lineage
cache
acd5865f0b is described below
commit acd5865f0b117e899f732f796da327ccf486c926
Author: Arnab Phani <[email protected]>
AuthorDate: Mon Mar 20 17:02:01 2023 +0100
[SYSTEMDS-2947] Remove background eviction of GPU lineage cache
This patch removes the asynchronous eviction of lineage cache entries
from the GPU to main memory. Background eviction needs a compiler-assisted
approach. A full runtime asynchronous eviction leads to synchronization
issues. This patch also adds a flag to delete cache entries in GPU instead
of coping to host.
Currently, for a mini-batch workload with limited reuse opportunities,
enabling reuse in GPU slows down the execution by 10x with eviction (copy
to host) and 4x with delete.
Closes #1794
---
.../runtime/instructions/cp/CPInstruction.java | 40 ++------
.../instructions/gpu/context/GPUMemoryManager.java | 103 ++++++++++++---------
.../instructions/gpu/context/GPUObject.java | 4 +-
.../sysds/runtime/lineage/LineageCacheConfig.java | 1 +
.../runtime/lineage/LineageCacheStatistics.java | 13 +++
.../runtime/lineage/LineageGPUCacheEviction.java | 6 ++
.../java/org/apache/sysds/utils/Statistics.java | 1 +
.../lineage/GPULineageCacheEvictionTest.java | 4 +
.../functions/lineage/GPUCacheEviction1.dml | 6 +-
.../functions/lineage/GPUCacheEviction2.dml | 2 +-
10 files changed, 97 insertions(+), 83 deletions(-)
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java
b/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java
index 144760b3d9..aa17fa2cab 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java
@@ -19,7 +19,6 @@
package org.apache.sysds.runtime.instructions.cp;
-import java.util.concurrent.Executors;
import org.apache.sysds.api.DMLScript;
import org.apache.sysds.common.Types.DataType;
@@ -32,10 +31,7 @@ import
org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysds.runtime.instructions.CPInstructionParser;
import org.apache.sysds.runtime.instructions.Instruction;
import org.apache.sysds.runtime.instructions.fed.FEDInstructionUtils;
-import org.apache.sysds.runtime.instructions.gpu.context.GPUContextPool;
-import org.apache.sysds.runtime.instructions.gpu.context.GPUMemoryEviction;
import org.apache.sysds.runtime.lineage.LineageCacheConfig;
-import org.apache.sysds.runtime.lineage.LineageGPUCacheEviction;
import org.apache.sysds.runtime.matrix.operators.Operator;
import org.apache.sysds.runtime.privacy.propagation.PrivacyPropagator;
@@ -108,29 +104,17 @@ public abstract class CPInstruction extends Instruction
}
tmp = PrivacyPropagator.preprocessInstruction(tmp, ec);
-
- //Submit a task for the eviction thread. The stopping criteria
are a passed
- //eviction count and STOPBACKGROUNDEVICTION flag.
STOPBACKGROUNDEVICTION flag
- //is set to true in the post processing of CPU instruction to
stop eviction.
- if (!LineageCacheConfig.ReuseCacheType.isNone() &&
DMLScript.USE_ACCELERATOR
- && LineageCacheConfig.CONCURRENTGPUEVICTION &&
ec.getNumGPUContexts()>0
- && !(tmp instanceof VariableCPInstruction) && !(tmp
instanceof FunctionCallCPInstruction)) {
- long availableMem =
ec.getGPUContext(0).getAvailableMemory(); //TODO: multi-gpu
- long almostFull = (long) (0.2 *
GPUContextPool.initialGPUMemBudget());
-
- if (availableMem < almostFull) { //80% full
- if (LineageGPUCacheEviction.gpuEvictionThread
== null)
-
LineageGPUCacheEviction.gpuEvictionThread = Executors.newSingleThreadExecutor();
- LineageCacheConfig.STOPBACKGROUNDEVICTION =
false;
-
LineageGPUCacheEviction.gpuEvictionThread.submit(new GPUMemoryEviction());
- }
- }
-
return tmp;
}
@Override
public abstract void processInstruction(ExecutionContext ec);
+
+ @Override
+ public void postprocessInstruction(ExecutionContext ec) {
+ if (DMLScript.LINEAGE_DEBUGGER)
+ ec.maintainLineageDebuggerInfo(this);
+ }
/**
* Takes a delimited string of instructions, and replaces ALL
placeholder labels
@@ -156,17 +140,7 @@ public abstract class CPInstruction extends Instruction
}
return updateInstList.toString();
}
- @Override
- public void postprocessInstruction(ExecutionContext ec) {
- //Stop the eviction thread if not done yet evicting the given
count.
- if (!LineageCacheConfig.ReuseCacheType.isNone() &&
DMLScript.USE_ACCELERATOR
- && LineageCacheConfig.CONCURRENTGPUEVICTION)
- LineageCacheConfig.STOPBACKGROUNDEVICTION = true;
-
- if (DMLScript.LINEAGE_DEBUGGER)
- ec.maintainLineageDebuggerInfo(this);
- }
-
+
/**
* Replaces ALL placeholder strings (such as ##mVar2## and ##Var5##) in
a single instruction.
*
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
index d6b62ad890..4aed4a7943 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -268,7 +268,7 @@ public class GPUMemoryManager {
}
// Step 3: Try reusing non-exact match entry of rmvarGPUPointers
- if(A == null) {
+ if(A == null) {
A =
lazyCudaFreeMemoryManager.getRmvarPointerMinSize(opcode, size);
if(A != null) {
guardedCudaFree(A);
@@ -280,7 +280,7 @@ public class GPUMemoryManager {
// Step 4: Eagerly free-up rmvarGPUPointers and check if memory
is available on GPU
// Evictions of matrix blocks are expensive (as they might lead
them to be written to disk in case of smaller CPU budget)
- // than doing cuda free/malloc/memset. So, rmvar-ing every
blocks (step 4) is preferred over eviction (step 5, 6, 7).
+ // than doing cuda free/malloc/memset. So, rmvar-ing every
blocks (step 4) is preferred over eviction (step 6, 7, 8).
if(A == null) {
lazyCudaFreeMemoryManager.clearAll();
if(allocator.canAllocate(size)) {
@@ -289,36 +289,12 @@ public class GPUMemoryManager {
}
}
- // Step 5: Try eviction/clearing exactly one with size
restriction
- if(A == null) {
- long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
- synchronized (matrixMemoryManager.gpuObjects) {
- Optional<GPUObject> sizeBasedUnlockedGPUObjects
= matrixMemoryManager.gpuObjects.stream()
- .filter(gpuObj ->
!gpuObj.isLocked() && !gpuObj.isLinCached()
- &&
matrixMemoryManager.getWorstCaseContiguousMemorySize(gpuObj) >= size)
- .min((o1, o2) ->
worstCaseContiguousMemorySizeCompare(o1, o2));
- if(sizeBasedUnlockedGPUObjects.isPresent()) {
-
evictOrClear(sizeBasedUnlockedGPUObjects.get(), opcode);
- A = cudaMallocNoWarn(tmpA, size, null);
- if(A == null)
- LOG.warn("cudaMalloc failed
after clearing/evicting based on size.");
- if(DMLScript.STATISTICS) {
- long totalTime =
System.nanoTime() - t0;
-
GPUStatistics.cudaEvictTime.add(totalTime);
-
GPUStatistics.cudaEvictSizeTime.add(totalTime);
-
GPUStatistics.cudaEvictCount.increment();
-
GPUStatistics.cudaEvictSizeCount.increment();
- }
- }
- }
- }
-
- // Step 6: Evict gpu intermediates from lineage cache
+ // Step 5: Evict gpu intermediates from lineage cache
// This can create holes. However, evicting rmVarpending
objects might right away make the required space
// TODO: Size dependent eviction logic (CostNSize is one)
if (A == null && !LineageCacheConfig.ReuseCacheType.isNone()) {
long currentAvailableMemory =
allocator.getAvailableMemory();
- List<LineageCacheEntry> lockedEntries = new
ArrayList<>();
+ List<LineageCacheEntry> lockedAndLiveList = new
ArrayList<>();
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
while (A == null &&
!LineageGPUCacheEviction.isGPUCacheEmpty()) {
LineageCacheEntry le =
LineageGPUCacheEviction.pollFirstEntry();
@@ -334,30 +310,41 @@ public class GPUMemoryManager {
nextgpuObj =
nextgpuObj.nextLineageCachedEntry;
}
if (locked) {
- lockedEntries.add(le);
+ lockedAndLiveList.add(le);
continue;
}
- // TODO: First remove the gobj chains that
don't contain any live and dirty objects.
- currentAvailableMemory +=
headGpuObj.getSizeOnDevice();
-
- // Copy from device to host for all live and
dirty objects
+ // First remove the gpuobj chains that don't
contain any live and dirty objects.
+ // Continue if any object is live
boolean copied = false;
+ boolean live = false;
nextgpuObj = headGpuObj;
while (nextgpuObj!= null) {
// Keeping isLinCached as True here
will save data deletion by copyFromDeviceToHost
- if (!nextgpuObj.isrmVarPending() &&
nextgpuObj.isDirty()) { //live and dirty
-
nextgpuObj.copyFromDeviceToHost(opcode, true, true);
- copied = true;
+ if (!nextgpuObj.isrmVarPending()) {
//live
+
//nextgpuObj.copyFromDeviceToHost(opcode, true, true);
+ //copied = true;
+ live = true;
}
- nextgpuObj.setIsLinCached(false);
+ //nextgpuObj.setIsLinCached(false);
nextgpuObj =
nextgpuObj.nextLineageCachedEntry;
}
+ if (live) {
+ lockedAndLiveList.add(le);
+ continue;
+ }
+ // TODO: Handle dirty objects separately. Copy
them back to the host
- // Copy from device cache to CPU lineage cache
if not already copied
- LineageGPUCacheEviction.copyToHostCache(le,
opcode, copied);
- if (DMLScript.STATISTICS)
-
LineageCacheStatistics.incrementGpuSyncEvicts();
+ currentAvailableMemory +=
headGpuObj.getSizeOnDevice();
+
+ if (!LineageCacheConfig.GPU2HOSTEVICTION)
+
LineageGPUCacheEviction.removeFromDeviceCache(le, opcode, copied);
+ else {
+ // Copy from device cache to CPU
lineage cache if not already copied
+
LineageGPUCacheEviction.copyToHostCache(le, opcode, copied);
+ if(DMLScript.STATISTICS)
+
LineageCacheStatistics.incrementGpuSyncEvicts();
+ }
// For all the other objects, remove and clear
data (only once)
nextgpuObj = headGpuObj;
@@ -366,6 +353,7 @@ public class GPUMemoryManager {
// If not live or live but not dirty
if (nextgpuObj.isrmVarPending() ||
!nextgpuObj.isDirty()) {
if (!freed) {
+
nextgpuObj.setIsLinCached(false);
nextgpuObj.clearData(opcode, true);
freed = true;
}
@@ -374,6 +362,7 @@ public class GPUMemoryManager {
}
nextgpuObj =
nextgpuObj.nextLineageCachedEntry;
}
+
// Clear the GPUOjects chain
GPUObject currgpuObj = headGpuObj;
while (currgpuObj.nextLineageCachedEntry !=
null) {
@@ -390,14 +379,38 @@ public class GPUMemoryManager {
}
// Add the locked entries back to the eviction queue
- if (!lockedEntries.isEmpty())
-
LineageGPUCacheEviction.addEntryList(lockedEntries);
- if (DMLScript.STATISTICS) //TODO: dedicated statistics
for lineage
-
GPUStatistics.cudaEvictTime.add(System.nanoTime() - t0);
+ if (!lockedAndLiveList.isEmpty())
+
LineageGPUCacheEviction.addEntryList(lockedAndLiveList);
+ if (DMLScript.STATISTICS)
+
LineageCacheStatistics.incrementEvictTimeGpu(System.nanoTime() - t0);
if (A == null)
LOG.warn("cudaMalloc failed after Lineage GPU
cache eviction.");
}
+
+ // Step 6: Try eviction/clearing exactly one with size
restriction
+ if(A == null) {
+ long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
+ synchronized (matrixMemoryManager.gpuObjects) {
+ Optional<GPUObject> sizeBasedUnlockedGPUObjects
= matrixMemoryManager.gpuObjects.stream()
+ .filter(gpuObj -> !gpuObj.isLocked() &&
!gpuObj.isLinCached()
+ &&
matrixMemoryManager.getWorstCaseContiguousMemorySize(gpuObj) >= size)
+ .min((o1, o2) ->
worstCaseContiguousMemorySizeCompare(o1, o2));
+ if(sizeBasedUnlockedGPUObjects.isPresent()) {
+
evictOrClear(sizeBasedUnlockedGPUObjects.get(), opcode);
+ A = cudaMallocNoWarn(tmpA, size, null);
+ if(A == null)
+ LOG.warn("cudaMalloc failed
after clearing/evicting based on size.");
+ if(DMLScript.STATISTICS) {
+ long totalTime =
System.nanoTime() - t0;
+
GPUStatistics.cudaEvictTime.add(totalTime);
+
GPUStatistics.cudaEvictSizeTime.add(totalTime);
+
GPUStatistics.cudaEvictCount.increment();
+
GPUStatistics.cudaEvictSizeCount.increment();
+ }
+ }
+ }
+ }
// Step 7: Try eviction/clearing one-by-one based on the given
policy without size restriction
if(A == null) {
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
index 5ea922b99f..043243f5ae 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
@@ -1064,8 +1064,10 @@ public class GPUObject {
* @throws DMLRuntimeException if error occurs
*/
synchronized public void clearData(String opcode, boolean eager) throws
DMLRuntimeException {
- if (isLineageCached)
+ if (isLineageCached) {
+ setDirty(false);
return;
+ }
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : clearData on " + this + ",
GPUContext=" + getGPUContext());
diff --git
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
index 253825a5cd..5bb4d97853 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
@@ -126,6 +126,7 @@ public class LineageCacheConfig
private static LineageCachePolicy _cachepolicy = null;
// Weights for scoring components (computeTime/size, LRU timestamp, DAG
height)
protected static double[] WEIGHTS = {1, 0, 0};
+ public static boolean GPU2HOSTEVICTION = false;
public static boolean CONCURRENTGPUEVICTION = false;
public static volatile boolean STOPBACKGROUNDEVICTION = false;
diff --git
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheStatistics.java
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheStatistics.java
index fd708517e8..8ae0831f41 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheStatistics.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheStatistics.java
@@ -45,6 +45,7 @@ public class LineageCacheStatistics {
private static final LongAdder _numHitsGpu = new LongAdder();
private static final LongAdder _numAsyncEvictGpu= new LongAdder();
private static final LongAdder _numSyncEvictGpu = new LongAdder();
+ private static final LongAdder _evtimeGpu = new LongAdder();
// Below entries are specific to Spark instructions
private static final LongAdder _numHitsRdd = new LongAdder();
private static final LongAdder _numHitsSparkActions = new LongAdder();
@@ -65,6 +66,7 @@ public class LineageCacheStatistics {
_ctimeFSWrite.reset();
_ctimeSaved.reset();
_ctimeMissed.reset();
+ _evtimeGpu.reset();
_numHitsGpu.reset();
_numAsyncEvictGpu.reset();
_numSyncEvictGpu.reset();
@@ -204,6 +206,11 @@ public class LineageCacheStatistics {
_numSyncEvictGpu.increment();
}
+ public static void incrementEvictTimeGpu(long delta) {
+ // Total time spent on evicting from GPU to main memory or
deleting from GPU lineage cache
+ _evtimeGpu.add(delta);
+ }
+
public static void incrementRDDHits() {
// Number of times a locally cached (but not persisted) RDD are
reused.
_numHitsRdd.increment();
@@ -281,6 +288,12 @@ public class LineageCacheStatistics {
return sb.toString();
}
+ public static String displayGpuEvictTime() {
+ StringBuilder sb = new StringBuilder();
+ sb.append(String.format("%.3f",
((double)_evtimeGpu.longValue())/1000000000)); //in sec
+ return sb.toString();
+ }
+
public static String displaySparkStats() {
StringBuilder sb = new StringBuilder();
sb.append(_numHitsSparkActions.longValue());
diff --git
a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
index 412db39926..b302359545 100644
---
a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
+++
b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
@@ -152,4 +152,10 @@ public class LineageGPUCacheEviction
updateSize(size, false);
}
+ public static void removeFromDeviceCache(LineageCacheEntry entry,
String instName, boolean alreadyCopied) {
+ long size = entry.getGPUObject().getSizeOnDevice();
+ LineageCache.removeEntry(entry._key);
+ updateSize(size, false);
+ }
+
}
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysds/utils/Statistics.java
b/src/main/java/org/apache/sysds/utils/Statistics.java
index da8c9fb444..89d8088734 100644
--- a/src/main/java/org/apache/sysds/utils/Statistics.java
+++ b/src/main/java/org/apache/sysds/utils/Statistics.java
@@ -639,6 +639,7 @@ public class Statistics
sb.append("LinCache hits (Mem/FS/Del): \t" +
LineageCacheStatistics.displayHits() + ".\n");
sb.append("LinCache MultiLevel (Ins/SB/Fn):" +
LineageCacheStatistics.displayMultiLevelHits() + ".\n");
sb.append("LinCache GPU (Hit/Async/Sync): \t" +
LineageCacheStatistics.displayGpuStats() + ".\n");
+ sb.append("LinCache GPU evict time: \t" +
LineageCacheStatistics.displayGpuEvictTime() + " sec.\n");
sb.append("LinCache Spark (Col/Loc/Dist): \t" +
LineageCacheStatistics.displaySparkStats() + ".\n");
sb.append("LinCache writes (Mem/FS/Del): \t" +
LineageCacheStatistics.displayWtrites() + ".\n");
sb.append("LinCache FStimes (Rd/Wr): \t" +
LineageCacheStatistics.displayFSTime() + " sec.\n");
diff --git
a/src/test/java/org/apache/sysds/test/functions/lineage/GPULineageCacheEvictionTest.java
b/src/test/java/org/apache/sysds/test/functions/lineage/GPULineageCacheEvictionTest.java
index f50cb9fe2f..7536173ce0 100644
---
a/src/test/java/org/apache/sysds/test/functions/lineage/GPULineageCacheEvictionTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/lineage/GPULineageCacheEvictionTest.java
@@ -24,6 +24,7 @@ import java.util.HashMap;
import java.util.List;
import org.apache.sysds.runtime.lineage.Lineage;
+import org.apache.sysds.runtime.lineage.LineageCacheConfig;
import org.apache.sysds.runtime.matrix.data.MatrixValue;
import org.apache.sysds.test.AutomatedTestBase;
import org.apache.sysds.test.TestConfiguration;
@@ -79,6 +80,8 @@ public class GPULineageCacheEvictionTest extends
AutomatedTestBase{
// reset clears the lineage cache held memory from the last run
Lineage.resetInternalState();
+ boolean gpu2Mem = LineageCacheConfig.GPU2HOSTEVICTION;
+ LineageCacheConfig.GPU2HOSTEVICTION = true;
//run the test
runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
HashMap<MatrixValue.CellIndex, Double> R_orig =
readDMLMatrixFromOutputDir("R");
@@ -95,6 +98,7 @@ public class GPULineageCacheEvictionTest extends
AutomatedTestBase{
//run the test
runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
AutomatedTestBase.TEST_GPU = false;
+ LineageCacheConfig.GPU2HOSTEVICTION = gpu2Mem;
HashMap<MatrixValue.CellIndex, Double> R_reused =
readDMLMatrixFromOutputDir("R");
//compare results
diff --git a/src/test/scripts/functions/lineage/GPUCacheEviction1.dml
b/src/test/scripts/functions/lineage/GPUCacheEviction1.dml
index a65894d546..330ef6742c 100644
--- a/src/test/scripts/functions/lineage/GPUCacheEviction1.dml
+++ b/src/test/scripts/functions/lineage/GPUCacheEviction1.dml
@@ -27,7 +27,7 @@ X1 = X;
y1 = y;
S1 = 0;
# fill half of the cache
-for (i in 1:15) {
+for (i in 1:20) {
R = X1 * y1;
X1 = cbind(X1, rand(rows=10000, cols=1, seed=42));
y1 = cbind(y1, rand(rows=10000, cols=1, seed=42));
@@ -40,7 +40,7 @@ X2 = X;
y2 = y;
S2 = 0;
# reuse (saves cache pollution)
-for (i in 1:15) {
+for (i in 1:20) {
R = X2 * y2;
X2 = cbind(X2, rand(rows=10000, cols=1, seed=42));
y2 = cbind(y2, rand(rows=10000, cols=1, seed=42));
@@ -50,7 +50,7 @@ for (i in 1:15) {
S[,2] = S2;
# generate eviction
-for (i in 1:15) {
+for (i in 1:20) {
R = X1 * y1;
X1 = cbind(X1, rand(rows=10000, cols=1, seed=42));
y1 = cbind(y1, rand(rows=10000, cols=1, seed=42));
diff --git a/src/test/scripts/functions/lineage/GPUCacheEviction2.dml
b/src/test/scripts/functions/lineage/GPUCacheEviction2.dml
index 623e87d7fc..12fb6875e3 100644
--- a/src/test/scripts/functions/lineage/GPUCacheEviction2.dml
+++ b/src/test/scripts/functions/lineage/GPUCacheEviction2.dml
@@ -25,7 +25,7 @@ S = matrix(0, rows=1, cols=1);
S1 = 0;
# fill the cache and generate eviction
-for (i in 1:30) {
+for (i in 1:40) {
R = X * y;
X = cbind(X, rand(rows=10000, cols=1, seed=42));
y = cbind(y, rand(rows=10000, cols=1, seed=42));