This is an automated email from the ASF dual-hosted git repository.
arnabp20 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new e5a3665 [SYSTEMDS-2947] Fix synchronization issues with GPU evictions
e5a3665 is described below
commit e5a366560cda832b38a82b7cb2631e002f49cc22
Author: arnabp <[email protected]>
AuthorDate: Thu Jul 22 13:23:23 2021 +0200
[SYSTEMDS-2947] Fix synchronization issues with GPU evictions
This patch adds a new logic to GPU background eviction to start
evicting only when the GPU is 80% full. In addition, this patch
fixes a couple of synchronization bugs in async eviction and
a bug in sparsity handling in H2D copy.
TODO: Fix remaining synchronization bugs (gpuobj list, rmvar cache).
---
.../runtime/instructions/cp/CPInstruction.java | 17 ++++++---
.../gpu/context/GPUMemoryEviction.java | 43 +++++++++++++---------
.../instructions/gpu/context/GPUObject.java | 24 +++++++++++-
3 files changed, 60 insertions(+), 24 deletions(-)
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java
b/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java
index 84b2332..e048add 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java
@@ -31,6 +31,7 @@ import
org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysds.runtime.instructions.CPInstructionParser;
import org.apache.sysds.runtime.instructions.Instruction;
import org.apache.sysds.runtime.instructions.fed.FEDInstructionUtils;
+import org.apache.sysds.runtime.instructions.gpu.context.GPUContextPool;
import org.apache.sysds.runtime.instructions.gpu.context.GPUMemoryEviction;
import org.apache.sysds.runtime.lineage.LineageCacheConfig;
import org.apache.sysds.runtime.lineage.LineageGPUCacheEviction;
@@ -109,11 +110,17 @@ public abstract class CPInstruction extends Instruction
//eviction count and STOPBACKGROUNDEVICTION flag.
STOPBACKGROUNDEVICTION flag
//is set to true in the post processing of CPU instruction to
stop eviction.
if (!LineageCacheConfig.ReuseCacheType.isNone() &&
DMLScript.USE_ACCELERATOR
- && LineageCacheConfig.CONCURRENTGPUEVICTION && !(tmp
instanceof VariableCPInstruction)) {
- if (LineageGPUCacheEviction.gpuEvictionThread == null)
- LineageGPUCacheEviction.gpuEvictionThread =
Executors.newSingleThreadExecutor();
- LineageCacheConfig.STOPBACKGROUNDEVICTION = false;
- LineageGPUCacheEviction.gpuEvictionThread.submit(new
GPUMemoryEviction(1));
+ && LineageCacheConfig.CONCURRENTGPUEVICTION &&
ec.getNumGPUContexts()>0
+ && !(tmp instanceof VariableCPInstruction) && !(tmp
instanceof FunctionCallCPInstruction)) {
+ long availableMem =
ec.getGPUContext(0).getAvailableMemory(); //TODO: multi-gpu
+ long almostFull = (long) (0.2 *
GPUContextPool.initialGPUMemBudget());
+
+ if (availableMem < almostFull) { //80% full
+ if (LineageGPUCacheEviction.gpuEvictionThread
== null)
+
LineageGPUCacheEviction.gpuEvictionThread = Executors.newSingleThreadExecutor();
+ LineageCacheConfig.STOPBACKGROUNDEVICTION =
false;
+
LineageGPUCacheEviction.gpuEvictionThread.submit(new GPUMemoryEviction());
+ }
}
return tmp;
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
index cb7787c..0264497 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryEviction.java
@@ -31,11 +31,15 @@ import org.apache.sysds.utils.GPUStatistics;
public class GPUMemoryEviction implements Runnable
{
- int numEvicts = 0;
+ int numEvicts;
public GPUMemoryEviction(int num) {
numEvicts = num;
}
+
+ public GPUMemoryEviction() {
+ numEvicts = 0;
+ }
@Override
public void run() {
@@ -46,9 +50,15 @@ public class GPUMemoryEviction implements Runnable
// Stop if 1) Evicted the request number of entries, 2) The
parallel
// CPU instruction is ended, and 3) No non-live entries left in
the cache.
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
- while (!LineageGPUCacheEviction.isGPUCacheEmpty() && count <
numEvicts)
+ while (!LineageGPUCacheEviction.isGPUCacheEmpty())
{
if (LineageCacheConfig.STOPBACKGROUNDEVICTION)
+ // This logic reduces #evictions if the cpu
instructions is so small
+ // that it ends before the background thread
reaches this condition.
+ // However, this check decreases race
conditions.
+ break;
+
+ if (numEvicts > 0 && count > numEvicts)
break;
LineageCacheEntry le =
LineageGPUCacheEviction.pollFirstEntry();
@@ -91,23 +101,22 @@ public class GPUMemoryEviction implements Runnable
nextgpuObj = headGpuObj;
boolean freed = false;
synchronized
(nextgpuObj.getGPUContext().getMemoryManager().getGPUMatrixMemoryManager().gpuObjects)
{
-
- while (nextgpuObj!= null) {
- // If not live or live but not dirty
- if (nextgpuObj.isrmVarPending() ||
!nextgpuObj.isDirty()) {
- if (!freed) {
- nextgpuObj.clearData(null,
true);
- //FIXME: adding to rmVar cache
causes multiple failures due to concurrent
- //access to the rmVar cache and
other data structures. VariableCP instruction
- //and other instruction free
memory and add to rmVar cache in parallel to
- //the background eviction task,
which needs to be synchronized.
- freed = true;
+ while (nextgpuObj!= null) {
+ // If not live or live but not dirty
+ if (nextgpuObj.isrmVarPending() ||
!nextgpuObj.isDirty()) {
+ if (!freed) {
+
nextgpuObj.clearData(null, true);
+ //FIXME: adding to
rmVar cache causes multiple failures due to concurrent
+ //access to the rmVar
cache and other data structures. VariableCP instruction
+ //and other instruction
free memory and add to rmVar cache in parallel to
+ //the background
eviction task, which needs to be synchronized.
+ freed = true;
+ }
+ else
+
nextgpuObj.clearGPUObject();
}
- else
- nextgpuObj.clearGPUObject();
+ nextgpuObj =
nextgpuObj.nextLineageCachedEntry;
}
- nextgpuObj = nextgpuObj.nextLineageCachedEntry;
- }
}
// Clear the GPUOjects chain
GPUObject currgpuObj = headGpuObj;
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
index 291cb07..1c1cb2b 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
@@ -786,6 +786,23 @@ public class GPUObject {
setSparseMatrixCudaPointer(tmp);
}
+ void allocateSparseMatrixOnDevice(long numVals) {
+ // This method is called when #values > nnz
+ if(LOG.isTraceEnabled()) {
+ LOG.trace("GPU : allocateSparseMatrixOnDevice, on " +
this + ", GPUContext=" + getGPUContext());
+ }
+ if(isAllocated())
+ throw new DMLRuntimeException("Internal error - trying
to allocated sparse matrix to a GPUObject that is already allocated");
+ long rows = mat.getNumRows();
+ long nnz = mat.getNnz();
+ if(rows <= 0)
+ throw new DMLRuntimeException("Internal error - invalid
number of rows when allocating sparse matrix");
+ if(nnz < 0)
+ throw new DMLRuntimeException("Internal error - invalid
number of non zeroes when allocating a sparse matrix");
+ CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(),
numVals, rows);
+ setSparseMatrixCudaPointer(tmp);
+ }
+
public long getSizeOnDevice() {
long GPUSize = 0;
long rlen = mat.getNumRows();
@@ -863,7 +880,10 @@ public class GPUObject {
values = csrBlock.values();
}
- allocateSparseMatrixOnDevice();
+ if (values.length > tmp.getNonZeros())
+ allocateSparseMatrixOnDevice(values.length);
+ else
+ allocateSparseMatrixOnDevice();
if (copyToDevice) {
CSRPointer.copyToDevice(getGPUContext(),
getJcudaSparseMatrixPtr(),
@@ -1037,7 +1057,7 @@ public class GPUObject {
* @param eager whether to be done synchronously or asynchronously
* @throws DMLRuntimeException if error occurs
*/
- public void clearData(String opcode, boolean eager) throws
DMLRuntimeException {
+ synchronized public void clearData(String opcode, boolean eager) throws
DMLRuntimeException {
if (isLineageCached)
return;