This is an automated email from the ASF dual-hosted git repository.
arnabp20 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 8a38477a08 [SYSTEMDS-3567] Update allocation logic in GPU
8a38477a08 is described below
commit 8a38477a0860c1631d21947dae62857f776f0cdb
Author: Arnab Phani <[email protected]>
AuthorDate: Wed Nov 8 23:39:37 2023 +0100
[SYSTEMDS-3567] Update allocation logic in GPU
This patch updates the order of the steps for GPU allocation. Now we
recycle cached pointers before freeing any pointers (inside or outside
of the cache). This patch also provides a method clear full GPU cache,
fixes bugs and extend lineage tracing for missing DNN operators.
---
.../instructions/cp/UnaryScalarCPInstruction.java | 3 +-
.../instructions/gpu/DnnGPUInstruction.java | 15 ++--
.../instructions/gpu/context/GPUMemoryManager.java | 80 ++++++++++++----------
.../apache/sysds/runtime/lineage/LineageCache.java | 2 +
.../sysds/runtime/lineage/LineageCacheConfig.java | 5 +-
.../runtime/lineage/LineageGPUCacheEviction.java | 16 +++++
6 files changed, 77 insertions(+), 44 deletions(-)
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/cp/UnaryScalarCPInstruction.java
b/src/main/java/org/apache/sysds/runtime/instructions/cp/UnaryScalarCPInstruction.java
index ecced1f13d..ba87869e37 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/cp/UnaryScalarCPInstruction.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/cp/UnaryScalarCPInstruction.java
@@ -23,6 +23,7 @@ import org.apache.sysds.api.DMLScript;
import org.apache.sysds.common.Types.ValueType;
import org.apache.sysds.runtime.DMLScriptException;
import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysds.runtime.lineage.LineageGPUCacheEviction;
import org.apache.sysds.runtime.matrix.operators.Operator;
import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
@@ -44,7 +45,7 @@ public class UnaryScalarCPInstruction extends
UnaryCPInstruction {
//core execution
if ( opcode.equalsIgnoreCase("print") ) {
String outString = so.getLanguageSpecificStringValue();
-
+
// print to stdout only when suppress flag in DMLScript
is not set.
// The flag will be set, for example, when SystemDS is
invoked in fenced mode from Jaql.
if (!DMLScript.suppressPrint2Stdout())
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java
index 599d361490..96b0310e03 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java
@@ -23,6 +23,7 @@ import java.util.ArrayList;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.sysds.api.DMLScript;
+import org.apache.sysds.common.Types;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
@@ -323,7 +324,7 @@ public class DnnGPUInstruction extends GPUInstruction {
CPOperand in6 = new CPOperand(parts[6]); // mode
CPOperand in7 = new CPOperand(parts[7]); // epsilon
CPOperand in8 = new CPOperand(parts[8]); //
exponentialAverageFactor
- CPOperand out = new CPOperand(parts[9]); // ret
+ CPOperand out = new CPOperand(parts[9],
Types.ValueType.FP64, Types.DataType.MATRIX); // ret
CPOperand out2 = new CPOperand(parts[10]); //
retRunningMean
CPOperand out3 = new CPOperand(parts[11]); //
retRunningVar
CPOperand out4 = new CPOperand(parts[12]); //
resultSaveMean
@@ -902,10 +903,14 @@ public class DnnGPUInstruction extends GPUInstruction {
inputs.add(_input6);
inputs.add(_input7);
inputs.add(_input8);
- inputs.addAll(_input_shape);
- inputs.addAll(_filter_shape);
- inputs.addAll(_stride);
- inputs.addAll(_padding);
+ if (_input_shape != null && !_input_shape.isEmpty())
+ inputs.addAll(_input_shape);
+ if (_filter_shape != null && !_filter_shape.isEmpty())
+ inputs.addAll(_filter_shape);
+ if (_stride != null && !_stride.isEmpty())
+ inputs.addAll(_stride);
+ if (_padding!= null && !_padding.isEmpty())
+ inputs.addAll(_padding);
return Pair.of(_output.getName(),
new LineageItem(getOpcode(),
LineageItemUtils.getLineage(ec, inputs.toArray(new CPOperand[0]))));
}
diff --git
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
index 39cff7623c..3d83d7f119 100644
---
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
+++
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -261,35 +261,13 @@ public class GPUMemoryManager {
Pointer tmpA = (A == null) ? new Pointer() : null;
// Step 2: Allocate a new pointer in the GPU memory (since
memory is available)
- // Step 3 has potential to create holes as well as limit future
reuse, hence perform this step before step 3.
+ // Step 4 has potential to create holes as well as limit future
reuse, hence perform this step before step 3.
if(A == null && allocator.canAllocate(size)) {
// This can fail in case of fragmented memory, so don't
issue any warning
A = cudaMallocNoWarn(tmpA, size, "allocate a new
pointer");
}
-
- // Step 3: Try reusing non-exact match entry of rmvarGPUPointers
- if(A == null) {
- A =
lazyCudaFreeMemoryManager.getRmvarPointerMinSize(opcode, size);
- if(A != null) {
- guardedCudaFree(A);
- A = cudaMallocNoWarn(tmpA, size, "reuse
non-exact match of rmvarGPUPointers");
- if(A == null)
- LOG.warn("cudaMalloc failed after
clearing one of rmvarGPUPointers.");
- }
- }
-
- // Step 4: Eagerly free-up rmvarGPUPointers and check if memory
is available on GPU
- // Evictions of matrix blocks are expensive (as they might lead
them to be written to disk in case of smaller CPU budget)
- // than doing cuda free/malloc/memset. So, rmvar-ing every
blocks (step 4) is preferred over eviction (step 6, 7, 8).
- if(A == null) {
- lazyCudaFreeMemoryManager.clearAll();
- if(allocator.canAllocate(size)) {
- // This can fail in case of fragmented memory,
so don't issue any warning
- A = cudaMallocNoWarn(tmpA, size, "allocate a
new pointer after eager free");
- }
- }
-
- // Step 5.1: Recycle, delete or evict gpu intermediates from
lineage cache
+
+ // Step 3: Recycle gpu intermediates from lineage cache
if (A == null && !LineageCacheConfig.ReuseCacheType.isNone()) {
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
// Recycle a cached pointer if exactly matches the
required size
@@ -316,8 +294,30 @@ public class GPUMemoryManager {
if (DMLScript.STATISTICS)
LineageCacheStatistics.incrementEvictTimeGpu(System.nanoTime() - t0);
}
-
- // Step 5.2: Use a non-exact sized pointer
+
+ // Step 4: Try reusing non-exact match entry of rmvarGPUPointers
+ if(A == null) {
+ A =
lazyCudaFreeMemoryManager.getRmvarPointerMinSize(opcode, size);
+ if(A != null) {
+ guardedCudaFree(A);
+ A = cudaMallocNoWarn(tmpA, size, "reuse
non-exact match of rmvarGPUPointers");
+ if(A == null)
+ LOG.warn("cudaMalloc failed after
clearing one of rmvarGPUPointers.");
+ }
+ }
+
+ // Step 5: Eagerly free-up rmvarGPUPointers and check if memory
is available on GPU
+ // Evictions of matrix blocks are expensive (as they might lead
them to be written to disk in case of smaller CPU budget)
+ // than doing cuda free/malloc/memset. So, rmvar-ing every
blocks (step 4) is preferred over eviction (step 6, 7, 8).
+ if(A == null) {
+ lazyCudaFreeMemoryManager.clearAll();
+ if(allocator.canAllocate(size)) {
+ // This can fail in case of fragmented memory,
so don't issue any warning
+ A = cudaMallocNoWarn(tmpA, size, "allocate a
new pointer after eager free");
+ }
+ }
+
+ // Step 6: Free gpu intermediates from lineage cache
if (A == null && !LineageCacheConfig.ReuseCacheType.isNone()) {
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
long freedSize = 0;
@@ -340,7 +340,7 @@ public class GPUMemoryManager {
if(DMLScript.STATISTICS)
LineageCacheStatistics.incrementGpuSyncEvicts();
}
- if (freedSize > size)
+ if (freedSize >= size)
A = cudaMallocNoWarn(tmpA,
size, "recycle non-exact match of lineage cache");
// Else, deallocate another free
pointer. We are calling pollFistFreeNotExact with
// the same size (not with
freedSize-size) to reduce potentials for creating holes
@@ -353,7 +353,7 @@ public class GPUMemoryManager {
LOG.warn("cudaMalloc failed after Lineage GPU
cache eviction.");
}
- // Step 6: Try eviction/clearing exactly one with size
restriction
+ // Step 7: Try eviction/clearing exactly one with size
restriction
if(A == null) {
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
synchronized (matrixMemoryManager.gpuObjects) {
@@ -377,7 +377,7 @@ public class GPUMemoryManager {
}
}
- // Step 7: Try eviction/clearing one-by-one based on the given
policy without size restriction
+ // Step 8: Try eviction/clearing one-by-one based on the given
policy without size restriction
if(A == null) {
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
long currentAvailableMemory =
allocator.getAvailableMemory();
@@ -411,7 +411,7 @@ public class GPUMemoryManager {
}
}
- // Step 8: Handle defragmentation
+ // Step 9: Handle defragmentation
if(A == null) {
LOG.warn("Potential fragmentation of the GPU memory.
Forcibly evicting all ...");
LOG.info("Before clearAllUnlocked, GPU Memory info:" +
toString());
@@ -477,7 +477,8 @@ public class GPUMemoryManager {
*
* @param toFree pointer to call cudaFree method on
*/
- public void guardedCudaFree(Pointer toFree) {
+ public void guardedCudaFree(Pointer toFree, boolean noStats) {
+ long t0 = (!noStats && DMLScript.STATISTICS) ?
System.nanoTime() : 0;
synchronized(allPointers) {
if(allPointers.containsKey(toFree)) {
long size =
allPointers.get(toFree).getSizeInBytes();
@@ -495,9 +496,16 @@ public class GPUMemoryManager {
throw new RuntimeException("Attempting to free
an unaccounted pointer:" + toFree);
}
}
+ if(DMLScript.STATISTICS && !noStats) {
+ GPUStatistics.cudaDeAllocTime.add(System.nanoTime() -
t0);
+ GPUStatistics.cudaDeAllocCount.add(1);
+ }
+ }
+ public void guardedCudaFree(Pointer toFree) {
+ guardedCudaFree(toFree, false);
}
-
+
/**
* Deallocate the pointer
*
@@ -517,9 +525,9 @@ public class GPUMemoryManager {
if(LOG.isTraceEnabled())
LOG.trace("Free-ing the pointer with eager=" + eager);
if (eager) {
- long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
+ //long t0 = DMLScript.STATISTICS ? System.nanoTime() :
0;
guardedCudaFree(toFree);
- addMiscTime(opcode, GPUStatistics.cudaDeAllocTime,
GPUStatistics.cudaDeAllocCount, GPUInstruction.MISC_TIMER_CUDA_FREE, t0);
+ //addMiscTime(opcode, GPUStatistics.cudaDeAllocTime,
GPUStatistics.cudaDeAllocCount, GPUInstruction.MISC_TIMER_CUDA_FREE, t0);
}
else {
long size = 0;
@@ -602,7 +610,7 @@ public class GPUMemoryManager {
Set<Pointer> unlockedDirtyOrCachedPointers =
matrixMemoryManager.getPointers(false, true);
Set<Pointer> temporaryPointers = nonIn(allPointers.keySet(),
unlockedDirtyOrCachedPointers);
for(Pointer tmpPtr : temporaryPointers) {
- guardedCudaFree(tmpPtr);
+ guardedCudaFree(tmpPtr, true);
}
}
diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
index 2d1e209100..e2dc6f6b0e 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
@@ -189,6 +189,7 @@ public class LineageCache
return false; //the
executing thread removed this entry from cache
if (e.getCacheStatus() ==
LineageCacheStatus.TOCACHEGPU) { //second hit
//Cannot reuse as
already garbage collected
+ if
(DMLScript.STATISTICS) LineageCacheStatistics.incrementDelHitsGpu(); //increase
miss count
ec.replaceLineageItem(outName, e._key); //still reuse the lineage trace
return false;
}
@@ -322,6 +323,7 @@ public class LineageCache
case TOCACHEGPU:
//Cannot reuse as
already garbage collected putValue method
// will save the
pointer while caching the original instruction
+ if
(DMLScript.STATISTICS) LineageCacheStatistics.incrementDelHitsGpu(); //increase
miss count
return false;
case GPUCACHED:
//Increment the live
count for this pointer
diff --git
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
index 9ad6a33bb7..04f41e5ce5 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
@@ -56,7 +56,8 @@ public class LineageCacheConfig
"^2", "*2", "uack+", "tak+*", "uacsqk+", "uark+", "n+",
"uarimax", "qsort",
"qpick", "transformapply", "uarmax", "n+", "-*", "castdtm",
"lowertri", "1-*",
"prefetch", "mapmm", "contains", "mmchain", "mapmmchain", "+*",
"==", "rmempty",
- "conv2d_bias_add", "relu_maxpooling", "maxpooling", "softmax"
+ "conv2d_bias_add", "relu_maxpooling", "maxpooling",
"batch_norm2d", "avgpooling",
+ "softmax"
//TODO: Reuse everything.
};
@@ -76,7 +77,7 @@ public class LineageCacheConfig
};
private static final String[] GPU_OPCODE_HEAVY = new String[] {
- "conv2d_bias_add", "relu_maxpooling", "maxpooling" //DNN
OPs
+ "conv2d_bias_add", "relu_maxpooling", "maxpooling",
"batch_norm2d", "avgpooling" //DNN OPs
};
private static String[] REUSE_OPCODES = new String[] {};
diff --git
a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
index 2fa4766d84..7eac5e4a54 100644
---
a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
+++
b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
@@ -31,6 +31,7 @@ import java.util.concurrent.ExecutorService;
import java.util.stream.Collectors;
import jcuda.Pointer;
+import org.apache.sysds.api.DMLScript;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
@@ -130,6 +131,21 @@ public class LineageGPUCacheEviction
}
}
+ public static void removeAllEntries() {
+ List<Long> sizes = new ArrayList<>(freeQueues.keySet());
+ for (Long size : sizes) {
+ TreeSet<LineageCacheEntry> freeList =
freeQueues.get(size);
+ LineageCacheEntry le = pollFirstFreeEntry(size);
+ while (le != null) {
+ // Free the pointer
+
_gpuContext.getMemoryManager().guardedCudaFree(le.getGPUPointer());
+ if (DMLScript.STATISTICS)
+
LineageCacheStatistics.incrementGpuDel();
+ le = pollFirstFreeEntry(size);
+ }
+ }
+ }
+
public static void setGPUContext(GPUContext gpuCtx) {
_gpuContext = gpuCtx;
}