(systemds) branch main updated: [SYSTEMDS-3567] Update allocation logic in GPU

arnabp20 Fri, 01 Dec 2023 05:59:57 -0800

This is an automated email from the ASF dual-hosted git repository.

arnabp20 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new 8a38477a08 [SYSTEMDS-3567] Update allocation logic in GPU
8a38477a08 is described below

commit 8a38477a0860c1631d21947dae62857f776f0cdb
Author: Arnab Phani <[email protected]>
AuthorDate: Wed Nov 8 23:39:37 2023 +0100

    [SYSTEMDS-3567] Update allocation logic in GPU
    
    This patch updates the order of the steps for GPU allocation. Now we
    recycle cached pointers before freeing any pointers (inside or outside
    of the cache). This patch also provides a method clear full GPU cache,
    fixes bugs and extend lineage tracing for missing DNN operators.
---
 .../instructions/cp/UnaryScalarCPInstruction.java  |  3 +-
 .../instructions/gpu/DnnGPUInstruction.java        | 15 ++--
 .../instructions/gpu/context/GPUMemoryManager.java | 80 ++++++++++++----------
 .../apache/sysds/runtime/lineage/LineageCache.java |  2 +
 .../sysds/runtime/lineage/LineageCacheConfig.java  |  5 +-
 .../runtime/lineage/LineageGPUCacheEviction.java   | 16 +++++
 6 files changed, 77 insertions(+), 44 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/cp/UnaryScalarCPInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/cp/UnaryScalarCPInstruction.java
index ecced1f13d..ba87869e37 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/cp/UnaryScalarCPInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/cp/UnaryScalarCPInstruction.java
@@ -23,6 +23,7 @@ import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.common.Types.ValueType;
 import org.apache.sysds.runtime.DMLScriptException;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysds.runtime.lineage.LineageGPUCacheEviction;
 import org.apache.sysds.runtime.matrix.operators.Operator;
 import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
 
@@ -44,7 +45,7 @@ public class UnaryScalarCPInstruction extends 
UnaryCPInstruction {
                //core execution
                if ( opcode.equalsIgnoreCase("print") ) {
                        String outString = so.getLanguageSpecificStringValue();
-                       
+
                        // print to stdout only when suppress flag in DMLScript 
is not set.
                        // The flag will be set, for example, when SystemDS is 
invoked in fenced mode from Jaql.
                        if (!DMLScript.suppressPrint2Stdout())
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java
index 599d361490..96b0310e03 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/DnnGPUInstruction.java
@@ -23,6 +23,7 @@ import java.util.ArrayList;
 
 import org.apache.commons.lang3.tuple.Pair;
 import org.apache.sysds.api.DMLScript;
+import org.apache.sysds.common.Types;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
@@ -323,7 +324,7 @@ public class DnnGPUInstruction extends GPUInstruction {
                        CPOperand in6 = new CPOperand(parts[6]); // mode
                        CPOperand in7 = new CPOperand(parts[7]); // epsilon
                        CPOperand in8 = new CPOperand(parts[8]); // 
exponentialAverageFactor
-                       CPOperand out = new CPOperand(parts[9]);  // ret
+                       CPOperand out = new CPOperand(parts[9], 
Types.ValueType.FP64, Types.DataType.MATRIX);  // ret
                        CPOperand out2 = new CPOperand(parts[10]); // 
retRunningMean
                        CPOperand out3 = new CPOperand(parts[11]); // 
retRunningVar
                        CPOperand out4 = new CPOperand(parts[12]); // 
resultSaveMean
@@ -902,10 +903,14 @@ public class DnnGPUInstruction extends GPUInstruction {
                inputs.add(_input6);
                inputs.add(_input7);
                inputs.add(_input8);
-               inputs.addAll(_input_shape);
-               inputs.addAll(_filter_shape);
-               inputs.addAll(_stride);
-               inputs.addAll(_padding);
+               if (_input_shape != null && !_input_shape.isEmpty())
+                       inputs.addAll(_input_shape);
+               if (_filter_shape != null && !_filter_shape.isEmpty())
+                       inputs.addAll(_filter_shape);
+               if (_stride != null && !_stride.isEmpty())
+                       inputs.addAll(_stride);
+               if (_padding!= null && !_padding.isEmpty())
+                       inputs.addAll(_padding);
                return Pair.of(_output.getName(),
                        new LineageItem(getOpcode(), 
LineageItemUtils.getLineage(ec, inputs.toArray(new CPOperand[0]))));
        }
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
index 39cff7623c..3d83d7f119 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -261,35 +261,13 @@ public class GPUMemoryManager {
 
                Pointer tmpA = (A == null) ? new Pointer() : null;
                // Step 2: Allocate a new pointer in the GPU memory (since 
memory is available)
-               // Step 3 has potential to create holes as well as limit future 
reuse, hence perform this step before step 3.
+               // Step 4 has potential to create holes as well as limit future 
reuse, hence perform this step before step 3.
                if(A == null && allocator.canAllocate(size)) {
                        // This can fail in case of fragmented memory, so don't 
issue any warning
                        A = cudaMallocNoWarn(tmpA, size, "allocate a new 
pointer");
                }
-               
-               // Step 3: Try reusing non-exact match entry of rmvarGPUPointers
-               if(A == null) {
-                       A = 
lazyCudaFreeMemoryManager.getRmvarPointerMinSize(opcode, size);
-                       if(A != null) {
-                               guardedCudaFree(A);
-                               A = cudaMallocNoWarn(tmpA, size, "reuse 
non-exact match of rmvarGPUPointers"); 
-                               if(A == null)
-                                       LOG.warn("cudaMalloc failed after 
clearing one of rmvarGPUPointers.");
-                       }
-               }
-               
-               // Step 4: Eagerly free-up rmvarGPUPointers and check if memory 
is available on GPU
-               // Evictions of matrix blocks are expensive (as they might lead 
them to be written to disk in case of smaller CPU budget) 
-               // than doing cuda free/malloc/memset. So, rmvar-ing every 
blocks (step 4) is preferred over eviction (step 6, 7, 8).
-               if(A == null) {
-                       lazyCudaFreeMemoryManager.clearAll();
-                       if(allocator.canAllocate(size)) {
-                               // This can fail in case of fragmented memory, 
so don't issue any warning
-                               A = cudaMallocNoWarn(tmpA, size, "allocate a 
new pointer after eager free");
-                       }
-               }
-               
-               // Step 5.1: Recycle, delete or evict gpu intermediates from 
lineage cache
+
+               // Step 3: Recycle gpu intermediates from lineage cache
                if (A == null && !LineageCacheConfig.ReuseCacheType.isNone()) {
                        long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
                        // Recycle a cached pointer if exactly matches the 
required size
@@ -316,8 +294,30 @@ public class GPUMemoryManager {
                        if (DMLScript.STATISTICS)
                                
LineageCacheStatistics.incrementEvictTimeGpu(System.nanoTime() - t0);
                }
-
-               // Step 5.2: Use a non-exact sized pointer
+               
+               // Step 4: Try reusing non-exact match entry of rmvarGPUPointers
+               if(A == null) {
+                       A = 
lazyCudaFreeMemoryManager.getRmvarPointerMinSize(opcode, size);
+                       if(A != null) {
+                               guardedCudaFree(A);
+                               A = cudaMallocNoWarn(tmpA, size, "reuse 
non-exact match of rmvarGPUPointers");
+                               if(A == null)
+                                       LOG.warn("cudaMalloc failed after 
clearing one of rmvarGPUPointers.");
+                       }
+               }
+               
+               // Step 5: Eagerly free-up rmvarGPUPointers and check if memory 
is available on GPU
+               // Evictions of matrix blocks are expensive (as they might lead 
them to be written to disk in case of smaller CPU budget) 
+               // than doing cuda free/malloc/memset. So, rmvar-ing every 
blocks (step 4) is preferred over eviction (step 6, 7, 8).
+               if(A == null) {
+                       lazyCudaFreeMemoryManager.clearAll();
+                       if(allocator.canAllocate(size)) {
+                               // This can fail in case of fragmented memory, 
so don't issue any warning
+                               A = cudaMallocNoWarn(tmpA, size, "allocate a 
new pointer after eager free");
+                       }
+               }
+               
+               // Step 6: Free gpu intermediates from lineage cache
                if (A == null && !LineageCacheConfig.ReuseCacheType.isNone()) {
                        long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
                        long freedSize = 0;
@@ -340,7 +340,7 @@ public class GPUMemoryManager {
                                                if(DMLScript.STATISTICS)
                                                        
LineageCacheStatistics.incrementGpuSyncEvicts();
                                        }
-                                       if (freedSize > size)
+                                       if (freedSize >= size)
                                                A = cudaMallocNoWarn(tmpA, 
size, "recycle non-exact match of lineage cache");
                                        // Else, deallocate another free 
pointer. We are calling pollFistFreeNotExact with
                                        // the same size (not with 
freedSize-size) to reduce potentials for creating holes
@@ -353,7 +353,7 @@ public class GPUMemoryManager {
                                LOG.warn("cudaMalloc failed after Lineage GPU 
cache eviction.");
                }
 
-               // Step 6: Try eviction/clearing exactly one with size 
restriction
+               // Step 7: Try eviction/clearing exactly one with size 
restriction
                if(A == null) {
                        long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
                        synchronized (matrixMemoryManager.gpuObjects) {
@@ -377,7 +377,7 @@ public class GPUMemoryManager {
                        }
                }
                
-               // Step 7: Try eviction/clearing one-by-one based on the given 
policy without size restriction
+               // Step 8: Try eviction/clearing one-by-one based on the given 
policy without size restriction
                if(A == null) {
                        long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
                        long currentAvailableMemory = 
allocator.getAvailableMemory();
@@ -411,7 +411,7 @@ public class GPUMemoryManager {
                        }
                }
                
-               // Step 8: Handle defragmentation
+               // Step 9: Handle defragmentation
                if(A == null) {
                        LOG.warn("Potential fragmentation of the GPU memory. 
Forcibly evicting all ...");
                        LOG.info("Before clearAllUnlocked, GPU Memory info:" + 
toString());
@@ -477,7 +477,8 @@ public class GPUMemoryManager {
         * 
         * @param toFree pointer to call cudaFree method on
         */
-       public void guardedCudaFree(Pointer toFree) {
+       public void guardedCudaFree(Pointer toFree, boolean noStats) {
+               long t0 = (!noStats && DMLScript.STATISTICS) ? 
System.nanoTime() : 0;
                synchronized(allPointers) {
                        if(allPointers.containsKey(toFree)) {
                                long size = 
allPointers.get(toFree).getSizeInBytes();
@@ -495,9 +496,16 @@ public class GPUMemoryManager {
                                throw new RuntimeException("Attempting to free 
an unaccounted pointer:" + toFree);
                        }
                }
+               if(DMLScript.STATISTICS && !noStats) {
+                       GPUStatistics.cudaDeAllocTime.add(System.nanoTime() - 
t0);
+                       GPUStatistics.cudaDeAllocCount.add(1);
+               }
+       }
 
+       public void guardedCudaFree(Pointer toFree) {
+               guardedCudaFree(toFree, false);
        }
-       
+
        /**
         * Deallocate the pointer
         * 
@@ -517,9 +525,9 @@ public class GPUMemoryManager {
                if(LOG.isTraceEnabled())
                        LOG.trace("Free-ing the pointer with eager=" + eager);
                if (eager) {
-                       long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
+                       //long t0 = DMLScript.STATISTICS ? System.nanoTime() : 
0;
                        guardedCudaFree(toFree);
-                       addMiscTime(opcode, GPUStatistics.cudaDeAllocTime, 
GPUStatistics.cudaDeAllocCount, GPUInstruction.MISC_TIMER_CUDA_FREE, t0);
+                       //addMiscTime(opcode, GPUStatistics.cudaDeAllocTime, 
GPUStatistics.cudaDeAllocCount, GPUInstruction.MISC_TIMER_CUDA_FREE, t0);
                }
                else {
                        long size = 0;
@@ -602,7 +610,7 @@ public class GPUMemoryManager {
                Set<Pointer> unlockedDirtyOrCachedPointers = 
matrixMemoryManager.getPointers(false, true);
                Set<Pointer> temporaryPointers = nonIn(allPointers.keySet(), 
unlockedDirtyOrCachedPointers);
                for(Pointer tmpPtr : temporaryPointers) {
-                       guardedCudaFree(tmpPtr);
+                       guardedCudaFree(tmpPtr, true);
                }
        }
        
diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
index 2d1e209100..e2dc6f6b0e 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
@@ -189,6 +189,7 @@ public class LineageCache
                                                        return false;  //the 
executing thread removed this entry from cache
                                                if (e.getCacheStatus() == 
LineageCacheStatus.TOCACHEGPU) {  //second hit
                                                        //Cannot reuse as 
already garbage collected
+                                                       if 
(DMLScript.STATISTICS) LineageCacheStatistics.incrementDelHitsGpu(); //increase 
miss count
                                                        
ec.replaceLineageItem(outName, e._key); //still reuse the lineage trace
                                                        return false;
                                                }
@@ -322,6 +323,7 @@ public class LineageCache
                                                case TOCACHEGPU:
                                                        //Cannot reuse as 
already garbage collected putValue method
                                                        // will save the 
pointer while caching the original instruction
+                                                       if 
(DMLScript.STATISTICS) LineageCacheStatistics.incrementDelHitsGpu(); //increase 
miss count
                                                        return false;
                                                case GPUCACHED:
                                                        //Increment the live 
count for this pointer
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
index 9ad6a33bb7..04f41e5ce5 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
@@ -56,7 +56,8 @@ public class LineageCacheConfig
                "^2", "*2", "uack+", "tak+*", "uacsqk+", "uark+", "n+", 
"uarimax", "qsort",
                "qpick", "transformapply", "uarmax", "n+", "-*", "castdtm", 
"lowertri", "1-*",
                "prefetch", "mapmm", "contains", "mmchain", "mapmmchain", "+*", 
"==", "rmempty",
-               "conv2d_bias_add", "relu_maxpooling", "maxpooling", "softmax"
+               "conv2d_bias_add", "relu_maxpooling", "maxpooling", 
"batch_norm2d", "avgpooling",
+               "softmax"
                //TODO: Reuse everything.
        };
 
@@ -76,7 +77,7 @@ public class LineageCacheConfig
        };
 
        private static final String[] GPU_OPCODE_HEAVY = new String[] {
-               "conv2d_bias_add", "relu_maxpooling", "maxpooling"       //DNN 
OPs
+               "conv2d_bias_add", "relu_maxpooling", "maxpooling", 
"batch_norm2d", "avgpooling"  //DNN OPs
        };
 
        private static String[] REUSE_OPCODES  = new String[] {};
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
index 2fa4766d84..7eac5e4a54 100644
--- 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageGPUCacheEviction.java
@@ -31,6 +31,7 @@ import java.util.concurrent.ExecutorService;
 import java.util.stream.Collectors;
 
 import jcuda.Pointer;
+import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
@@ -130,6 +131,21 @@ public class LineageGPUCacheEviction
                }
        }
 
+       public static void removeAllEntries() {
+               List<Long> sizes = new ArrayList<>(freeQueues.keySet());
+               for (Long size : sizes) {
+                       TreeSet<LineageCacheEntry> freeList = 
freeQueues.get(size);
+                       LineageCacheEntry le = pollFirstFreeEntry(size);
+                       while (le != null) {
+                               // Free the pointer
+                               
_gpuContext.getMemoryManager().guardedCudaFree(le.getGPUPointer());
+                               if (DMLScript.STATISTICS)
+                                       
LineageCacheStatistics.incrementGpuDel();
+                               le = pollFirstFreeEntry(size);
+                       }
+               }
+       }
+
        public static void setGPUContext(GPUContext gpuCtx) {
                _gpuContext = gpuCtx;
        }

(systemds) branch main updated: [SYSTEMDS-3567] Update allocation logic in GPU

Reply via email to