This is an automated email from the ASF dual-hosted git repository.

markd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit f10eb03821dbc30bcec731b2cc93125ec39ea3ff
Author: Mark Dokter <[email protected]>
AuthorDate: Wed Apr 14 23:59:51 2021 +0200

    [MINOR] Reduce memory footprint of reduceALL GPU operation
    
    The temporary buffer needs to hold at most num_blocks (of first reduction 
wave)
    items of size <data-type>, not N (size of input).
---
 src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java
index dc5a1f0..f6e950a 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java
@@ -933,10 +933,9 @@ public class LibMatrixCUDA {
                int[] tmp = getKernelParamsForReduceAll(gCtx, n);
                int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
 
-               Pointer tempOut = gCtx.allocate(instName, n*sizeOfDataType); 
+               Pointer tempOut = gCtx.allocate(instName, (long) blocks * 
sizeOfDataType); 
 
                getCudaKernels(gCtx).launchKernel(kernelFunction, new 
ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n);
-               //cudaDeviceSynchronize;
                
                int s = blocks;
                while (s > 1) {

Reply via email to