This is an automated email from the ASF dual-hosted git repository. niketanpansare pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git
The following commit(s) were added to refs/heads/master by this push: new 70bf610 [SYSTEMML-540] Optimized sparse-to-dense conversion on GPU and added a flag to disable forced memset0 70bf610 is described below commit 70bf61093dc3814ccbec867de4e4753cb9f3e086 Author: Niketan Pansare <npan...@us.ibm.com> AuthorDate: Thu Mar 28 22:44:24 2019 -0700 [SYSTEMML-540] Optimized sparse-to-dense conversion on GPU and added a flag to disable forced memset0 - Improved the performance of sparse-to-dense conversion of empty matrices. - Added a flag sysml.gpu.force.memSetZero that allows the user to disable forced memset0. - This flag is turned on for now and after exhaustive testing, it will be turned off later by default. --- conf/SystemML-config.xml.template | 3 +++ src/main/java/org/apache/sysml/conf/DMLConfig.java | 4 +++- .../instructions/gpu/context/CSRPointer.java | 3 +++ .../instructions/gpu/context/GPUMemoryManager.java | 20 ++++++++++++++++---- .../instructions/gpu/context/GPUObject.java | 22 ++++++++++++++++++---- 5 files changed, 43 insertions(+), 9 deletions(-) diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template index 17cc2cc..cd0d311 100644 --- a/conf/SystemML-config.xml.template +++ b/conf/SystemML-config.xml.template @@ -121,4 +121,7 @@ <!-- Should SystemML runtime force the lstm builtin functions to use the CuDNN kernels (default: true) --> <sysml.gpu.lstm.force.cudnn>true</sysml.gpu.lstm.force.cudnn> + + <!-- Should SystemML GPU memory manager force memSet(0) for the allocated arrays (default: true) --> + <sysml.gpu.force.memSetZero>true</sysml.gpu.force.memSetZero> </root> \ No newline at end of file diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java index 0b5ed78..e435c77 100644 --- a/src/main/java/org/apache/sysml/conf/DMLConfig.java +++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java @@ -96,6 +96,7 @@ public class DMLConfig public static final String GPU_MEMORY_ALLOCATOR = "sysml.gpu.memory.allocator"; // String to specify the memory allocator to use. Supported values are: cuda, unified_memory public static final String FLOATING_POINT_PRECISION = "sysml.floating.point.precision"; // String to specify the datatype to use internally: supported values are double, single public static final String PRINT_GPU_MEMORY_INFO = "sysml.gpu.print.memoryInfo"; + public static final String GPU_FORCE_MEMSET_ZERO = "sysml.gpu.force.memSetZero"; public static final String EVICTION_SHADOW_BUFFERSIZE = "sysml.gpu.eviction.shadow.bufferSize"; public static final String GPU_RECOMPUTE_ACTIVATIONS = "sysml.gpu.recompute.activations"; @@ -140,6 +141,7 @@ public class DMLConfig _defaultVals.put(NATIVE_BLAS_DIR, "none" ); _defaultVals.put(EXTRA_FINEGRAINED_STATS,"false" ); _defaultVals.put(PRINT_GPU_MEMORY_INFO, "false" ); + _defaultVals.put(GPU_FORCE_MEMSET_ZERO, "true" ); _defaultVals.put(EVICTION_SHADOW_BUFFERSIZE, "0.5" ); _defaultVals.put(STATS_MAX_WRAP_LEN, "30" ); _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR, "0.9" ); @@ -431,7 +433,7 @@ public class DMLConfig YARN_APPMASTER, YARN_APPMASTERMEM, YARN_MAPREDUCEMEM, CP_PARALLEL_OPS, CP_PARALLEL_IO, NATIVE_BLAS, NATIVE_BLAS_DIR, COMPRESSED_LINALG, - CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS, + CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS, GPU_FORCE_MEMSET_ZERO, EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN, PRINT_GPU_MEMORY_INFO, CACHING_BUFFER_SIZE, AVAILABLE_GPUS, SYNCHRONIZE_GPU, EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY, EVICTION_SHADOW_BUFFERSIZE, GPU_MEMORY_ALLOCATOR, GPU_MEMORY_UTILIZATION_FACTOR, GPU_RECOMPUTE_ACTIVATIONS, FORCE_LSTM_CUDNN diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java index b3ec497..d7bd295 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java @@ -303,6 +303,9 @@ public class CSRPointer { r.val = gCtx.allocate(null, getDataTypeSizeOf(nnz2)); r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1)); r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2)); + GPUMemoryManager.postAllocateMemset0(r.val, getDataTypeSizeOf(nnz2), null); + GPUMemoryManager.postAllocateMemset0(r.rowPtr, getIntSizeOf(rows + 1), null); + GPUMemoryManager.postAllocateMemset0(r.colInd, getIntSizeOf(nnz2), null); return r; } diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java index cf579ec..d15b953 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java @@ -57,6 +57,7 @@ public class GPUMemoryManager { private static final int [] DEBUG_MEMORY_LEAK_STACKTRACE_DEPTH = {5, 6, 7, 8, 9, 10, 11}; // Avoids printing too much text while debugging private final boolean PRINT_GPU_MEMORY_INFO = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.PRINT_GPU_MEMORY_INFO); + public static boolean GPU_FORCE_MEMSET_ZERO = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.GPU_FORCE_MEMSET_ZERO); protected final GPUMemoryAllocator allocator; /*****************************************************************************************/ @@ -141,6 +142,7 @@ public class GPUMemoryManager { private static final double WARN_UTILIZATION_FACTOR = 0.7; public GPUMemoryManager(GPUContext gpuCtx) { + GPU_FORCE_MEMSET_ZERO = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.GPU_FORCE_MEMSET_ZERO); matrixMemoryManager = new GPUMatrixMemoryManager(this); lazyCudaFreeMemoryManager = new GPULazyCudaFreeMemoryManager(this); String allocatorType = ConfigurationManager.getDMLConfig().getTextValue(DMLConfig.GPU_MEMORY_ALLOCATOR); @@ -361,12 +363,22 @@ public class GPUMemoryManager { + toString()); } - long t0 = ConfigurationManager.isStatistics() ? System.nanoTime() : 0; - cudaMemset(A, 0, size); - addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0); + if(GPU_FORCE_MEMSET_ZERO) { + long t0 = ConfigurationManager.isStatistics() ? System.nanoTime() : 0; + cudaMemset(A, 0, size); + addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0); + } return A; } + public static void postAllocateMemset0(Pointer A, long size, String opcode) { + if(!GPU_FORCE_MEMSET_ZERO) { + long t0 = ConfigurationManager.isStatistics() ? System.nanoTime() : 0; + cudaMemset(A, 0, size); + addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0); + } + } + private int worstCaseContiguousMemorySizeCompare(GPUObject o1, GPUObject o2) { long ret = o1.getWorstCaseContiguousMemorySize() - o2.getWorstCaseContiguousMemorySize(); return ret < 0 ? -1 : (ret == 0 ? 0 : 1); @@ -553,7 +565,7 @@ public class GPUMemoryManager { * @param instructionLevelTimer member of GPUInstruction * @param startTime start time */ - private void addMiscTime(String opcode, LongAdder globalGPUTimer, LongAdder globalGPUCounter, String instructionLevelTimer, long startTime) { + private static void addMiscTime(String opcode, LongAdder globalGPUTimer, LongAdder globalGPUCounter, String instructionLevelTimer, long startTime) { if(ConfigurationManager.isStatistics()) { long totalTime = System.nanoTime() - startTime; globalGPUTimer.add(totalTime); diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index 9d263aa..254c9d7 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -434,9 +434,15 @@ public class GPUObject { start = System.nanoTime(); if (getJcudaSparseMatrixPtr() == null || !isAllocated()) throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call"); - - sparseToColumnMajorDense(); - denseColumnMajorToRowMajor(); + if(getJcudaSparseMatrixPtr().nnz == 0) { + long size = ((long) mat.getNumRows()) * getDataTypeSizeOf(mat.getNumColumns()); + setDensePointer(allocate(size)); + GPUMemoryManager.postAllocateMemset0(getDensePointer(), size, instructionName); + } + else { + sparseToColumnMajorDense(); + denseColumnMajorToRowMajor(); + } if (ConfigurationManager.isStatistics()) end = System.nanoTime(); if (instructionName != null && ConfigurationManager.isFinegrainedStatistics()) @@ -446,6 +452,10 @@ public class GPUObject { if (ConfigurationManager.isStatistics()) GPUStatistics.cudaSparseToDenseCount.add(1); } + + private static long getDataTypeSizeOf(long numElems) { + return numElems * ((long) LibMatrixCUDA.sizeOfDataType); + } /** * More efficient method to convert sparse to dense but returns dense in column major format @@ -521,10 +531,14 @@ public class GPUObject { setDensePointer(allocate(size)); // The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v" // If the fill value is 0, no need to call the special kernel, the allocate memsets the allocated region to 0 - if (v != 0) + if (v != 0) { getGPUContext().getKernels() .launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), getDensePointer(), v, numElems); + } + else { + GPUMemoryManager.postAllocateMemset0(getDensePointer(), size, null); + } } /**