Repository: systemml Updated Branches: refs/heads/master e624d149f -> 4cf95c92e
http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/api/DMLScript.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java b/src/main/java/org/apache/sysml/api/DMLScript.java index 3912403..767fead 100644 --- a/src/main/java/org/apache/sysml/api/DMLScript.java +++ b/src/main/java/org/apache/sysml/api/DMLScript.java @@ -170,8 +170,10 @@ public class DMLScript * case (TRUE/FALSE for DML and True/False for PYDML). */ public static ScriptType SCRIPT_TYPE = DMLOptions.defaultOptions.scriptType; + public static boolean USE_ACCELERATOR = DMLOptions.defaultOptions.gpu; public static boolean FORCE_ACCELERATOR = DMLOptions.defaultOptions.forceGPU; + public static boolean SYNCHRONIZE_GPU = true; // whether to synchronize GPU after every instruction public static boolean _suppressPrint2Stdout = false; // flag that indicates whether or not to suppress any prints to stdout http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java index 09897a5..60c84aa 100644 --- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java +++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java @@ -78,6 +78,7 @@ public class ScriptExecutorUtils { GPUStatistics.DISPLAY_STATISTICS = dmlconf.getBooleanValue(DMLConfig.EXTRA_GPU_STATS); LibMatrixDNN.DISPLAY_STATISTICS = dmlconf.getBooleanValue(DMLConfig.EXTRA_DNN_STATS); DMLScript.FINEGRAINED_STATISTICS = dmlconf.getBooleanValue(DMLConfig.EXTRA_FINEGRAINED_STATS); + DMLScript.SYNCHRONIZE_GPU = dmlconf.getBooleanValue(DMLConfig.SYNCHRONIZE_GPU); DMLScript.STATISTICS_MAX_WRAP_LEN = dmlconf.getIntValue(DMLConfig.STATS_MAX_WRAP_LEN); boolean exceptionThrown = false; http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/conf/DMLConfig.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java index bfb3850..e5999fe 100644 --- a/src/main/java/org/apache/sysml/conf/DMLConfig.java +++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java @@ -83,7 +83,8 @@ public class DMLConfig public static final String EXTRA_GPU_STATS = "systemml.stats.extraGPU"; //boolean public static final String EXTRA_DNN_STATS = "systemml.stats.extraDNN"; //boolean public static final String AVAILABLE_GPUS = "systemml.gpu.availableGPUs"; // String to specify which GPUs to use (a range, all GPUs, comma separated list or a specific GPU) - + public static final String SYNCHRONIZE_GPU = "systemml.gpu.sync.postProcess"; // boolean: whether to synchronize GPUs after every instruction + // Fraction of available memory to use. The available memory is computer when the GPUContext is created // to handle the tradeoff on calling cudaMemGetInfo too often. public static final String GPU_MEMORY_UTILIZATION_FACTOR = "gpu.memory.util.factor"; @@ -131,6 +132,7 @@ public class DMLConfig _defaultVals.put(EXTRA_DNN_STATS, "false" ); _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR, "0.9" ); _defaultVals.put(AVAILABLE_GPUS, "-1"); + _defaultVals.put(SYNCHRONIZE_GPU, "false" ); } public DMLConfig() @@ -413,7 +415,7 @@ public class DMLConfig COMPRESSED_LINALG, CODEGEN, CODEGEN_COMPILER, CODEGEN_PLANCACHE, CODEGEN_LITERALS, EXTRA_GPU_STATS, EXTRA_DNN_STATS, EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN, - AVAILABLE_GPUS + AVAILABLE_GPUS, SYNCHRONIZE_GPU }; StringBuilder sb = new StringBuilder(); http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java index 59d2589..b74c0dd 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java @@ -378,30 +378,22 @@ public class ExecutionContext { return mo; } - public Pair<MatrixObject, Boolean> getMatrixInputForGPUInstruction(String varName) + public MatrixObject getMatrixInputForGPUInstruction(String varName, String opcode) throws DMLRuntimeException { GPUContext gCtx = getGPUContext(0); - boolean copied = false; MatrixObject mo = getMatrixObject(varName); if(mo == null) { throw new DMLRuntimeException("No matrix object available for variable:" + varName); } - boolean acquired = false; if( mo.getGPUObject(gCtx) == null ) { GPUObject newGObj = gCtx.createGPUObject(mo); mo.setGPUObject(gCtx, newGObj); - } else if( !mo.getGPUObject(gCtx).isInputAllocated() ) { - mo.acquireRead(); - acquired = true; } - - copied = mo.getGPUObject(gCtx).acquireDeviceRead(); - if(acquired) { - mo.release(); - } - return new Pair<MatrixObject, Boolean>(mo, copied); + // No need to perform acquireRead here because it is performed in copyFromHostToDevice + mo.getGPUObject(gCtx).acquireDeviceRead(opcode); + return mo; } /** http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java index b01c2c5..3294384 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java @@ -52,6 +52,7 @@ public abstract class CPInstruction extends Instruction public final static String MISC_TIMER_RELEASE_BUFF_WRITE = "rlswr";// time spent in buffer write in release operation public final static String MISC_TIMER_SPARSE_TO_DENSE = "s2d"; // time spent in sparse to dense conversion public final static String MISC_TIMER_DENSE_TO_SPARSE = "d2s"; // time spent in sparse to dense conversion + public final static String MISC_TIMER_RECOMPUTE_NNZ = "rnnz"; // time spent in recompute non-zeroes // Instruction specific miscellaneous timers that were found as potential bottlenecks in one of performance analysis. // SystemML committers have to be judicious about adding them by weighing the tradeoffs between reuse in future analysis and unnecessary overheads. http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java index f27084f..9a6a3bb 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java @@ -19,6 +19,7 @@ package org.apache.sysml.runtime.instructions.gpu; +import org.apache.sysml.api.DMLScript; import org.apache.sysml.lops.runtime.RunMRJobs; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; @@ -123,7 +124,7 @@ public abstract class GPUInstruction extends Instruction { public final static String MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB = "nncbd"; // time spent in cudnnConvolutionBackwardData public final static String MISC_TIMER_MAXPOOLING_FORWARD_LIB = "nnmf"; // time spent in cudnnPoolingForward public final static String MISC_TIMER_MAXPOOLING_BACKWARD_LIB = "nnmb"; // time spent in cudnnPoolingBackward - public final static String MISC_TIMER_BIAS_ADD_LIB = "nnba"; // time spent in bias_add cuda kernel + public final static String MISC_TIMER_BIAS_ADD_LIB = "nnba"; // time spent in bias_add, bias_multiply cuda kernel public final static String MISC_TIMER_RELU_BACKWARD_KERNEL= "nnrbk"; // time spent in relu_backward cuda kernel public final static String MISC_TIMER_RELU_KERNEL = "nnrk"; // time spent in the relu kernel public final static String MISC_TIMER_CUDNN_INIT = "nni"; // time spent in initializations for cudnn call @@ -187,7 +188,9 @@ public abstract class GPUInstruction extends Instruction { public void postprocessInstruction(ExecutionContext ec) throws DMLRuntimeException { - //JCuda.cudaDeviceSynchronize(); + if(DMLScript.SYNCHRONIZE_GPU) { + jcuda.runtime.JCuda.cudaDeviceSynchronize(); + } } /** @@ -199,10 +202,7 @@ public abstract class GPUInstruction extends Instruction { * @throws DMLRuntimeException if an error occurs */ protected MatrixObject getMatrixInputForGPUInstruction(ExecutionContext ec, String name) throws DMLRuntimeException { - long t0 = System.nanoTime(); - Pair<MatrixObject, Boolean> mb = ec.getMatrixInputForGPUInstruction(name); - if (mb.getValue()) GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t0); - return mb.getKey(); + return ec.getMatrixInputForGPUInstruction(name, getExtendedOpcode()); } /** @@ -216,9 +216,9 @@ public abstract class GPUInstruction extends Instruction { * @throws DMLRuntimeException if an error occurs */ protected MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name, long numRows, long numCols) throws DMLRuntimeException { - long t0 = System.nanoTime(); + long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; Pair<MatrixObject, Boolean> mb = ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols); - if (mb.getValue()) GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t0); + if (GPUStatistics.DISPLAY_STATISTICS && mb.getValue()) GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t0); return mb.getKey(); } } http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java index 9379534..7bb8b07 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java @@ -294,7 +294,8 @@ public class CSRPointer { */ public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) throws DMLRuntimeException { LOG.trace("GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and rows=" + rows + ", GPUContext=" + gCtx); - assert nnz2 > -1 : "Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU"; + if(nnz2 < 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU"); + if(rows <= 0) throw new DMLRuntimeException("Incorrect usage of internal API, number of rows is less than or equal to 0 when trying to allocate sparse data on GPU"); CSRPointer r = new CSRPointer(gCtx); r.nnz = nnz2; if (nnz2 == 0) { http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java index 197daaf..a31deab 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java @@ -265,8 +265,10 @@ public class GPUContext { long t0 = 0, t1 = 0, end = 0; Pointer A; if (freeCUDASpaceMap.containsKey(size)) { - LOG.trace("GPU : in allocate from instruction " + instructionName + ", found free block of size " + (size + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : in allocate from instruction " + instructionName + ", found free block of size " + (size / 1024.0) + " Kbytes from previously allocated block on " + this); + } if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size); @@ -277,9 +279,11 @@ public class GPUContext { GPUStatistics .maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0); } else { - LOG.trace( + if(LOG.isTraceEnabled()) { + LOG.trace( "GPU : in allocate from instruction " + instructionName + ", allocating new block of size " + (size / 1024.0) + " Kbytes on " + this); + } if (DMLScript.STATISTICS) t0 = System.nanoTime(); ensureFreeSpace(instructionName, size); @@ -296,8 +300,10 @@ public class GPUContext { // Set all elements to 0 since newly allocated space will contain garbage if (DMLScript.STATISTICS) t1 = System.nanoTime(); - LOG.trace("GPU : in allocate from instruction " + instructionName + ", setting block of size " + (size / 1024.0) + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : in allocate from instruction " + instructionName + ", setting block of size " + (size / 1024.0) + " Kbytes to zero on " + this); + } cudaMemset(A, 0, size); if (DMLScript.STATISTICS) end = System.nanoTime(); @@ -353,34 +359,38 @@ public class GPUContext { if (toFree == dummy) // trying to free a null pointer return; long t0 = 0; - assert cudaBlockSizeMap.containsKey( - toFree) : "ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up"; - long size = cudaBlockSizeMap.get(toFree); - if (eager) { - LOG.trace("GPU : eagerly freeing cuda memory [ " + toFree + " ] for instruction " + instructionName + " on " - + this); - if (DMLScript.STATISTICS) - t0 = System.nanoTime(); - cudaFree(toFree); - cudaBlockSizeMap.remove(toFree); - if (DMLScript.STATISTICS) - GPUStatistics.cudaDeAllocTime.add(System.nanoTime() - t0); - if (DMLScript.STATISTICS) - GPUStatistics.cudaDeAllocCount.add(1); - if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) - GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_CUDA_FREE, - System.nanoTime() - t0); - } else { - LOG.trace("GPU : lazily freeing cuda memory for instruction " + instructionName + " on " + this); - LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size); - if (freeList == null) { - freeList = new LinkedList<Pointer>(); - freeCUDASpaceMap.put(size, freeList); - } - if (freeList.contains(toFree)) - throw new RuntimeException("GPU : Internal state corrupted, double free"); - freeList.add(toFree); - } + if(!cudaBlockSizeMap.containsKey(toFree)) + throw new RuntimeException("ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up"); + long size = cudaBlockSizeMap.get(toFree); + if (eager) { + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : eagerly freeing cuda memory [ " + toFree + " ] for instruction " + instructionName + " on " + + this); + } + if (DMLScript.STATISTICS) + t0 = System.nanoTime(); + cudaFree(toFree); + cudaBlockSizeMap.remove(toFree); + if (DMLScript.STATISTICS) + GPUStatistics.cudaDeAllocTime.add(System.nanoTime() - t0); + if (DMLScript.STATISTICS) + GPUStatistics.cudaDeAllocCount.add(1); + if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_CUDA_FREE, + System.nanoTime() - t0); + } else { + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : lazily freeing cuda memory for instruction " + instructionName + " on " + this); + } + LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size); + if (freeList == null) { + freeList = new LinkedList<Pointer>(); + freeCUDASpaceMap.put(size, freeList); + } + if (freeList.contains(toFree)) + throw new RuntimeException("GPU : Internal state corrupted, double free"); + freeList.add(toFree); + } } /** @@ -401,9 +411,10 @@ public class GPUContext { * @throws DMLRuntimeException if DMLRuntimeException occurs */ void ensureFreeSpace(String instructionName, long size) throws DMLRuntimeException { - if (size >= getAvailableMemory()) { + if(size < 0 ) + throw new DMLRuntimeException("The size cannot be negative:" + size); + else if (size >= getAvailableMemory()) evict(instructionName, size); - } } /** @@ -431,7 +442,9 @@ public class GPUContext { * @throws DMLRuntimeException If no reusable memory blocks to free up or if not enough matrix blocks with zero locks on them. */ protected void evict(String instructionName, final long neededSize) throws DMLRuntimeException { - LOG.trace("GPU : evict called from " + instructionName + " for size " + neededSize + " on " + this); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : evict called from " + instructionName + " for size " + neededSize + " on " + this); + } GPUStatistics.cudaEvictionCount.add(1); // Release the set of free blocks maintained in a GPUObject.freeCUDASpaceMap // to free up space @@ -499,7 +512,7 @@ public class GPUContext { GPUObject toBeRemoved = allocatedGPUObjects.get(allocatedGPUObjects.size() - 1); if (toBeRemoved.locks.get() > 0) { throw new DMLRuntimeException( - "There is not enough memory on device for this matrix, request (" + neededSize + ")"); + "There is not enough memory on device for this matrix, request (" + neededSize + "). Allocated GPU objects:" + allocatedGPUObjects.toString()); } if (toBeRemoved.dirty) { toBeRemoved.copyFromDeviceToHost(); @@ -702,7 +715,9 @@ public class GPUContext { * @throws DMLRuntimeException if error */ public void destroy() throws DMLRuntimeException { - LOG.trace("GPU : this context was destroyed, this = " + this.toString()); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : this context was destroyed, this = " + this.toString()); + } clearMemory(); cudnnDestroy(cudnnHandle.get()); cublasDestroy(cublasHandle.get()); http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index 1bed42a..2642011 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -27,18 +27,21 @@ import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW; import static jcuda.jcusparse.JCusparse.cusparseDdense2csr; import static jcuda.jcusparse.JCusparse.cusparseDnnz; import static jcuda.runtime.JCuda.cudaMemcpy; +import static jcuda.runtime.JCuda.cudaMemset; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; import java.util.Arrays; import java.util.concurrent.atomic.AtomicLong; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysml.api.DMLScript; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.CacheException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; +import org.apache.sysml.runtime.instructions.cp.CPInstruction; import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.data.MatrixBlock; @@ -189,7 +192,9 @@ public class GPUObject { */ public static Pointer transpose(GPUContext gCtx, Pointer densePtr, int m, int n, int lda, int ldc) throws DMLRuntimeException { - LOG.trace("GPU : transpose of block of size [" + m + "," + n + "]" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : transpose of block of size [" + m + "," + n + "]" + ", GPUContext=" + gCtx); + } Pointer alpha = Pointer.to(new double[] { 1.0 }); Pointer beta = Pointer.to(new double[] { 0.0 }); Pointer A = densePtr; @@ -244,9 +249,10 @@ public class GPUObject { throw new DMLRuntimeException( "cusparseDnnz did not calculate the correct number of nnz from the sparse-matrix vector mulitply on the GPU"); } - - LOG.trace("GPU : col-major dense size[" + rows + "," + cols + "] to row-major sparse of with nnz = " + nnzC[0] + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : col-major dense size[" + rows + "," + cols + "] to row-major sparse of with nnz = " + nnzC[0] + ", GPUContext=" + gCtx); + } CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnzC[0], rows); cusparseDdense2csr(cusparseHandle, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, C.val, C.rowPtr, @@ -338,7 +344,9 @@ public class GPUObject { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public void denseToSparse() throws DMLRuntimeException { - LOG.trace("GPU : dense -> sparse on " + this + ", GPUContext=" + getGPUContext()); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : dense -> sparse on " + this + ", GPUContext=" + getGPUContext()); + } long t0 = 0; if (DMLScript.STATISTICS) t0 = System.nanoTime(); @@ -368,7 +376,9 @@ public class GPUObject { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public void denseRowMajorToColumnMajor() throws DMLRuntimeException { - LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + getGPUContext()); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + getGPUContext()); + } int m = toIntExact(mat.getNumRows()); int n = toIntExact(mat.getNumColumns()); int lda = n; @@ -389,7 +399,9 @@ public class GPUObject { * @throws DMLRuntimeException if error */ public void denseColumnMajorToRowMajor() throws DMLRuntimeException { - LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + getGPUContext()); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + getGPUContext()); + } int n = toIntExact(mat.getNumRows()); int m = toIntExact(mat.getNumColumns()); @@ -422,7 +434,9 @@ public class GPUObject { * @throws DMLRuntimeException ? */ public void sparseToDense(String instructionName) throws DMLRuntimeException { - LOG.trace("GPU : sparse -> dense on " + this + ", GPUContext=" + getGPUContext()); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : sparse -> dense on " + this + ", GPUContext=" + getGPUContext()); + } long start = 0, end = 0; if (DMLScript.STATISTICS) start = System.nanoTime(); @@ -447,7 +461,9 @@ public class GPUObject { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public void sparseToColumnMajorDense() throws DMLRuntimeException { - LOG.trace("GPU : sparse -> col-major dense on " + this + ", GPUContext=" + getGPUContext()); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : sparse -> col-major dense on " + this + ", GPUContext=" + getGPUContext()); + } if (getJcudaSparseMatrixPtr() == null || !isAllocated()) throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call"); @@ -501,7 +517,9 @@ public class GPUObject { * @return cudnn tensor descriptor */ public cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int H, int W) { - LOG.trace("GPU : allocateTensorDescriptor with [N=" + N + ",C=" + C + ",H=" + H + ",W=" + W + "] on " + this); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : allocateTensorDescriptor with [N=" + N + ",C=" + C + ",H=" + H + ",W=" + W + "] on " + this); + } if (tensorDescriptor == null) { tensorDescriptor = new cudnnTensorDescriptor(); cudnnCreateTensorDescriptor(tensorDescriptor); @@ -544,7 +562,9 @@ public class GPUObject { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public void allocateSparseAndEmpty() throws DMLRuntimeException { - LOG.trace("GPU : allocate sparse and empty block on " + this + ", GPUContext=" + getGPUContext()); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : allocate sparse and empty block on " + this + ", GPUContext=" + getGPUContext()); + } setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(getGPUContext(), 0, mat.getNumRows())); } @@ -556,7 +576,9 @@ public class GPUObject { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public void allocateAndFillDense(double v) throws DMLRuntimeException { - LOG.trace("GPU : allocate and fill dense with value " + v + " on " + this + ", GPUContext=" + getGPUContext()); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : allocate and fill dense with value " + v + " on " + this + ", GPUContext=" + getGPUContext()); + } long rows = mat.getNumRows(); long cols = mat.getNumColumns(); int numElems = toIntExact(rows * cols); @@ -583,14 +605,17 @@ public class GPUObject { return isEmptyAndSparseAndAllocated; } - public boolean acquireDeviceRead() throws DMLRuntimeException { - LOG.trace("GPU : acquireDeviceRead on " + this); + public boolean acquireDeviceRead(String opcode) throws DMLRuntimeException { + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : acquireDeviceRead on " + this); + } boolean transferred = false; if (!isAllocated()) { - LOG.trace( - "GPU : in acquireDeviceRead, data is not allocated, copying from host, on " + this + ", GPUContext=" + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : in acquireDeviceRead, data is not allocated, copying from host, on " + this + ", GPUContext=" + getGPUContext()); - copyFromHostToDevice(); + } + copyFromHostToDevice(opcode); transferred = true; } addLock(); @@ -600,11 +625,15 @@ public class GPUObject { } public boolean acquireDeviceModifyDense() throws DMLRuntimeException { - LOG.trace("GPU : acquireDeviceModifyDense on " + this + ", GPUContext=" + getGPUContext()); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : acquireDeviceModifyDense on " + this + ", GPUContext=" + getGPUContext()); + } boolean allocated = false; if (!isAllocated()) { mat.setDirty(true); - LOG.trace("GPU : data is not allocated, allocating a dense block, on " + this); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : data is not allocated, allocating a dense block, on " + this); + } // Dense block, size = numRows * numCols allocateDenseMatrixOnDevice(); allocated = true; @@ -616,11 +645,15 @@ public class GPUObject { } public boolean acquireDeviceModifySparse() throws DMLRuntimeException { - LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + getGPUContext()); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + getGPUContext()); + } boolean allocated = false; isSparse = true; if (!isAllocated()) { - LOG.trace("GPU : data is not allocated, allocating a sparse block, on " + this); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : data is not allocated, allocating a sparse block, on " + this); + } mat.setDirty(true); allocateSparseMatrixOnDevice(); allocated = true; @@ -644,10 +677,14 @@ public class GPUObject { public boolean acquireHostRead() throws CacheException { boolean copied = false; try { - LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + getGPUContext()); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + getGPUContext()); + } if (isAllocated() && dirty) { - LOG.trace("GPU : data is dirty on device, copying to host, on " + this + ", GPUContext=" + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : data is dirty on device, copying to host, on " + this + ", GPUContext=" + getGPUContext()); + } copyFromDeviceToHost(); copied = true; } @@ -668,8 +705,10 @@ public class GPUObject { throw new CacheException("Internal state error : Invalid number of locks on a GPUObject"); } - LOG.trace("GPU : updateReleaseLocks, new number of locks is " + newLocks + ", on " + this + ", GPUContext=" + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : updateReleaseLocks, new number of locks is " + newLocks + ", on " + this + ", GPUContext=" + getGPUContext()); + } GPUContext.EvictionPolicy evictionPolicy = getGPUContext().evictionPolicy; switch (evictionPolicy) { case LRU: @@ -711,30 +750,42 @@ public class GPUObject { } void allocateDenseMatrixOnDevice() throws DMLRuntimeException { - LOG.trace("GPU : allocateDenseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext()); - assert !isAllocated() : "Internal error - trying to allocated dense matrix to a GPUObject that is already allocated"; + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : allocateDenseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext()); + } + if(isAllocated()) + throw new DMLRuntimeException("Internal error - trying to allocated dense matrix to a GPUObject that is already allocated"); long rows = mat.getNumRows(); long cols = mat.getNumColumns(); - assert rows > 0 : "Internal error - invalid number of rows when allocating dense matrix"; - assert cols > 0 : "Internal error - invalid number of columns when allocating dense matrix;"; + if(rows <= 0) + throw new DMLRuntimeException("Internal error - invalid number of rows when allocating dense matrix"); + if(cols <= 0) + throw new DMLRuntimeException("Internal error - invalid number of columns when allocating dense matrix;"); long size = getDoubleSizeOf(rows * cols); Pointer tmp = allocate(size); setDenseMatrixCudaPointer(tmp); } void allocateSparseMatrixOnDevice() throws DMLRuntimeException { - LOG.trace("GPU : allocateSparseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext()); - assert !isAllocated() : "Internal error = trying to allocated sparse matrix to a GPUObject that is already allocated"; + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : allocateSparseMatrixOnDevice, on " + this + ", GPUContext=" + getGPUContext()); + } + if(isAllocated()) + throw new DMLRuntimeException("Internal error - trying to allocated sparse matrix to a GPUObject that is already allocated"); long rows = mat.getNumRows(); long nnz = mat.getNnz(); - assert rows > 0 : "Internal error - invalid number of rows when allocating a sparse matrix"; - assert nnz >= 0 : "Internal error - invalid number of non zeroes when allocating a sparse matrix"; + if(rows <= 0) + throw new DMLRuntimeException("Internal error - invalid number of rows when allocating sparse matrix"); + if(nnz < 0) + throw new DMLRuntimeException("Internal error - invalid number of non zeroes when allocating a sparse matrix"); CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), nnz, rows); setSparseMatrixCudaPointer(tmp); } void deallocateMemoryOnDevice(boolean eager) throws DMLRuntimeException { - LOG.trace("GPU : deallocateMemoryOnDevice, on " + this + ", GPUContext=" + getGPUContext()); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : deallocateMemoryOnDevice, on " + this + ", GPUContext=" + getGPUContext()); + } if (getJcudaDenseMatrixPtr() != null) { cudaFreeHelper(null, getJcudaDenseMatrixPtr(), eager); } @@ -765,20 +816,32 @@ public class GPUObject { return GPUSize; } - void copyFromHostToDevice() throws DMLRuntimeException { - LOG.trace("GPU : copyFromHostToDevice, on " + this + ", GPUContext=" + getGPUContext()); + void copyFromHostToDevice(String opcode) throws DMLRuntimeException { + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : copyFromHostToDevice, on " + this + ", GPUContext=" + getGPUContext()); + } long start = 0; if (DMLScript.STATISTICS) start = System.nanoTime(); + long acqrTime = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; MatrixBlock tmp = mat.acquireRead(); + if(GPUStatistics.DISPLAY_STATISTICS) { + if(tmp.isInSparseFormat()) + GPUStatistics.maintainCPMiscTimes(opcode, CPInstruction.MISC_TIMER_GET_SPARSE_MB, System.nanoTime()-acqrTime); + else + GPUStatistics.maintainCPMiscTimes(opcode, CPInstruction.MISC_TIMER_GET_DENSE_MB, System.nanoTime()-acqrTime); + } + if (tmp.isInSparseFormat()) { - int rowPtr[] = null; int colInd[] = null; double[] values = null; - - tmp.recomputeNonZeros(); + + // Only recompute non-zero if unknown, else this will incur huge penalty !! + if(tmp.getNonZeros() < 0) { + tmp.recomputeNonZeros(opcode); + } long nnz = tmp.getNonZeros(); mat.getMatrixCharacteristics().setNonZeros(nnz); @@ -831,8 +894,11 @@ public class GPUObject { allocateSparseMatrixOnDevice(); if (copyToDevice) { + long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; CSRPointer.copyToDevice(getJcudaSparseMatrixPtr(), tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd, values); + if(GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1); } } else { double[] data = tmp.getDenseBlock(); @@ -841,14 +907,26 @@ public class GPUObject { throw new DMLRuntimeException("Incorrect sparsity calculation"); else if (data == null && tmp.getNonZeros() != 0) throw new DMLRuntimeException("MatrixBlock is not allocated"); - else if (tmp.getNonZeros() == 0) - data = new double[tmp.getNumRows() * tmp.getNumColumns()]; - - // Copy dense block + allocateDenseMatrixOnDevice(); - - cudaMemcpy(getJcudaDenseMatrixPtr(), Pointer.to(data), - getDoubleSizeOf(mat.getNumRows() * mat.getNumColumns()), cudaMemcpyHostToDevice); + + if (tmp.getNonZeros() == 0) { + // Minor optimization: No need to allocate empty error for CPU + // data = new double[tmp.getNumRows() * tmp.getNumColumns()]; + long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + cudaMemset(getJcudaDenseMatrixPtr(), 0, getDoubleSizeOf(mat.getNumRows() * mat.getNumColumns())); + if(GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t1); + } + else { + // Copy dense block + // H2D now only measures the time taken to do + long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + cudaMemcpy(getJcudaDenseMatrixPtr(), Pointer.to(data), + getDoubleSizeOf(mat.getNumRows() * mat.getNumColumns()), cudaMemcpyHostToDevice); + if(GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1); + } } mat.release(); @@ -867,7 +945,9 @@ public class GPUObject { } protected void copyFromDeviceToHost() throws DMLRuntimeException { - LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", GPUContext=" + getGPUContext()); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", GPUContext=" + getGPUContext()); + } if (getJcudaDenseMatrixPtr() != null && getJcudaSparseMatrixPtr() != null) { throw new DMLRuntimeException("Invalid state : JCuda dense/sparse pointer are both allocated"); } http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index a362364..5e2136f 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -264,7 +264,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void reluBackward(GPUContext gCtx, String instName, MatrixObject input, MatrixObject dout, MatrixObject outputBlock) throws DMLRuntimeException { - LOG.trace("GPU : reluBackward" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : reluBackward" + ", GPUContext=" + gCtx); + } long rows = input.getNumRows(); long cols = input.getNumColumns(); Pointer imagePointer = getDensePointer(gCtx, input, instName); @@ -276,7 +278,7 @@ public class LibMatrixCUDA { getCudaKernels(gCtx).launchKernel("relu_backward", ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)), imagePointer, doutPointer, outputPointer, toInt(rows), toInt(cols)); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1); } @@ -293,7 +295,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void biasMultiply(GPUContext gCtx, String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException { - LOG.trace("GPU : biasMultiply" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : biasMultiply" + ", GPUContext=" + gCtx); + } if(isInSparseFormat(gCtx, input)) { input.getGPUObject(gCtx).sparseToDense(instName); } @@ -315,7 +319,7 @@ public class LibMatrixCUDA { getCudaKernels(gCtx).launchKernel("bias_multiply", ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)), imagePointer, biasPointer, outputPointer, toInt(rows), toInt(cols), toInt(PQ)); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1); } @@ -360,14 +364,16 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException */ private static void biasAdd(GPUContext gCtx, String instName, Pointer image, Pointer bias, Pointer output, int rows, int cols, int k) throws DMLRuntimeException { - LOG.trace("GPU : biasAdd" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : biasAdd" + ", GPUContext=" + gCtx); + } int PQ = cols / k; long t1 = 0; if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("bias_add", ExecutionConfig.getConfigForSimpleMatrixOperations(rows, cols), image, bias, output, rows, cols, PQ); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1); } @@ -397,7 +403,9 @@ public class LibMatrixCUDA { */ public static void matmultTSMM(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, String outputName, boolean isLeftTransposed) throws DMLRuntimeException { - LOG.trace("GPU : matmultTSMM" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : matmultTSMM" + ", GPUContext=" + gCtx); + } if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); if(isInSparseFormat(gCtx, left)) { @@ -452,7 +460,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static void copyUpperToLowerTriangle(GPUContext gCtx, String instName, MatrixObject ret) throws DMLRuntimeException { - LOG.trace("GPU : copyUpperToLowerTriangle" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : copyUpperToLowerTriangle" + ", GPUContext=" + gCtx); + } if(isInSparseFormat(gCtx, ret)) { throw new DMLRuntimeException("Sparse GPU copyUpperToLowerTriangle is not implemented"); } @@ -501,7 +511,9 @@ public class LibMatrixCUDA { boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException { if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); - LOG.trace("GPU : matmult" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : matmult" + ", GPUContext=" + gCtx); + } if(!left.getGPUObject(gCtx).isAllocated() || !right.getGPUObject(gCtx).isAllocated()) throw new DMLRuntimeException("One of input is not allocated:" + left.getGPUObject(gCtx).isAllocated() + " " + right.getGPUObject(gCtx).isAllocated()); @@ -591,7 +603,9 @@ public class LibMatrixCUDA { CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); Pointer ADense = getDensePointer(gCtx, left, instName); if (B.isUltraSparse(k, n)){ - LOG.trace(" GPU : Convert d M %*% sp M --> sp M %*% sp M)" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace(" GPU : Convert d M %*% sp M --> sp M %*% sp M)" + ", GPUContext=" + gCtx); + } // Convert left to CSR and do cuSparse matmul int rowsA = (int)left.getNumRows(); @@ -616,7 +630,9 @@ public class LibMatrixCUDA { if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2); } else { - LOG.trace(" GPU : Convert d M %*% sp M --> d M %*% d M" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace(" GPU : Convert d M %*% sp M --> d M %*% d M" + ", GPUContext=" + gCtx); + } // Convert right to dense and do a cuBlas matmul // BDenseTransposed is a column major matrix // Note the arguments to denseDenseMatmult to accommodate for this. @@ -673,7 +689,9 @@ public class LibMatrixCUDA { long t0=0, t1=0, t2=0; // Sparse Matrix Dense Matrix multiply if (A.isUltraSparse(m, k)){ - LOG.trace(" GPU : Convert sp M %*% d M --> sp M %*% sp M" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace(" GPU : Convert sp M %*% d M --> sp M %*% sp M" + ", GPUContext=" + gCtx); + } // Convert right to CSR and do cuSparse matmul int rowsB = (int)right.getNumRows(); int colsB = (int)right.getNumColumns(); @@ -697,7 +715,9 @@ public class LibMatrixCUDA { if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2); } else { - LOG.trace(" GPU : Convert sp M %*% d M --> d M %*% d M" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace(" GPU : Convert sp M %*% d M --> d M %*% d M" + ", GPUContext=" + gCtx); + } // Convert left to dense and do a cuBlas matmul // ADenseTransposed is a column major matrix // Note the arguments to denseDenseMatmult to accommodate for this. @@ -739,7 +759,9 @@ public class LibMatrixCUDA { */ private static void sparseMatrixDenseVectorMult(GPUContext gCtx, String instName, MatrixObject output, CSRPointer A, Pointer B_dense, boolean isATranposed, int m, int k) throws DMLRuntimeException { - LOG.trace("GPU : sp M %*% dense V" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : sp M %*% dense V" + ", GPUContext=" + gCtx); + } int transA = CUSPARSE_OPERATION_NON_TRANSPOSE; long size = m * Sizeof.DOUBLE; if (isATranposed){ @@ -836,7 +858,9 @@ public class LibMatrixCUDA { */ private static void sparseSparseMatmult(GPUContext gCtx, String instName, CSRPointer A, CSRPointer B, MatrixObject output, boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException { - LOG.trace("GPU : sp M %*% sp M" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : sp M %*% sp M" + ", GPUContext=" + gCtx); + } int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; @@ -910,7 +934,9 @@ public class LibMatrixCUDA { public static void denseDenseMatmult(GPUContext gCtx, String instName, Pointer output, int leftRows1, int leftCols1, int rightRows1, int rightCols1, boolean isLeftTransposed1, boolean isRightTransposed1, Pointer leftPtr, Pointer rightPtr) throws DMLRuntimeException { - LOG.trace("GPU : d M %*% d M" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : d M %*% d M" + ", GPUContext=" + gCtx); + } Pointer A = rightPtr; Pointer B = leftPtr; @@ -1004,7 +1030,9 @@ public class LibMatrixCUDA { throws DMLRuntimeException { if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); - LOG.trace("GPU : unaryAggregate" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : unaryAggregate" + ", GPUContext=" + gCtx); + } final int REDUCTION_ALL = 1; final int REDUCTION_ROW = 2; final int REDUCTION_COL = 3; @@ -1044,7 +1072,8 @@ public class LibMatrixCUDA { } else { throw new DMLRuntimeException("Internal Error - Invalid index function type, only reducing along rows, columns, diagonals or all elements is supported in Aggregate Unary operations"); } - assert reductionDirection !=-1 : "Internal Error - Incorrect type of reduction direction set for aggregate unary GPU instruction"; + if(reductionDirection == -1) + throw new DMLRuntimeException("Internal Error - Incorrect type of reduction direction set for aggregate unary GPU instruction"); // Convert function type to a number int opIndex = -1; @@ -1055,7 +1084,8 @@ public class LibMatrixCUDA { } else if (aggOp.increOp.fn instanceof Mean) { opIndex = OP_MEAN; } else if (aggOp.increOp.fn instanceof CM) { - assert ((CM)aggOp.increOp.fn).getAggOpType() == CMOperator.AggregateOperationTypes.VARIANCE : "Internal Error - Invalid Type of CM operator for Aggregate Unary operation on GPU"; + if(((CM)aggOp.increOp.fn).getAggOpType() != CMOperator.AggregateOperationTypes.VARIANCE) + throw new DMLRuntimeException("Internal Error - Invalid Type of CM operator for Aggregate Unary operation on GPU"); opIndex = OP_VARIANCE; } else if (aggOp.increOp.fn instanceof Plus) { opIndex = OP_PLUS; @@ -1074,8 +1104,8 @@ public class LibMatrixCUDA { } else { throw new DMLRuntimeException("Internal Error - Aggregate operator has invalid Value function"); } - assert opIndex != -1 : "Internal Error - Incorrect type of operation set for aggregate unary GPU instruction"; - + if(opIndex == -1) + throw new DMLRuntimeException("Internal Error - Incorrect type of operation set for aggregate unary GPU instruction"); int rlen = (int)in1.getNumRows(); int clen = (int)in1.getNumColumns(); @@ -1345,7 +1375,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static double reduceAll(GPUContext gCtx, String instName, String kernelFunction, Pointer in, int n) throws DMLRuntimeException { - LOG.trace("GPU : reduceAll for " + kernelFunction + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : reduceAll for " + kernelFunction + ", GPUContext=" + gCtx); + } int[] tmp = getKernelParamsForReduceAll(gCtx, n); int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; @@ -1391,7 +1423,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static void reduceRow(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException { - LOG.trace("GPU : reduceRow for " + kernelFunction + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : reduceRow for " + kernelFunction + ", GPUContext=" + gCtx); + } int[] tmp = getKernelParamsForReduceByRow(gCtx, rows, cols); int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; @@ -1417,7 +1451,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static void reduceCol(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException { - LOG.trace("GPU : reduceCol for " + kernelFunction + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : reduceCol for " + kernelFunction + ", GPUContext=" + gCtx); + } int[] tmp = getKernelParamsForReduceByCol(gCtx, rows, cols); int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; @@ -1521,7 +1557,9 @@ public class LibMatrixCUDA { if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); double constant = op.getConstant(); - LOG.trace("GPU : matrixScalarRelational, scalar: " + constant + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : matrixScalarRelational, scalar: " + constant + ", GPUContext=" + gCtx); + } Pointer A, C; if (isSparseAndEmpty(gCtx, in)) { @@ -1556,7 +1594,9 @@ public class LibMatrixCUDA { if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); double constant = op.getConstant(); - LOG.trace("GPU : matrixScalarArithmetic, scalar: " + constant + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : matrixScalarArithmetic, scalar: " + constant + ", GPUContext=" + gCtx); + } int outRLen = isInputTransposed ? (int) in.getNumColumns() : (int) in.getNumRows(); int outCLen = isInputTransposed ? (int) in.getNumRows() : (int) in.getNumColumns(); @@ -1743,7 +1783,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException throws runtime exception */ private static void matrixScalarOp(GPUContext gCtx, String instName, Pointer a, double scalar, int rlenA, int clenA, Pointer c, ScalarOperator op) throws DMLRuntimeException { - LOG.trace("GPU : matrix_scalar_op" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : matrix_scalar_op" + ", GPUContext=" + gCtx); + } int isLeftScalar = (op instanceof LeftScalarOperator) ? 1 : 0; int size = rlenA * clenA; long t0=0; @@ -1847,7 +1889,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException */ private static void matrixMatrixOp(GPUContext gCtx, String instName, Pointer a, Pointer b, int maxRlen, int maxClen, int vecStatusA, int vecStatusB, Pointer c, BinaryOperator op) throws DMLRuntimeException { - LOG.trace("GPU : matrix_matrix_cellwise_op" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : matrix_matrix_cellwise_op" + ", GPUContext=" + gCtx); + } long t0=0; if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("matrix_matrix_cellwise_op", @@ -2025,7 +2069,9 @@ public class LibMatrixCUDA { boolean isLeftTransposed, boolean isRightTransposed, double alpha, double beta) throws DMLRuntimeException { if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); - LOG.trace("GPU : dgeam" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : dgeam" + ", GPUContext=" + gCtx); + } Pointer alphaPtr = pointerTo(alpha); Pointer betaPtr = pointerTo(beta); @@ -2189,7 +2235,9 @@ public class LibMatrixCUDA { if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException( "GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); - LOG.trace("GPU : sliceOperations" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : sliceOperations" + ", GPUContext=" + gCtx); + } int rl = (int) ixrange.rowStart; int ru = (int) ixrange.rowEnd; @@ -2233,21 +2281,20 @@ public class LibMatrixCUDA { * @param ru row upper * @param cl column lower * @param cu column upper - * @param len1 input number of columns - * @param len2 output number of columns - * @throws DMLRuntimeException + * @param inClen input number of columns + * @param retClen output number of columns + * @throws DMLRuntimeException if error occurs */ protected static void sliceDenseDense(GPUContext gCtx, String instName, Pointer inPointer, Pointer outPointer, - int rl, int ru, int cl, int cu, int len1, int len2) throws DMLRuntimeException { + int rl, int ru, int cl, int cu, int inClen, int retClen) throws DMLRuntimeException { long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; - if (len1 == len2) { - cudaMemcpy(outPointer, inPointer.withByteOffset(rl * len1 * Sizeof.DOUBLE), (ru - rl + 1) * len1 + if (inClen == retClen) { + cudaMemcpy(outPointer, inPointer.withByteOffset(rl * inClen * Sizeof.DOUBLE), (ru - rl + 1) * inClen * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice); } else { - for (int i = rl, ix1 = rl * len1 + cl, ix2 = 0; i <= ru; i++, ix1 += len1, ix2 += len2) { - cudaMemcpy(outPointer.withByteOffset(ix2 * Sizeof.DOUBLE), - inPointer.withByteOffset(ix1 * Sizeof.DOUBLE), len2 * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice); - } + int size = ru - rl + 1; + getCudaKernels(gCtx).launchKernel("slice_dense_dense", ExecutionConfig.getConfigForSimpleVectorOperations(size), + inPointer, outPointer, rl, ru, cl, cu, inClen, retClen); } if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RIX_DENSE_OP, System.nanoTime() - t0); } @@ -2263,23 +2310,26 @@ public class LibMatrixCUDA { * @param ru row upper * @param cl column lower * @param cu column upper - * @throws DMLRuntimeException + * @throws DMLRuntimeException if error */ protected static void sliceSparseDense(GPUContext gCtx, String instName, CSRPointer inPointer, Pointer outPointer, int rl, int ru, int cl, int cu) throws DMLRuntimeException { int size = ru - rl + 1; long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + int retClen = cu - cl + 1; // Performs a slice operation where the input matrix is sparse and the output matrix is dense. // This function avoids unnecessary sparse to dense conversion of the input matrix. // We can generalize this later to output sparse matrix. getCudaKernels(gCtx).launchKernel("slice_sparse_dense", ExecutionConfig.getConfigForSimpleVectorOperations(size), - inPointer.val, inPointer.rowPtr, inPointer.colInd, outPointer, rl, ru, cl, cu); + inPointer.val, inPointer.rowPtr, inPointer.colInd, outPointer, rl, ru, cl, cu, retClen); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RIX_SPARSE_DENSE_OP, System.nanoTime() - t0); } public static void cbind(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException { if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); - LOG.trace("GPU : cbind" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : cbind" + ", GPUContext=" + gCtx); + } long t1 = 0; @@ -2312,7 +2362,9 @@ public class LibMatrixCUDA { public static void rbind(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException { if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); - LOG.trace("GPU : rbind" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : rbind" + ", GPUContext=" + gCtx); + } long t1 = 0; @@ -2362,7 +2414,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void exp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { - LOG.trace("GPU : exp" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : exp" + ", GPUContext=" + gCtx); + } // e^0 = 1, create a dense block full of 1s unaryOp(ec, gCtx, in1, "matrix_exp", 1, outputName, instName, GPUInstruction.MISC_TIMER_EXP_KERNEL); } @@ -2377,7 +2431,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void sqrt(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { - LOG.trace("GPU : sqrt" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : sqrt" + ", GPUContext=" + gCtx); + } // sqrt(0) = 0, create a dense block full of 0s unaryOp(ec, gCtx, in1, "matrix_sqrt", 0, outputName, instName, GPUInstruction.MISC_TIMER_SQRT_KERNEL); } @@ -2392,7 +2448,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void round(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { - LOG.trace("GPU : round" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : round" + ", GPUContext=" + gCtx); + } // round(0) = 0, create a dense block full of 0s unaryOp(ec, gCtx, in1, "matrix_round", 0, outputName, instName, GPUInstruction.MISC_TIMER_ROUND_KERNEL); } @@ -2407,7 +2465,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void abs(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { - LOG.trace("GPU : abs" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : abs" + ", GPUContext=" + gCtx); + } // abs(0) = 0, create a dense block full of 0s unaryOp(ec, gCtx, in1, "matrix_abs", 0, outputName, instName, GPUInstruction.MISC_TIMER_ABS_KERNEL); } @@ -2422,7 +2482,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void log(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { - LOG.trace("GPU : log" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : log" + ", GPUContext=" + gCtx); + } // log(0) = -Inf unaryOp(ec, gCtx, in1, "matrix_log", Double.NEGATIVE_INFINITY, outputName, instName, GPUInstruction.MISC_TIMER_LOG_KERNEL); } @@ -2437,7 +2499,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void floor(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { - LOG.trace("GPU : floor" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : floor" + ", GPUContext=" + gCtx); + } // floor(0) = 0 unaryOp(ec, gCtx, in1, "matrix_floor", 0, outputName, instName, GPUInstruction.MISC_TIMER_FLOOR_KERNEL); } @@ -2452,7 +2516,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void ceil(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { - LOG.trace("GPU : ceil" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : ceil" + ", GPUContext=" + gCtx); + } // ceil(0) = 0 unaryOp(ec, gCtx, in1, "matrix_ceil", 0, outputName, instName, GPUInstruction.MISC_TIMER_CEIL_KERNEL); } @@ -2467,7 +2533,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void sin(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { - LOG.trace("GPU : sin" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : sin" + ", GPUContext=" + gCtx); + } // sin(0) = 0 unaryOp(ec, gCtx, in1, "matrix_sin", 0, outputName, instName, GPUInstruction.MISC_TIMER_SIN_KERNEL); } @@ -2482,7 +2550,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void cos(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { - LOG.trace("GPU : cos" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : cos" + ", GPUContext=" + gCtx); + } // cos(0) = 1 unaryOp(ec, gCtx, in1, "matrix_cos", 1, outputName, instName, GPUInstruction.MISC_TIMER_COS_KERNEL); } @@ -2497,7 +2567,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void tan(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { - LOG.trace("GPU : tan" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : tan" + ", GPUContext=" + gCtx); + } // tan(0) = 0 unaryOp(ec, gCtx, in1, "matrix_tan", 0, outputName, instName, GPUInstruction.MISC_TIMER_TAN_KERNEL); } @@ -2512,7 +2584,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void asin(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { - LOG.trace("GPU : asin" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : asin" + ", GPUContext=" + gCtx); + } // asin(0) = 0 unaryOp(ec, gCtx, in1, "matrix_asin", 0, outputName, instName, GPUInstruction.MISC_TIMER_ASIN_KERNEL); } @@ -2527,7 +2601,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void acos(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { - LOG.trace("GPU : acos" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : acos" + ", GPUContext=" + gCtx); + } // acos(0) = PI/2 unaryOp(ec, gCtx, in1, "matrix_acos", Math.PI/2.0, outputName, instName, GPUInstruction.MISC_TIMER_ACOS_KERNEL); } @@ -2542,7 +2618,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void atan(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { - LOG.trace("GPU : atan" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : atan" + ", GPUContext=" + gCtx); + } // atan(0) = 0 unaryOp(ec, gCtx, in1, "matrix_atan", 0, outputName, instName, GPUInstruction.MISC_TIMER_ATAN_KERNEL); } @@ -2557,7 +2635,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void sign(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException { - LOG.trace("GPU : sign" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : sign" + ", GPUContext=" + gCtx); + } // sign(0) = 0 unaryOp(ec, gCtx, in1, "matrix_sign", 0, outputName, instName, GPUInstruction.MISC_TIMER_SIGN_KERNEL); } @@ -2622,7 +2702,9 @@ public class LibMatrixCUDA { long t1=0, t2=0; if(in1.getNumRows() == in2.getNumRows() && in1.getNumColumns() == in2.getNumColumns()) { - LOG.trace("GPU : cublasDaxpy" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : cublasDaxpy" + ", GPUContext=" + gCtx); + } // Matrix-Matrix daxpy long n = in1.getNumRows()*in2.getNumColumns(); // Since A is always a matrix @@ -2640,7 +2722,9 @@ public class LibMatrixCUDA { if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_LIB, System.nanoTime() - t2); } else { - LOG.trace("GPU : daxpy_matrix_vector" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : daxpy_matrix_vector" + ", GPUContext=" + gCtx); + } // Matrix-Vector daxpy // Note: Vector-Matrix operation is not supported @@ -2671,7 +2755,9 @@ public class LibMatrixCUDA { throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); // x = solve(A, b) - LOG.trace("GPU : solve" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : solve" + ", GPUContext=" + gCtx); + } long t0 = -1; http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java index bf5f25b..9a39b1c 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java @@ -244,7 +244,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { private static void cudnnConv2d(GPUContext gCtx, String instName, Pointer image, Pointer filter, Pointer output, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException { - LOG.trace("GPU : conv2d" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : conv2d" + ", GPUContext=" + gCtx); + } cudnnFilterDescriptor filterDesc = null; cudnnConvolutionDescriptor convDesc = null; Pointer workSpace = null; @@ -411,7 +413,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { Pointer dwPointer, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException { - LOG.trace("GPU : conv2dBackwardFilter" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : conv2dBackwardFilter" + ", GPUContext=" + gCtx); + } cudnnFilterDescriptor dwDesc = null; cudnnConvolutionDescriptor convDesc = null; @@ -551,7 +555,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { Pointer dx, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException { - LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" + gCtx); + } cudnnFilterDescriptor wDesc = null; cudnnConvolutionDescriptor convDesc = null; @@ -708,7 +714,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { Pointer y, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException { - LOG.trace("GPU : performMaxpooling" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : performMaxpooling" + ", GPUContext=" + gCtx); + } cudnnPoolingDescriptor poolingDesc = null; @@ -803,7 +811,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException { - LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" + gCtx); + } Pointer y = null; cudnnPoolingDescriptor poolingDesc = null; @@ -938,7 +948,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { public static void batchNormalizationForwardInference(GPUContext gCtx, String instName, MatrixObject image, MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, MatrixObject ret, double epsilon) throws DMLRuntimeException { - LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" + gCtx); + } int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL; int N = toInt(image.getNumRows()); @@ -984,7 +996,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { public static void batchNormalizationForwardTraining(GPUContext gCtx, String instName, MatrixObject image, MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, MatrixObject ret, MatrixObject retRunningMean, MatrixObject retRunningVar, double epsilon, double exponentialAverageFactor) throws DMLRuntimeException { - LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" + gCtx); + } int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL; int N = toInt(image.getNumRows()); @@ -1104,7 +1118,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { public static void batchNormalizationBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout, MatrixObject scale, MatrixObject ret, MatrixObject retScale, MatrixObject retBias, double epsilon) throws DMLRuntimeException { - LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" + gCtx); + } int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL; int N = toInt(image.getNumRows()); @@ -1134,7 +1150,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { private static void cudnnReLU(GPUContext gCtx, String instName, MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws DMLRuntimeException { long t0=0; try { - LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + gCtx); + } cudnnTensorDescriptor dstTensorDesc = srcTensorDesc; Pointer srcData = getDensePointerForCuDNN(gCtx, in, instName); @@ -1176,7 +1194,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { long t0=0; cudnnTensorDescriptor srcTensorDesc = in.getGPUObject(gCtx).getTensorDescriptor(); if(N*CHW >= maxNumDoublesOfCuDNNTensor || srcTensorDesc == null) { - LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx); + } // Invokes relu(double* A, double* ret, int rlen, int clen) if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); Pointer dstData = getDensePointerForCuDNN(gCtx, output, instName); @@ -1193,8 +1213,10 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { /** * Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously. + * * @param gCtx a valid {@link GPUContext} * @param image input matrix object + * @param instName name of the instruction * @return jcuda pointer * @throws DMLRuntimeException if error occurs while sparse to dense conversion */ http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java index d041fc4..ef32481 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java @@ -1176,6 +1176,11 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab * @return number of non-zeros */ public long recomputeNonZeros() { + return recomputeNonZeros(null); + } + + public long recomputeNonZeros(String opcode) { + long t1 = opcode != null && DMLScript.STATISTICS && DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; if( sparse && sparseBlock!=null ) { //SPARSE (max long) //note: rlen might be <= sparseBlock.numRows() nonZeros = sparseBlock.size(0, sparseBlock.numRows()); @@ -1188,6 +1193,10 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab nnz += (a[i]!=0) ? 1 : 0; nonZeros = nnz; } + if(opcode != null && DMLScript.STATISTICS && DMLScript.FINEGRAINED_STATISTICS) { + long t2 = System.nanoTime(); + GPUStatistics.maintainCPMiscTimes(opcode, CPInstruction.MISC_TIMER_RECOMPUTE_NNZ, t2-t1); + } return nonZeros; }