[1/3] systemml git commit: [SYSTEMML-445] Improved performance of GPU right indexing

niketanpansare Wed, 13 Sep 2017 16:05:05 -0700

Repository: systemml
Updated Branches:
  refs/heads/master e624d149f -> 4cf95c92e



http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/api/DMLScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java 
b/src/main/java/org/apache/sysml/api/DMLScript.java
index 3912403..767fead 100644
--- a/src/main/java/org/apache/sysml/api/DMLScript.java
+++ b/src/main/java/org/apache/sysml/api/DMLScript.java
@@ -170,8 +170,10 @@ public class DMLScript
         * case (TRUE/FALSE for DML and True/False for PYDML).
         */
        public static ScriptType        SCRIPT_TYPE         = 
DMLOptions.defaultOptions.scriptType;
+       
        public static boolean           USE_ACCELERATOR     = 
DMLOptions.defaultOptions.gpu;
        public static boolean           FORCE_ACCELERATOR   = 
DMLOptions.defaultOptions.forceGPU;
+       public static boolean           SYNCHRONIZE_GPU         = true;         
                                                                // whether to 
synchronize GPU after every instruction
 
 
        public static boolean _suppressPrint2Stdout = false;  // flag that 
indicates whether or not to suppress any prints to stdout

http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java 
b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
index 09897a5..60c84aa 100644
--- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
@@ -78,6 +78,7 @@ public class ScriptExecutorUtils {
                GPUStatistics.DISPLAY_STATISTICS = 
dmlconf.getBooleanValue(DMLConfig.EXTRA_GPU_STATS);
                LibMatrixDNN.DISPLAY_STATISTICS = 
dmlconf.getBooleanValue(DMLConfig.EXTRA_DNN_STATS);
                DMLScript.FINEGRAINED_STATISTICS = 
dmlconf.getBooleanValue(DMLConfig.EXTRA_FINEGRAINED_STATS);
+               DMLScript.SYNCHRONIZE_GPU = 
dmlconf.getBooleanValue(DMLConfig.SYNCHRONIZE_GPU);
                DMLScript.STATISTICS_MAX_WRAP_LEN = 
dmlconf.getIntValue(DMLConfig.STATS_MAX_WRAP_LEN);
 
                boolean exceptionThrown = false;

http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/conf/DMLConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java 
b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index bfb3850..e5999fe 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -83,7 +83,8 @@ public class DMLConfig
        public static final String EXTRA_GPU_STATS      = 
"systemml.stats.extraGPU"; //boolean
        public static final String EXTRA_DNN_STATS      = 
"systemml.stats.extraDNN"; //boolean
        public static final String AVAILABLE_GPUS       = 
"systemml.gpu.availableGPUs"; // String to specify which GPUs to use (a range, 
all GPUs, comma separated list or a specific GPU)
-
+       public static final String SYNCHRONIZE_GPU       = 
"systemml.gpu.sync.postProcess"; // boolean: whether to synchronize GPUs after 
every instruction 
+       
        // Fraction of available memory to use. The available memory is 
computer when the GPUContext is created
        // to handle the tradeoff on calling cudaMemGetInfo too often.
        public static final String GPU_MEMORY_UTILIZATION_FACTOR    = 
"gpu.memory.util.factor";
@@ -131,6 +132,7 @@ public class DMLConfig
                _defaultVals.put(EXTRA_DNN_STATS,        "false" );
                _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR,      "0.9" );
                _defaultVals.put(AVAILABLE_GPUS,         "-1");
+               _defaultVals.put(SYNCHRONIZE_GPU,        "false" );
        }
        
        public DMLConfig()
@@ -413,7 +415,7 @@ public class DMLConfig
                                COMPRESSED_LINALG, 
                                CODEGEN, CODEGEN_COMPILER, CODEGEN_PLANCACHE, 
CODEGEN_LITERALS,
                                EXTRA_GPU_STATS, EXTRA_DNN_STATS, 
EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN,
-                               AVAILABLE_GPUS
+                               AVAILABLE_GPUS, SYNCHRONIZE_GPU
                }; 
                
                StringBuilder sb = new StringBuilder();

http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
index 59d2589..b74c0dd 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
@@ -378,30 +378,22 @@ public class ExecutionContext {
                return mo;
        }
 
-       public Pair<MatrixObject, Boolean> 
getMatrixInputForGPUInstruction(String varName)
+       public MatrixObject getMatrixInputForGPUInstruction(String varName, 
String opcode)
                        throws DMLRuntimeException 
        {
                GPUContext gCtx = getGPUContext(0);
-               boolean copied = false;
                MatrixObject mo = getMatrixObject(varName);
                if(mo == null) {
                        throw new DMLRuntimeException("No matrix object 
available for variable:" + varName);
                }
 
-               boolean acquired = false;
                if( mo.getGPUObject(gCtx) == null ) {
                        GPUObject newGObj = gCtx.createGPUObject(mo);
                        mo.setGPUObject(gCtx, newGObj);
-               } else if( !mo.getGPUObject(gCtx).isInputAllocated() ) {
-                       mo.acquireRead();
-                       acquired = true;
                }
-
-               copied = mo.getGPUObject(gCtx).acquireDeviceRead();
-               if(acquired) {
-                       mo.release();
-               }
-               return new Pair<MatrixObject, Boolean>(mo, copied);
+               // No need to perform acquireRead here because it is performed 
in copyFromHostToDevice
+               mo.getGPUObject(gCtx).acquireDeviceRead(opcode);
+               return mo;
        }
        
        /**

http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java
index b01c2c5..3294384 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java
@@ -52,6 +52,7 @@ public abstract class CPInstruction extends Instruction
        public final static String MISC_TIMER_RELEASE_BUFF_WRITE =              
        "rlswr";// time spent in buffer write in release operation
        public final static String MISC_TIMER_SPARSE_TO_DENSE =                 
        "s2d";  // time spent in sparse to dense conversion
        public final static String MISC_TIMER_DENSE_TO_SPARSE =                 
        "d2s";  // time spent in sparse to dense conversion
+       public final static String MISC_TIMER_RECOMPUTE_NNZ =                   
        "rnnz"; // time spent in recompute non-zeroes
        
        // Instruction specific miscellaneous timers that were found as 
potential bottlenecks in one of performance analysis.
        // SystemML committers have to be judicious about adding them by 
weighing the tradeoffs between reuse in future analysis and unnecessary 
overheads.

http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index f27084f..9a6a3bb 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -19,6 +19,7 @@
 
 package org.apache.sysml.runtime.instructions.gpu;
 
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.lops.runtime.RunMRJobs;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
@@ -123,7 +124,7 @@ public abstract class GPUInstruction extends Instruction {
        public final static String MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB =  
"nncbd"; // time spent in cudnnConvolutionBackwardData
        public final static String MISC_TIMER_MAXPOOLING_FORWARD_LIB =         
"nnmf";  // time spent in cudnnPoolingForward
        public final static String MISC_TIMER_MAXPOOLING_BACKWARD_LIB =        
"nnmb";  // time spent in cudnnPoolingBackward
-       public final static String MISC_TIMER_BIAS_ADD_LIB =                   
"nnba";  // time spent in bias_add cuda kernel
+       public final static String MISC_TIMER_BIAS_ADD_LIB =                   
"nnba";  // time spent in bias_add, bias_multiply cuda kernel
        public final static String MISC_TIMER_RELU_BACKWARD_KERNEL=            
"nnrbk"; // time spent in relu_backward cuda kernel
        public final static String MISC_TIMER_RELU_KERNEL =                    
"nnrk";  // time spent in the relu kernel
        public final static String MISC_TIMER_CUDNN_INIT =                     
"nni";   // time spent in initializations for cudnn call
@@ -187,7 +188,9 @@ public abstract class GPUInstruction extends Instruction {
        public void postprocessInstruction(ExecutionContext ec)
                                        throws DMLRuntimeException
        {
-               //JCuda.cudaDeviceSynchronize();
+               if(DMLScript.SYNCHRONIZE_GPU) {
+                       jcuda.runtime.JCuda.cudaDeviceSynchronize();
+               }
        }
 
        /**
@@ -199,10 +202,7 @@ public abstract class GPUInstruction extends Instruction {
         * @throws DMLRuntimeException if an error occurs
         */
        protected MatrixObject getMatrixInputForGPUInstruction(ExecutionContext 
ec, String name) throws DMLRuntimeException {
-               long t0 = System.nanoTime();
-               Pair<MatrixObject, Boolean> mb = 
ec.getMatrixInputForGPUInstruction(name);
-               if (mb.getValue()) 
GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), 
GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t0);
-               return mb.getKey();
+               return ec.getMatrixInputForGPUInstruction(name, 
getExtendedOpcode());
        }
 
        /**
@@ -216,9 +216,9 @@ public abstract class GPUInstruction extends Instruction {
         * @throws DMLRuntimeException  if an error occurs
         */
        protected MatrixObject 
getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name, long 
numRows, long numCols) throws DMLRuntimeException {
-               long t0 = System.nanoTime();
+               long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() 
: 0;
                Pair<MatrixObject, Boolean> mb = 
ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols);
-               if (mb.getValue()) 
GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), 
GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t0);
+               if (GPUStatistics.DISPLAY_STATISTICS && mb.getValue()) 
GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), 
GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t0);
                return mb.getKey();
        }
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index 9379534..7bb8b07 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -294,7 +294,8 @@ public class CSRPointer {
         */
        public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long 
rows) throws DMLRuntimeException {
                LOG.trace("GPU : allocateEmpty from CSRPointer with nnz=" + 
nnz2 + " and rows=" + rows + ", GPUContext=" + gCtx);
-               assert nnz2 > -1 : "Incorrect usage of internal API, number of 
non zeroes is less than 0 when trying to allocate sparse data on GPU";
+               if(nnz2 < 0) throw new DMLRuntimeException("Incorrect usage of 
internal API, number of non zeroes is less than 0 when trying to allocate 
sparse data on GPU");
+               if(rows <= 0) throw new DMLRuntimeException("Incorrect usage of 
internal API, number of rows is less than or equal to 0 when trying to allocate 
sparse data on GPU");
                CSRPointer r = new CSRPointer(gCtx);
                r.nnz = nnz2;
                if (nnz2 == 0) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 197daaf..a31deab 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -265,8 +265,10 @@ public class GPUContext {
                long t0 = 0, t1 = 0, end = 0;
                Pointer A;
                if (freeCUDASpaceMap.containsKey(size)) {
-                       LOG.trace("GPU : in allocate from instruction " + 
instructionName + ", found free block of size " + (size
+                       if(LOG.isTraceEnabled()) {
+                               LOG.trace("GPU : in allocate from instruction " 
+ instructionName + ", found free block of size " + (size
                                        / 1024.0) + " Kbytes from previously 
allocated block on " + this);
+                       }
                        if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS)
                                t0 = System.nanoTime();
                        LinkedList<Pointer> freeList = 
freeCUDASpaceMap.get(size);
@@ -277,9 +279,11 @@ public class GPUContext {
                                GPUStatistics
                                .maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0);
                } else {
-                       LOG.trace(
+                       if(LOG.isTraceEnabled()) {
+                               LOG.trace(
                                        "GPU : in allocate from instruction " + 
instructionName + ", allocating new block of size " + (size
                                                        / 1024.0) + " Kbytes on 
" + this);
+                       }
                        if (DMLScript.STATISTICS)
                                t0 = System.nanoTime();
                        ensureFreeSpace(instructionName, size);
@@ -296,8 +300,10 @@ public class GPUContext {
                // Set all elements to 0 since newly allocated space will 
contain garbage
                if (DMLScript.STATISTICS)
                        t1 = System.nanoTime();
-               LOG.trace("GPU : in allocate from instruction " + 
instructionName + ", setting block of size " + (size / 1024.0)
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : in allocate from instruction " + 
instructionName + ", setting block of size " + (size / 1024.0)
                                + " Kbytes to zero on " + this);
+               }
                cudaMemset(A, 0, size);
                if (DMLScript.STATISTICS)
                        end = System.nanoTime();
@@ -353,34 +359,38 @@ public class GPUContext {
                if (toFree == dummy) // trying to free a null pointer
                        return;
                long t0 = 0;
-               assert cudaBlockSizeMap.containsKey(
-                               toFree) : "ERROR : Internal state corrupted, 
cache block size map is not aware of a block it trying to free up";
-                               long size = cudaBlockSizeMap.get(toFree);
-                               if (eager) {
-                                       LOG.trace("GPU : eagerly freeing cuda 
memory [ " + toFree + " ] for instruction " + instructionName + " on "
-                                                       + this);
-                                       if (DMLScript.STATISTICS)
-                                               t0 = System.nanoTime();
-                                       cudaFree(toFree);
-                                       cudaBlockSizeMap.remove(toFree);
-                                       if (DMLScript.STATISTICS)
-                                               
GPUStatistics.cudaDeAllocTime.add(System.nanoTime() - t0);
-                                       if (DMLScript.STATISTICS)
-                                               
GPUStatistics.cudaDeAllocCount.add(1);
-                                       if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS)
-                                               
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_CUDA_FREE,
-                                                               
System.nanoTime() - t0);
-                               } else {
-                                       LOG.trace("GPU : lazily freeing cuda 
memory for instruction " + instructionName + " on " + this);
-                                       LinkedList<Pointer> freeList = 
freeCUDASpaceMap.get(size);
-                                       if (freeList == null) {
-                                               freeList = new 
LinkedList<Pointer>();
-                                               freeCUDASpaceMap.put(size, 
freeList);
-                                       }
-                                       if (freeList.contains(toFree))
-                                               throw new RuntimeException("GPU 
: Internal state corrupted, double free");
-                                       freeList.add(toFree);
-                               }
+               if(!cudaBlockSizeMap.containsKey(toFree))
+                       throw new RuntimeException("ERROR : Internal state 
corrupted, cache block size map is not aware of a block it trying to free up");
+               long size = cudaBlockSizeMap.get(toFree);
+               if (eager) {
+                       if(LOG.isTraceEnabled()) {
+                               LOG.trace("GPU : eagerly freeing cuda memory [ 
" + toFree + " ] for instruction " + instructionName + " on "
+                                       + this);
+                       }
+                       if (DMLScript.STATISTICS)
+                               t0 = System.nanoTime();
+                       cudaFree(toFree);
+                       cudaBlockSizeMap.remove(toFree);
+                       if (DMLScript.STATISTICS)
+                               
GPUStatistics.cudaDeAllocTime.add(System.nanoTime() - t0);
+                       if (DMLScript.STATISTICS)
+                               GPUStatistics.cudaDeAllocCount.add(1);
+                       if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS)
+                               
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_CUDA_FREE,
+                                               System.nanoTime() - t0);
+               } else {
+                       if(LOG.isTraceEnabled()) {
+                               LOG.trace("GPU : lazily freeing cuda memory for 
instruction " + instructionName + " on " + this);
+                       }
+                       LinkedList<Pointer> freeList = 
freeCUDASpaceMap.get(size);
+                       if (freeList == null) {
+                               freeList = new LinkedList<Pointer>();
+                               freeCUDASpaceMap.put(size, freeList);
+                       }
+                       if (freeList.contains(toFree))
+                               throw new RuntimeException("GPU : Internal 
state corrupted, double free");
+                       freeList.add(toFree);
+               }
        }
 
        /**
@@ -401,9 +411,10 @@ public class GPUContext {
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        void ensureFreeSpace(String instructionName, long size) throws 
DMLRuntimeException {
-               if (size >= getAvailableMemory()) {
+               if(size < 0 )
+                       throw new DMLRuntimeException("The size cannot be 
negative:" + size);
+               else if (size >= getAvailableMemory())
                        evict(instructionName, size);
-               }
        }
 
        /**
@@ -431,7 +442,9 @@ public class GPUContext {
         * @throws DMLRuntimeException If no reusable memory blocks to free up 
or if not enough matrix blocks with zero locks on them.
         */
        protected void evict(String instructionName, final long neededSize) 
throws DMLRuntimeException {
-               LOG.trace("GPU : evict called from " + instructionName + " for 
size " + neededSize + " on " + this);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : evict called from " + instructionName 
+ " for size " + neededSize + " on " + this);
+               }
                GPUStatistics.cudaEvictionCount.add(1);
                // Release the set of free blocks maintained in a 
GPUObject.freeCUDASpaceMap
                // to free up space
@@ -499,7 +512,7 @@ public class GPUContext {
                        GPUObject toBeRemoved = 
allocatedGPUObjects.get(allocatedGPUObjects.size() - 1);
                        if (toBeRemoved.locks.get() > 0) {
                                throw new DMLRuntimeException(
-                                               "There is not enough memory on 
device for this matrix, request (" + neededSize + ")");
+                                               "There is not enough memory on 
device for this matrix, request (" + neededSize + "). Allocated GPU objects:" + 
allocatedGPUObjects.toString());
                        }
                        if (toBeRemoved.dirty) {
                                toBeRemoved.copyFromDeviceToHost();
@@ -702,7 +715,9 @@ public class GPUContext {
         * @throws DMLRuntimeException if error
         */
        public void destroy() throws DMLRuntimeException {
-               LOG.trace("GPU : this context was destroyed, this = " + 
this.toString());
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : this context was destroyed, this = " + 
this.toString());
+               }
                clearMemory();
                cudnnDestroy(cudnnHandle.get());
                cublasDestroy(cublasHandle.get());

http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 1bed42a..2642011 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -27,18 +27,21 @@ import static 
jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
 import static jcuda.jcusparse.JCusparse.cusparseDdense2csr;
 import static jcuda.jcusparse.JCusparse.cusparseDnnz;
 import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.JCuda.cudaMemset;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 
 import java.util.Arrays;
 import java.util.concurrent.atomic.AtomicLong;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.CacheException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.instructions.cp.CPInstruction;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
@@ -189,7 +192,9 @@ public class GPUObject {
         */
        public static Pointer transpose(GPUContext gCtx, Pointer densePtr, int 
m, int n, int lda, int ldc)
                        throws DMLRuntimeException {
-               LOG.trace("GPU : transpose of block of size [" + m + "," + n + 
"]" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : transpose of block of size [" + m + 
"," + n + "]" + ", GPUContext=" + gCtx);
+               }
                Pointer alpha = Pointer.to(new double[] { 1.0 });
                Pointer beta = Pointer.to(new double[] { 0.0 });
                Pointer A = densePtr;
@@ -244,9 +249,10 @@ public class GPUObject {
                        throw new DMLRuntimeException(
                                        "cusparseDnnz did not calculate the 
correct number of nnz from the sparse-matrix vector mulitply on the GPU");
                }
-
-               LOG.trace("GPU : col-major dense size[" + rows + "," + cols + 
"] to row-major sparse of with nnz = " + nnzC[0]
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : col-major dense size[" + rows + "," + 
cols + "] to row-major sparse of with nnz = " + nnzC[0]
                                + ", GPUContext=" + gCtx);
+               }
 
                CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnzC[0], rows);
                cusparseDdense2csr(cusparseHandle, rows, cols, matDescr, 
densePtr, rows, nnzPerRowPtr, C.val, C.rowPtr,
@@ -338,7 +344,9 @@ public class GPUObject {
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        public void denseToSparse() throws DMLRuntimeException {
-               LOG.trace("GPU : dense -> sparse on " + this + ", GPUContext=" 
+ getGPUContext());
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : dense -> sparse on " + this + ", 
GPUContext=" + getGPUContext());
+               }
                long t0 = 0;
                if (DMLScript.STATISTICS)
                        t0 = System.nanoTime();
@@ -368,7 +376,9 @@ public class GPUObject {
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        public void denseRowMajorToColumnMajor() throws DMLRuntimeException {
-               LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + 
", GPUContext=" + getGPUContext());
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : dense Ptr row-major -> col-major on " 
+ this + ", GPUContext=" + getGPUContext());
+               }
                int m = toIntExact(mat.getNumRows());
                int n = toIntExact(mat.getNumColumns());
                int lda = n;
@@ -389,7 +399,9 @@ public class GPUObject {
         * @throws DMLRuntimeException if error
         */
        public void denseColumnMajorToRowMajor() throws DMLRuntimeException {
-               LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + 
", GPUContext=" + getGPUContext());
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : dense Ptr row-major -> col-major on " 
+ this + ", GPUContext=" + getGPUContext());
+               }
 
                int n = toIntExact(mat.getNumRows());
                int m = toIntExact(mat.getNumColumns());
@@ -422,7 +434,9 @@ public class GPUObject {
         * @throws DMLRuntimeException ?
         */
        public void sparseToDense(String instructionName) throws 
DMLRuntimeException {
-               LOG.trace("GPU : sparse -> dense on " + this + ", GPUContext=" 
+ getGPUContext());
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : sparse -> dense on " + this + ", 
GPUContext=" + getGPUContext());
+               }
                long start = 0, end = 0;
                if (DMLScript.STATISTICS)
                        start = System.nanoTime();
@@ -447,7 +461,9 @@ public class GPUObject {
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        public void sparseToColumnMajorDense() throws DMLRuntimeException {
-               LOG.trace("GPU : sparse -> col-major dense on " + this + ", 
GPUContext=" + getGPUContext());
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : sparse -> col-major dense on " + this 
+ ", GPUContext=" + getGPUContext());
+               }
                if (getJcudaSparseMatrixPtr() == null || !isAllocated())
                        throw new DMLRuntimeException("Expected allocated 
sparse matrix before sparseToDense() call");
 
@@ -501,7 +517,9 @@ public class GPUObject {
         * @return cudnn tensor descriptor
         */
        public cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int 
H, int W) {
-               LOG.trace("GPU : allocateTensorDescriptor with [N=" + N + ",C=" 
+ C + ",H=" + H + ",W=" + W + "] on " + this);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : allocateTensorDescriptor with [N=" + N 
+ ",C=" + C + ",H=" + H + ",W=" + W + "] on " + this);
+               }
                if (tensorDescriptor == null) {
                        tensorDescriptor = new cudnnTensorDescriptor();
                        cudnnCreateTensorDescriptor(tensorDescriptor);
@@ -544,7 +562,9 @@ public class GPUObject {
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        public void allocateSparseAndEmpty() throws DMLRuntimeException {
-               LOG.trace("GPU : allocate sparse and empty block on " + this + 
", GPUContext=" + getGPUContext());
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : allocate sparse and empty block on " + 
this + ", GPUContext=" + getGPUContext());
+               }
                
setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(getGPUContext(), 0, 
mat.getNumRows()));
        }
 
@@ -556,7 +576,9 @@ public class GPUObject {
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        public void allocateAndFillDense(double v) throws DMLRuntimeException {
-               LOG.trace("GPU : allocate and fill dense with value " + v + " 
on " + this + ", GPUContext=" + getGPUContext());
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : allocate and fill dense with value " + 
v + " on " + this + ", GPUContext=" + getGPUContext());
+               }
                long rows = mat.getNumRows();
                long cols = mat.getNumColumns();
                int numElems = toIntExact(rows * cols);
@@ -583,14 +605,17 @@ public class GPUObject {
                return isEmptyAndSparseAndAllocated;
        }
 
-       public boolean acquireDeviceRead() throws DMLRuntimeException {
-               LOG.trace("GPU : acquireDeviceRead on " + this);
+       public boolean acquireDeviceRead(String opcode) throws 
DMLRuntimeException {
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : acquireDeviceRead on " + this);
+               }
                boolean transferred = false;
                if (!isAllocated()) {
-                       LOG.trace(
-                                       "GPU : in acquireDeviceRead, data is 
not allocated, copying from host, on " + this + ", GPUContext="
+                       if(LOG.isTraceEnabled()) {
+                               LOG.trace("GPU : in acquireDeviceRead, data is 
not allocated, copying from host, on " + this + ", GPUContext="
                                                        + getGPUContext());
-                       copyFromHostToDevice();
+                       }
+                       copyFromHostToDevice(opcode);
                        transferred = true;
                }
                addLock();
@@ -600,11 +625,15 @@ public class GPUObject {
        }
 
        public boolean acquireDeviceModifyDense() throws DMLRuntimeException {
-               LOG.trace("GPU : acquireDeviceModifyDense on " + this + ", 
GPUContext=" + getGPUContext());
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : acquireDeviceModifyDense on " + this + 
", GPUContext=" + getGPUContext());
+               }
                boolean allocated = false;
                if (!isAllocated()) {
                        mat.setDirty(true);
-                       LOG.trace("GPU : data is not allocated, allocating a 
dense block, on " + this);
+                       if(LOG.isTraceEnabled()) {
+                               LOG.trace("GPU : data is not allocated, 
allocating a dense block, on " + this);
+                       }
                        // Dense block, size = numRows * numCols
                        allocateDenseMatrixOnDevice();
                        allocated = true;
@@ -616,11 +645,15 @@ public class GPUObject {
        }
 
        public boolean acquireDeviceModifySparse() throws DMLRuntimeException {
-               LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", 
GPUContext=" + getGPUContext());
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : acquireDeviceModifySparse on " + this 
+ ", GPUContext=" + getGPUContext());
+               }
                boolean allocated = false;
                isSparse = true;
                if (!isAllocated()) {
-                       LOG.trace("GPU : data is not allocated, allocating a 
sparse block, on " + this);
+                       if(LOG.isTraceEnabled()) {
+                               LOG.trace("GPU : data is not allocated, 
allocating a sparse block, on " + this);
+                       }
                        mat.setDirty(true);
                        allocateSparseMatrixOnDevice();
                        allocated = true;
@@ -644,10 +677,14 @@ public class GPUObject {
        public boolean acquireHostRead() throws CacheException {
                boolean copied = false;
                try {
-                       LOG.trace("GPU : acquireDeviceModifySparse on " + this 
+ ", GPUContext=" + getGPUContext());
+                       if(LOG.isTraceEnabled()) {
+                               LOG.trace("GPU : acquireDeviceModifySparse on " 
+ this + ", GPUContext=" + getGPUContext());
+                       }
                        if (isAllocated() && dirty) {
-                               LOG.trace("GPU : data is dirty on device, 
copying to host, on " + this + ", GPUContext="
+                               if(LOG.isTraceEnabled()) {
+                                       LOG.trace("GPU : data is dirty on 
device, copying to host, on " + this + ", GPUContext="
                                                + getGPUContext());
+                               }
                                copyFromDeviceToHost();
                                copied = true;
                        }
@@ -668,8 +705,10 @@ public class GPUObject {
                        throw new CacheException("Internal state error : 
Invalid number of locks on a GPUObject");
                }
 
-               LOG.trace("GPU : updateReleaseLocks, new number of locks is " + 
newLocks + ", on " + this + ", GPUContext="
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : updateReleaseLocks, new number of 
locks is " + newLocks + ", on " + this + ", GPUContext="
                                + getGPUContext());
+               }
                GPUContext.EvictionPolicy evictionPolicy = 
getGPUContext().evictionPolicy;
                switch (evictionPolicy) {
                case LRU:
@@ -711,30 +750,42 @@ public class GPUObject {
        }
 
        void allocateDenseMatrixOnDevice() throws DMLRuntimeException {
-               LOG.trace("GPU : allocateDenseMatrixOnDevice, on " + this + ", 
GPUContext=" + getGPUContext());
-               assert !isAllocated() : "Internal error - trying to allocated 
dense matrix to a GPUObject that is already allocated";
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : allocateDenseMatrixOnDevice, on " + 
this + ", GPUContext=" + getGPUContext());
+               }
+               if(isAllocated()) 
+                       throw new DMLRuntimeException("Internal error - trying 
to allocated dense matrix to a GPUObject that is already allocated");
                long rows = mat.getNumRows();
                long cols = mat.getNumColumns();
-               assert rows > 0 : "Internal error - invalid number of rows when 
allocating dense matrix";
-               assert cols > 0 : "Internal error - invalid number of columns 
when allocating dense matrix;";
+               if(rows <= 0)
+                       throw new DMLRuntimeException("Internal error - invalid 
number of rows when allocating dense matrix");
+               if(cols <= 0)
+                       throw new DMLRuntimeException("Internal error - invalid 
number of columns when allocating dense matrix;");
                long size = getDoubleSizeOf(rows * cols);
                Pointer tmp = allocate(size);
                setDenseMatrixCudaPointer(tmp);
        }
 
        void allocateSparseMatrixOnDevice() throws DMLRuntimeException {
-               LOG.trace("GPU : allocateSparseMatrixOnDevice, on " + this + ", 
GPUContext=" + getGPUContext());
-               assert !isAllocated() : "Internal error = trying to allocated 
sparse matrix to a GPUObject that is already allocated";
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : allocateSparseMatrixOnDevice, on " + 
this + ", GPUContext=" + getGPUContext());
+               }
+               if(isAllocated()) 
+                       throw new DMLRuntimeException("Internal error - trying 
to allocated sparse matrix to a GPUObject that is already allocated");
                long rows = mat.getNumRows();
                long nnz = mat.getNnz();
-               assert rows > 0 : "Internal error - invalid number of rows when 
allocating a sparse matrix";
-               assert nnz >= 0 : "Internal error - invalid number of non 
zeroes when allocating a sparse matrix";
+               if(rows <= 0)
+                       throw new DMLRuntimeException("Internal error - invalid 
number of rows when allocating sparse matrix");
+               if(nnz < 0)
+                       throw new DMLRuntimeException("Internal error - invalid 
number of non zeroes when allocating a sparse matrix");
                CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), nnz, 
rows);
                setSparseMatrixCudaPointer(tmp);
        }
 
        void deallocateMemoryOnDevice(boolean eager) throws DMLRuntimeException 
{
-               LOG.trace("GPU : deallocateMemoryOnDevice, on " + this + ", 
GPUContext=" + getGPUContext());
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : deallocateMemoryOnDevice, on " + this 
+ ", GPUContext=" + getGPUContext());
+               }
                if (getJcudaDenseMatrixPtr() != null) {
                        cudaFreeHelper(null, getJcudaDenseMatrixPtr(), eager);
                }
@@ -765,20 +816,32 @@ public class GPUObject {
                return GPUSize;
        }
 
-       void copyFromHostToDevice() throws DMLRuntimeException {
-               LOG.trace("GPU : copyFromHostToDevice, on " + this + ", 
GPUContext=" + getGPUContext());
+       void copyFromHostToDevice(String opcode) throws DMLRuntimeException {
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : copyFromHostToDevice, on " + this + ", 
GPUContext=" + getGPUContext());
+               }
                long start = 0;
                if (DMLScript.STATISTICS)
                        start = System.nanoTime();
 
+               long acqrTime = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
                MatrixBlock tmp = mat.acquireRead();
+               if(GPUStatistics.DISPLAY_STATISTICS) {
+                       if(tmp.isInSparseFormat())
+                               GPUStatistics.maintainCPMiscTimes(opcode, 
CPInstruction.MISC_TIMER_GET_SPARSE_MB, System.nanoTime()-acqrTime);
+                       else
+                               GPUStatistics.maintainCPMiscTimes(opcode, 
CPInstruction.MISC_TIMER_GET_DENSE_MB, System.nanoTime()-acqrTime);
+               }
+               
                if (tmp.isInSparseFormat()) {
-
                        int rowPtr[] = null;
                        int colInd[] = null;
                        double[] values = null;
-
-                       tmp.recomputeNonZeros();
+                       
+                       // Only recompute non-zero if unknown, else this will 
incur huge penalty !!
+                       if(tmp.getNonZeros() < 0) {
+                               tmp.recomputeNonZeros(opcode);
+                       }
                        long nnz = tmp.getNonZeros();
                        mat.getMatrixCharacteristics().setNonZeros(nnz);
 
@@ -831,8 +894,11 @@ public class GPUObject {
                        allocateSparseMatrixOnDevice();
 
                        if (copyToDevice) {
+                               long t1 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
                                
CSRPointer.copyToDevice(getJcudaSparseMatrixPtr(), tmp.getNumRows(), 
tmp.getNonZeros(), rowPtr, colInd,
                                                values);
+                               if(GPUStatistics.DISPLAY_STATISTICS) 
+                                       
GPUStatistics.maintainCPMiscTimes(opcode, 
GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1);
                        }
                } else {
                        double[] data = tmp.getDenseBlock();
@@ -841,14 +907,26 @@ public class GPUObject {
                                throw new DMLRuntimeException("Incorrect 
sparsity calculation");
                        else if (data == null && tmp.getNonZeros() != 0)
                                throw new DMLRuntimeException("MatrixBlock is 
not allocated");
-                       else if (tmp.getNonZeros() == 0)
-                               data = new double[tmp.getNumRows() * 
tmp.getNumColumns()];
-
-                       // Copy dense block
+                       
                        allocateDenseMatrixOnDevice();
-
-                       cudaMemcpy(getJcudaDenseMatrixPtr(), Pointer.to(data),
-                                       getDoubleSizeOf(mat.getNumRows() * 
mat.getNumColumns()), cudaMemcpyHostToDevice);
+                       
+                       if (tmp.getNonZeros() == 0) {
+                               // Minor optimization: No need to allocate 
empty error for CPU 
+                               // data = new double[tmp.getNumRows() * 
tmp.getNumColumns()];
+                               long t1 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+                               cudaMemset(getJcudaDenseMatrixPtr(), 0, 
getDoubleSizeOf(mat.getNumRows() * mat.getNumColumns()));
+                               if(GPUStatistics.DISPLAY_STATISTICS) 
+                                       
GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_SET_ZERO, 
System.nanoTime() - t1);
+                       }
+                       else {
+                               // Copy dense block
+                               // H2D now only measures the time taken to do 
+                               long t1 = GPUStatistics.DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+                               cudaMemcpy(getJcudaDenseMatrixPtr(), 
Pointer.to(data),
+                                               
getDoubleSizeOf(mat.getNumRows() * mat.getNumColumns()), 
cudaMemcpyHostToDevice);
+                               if(GPUStatistics.DISPLAY_STATISTICS) 
+                                       
GPUStatistics.maintainCPMiscTimes(opcode, 
GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1);
+                       }
                }
 
                mat.release();
@@ -867,7 +945,9 @@ public class GPUObject {
        }
 
        protected void copyFromDeviceToHost() throws DMLRuntimeException {
-               LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", 
GPUContext=" + getGPUContext());
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", 
GPUContext=" + getGPUContext());
+               }
                if (getJcudaDenseMatrixPtr() != null && 
getJcudaSparseMatrixPtr() != null) {
                        throw new DMLRuntimeException("Invalid state : JCuda 
dense/sparse pointer are both allocated");
                }

http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index a362364..5e2136f 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -264,7 +264,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        public static void reluBackward(GPUContext gCtx, String instName, 
MatrixObject input, MatrixObject dout, MatrixObject outputBlock) throws 
DMLRuntimeException {
-               LOG.trace("GPU : reluBackward" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : reluBackward" + ", GPUContext=" + 
gCtx);
+               }
                long rows = input.getNumRows();
                long cols = input.getNumColumns();
                Pointer imagePointer = getDensePointer(gCtx, input, instName);
@@ -276,7 +278,7 @@ public class LibMatrixCUDA {
                getCudaKernels(gCtx).launchKernel("relu_backward",
                                
ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)),
                                imagePointer, doutPointer, outputPointer, 
toInt(rows), toInt(cols));
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1);
+               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
 
        }
 
@@ -293,7 +295,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        public static void biasMultiply(GPUContext gCtx, String instName, 
MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws 
DMLRuntimeException {
-               LOG.trace("GPU : biasMultiply" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : biasMultiply" + ", GPUContext=" + 
gCtx);
+               }
                if(isInSparseFormat(gCtx, input)) {
                        input.getGPUObject(gCtx).sparseToDense(instName);
                }
@@ -315,7 +319,7 @@ public class LibMatrixCUDA {
                getCudaKernels(gCtx).launchKernel("bias_multiply",
                                
ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)),
                                imagePointer, biasPointer, outputPointer, 
toInt(rows), toInt(cols), toInt(PQ));
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
+               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1);
 
        }
 
@@ -360,14 +364,16 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException
         */
        private static void biasAdd(GPUContext gCtx, String instName, Pointer 
image, Pointer bias, Pointer output, int rows, int cols, int k) throws 
DMLRuntimeException {
-               LOG.trace("GPU : biasAdd" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : biasAdd" + ", GPUContext=" + gCtx);
+               }
                int PQ = cols / k;
                long t1 = 0;
                if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
                getCudaKernels(gCtx).launchKernel("bias_add",
                                
ExecutionConfig.getConfigForSimpleMatrixOperations(rows, cols),
                                image, bias, output, rows, cols, PQ);
-               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
+               if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1);
        }
        
 
@@ -397,7 +403,9 @@ public class LibMatrixCUDA {
         */
        public static void matmultTSMM(ExecutionContext ec, GPUContext gCtx, 
String instName, MatrixObject left, String outputName,
                        boolean isLeftTransposed) throws DMLRuntimeException {
-               LOG.trace("GPU : matmultTSMM" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : matmultTSMM" + ", GPUContext=" + gCtx);
+               }
                if (ec.getGPUContext(0) != gCtx)
                        throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
                if(isInSparseFormat(gCtx, left)) {
@@ -452,7 +460,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        private static void copyUpperToLowerTriangle(GPUContext gCtx, String 
instName, MatrixObject ret) throws DMLRuntimeException {
-               LOG.trace("GPU : copyUpperToLowerTriangle" + ", GPUContext=" + 
gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : copyUpperToLowerTriangle" + ", 
GPUContext=" + gCtx);
+               }
                if(isInSparseFormat(gCtx, ret)) {
                        throw new DMLRuntimeException("Sparse GPU 
copyUpperToLowerTriangle is not implemented");
                }
@@ -501,7 +511,9 @@ public class LibMatrixCUDA {
                        boolean isLeftTransposed, boolean isRightTransposed) 
throws DMLRuntimeException {
                if (ec.getGPUContext(0) != gCtx)
                        throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
-               LOG.trace("GPU : matmult" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : matmult" + ", GPUContext=" + gCtx);
+               }
                if(!left.getGPUObject(gCtx).isAllocated() || 
!right.getGPUObject(gCtx).isAllocated())
                        throw new DMLRuntimeException("One of input is not 
allocated:" + left.getGPUObject(gCtx).isAllocated() + " " + 
right.getGPUObject(gCtx).isAllocated());
 
@@ -591,7 +603,9 @@ public class LibMatrixCUDA {
                CSRPointer B = 
right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
                Pointer ADense = getDensePointer(gCtx, left, instName);
                if (B.isUltraSparse(k, n)){
-                       LOG.trace(" GPU : Convert d M %*% sp M --> sp M %*% sp 
M)" + ", GPUContext=" + gCtx);
+                       if(LOG.isTraceEnabled()) {
+                               LOG.trace(" GPU : Convert d M %*% sp M --> sp M 
%*% sp M)" + ", GPUContext=" + gCtx);
+                       }
 
                        // Convert left to CSR and do cuSparse matmul
                        int rowsA = (int)left.getNumRows();
@@ -616,7 +630,9 @@ public class LibMatrixCUDA {
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
 
                } else {
-                       LOG.trace(" GPU : Convert d M %*% sp M --> d M %*% d M" 
+ ", GPUContext=" + gCtx);
+                       if(LOG.isTraceEnabled()) {
+                               LOG.trace(" GPU : Convert d M %*% sp M --> d M 
%*% d M" + ", GPUContext=" + gCtx);
+                       }
                        // Convert right to dense and do a cuBlas matmul
                        // BDenseTransposed is a column major matrix
                        // Note the arguments to denseDenseMatmult to 
accommodate for this.
@@ -673,7 +689,9 @@ public class LibMatrixCUDA {
                        long t0=0, t1=0, t2=0;
                        // Sparse Matrix Dense Matrix multiply
                        if (A.isUltraSparse(m, k)){
-                               LOG.trace(" GPU : Convert sp M %*% d M --> sp M 
%*% sp M" + ", GPUContext=" + gCtx);
+                               if(LOG.isTraceEnabled()) {
+                                       LOG.trace(" GPU : Convert sp M %*% d M 
--> sp M %*% sp M" + ", GPUContext=" + gCtx);
+                               }
                                // Convert right to CSR and do cuSparse matmul
                                int rowsB = (int)right.getNumRows();
                                int colsB = (int)right.getNumColumns();
@@ -697,7 +715,9 @@ public class LibMatrixCUDA {
                                if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
 
                        } else {
-                               LOG.trace(" GPU : Convert sp M %*% d M --> d M 
%*% d M" + ", GPUContext=" + gCtx);
+                               if(LOG.isTraceEnabled()) {
+                                       LOG.trace(" GPU : Convert sp M %*% d M 
--> d M %*% d M" + ", GPUContext=" + gCtx);
+                               }
                                // Convert left to dense and do a cuBlas matmul
                                // ADenseTransposed is a column major matrix
                                // Note the arguments to denseDenseMatmult to 
accommodate for this.
@@ -739,7 +759,9 @@ public class LibMatrixCUDA {
         */
        private static void sparseMatrixDenseVectorMult(GPUContext gCtx, String 
instName, MatrixObject output, CSRPointer A, Pointer B_dense, boolean 
isATranposed,
                        int m, int k) throws DMLRuntimeException {
-               LOG.trace("GPU : sp M %*% dense V" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : sp M %*% dense V" + ", GPUContext=" + 
gCtx);
+               }
                int transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
                long size = m * Sizeof.DOUBLE;
                if (isATranposed){
@@ -836,7 +858,9 @@ public class LibMatrixCUDA {
         */
        private static void sparseSparseMatmult(GPUContext gCtx, String 
instName, CSRPointer A, CSRPointer B, MatrixObject output,
                        boolean isLeftTransposed, boolean isRightTransposed, 
int m, int n, int k) throws DMLRuntimeException {
-               LOG.trace("GPU : sp M %*% sp M" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : sp M %*% sp M" + ", GPUContext=" + 
gCtx);
+               }
 
                int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : 
CUSPARSE_OPERATION_NON_TRANSPOSE;
                int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : 
CUSPARSE_OPERATION_NON_TRANSPOSE;
@@ -910,7 +934,9 @@ public class LibMatrixCUDA {
        public static void denseDenseMatmult(GPUContext gCtx, String instName, 
Pointer output, int leftRows1, int leftCols1, int rightRows1,
                        int rightCols1, boolean isLeftTransposed1, boolean 
isRightTransposed1, Pointer leftPtr, Pointer rightPtr)
                                        throws DMLRuntimeException {
-               LOG.trace("GPU : d M %*% d M" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : d M %*% d M" + ", GPUContext=" + gCtx);
+               }
 
                Pointer A = rightPtr;
                Pointer B = leftPtr;
@@ -1004,7 +1030,9 @@ public class LibMatrixCUDA {
                        throws DMLRuntimeException {
                if (ec.getGPUContext(0) != gCtx)
                        throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
-               LOG.trace("GPU : unaryAggregate" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : unaryAggregate" + ", GPUContext=" + 
gCtx);
+               }
                final int REDUCTION_ALL = 1;
                final int REDUCTION_ROW = 2;
                final int REDUCTION_COL = 3;
@@ -1044,7 +1072,8 @@ public class LibMatrixCUDA {
                } else {
                        throw new DMLRuntimeException("Internal Error - Invalid 
index function type, only reducing along rows, columns, diagonals or all 
elements is supported in Aggregate Unary operations");
                }
-               assert reductionDirection !=-1 : "Internal Error - Incorrect 
type of reduction direction set for aggregate unary GPU instruction";
+               if(reductionDirection == -1)
+                       throw new DMLRuntimeException("Internal Error - 
Incorrect type of reduction direction set for aggregate unary GPU instruction");
 
                // Convert function type to a number
                int opIndex = -1;
@@ -1055,7 +1084,8 @@ public class LibMatrixCUDA {
                } else if (aggOp.increOp.fn instanceof Mean) {
                        opIndex = OP_MEAN;
                } else if (aggOp.increOp.fn instanceof CM) {
-                       assert ((CM)aggOp.increOp.fn).getAggOpType() == 
CMOperator.AggregateOperationTypes.VARIANCE : "Internal Error - Invalid Type of 
CM operator for Aggregate Unary operation on GPU";
+                       if(((CM)aggOp.increOp.fn).getAggOpType() != 
CMOperator.AggregateOperationTypes.VARIANCE)
+                               throw new DMLRuntimeException("Internal Error - 
Invalid Type of CM operator for Aggregate Unary operation on GPU");
                        opIndex = OP_VARIANCE;
                } else if (aggOp.increOp.fn instanceof Plus) {
                        opIndex = OP_PLUS;
@@ -1074,8 +1104,8 @@ public class LibMatrixCUDA {
                } else {
                        throw new DMLRuntimeException("Internal Error - 
Aggregate operator has invalid Value function");
                }
-               assert opIndex != -1 : "Internal Error - Incorrect type of 
operation set for aggregate unary GPU instruction";
-
+               if(opIndex == -1)
+                       throw new DMLRuntimeException("Internal Error - 
Incorrect type of operation set for aggregate unary GPU instruction");
 
                int rlen = (int)in1.getNumRows();
                int clen = (int)in1.getNumColumns();
@@ -1345,7 +1375,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        private static double reduceAll(GPUContext gCtx, String instName, 
String kernelFunction, Pointer in, int n) throws DMLRuntimeException {
-               LOG.trace("GPU : reduceAll for " + kernelFunction + ", 
GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : reduceAll for " + kernelFunction + ", 
GPUContext=" + gCtx);
+               }
 
                int[] tmp = getKernelParamsForReduceAll(gCtx, n);
                int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
@@ -1391,7 +1423,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        private static void reduceRow(GPUContext gCtx, String instName, String 
kernelFunction, Pointer in, Pointer out, int rows, int cols) throws 
DMLRuntimeException {
-               LOG.trace("GPU : reduceRow for " + kernelFunction + ", 
GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : reduceRow for " + kernelFunction + ", 
GPUContext=" + gCtx);
+               }
 
                int[] tmp = getKernelParamsForReduceByRow(gCtx, rows, cols);
                int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
@@ -1417,7 +1451,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        private static void reduceCol(GPUContext gCtx, String instName, String 
kernelFunction, Pointer in, Pointer out, int rows, int cols) throws 
DMLRuntimeException {
-               LOG.trace("GPU : reduceCol for " + kernelFunction + ", 
GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : reduceCol for " + kernelFunction + ", 
GPUContext=" + gCtx);
+               }
 
                int[] tmp = getKernelParamsForReduceByCol(gCtx, rows, cols);
                int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
@@ -1521,7 +1557,9 @@ public class LibMatrixCUDA {
                if (ec.getGPUContext(0) != gCtx)
                        throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
                double constant = op.getConstant();
-               LOG.trace("GPU : matrixScalarRelational, scalar: " + constant + 
", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : matrixScalarRelational, scalar: " + 
constant + ", GPUContext=" + gCtx);
+               }
 
                Pointer A, C;
                if (isSparseAndEmpty(gCtx, in)) {
@@ -1556,7 +1594,9 @@ public class LibMatrixCUDA {
                if (ec.getGPUContext(0) != gCtx)
                        throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
                double constant = op.getConstant();
-               LOG.trace("GPU : matrixScalarArithmetic, scalar: " + constant + 
", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : matrixScalarArithmetic, scalar: " + 
constant + ", GPUContext=" + gCtx);
+               }
 
                int outRLen = isInputTransposed ? (int) in.getNumColumns() : 
(int) in.getNumRows();
                int outCLen = isInputTransposed ? (int) in.getNumRows() : (int) 
in.getNumColumns();
@@ -1743,7 +1783,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException throws runtime exception
         */
        private static void matrixScalarOp(GPUContext gCtx, String instName, 
Pointer a, double scalar, int rlenA, int clenA, Pointer c, ScalarOperator op) 
throws DMLRuntimeException {
-               LOG.trace("GPU : matrix_scalar_op" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : matrix_scalar_op" + ", GPUContext=" + 
gCtx);
+               }
                int isLeftScalar = (op instanceof LeftScalarOperator) ? 1 : 0;
                int size = rlenA * clenA;
                long t0=0;
@@ -1847,7 +1889,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException
         */
        private static void matrixMatrixOp(GPUContext gCtx, String instName, 
Pointer a, Pointer b, int maxRlen, int maxClen, int vecStatusA, int vecStatusB, 
Pointer c, BinaryOperator op) throws DMLRuntimeException {
-               LOG.trace("GPU : matrix_matrix_cellwise_op" + ", GPUContext=" + 
gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : matrix_matrix_cellwise_op" + ", 
GPUContext=" + gCtx);
+               }
                long t0=0;
                if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
                getCudaKernels(gCtx).launchKernel("matrix_matrix_cellwise_op",
@@ -2025,7 +2069,9 @@ public class LibMatrixCUDA {
                        boolean isLeftTransposed, boolean isRightTransposed, 
double alpha, double beta) throws DMLRuntimeException {
                if (ec.getGPUContext(0) != gCtx)
                        throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
-               LOG.trace("GPU : dgeam" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : dgeam" + ", GPUContext=" + gCtx);
+               }
 
                Pointer alphaPtr = pointerTo(alpha);
                Pointer betaPtr = pointerTo(beta);
@@ -2189,7 +2235,9 @@ public class LibMatrixCUDA {
                if (ec.getGPUContext(0) != gCtx)
                        throw new DMLRuntimeException(
                                        "GPU : Invalid internal state, the 
GPUContext set with the ExecutionContext is not the same used to run this 
LibMatrixCUDA function");
-               LOG.trace("GPU : sliceOperations" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : sliceOperations" + ", GPUContext=" + 
gCtx);
+               }
 
                int rl = (int) ixrange.rowStart;
                int ru = (int) ixrange.rowEnd;
@@ -2233,21 +2281,20 @@ public class LibMatrixCUDA {
         * @param ru row upper
         * @param cl column lower
         * @param cu column upper
-        * @param len1 input number of columns
-        * @param len2 output number of columns
-        * @throws DMLRuntimeException
+        * @param inClen input number of columns
+        * @param retClen output number of columns
+        * @throws DMLRuntimeException if error occurs
         */
        protected static void sliceDenseDense(GPUContext gCtx, String instName, 
Pointer inPointer, Pointer outPointer, 
-                       int rl, int ru, int cl, int cu, int len1, int len2) 
throws DMLRuntimeException {
+                       int rl, int ru, int cl, int cu, int inClen, int 
retClen) throws DMLRuntimeException {
                long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() 
: 0;
-               if (len1 == len2) {
-                       cudaMemcpy(outPointer, inPointer.withByteOffset(rl * 
len1 * Sizeof.DOUBLE), (ru - rl + 1) * len1
+               if (inClen == retClen) {
+                       cudaMemcpy(outPointer, inPointer.withByteOffset(rl * 
inClen * Sizeof.DOUBLE), (ru - rl + 1) * inClen
                                        * Sizeof.DOUBLE, 
cudaMemcpyDeviceToDevice);
                } else {
-                       for (int i = rl, ix1 = rl * len1 + cl, ix2 = 0; i <= 
ru; i++, ix1 += len1, ix2 += len2) {
-                               cudaMemcpy(outPointer.withByteOffset(ix2 * 
Sizeof.DOUBLE),
-                                               inPointer.withByteOffset(ix1 * 
Sizeof.DOUBLE), len2 * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
-                       }
+                       int size = ru - rl + 1;
+                       getCudaKernels(gCtx).launchKernel("slice_dense_dense", 
ExecutionConfig.getConfigForSimpleVectorOperations(size),
+                                       inPointer, outPointer, rl, ru, cl, cu, 
inClen, retClen);
                }
                if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RIX_DENSE_OP, System.nanoTime() - t0);
        }
@@ -2263,23 +2310,26 @@ public class LibMatrixCUDA {
         * @param ru row upper
         * @param cl column lower
         * @param cu column upper
-        * @throws DMLRuntimeException
+        * @throws DMLRuntimeException if error
         */
        protected static void sliceSparseDense(GPUContext gCtx, String 
instName, CSRPointer inPointer, Pointer outPointer, int rl, int ru, int cl, int 
cu) throws DMLRuntimeException {
                int size = ru - rl + 1;
                long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() 
: 0;
+               int retClen = cu - cl + 1;
                // Performs a slice operation where the input matrix is sparse 
and the output matrix is dense.
                // This function avoids unnecessary sparse to dense conversion 
of the input matrix.
                // We can generalize this later to output sparse matrix.
                getCudaKernels(gCtx).launchKernel("slice_sparse_dense", 
ExecutionConfig.getConfigForSimpleVectorOperations(size),
-                               inPointer.val, inPointer.rowPtr, 
inPointer.colInd, outPointer, rl, ru, cl, cu);
+                               inPointer.val, inPointer.rowPtr, 
inPointer.colInd, outPointer, rl, ru, cl, cu, retClen);
                if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RIX_SPARSE_DENSE_OP, System.nanoTime() - t0);
        }
 
        public static void cbind(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, MatrixObject in2, String outputName) throws 
DMLRuntimeException {
                if (ec.getGPUContext(0) != gCtx)
                        throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
-               LOG.trace("GPU : cbind" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : cbind" + ", GPUContext=" + gCtx);
+               }
 
                long t1 = 0;
 
@@ -2312,7 +2362,9 @@ public class LibMatrixCUDA {
        public static void rbind(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, MatrixObject in2, String outputName) throws 
DMLRuntimeException {
                if (ec.getGPUContext(0) != gCtx)
                        throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
-               LOG.trace("GPU : rbind" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : rbind" + ", GPUContext=" + gCtx);
+               }
 
                long t1 = 0;
 
@@ -2362,7 +2414,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException  if DMLRuntimeException occurs
         */
        public static void exp(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
-               LOG.trace("GPU : exp" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : exp" + ", GPUContext=" + gCtx);
+               }
                // e^0 = 1, create a dense block full of 1s
                unaryOp(ec, gCtx, in1, "matrix_exp", 1, outputName, instName, 
GPUInstruction.MISC_TIMER_EXP_KERNEL);
        }
@@ -2377,7 +2431,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException  if DMLRuntimeException occurs
         */
        public static void sqrt(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
-               LOG.trace("GPU : sqrt" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : sqrt" + ", GPUContext=" + gCtx);
+               }
                // sqrt(0) = 0, create a dense block full of 0s
                unaryOp(ec, gCtx, in1, "matrix_sqrt", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_SQRT_KERNEL);
        }
@@ -2392,7 +2448,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException  if DMLRuntimeException occurs
         */
        public static void round(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
-               LOG.trace("GPU : round" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : round" + ", GPUContext=" + gCtx);
+               }
                // round(0) = 0, create a dense block full of 0s
                unaryOp(ec, gCtx, in1, "matrix_round", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_ROUND_KERNEL);
        }
@@ -2407,7 +2465,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException  if DMLRuntimeException occurs
         */
        public static void abs(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
-               LOG.trace("GPU : abs" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : abs" + ", GPUContext=" + gCtx);
+               }
                // abs(0) = 0, create a dense block full of 0s
                unaryOp(ec, gCtx, in1, "matrix_abs", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_ABS_KERNEL);
        }
@@ -2422,7 +2482,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException  if DMLRuntimeException occurs
         */
        public static void log(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
-               LOG.trace("GPU : log" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : log" + ", GPUContext=" + gCtx);
+               }
                // log(0) = -Inf
                unaryOp(ec, gCtx, in1, "matrix_log", Double.NEGATIVE_INFINITY, 
outputName, instName, GPUInstruction.MISC_TIMER_LOG_KERNEL);
        }
@@ -2437,7 +2499,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException  if DMLRuntimeException occurs
         */
        public static void floor(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
-               LOG.trace("GPU : floor" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : floor" + ", GPUContext=" + gCtx);
+               }
                // floor(0) = 0
                unaryOp(ec, gCtx, in1, "matrix_floor", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_FLOOR_KERNEL);
        }
@@ -2452,7 +2516,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException  if DMLRuntimeException occurs
         */
        public static void ceil(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
-               LOG.trace("GPU : ceil" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : ceil" + ", GPUContext=" + gCtx);
+               }
                // ceil(0) = 0
                unaryOp(ec, gCtx, in1, "matrix_ceil", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_CEIL_KERNEL);
        }
@@ -2467,7 +2533,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException  if DMLRuntimeException occurs
         */
        public static void sin(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
-               LOG.trace("GPU : sin" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : sin" + ", GPUContext=" + gCtx);
+               }
                // sin(0) = 0
                unaryOp(ec, gCtx, in1, "matrix_sin", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_SIN_KERNEL);
        }
@@ -2482,7 +2550,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException  if DMLRuntimeException occurs
         */
        public static void cos(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
-               LOG.trace("GPU : cos" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : cos" + ", GPUContext=" + gCtx);
+               }
                // cos(0) = 1
                unaryOp(ec, gCtx, in1, "matrix_cos", 1, outputName, instName, 
GPUInstruction.MISC_TIMER_COS_KERNEL);
        }
@@ -2497,7 +2567,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException  if DMLRuntimeException occurs
         */
        public static void tan(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
-               LOG.trace("GPU : tan" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : tan" + ", GPUContext=" + gCtx);
+               }
                // tan(0) = 0
                unaryOp(ec, gCtx, in1, "matrix_tan", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_TAN_KERNEL);
        }
@@ -2512,7 +2584,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException  if DMLRuntimeException occurs
         */
        public static void asin(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
-               LOG.trace("GPU : asin" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : asin" + ", GPUContext=" + gCtx);
+               }
                // asin(0) = 0
                unaryOp(ec, gCtx, in1, "matrix_asin", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_ASIN_KERNEL);
        }
@@ -2527,7 +2601,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException  if DMLRuntimeException occurs
         */
        public static void acos(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
-               LOG.trace("GPU : acos" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : acos" + ", GPUContext=" + gCtx);
+               }
                // acos(0) = PI/2
                unaryOp(ec, gCtx, in1, "matrix_acos", Math.PI/2.0, outputName, 
instName, GPUInstruction.MISC_TIMER_ACOS_KERNEL);
        }
@@ -2542,7 +2618,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException  if DMLRuntimeException occurs
         */
        public static void atan(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
-               LOG.trace("GPU : atan" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : atan" + ", GPUContext=" + gCtx);
+               }
                // atan(0) = 0
                unaryOp(ec, gCtx, in1, "matrix_atan", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_ATAN_KERNEL);
        }
@@ -2557,7 +2635,9 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException  if DMLRuntimeException occurs
         */
        public static void sign(ExecutionContext ec, GPUContext gCtx, String 
instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
-               LOG.trace("GPU : sign" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : sign" + ", GPUContext=" + gCtx);
+               }
                // sign(0) = 0
                unaryOp(ec, gCtx, in1, "matrix_sign", 0, outputName, instName, 
GPUInstruction.MISC_TIMER_SIGN_KERNEL);
        }
@@ -2622,7 +2702,9 @@ public class LibMatrixCUDA {
 
                long t1=0, t2=0;
                if(in1.getNumRows() == in2.getNumRows() && in1.getNumColumns() 
== in2.getNumColumns()) {
-                       LOG.trace("GPU : cublasDaxpy" + ", GPUContext=" + gCtx);
+                       if(LOG.isTraceEnabled()) {
+                               LOG.trace("GPU : cublasDaxpy" + ", GPUContext=" 
+ gCtx);
+                       }
 
                        // Matrix-Matrix daxpy
                        long n = in1.getNumRows()*in2.getNumColumns(); // Since 
A is always a matrix
@@ -2640,7 +2722,9 @@ public class LibMatrixCUDA {
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DAXPY_LIB, System.nanoTime() - t2);
                }
                else {
-                       LOG.trace("GPU : daxpy_matrix_vector" + ", GPUContext=" 
+ gCtx);
+                       if(LOG.isTraceEnabled()) {
+                               LOG.trace("GPU : daxpy_matrix_vector" + ", 
GPUContext=" + gCtx);
+                       }
 
                        // Matrix-Vector daxpy
                        // Note: Vector-Matrix operation is not supported
@@ -2671,7 +2755,9 @@ public class LibMatrixCUDA {
                        throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
 
                // x = solve(A, b)
-               LOG.trace("GPU : solve" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : solve" + ", GPUContext=" + gCtx);
+               }
 
                long t0 = -1;
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index bf5f25b..9a39b1c 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -244,7 +244,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
        private static void cudnnConv2d(GPUContext gCtx, String instName, 
Pointer image, Pointer filter, Pointer output, int N,
                        int C, int H, int W, int K, int R, int S, int pad_h, 
int pad_w, int stride_h, int stride_w, int P, int Q)
                                        throws DMLRuntimeException {
-               LOG.trace("GPU : conv2d" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : conv2d" + ", GPUContext=" + gCtx);
+               }
                cudnnFilterDescriptor filterDesc = null;
                cudnnConvolutionDescriptor convDesc = null;
                Pointer workSpace = null;
@@ -411,7 +413,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        Pointer dwPointer, int N, int C, int H, int W, int K, 
int R,
                        int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
                        int Q) throws DMLRuntimeException {
-               LOG.trace("GPU : conv2dBackwardFilter" + ", GPUContext=" + 
gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : conv2dBackwardFilter" + ", 
GPUContext=" + gCtx);
+               }
                cudnnFilterDescriptor dwDesc = null;
                cudnnConvolutionDescriptor convDesc = null;
 
@@ -551,7 +555,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        Pointer dx, int N, int C, int H, int W, int K, int R,
                        int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
                        int Q) throws DMLRuntimeException {
-               LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" 
+ gCtx);
+               }
                cudnnFilterDescriptor wDesc = null;
                cudnnConvolutionDescriptor convDesc = null;
 
@@ -708,7 +714,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        Pointer y, int N, int C, int H, int W, int K, int R,
                        int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
                        int Q) throws DMLRuntimeException {
-               LOG.trace("GPU : performMaxpooling" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : performMaxpooling" + ", GPUContext=" + 
gCtx);
+               }
 
                cudnnPoolingDescriptor poolingDesc = null;
 
@@ -803,7 +811,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        int N, int C, int H, int W, int K, int R,
                        int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
                        int Q) throws DMLRuntimeException {
-               LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" 
+ gCtx);
+               }
                Pointer y = null;
                cudnnPoolingDescriptor poolingDesc = null;
 
@@ -938,7 +948,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
        public static void batchNormalizationForwardInference(GPUContext gCtx, 
String instName, MatrixObject image,
                        MatrixObject scale, MatrixObject bias, MatrixObject 
runningMean, MatrixObject runningVar,
                        MatrixObject ret, double epsilon) throws 
DMLRuntimeException {
-               LOG.trace("GPU : batchNormalizationForwardInference" + ", 
GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : batchNormalizationForwardInference" + 
", GPUContext=" + gCtx);
+               }
                int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
 
                int N = toInt(image.getNumRows());
@@ -984,7 +996,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
        public static void batchNormalizationForwardTraining(GPUContext gCtx, 
String instName, MatrixObject image,
                        MatrixObject scale,  MatrixObject bias, MatrixObject 
runningMean, MatrixObject runningVar,
                        MatrixObject ret, MatrixObject retRunningMean, 
MatrixObject retRunningVar, double epsilon, double exponentialAverageFactor) 
throws DMLRuntimeException {
-               LOG.trace("GPU : batchNormalizationForwardTraining" + ", 
GPUContext=" + gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : batchNormalizationForwardTraining" + 
", GPUContext=" + gCtx);
+               }
                int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
 
                int N = toInt(image.getNumRows());
@@ -1104,7 +1118,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
        public static void batchNormalizationBackward(GPUContext gCtx, String 
instName, MatrixObject image, MatrixObject dout,
                        MatrixObject scale, MatrixObject ret, MatrixObject 
retScale, MatrixObject retBias,
                        double epsilon) throws DMLRuntimeException {
-               LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" 
+ gCtx);
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : batchNormalizationBackward" + ", 
GPUContext=" + gCtx);
+               }
                int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
 
                int N = toInt(image.getNumRows());
@@ -1134,7 +1150,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
        private static void cudnnReLU(GPUContext gCtx, String instName, 
MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws 
DMLRuntimeException {
                long t0=0;
                try {
-                       LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + 
gCtx);
+                       if(LOG.isTraceEnabled()) {
+                               LOG.trace("GPU : performCuDNNReLU" + ", 
GPUContext=" + gCtx);
+                       }
                        cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;
 
                        Pointer srcData = getDensePointerForCuDNN(gCtx, in, 
instName);
@@ -1176,7 +1194,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                long t0=0;
                cudnnTensorDescriptor srcTensorDesc = 
in.getGPUObject(gCtx).getTensorDescriptor();
                if(N*CHW >= maxNumDoublesOfCuDNNTensor ||  srcTensorDesc == 
null) {
-                       LOG.trace("GPU : relu custom kernel" + ", GPUContext=" 
+ gCtx);
+                       if(LOG.isTraceEnabled()) {
+                               LOG.trace("GPU : relu custom kernel" + ", 
GPUContext=" + gCtx);
+                       }
                        // Invokes relu(double* A,  double* ret, int rlen, int 
clen)
                        if (GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
                        Pointer dstData = getDensePointerForCuDNN(gCtx, output, 
instName);
@@ -1193,8 +1213,10 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 
        /**
         * Convenience method to get jcudaDenseMatrixPtr. This method 
explicitly converts sparse to dense format, so use it judiciously.
+        * 
         * @param gCtx a valid {@link GPUContext}
         * @param image input matrix object
+        * @param instName name of the instruction
         * @return jcuda pointer
         * @throws DMLRuntimeException if error occurs while sparse to dense 
conversion
         */

http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index d041fc4..ef32481 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -1176,6 +1176,11 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
         * @return number of non-zeros
         */
        public long recomputeNonZeros() {
+               return recomputeNonZeros(null);
+       }
+       
+       public long recomputeNonZeros(String opcode) {
+               long t1 = opcode != null && DMLScript.STATISTICS && 
DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
                if( sparse && sparseBlock!=null ) { //SPARSE (max long)
                        //note: rlen might be <= sparseBlock.numRows()
                        nonZeros = sparseBlock.size(0, sparseBlock.numRows());
@@ -1188,6 +1193,10 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
                                nnz += (a[i]!=0) ? 1 : 0;
                        nonZeros = nnz;
                }
+               if(opcode != null && DMLScript.STATISTICS && 
DMLScript.FINEGRAINED_STATISTICS) {
+                       long t2 = System.nanoTime();
+                       GPUStatistics.maintainCPMiscTimes(opcode, 
CPInstruction.MISC_TIMER_RECOMPUTE_NNZ, t2-t1);
+               }
                return nonZeros;
        }

[1/3] systemml git commit: [SYSTEMML-445] Improved performance of GPU right indexing

Reply via email to