Repository: systemml
Updated Branches:
  refs/heads/master af9cc8a90 -> 4d3216678


http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 82a76b6..cdb69f9 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -211,7 +211,7 @@ public class LibMatrixCUDA {
                return gCtx.getCublasHandle();
        }
 
-       protected static JCudaKernels getCudaKernels(GPUContext gCtx) {
+       public static JCudaKernels getCudaKernels(GPUContext gCtx) throws 
DMLRuntimeException {
                return gCtx.getKernels();
        }
        
@@ -244,7 +244,7 @@ public class LibMatrixCUDA {
         */
        public static Pointer one() {
                if(_one == null || oldDataTypeSize != sizeOfDataType) {
-                       _one = dataTypePointerTo(1.0);
+                       _one = _dataTypePointerTo(1.0);
                        oldDataTypeSize = sizeOfDataType;
                }
                return _one;
@@ -255,7 +255,7 @@ public class LibMatrixCUDA {
         */
        public static Pointer zero() {
                if(_zero == null  || oldDataTypeSize != sizeOfDataType) {
-                       _zero = dataTypePointerTo(0.0);
+                       _zero = _dataTypePointerTo(0.0);
                        oldDataTypeSize = sizeOfDataType;
                }
                return _zero;
@@ -268,11 +268,11 @@ public class LibMatrixCUDA {
         * @param instName  the invoking instruction's name for record {@link 
Statistics}.
         * @return jcuda pointer
         */
-       protected static Pointer getDensePointer(GPUContext gCtx, MatrixObject 
input, String instName) {
+       public static Pointer getDensePointer(GPUContext gCtx, MatrixObject 
input, String instName) throws DMLRuntimeException {
                if(isInSparseFormat(gCtx, input)) {
                        input.getGPUObject(gCtx).sparseToDense(instName);
                }
-               return input.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
+               return input.getGPUObject(gCtx).getDensePointer();
        }
 
        /**
@@ -289,7 +289,7 @@ public class LibMatrixCUDA {
                return input.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
        }
        
-       protected static Pointer dataTypePointerTo(double value) {
+       private static Pointer _dataTypePointerTo(double value) {
                if(sizeOfDataType == Sizeof.DOUBLE) {
                        return Pointer.to(new double[] { value });
                }
@@ -301,6 +301,18 @@ public class LibMatrixCUDA {
                }
        }
        
+       protected static Pointer dataTypePointerTo(double value) {
+               if(value == 1) {
+                       return one();
+               }
+               else if(value == 0) {
+                       return zero();
+               }
+               else {
+                       return _dataTypePointerTo(value);
+               }
+       }
+       
 
        /**
         * This method computes the backpropagation errors for previous layer 
of relu operation
@@ -355,8 +367,7 @@ public class LibMatrixCUDA {
                Pointer tmp = gCtx.allocate(instName, cols*sizeOfDataType);
                reduceCol(gCtx, instName, "reduce_col_sum", imagePointer, tmp, 
N, cols);
                reduceRow(gCtx, instName, "reduce_row_sum", tmp, outputPointer, 
toInt(C), toInt(HW));
-               gCtx.cudaFreeHelper(tmp);
-
+               gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE);
        }
 
        /**
@@ -387,9 +398,9 @@ public class LibMatrixCUDA {
                if(bias.getNumColumns() != 1 || cols % K != 0) {
                        throw new DMLRuntimeException("Incorrect inputs for 
bias_multiply: input[" + rows + " X " + cols + "] and bias[" + K + " X " + 
bias.getNumColumns() + "]");
                }
-               Pointer imagePointer = 
input.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
-               Pointer biasPointer = 
bias.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
-               Pointer outputPointer = 
outputBlock.getGPUObject(gCtx).getJcudaDenseMatrixPtr();
+               Pointer imagePointer = 
input.getGPUObject(gCtx).getDensePointer();
+               Pointer biasPointer = bias.getGPUObject(gCtx).getDensePointer();
+               Pointer outputPointer = 
outputBlock.getGPUObject(gCtx).getDensePointer();
                long t1 = 0;
                if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime();
                getCudaKernels(gCtx).launchKernel("bias_multiply",
@@ -729,7 +740,7 @@ public class LibMatrixCUDA {
                        default:
                                throw new DMLRuntimeException("Internal Error - 
Unsupported reduction direction for summation squared");
                        }
-                       gCtx.cudaFreeHelper(instName, tmp);
+                       gCtx.cudaFreeHelper(instName, tmp, 
DMLScript.EAGER_CUDA_FREE);
                        break;
                }
                case OP_MEAN:{
@@ -842,7 +853,7 @@ public class LibMatrixCUDA {
                                ScalarOperator divideOp = new 
RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
                                matrixScalarOp(gCtx, instName, tmpRow, clen - 
1, rlen, 1, out, divideOp);
 
-                               gCtx.cudaFreeHelper(instName, tmpRow);
+                               gCtx.cudaFreeHelper(instName, tmpRow, 
DMLScript.EAGER_CUDA_FREE);
 
                                break;
                        }
@@ -860,15 +871,15 @@ public class LibMatrixCUDA {
                                ScalarOperator divideOp = new 
RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
                                matrixScalarOp(gCtx, instName, tmpCol, rlen - 
1, 1, clen, out, divideOp);
 
-                               gCtx.cudaFreeHelper(instName, tmpCol);
+                               gCtx.cudaFreeHelper(instName, tmpCol, 
DMLScript.EAGER_CUDA_FREE);
 
                                break;
                        }
                        default:
                                throw new DMLRuntimeException("Internal Error - 
Unsupported reduction direction for variance");
                        }
-                       gCtx.cudaFreeHelper(instName, tmp);
-                       gCtx.cudaFreeHelper(instName, tmp2);
+                       gCtx.cudaFreeHelper(instName, tmp, 
DMLScript.EAGER_CUDA_FREE);
+                       gCtx.cudaFreeHelper(instName, tmp2, 
DMLScript.EAGER_CUDA_FREE);
                        break;
                }
                case OP_MAXINDEX : {
@@ -923,7 +934,7 @@ public class LibMatrixCUDA {
                int[] tmp = getKernelParamsForReduceAll(gCtx, n);
                int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
 
-               Pointer tempOut = gCtx.allocate(instName, n * sizeOfDataType);
+               Pointer tempOut = gCtx.allocate(instName, n*sizeOfDataType); 
 
                long t1=0,t2=0;
 
@@ -944,7 +955,7 @@ public class LibMatrixCUDA {
                }
                double[] result = {-1f};
                cudaSupportFunctions.deviceToHost(gCtx, tempOut, result, 
instName, false);
-               gCtx.cudaFreeHelper(instName, tempOut);
+               gCtx.cudaFreeHelper(instName, tempOut, 
DMLScript.EAGER_CUDA_FREE);
                return result[0];
        }
 
@@ -1699,6 +1710,36 @@ public class LibMatrixCUDA {
                        if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0);
                }
        }
+       
+       /**
+        * Computes C = t(A)
+        * @param ec execution context
+        * @param gCtx gpu context
+        * @param instName name of the instruction
+        * @param A pointer to the input matrix
+        * @param C pointer to the output matrix
+        * @param numRowsA number of rows of the input matrix
+        * @param numColsA number of columns of the output matrix
+        * @throws DMLRuntimeException if error
+        */
+       public static void denseTranspose(ExecutionContext ec, GPUContext gCtx, 
String instName, 
+                       Pointer A, Pointer C, long numRowsA, long numColsA) 
throws DMLRuntimeException {
+               if (ec.getGPUContext(0) != gCtx)
+                       throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
+               if(LOG.isTraceEnabled()) {
+                       LOG.trace("GPU : dense transpose" + ", GPUContext=" + 
gCtx);
+               }
+               long t0=0;
+               // Dense-Dense dgeam
+               int lda = toInt(numColsA);
+               int ldb = lda;
+               int m = toInt(numRowsA);
+               int n = lda;
+               int ldc = m;
+               if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
+               cudaSupportFunctions.cublasgeam(getCublasHandle(gCtx), 
CUBLAS_OP_T, CUBLAS_OP_T, m, n, one(), A, lda, zero(), A, ldb, C, ldc);
+               if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0);
+       }
 
 
        //********************************************************************/
@@ -2389,7 +2430,7 @@ public class LibMatrixCUDA {
                if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                ATobj.denseRowMajorToColumnMajor();
                if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
-               Pointer A = ATobj.getJcudaDenseMatrixPtr();
+               Pointer A = ATobj.getDensePointer();
 
                if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime();
                GPUObject bTobj = (GPUObject) bobj.clone();
@@ -2399,7 +2440,7 @@ public class LibMatrixCUDA {
                if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
 
 
-               Pointer b = bTobj.getJcudaDenseMatrixPtr();
+               Pointer b = bTobj.getDensePointer();
 
                // The following set of operations is done following the 
example in the cusolver documentation
                // http://docs.nvidia.com/cuda/cusolver/#ormqr-example1
@@ -2447,12 +2488,12 @@ public class LibMatrixCUDA {
                // TODO  : Find a way to assign bTobj directly to the output 
and set the correct flags so as to not crash
                // There is an avoidable copy happening here
                MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, 
instName, outputName, in1.getNumColumns(), 1);
-               cudaMemcpy(out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), 
bTobj.getJcudaDenseMatrixPtr(), n * 1 * sizeOfDataType, 
cudaMemcpyDeviceToDevice);
+               cudaMemcpy(out.getGPUObject(gCtx).getDensePointer(), 
bTobj.getDensePointer(), n * 1 * sizeOfDataType, cudaMemcpyDeviceToDevice);
 
-               gCtx.cudaFreeHelper(instName, work);
-               gCtx.cudaFreeHelper(instName, tau);
-               ATobj.clearData();
-               bTobj.clearData();
+               gCtx.cudaFreeHelper(instName, work, DMLScript.EAGER_CUDA_FREE);
+               gCtx.cudaFreeHelper(instName, tau, DMLScript.EAGER_CUDA_FREE);
+               ATobj.clearData(instName, DMLScript.EAGER_CUDA_FREE);
+               bTobj.clearData(instName, DMLScript.EAGER_CUDA_FREE);
 
                //debugPrintMatrix(b, n, 1);
     }
@@ -2501,6 +2542,39 @@ public class LibMatrixCUDA {
                                GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ALLOCATE_SPARSE_OUTPUT, System.nanoTime() - t0);
                return mb.getKey();
        }
-
-
-}
+       
+//     // Small 1-int pointers to avoid unnecessary allocation/deallocation
+//     private static Pointer _TMP_NNZ_ROW_PTR = null;
+//     private static Pointer _TMP_NNZ_PTR = null;
+       /**
+        * Utility to compute number of non-zeroes on the GPU
+        * 
+        * @param gCtx the associated GPUContext
+        * @param densePtr device pointer to the dense matrix
+        * @param length length of the dense pointer
+        * @return the number of non-zeroes
+        */
+       public static synchronized int computeNNZ(GPUContext gCtx, Pointer 
densePtr, int length) {
+               return (int) reduceAll(gCtx, null, "compute_nnz", densePtr, 
length);
+               // This is extremely slow
+//             cusparseMatDescr matDescr = 
CSRPointer.getDefaultCuSparseMatrixDescriptor();
+//             cusparseHandle cusparseHandle = gCtx.getCusparseHandle();
+//             if(_TMP_NNZ_ROW_PTR == null) {
+//                     // As these are 4-byte pointers, using cudaMalloc 
directly so as not to include them in memory information.
+//                     _TMP_NNZ_ROW_PTR = new Pointer();
+//                     cudaMalloc(_TMP_NNZ_ROW_PTR, jcuda.Sizeof.INT);
+//                     _TMP_NNZ_PTR = new Pointer();
+//                     cudaMalloc(_TMP_NNZ_PTR, jcuda.Sizeof.INT);
+//                     // _TMP_NNZ_ROW_PTR = gCtx.allocate(jcuda.Sizeof.INT);
+//                     // _TMP_NNZ_PTR = gCtx.allocate(jcuda.Sizeof.INT);
+//             }
+//             // Output is in dense vector format, convert it to CSR
+//             LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, 
cusparseDirection.CUSPARSE_DIRECTION_ROW, 1, length, matDescr, densePtr, 1,
+//                             _TMP_NNZ_ROW_PTR, _TMP_NNZ_PTR);
+//             int[] nnzC = { -1 };
+//             cudaMemcpy(Pointer.to(nnzC), _TMP_NNZ_PTR, jcuda.Sizeof.INT, 
cudaMemcpyDeviceToHost);
+//             return nnzC[0];
+       }
+
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index 26a4d2e..2bfb8f2 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -215,7 +215,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                                CSRPointer filterPointer = 
filter.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
                                Pointer matmultOutputPointer = 
gCtx.allocate(instName, NKPQ*sizeOfDataType);
                                LibMatrixCuMatMult.sparseDenseMatMult(gCtx, 
instName, matmultOutputPointer, filterPointer, im2colPointer, K, CRS, CRS, NPQ, 
K, NPQ, false, false);
-                               gCtx.cudaFreeHelper(instName, im2colPointer);
+                               gCtx.cudaFreeHelper(instName, im2colPointer, 
DMLScript.EAGER_CUDA_FREE);
                                
                                // Perform reorg_knpq a reorg operation of 
matmultOutputPointer matrix with dimensions [K, NPQ]
                                // and return a matrix dstPointer with 
dimensions [N, KPQ]
@@ -224,7 +224,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                                                matmultOutputPointer, 
dstPointer, NKPQ, NPQ, KPQ, P*Q);
                                if (DMLScript.FINEGRAINED_STATISTICS)
                                        
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DENSE_REORG_KNPQ_KERNEL, System.nanoTime() - t1);
-                               gCtx.cudaFreeHelper(instName, 
matmultOutputPointer);
+                               gCtx.cudaFreeHelper(instName, 
matmultOutputPointer, DMLScript.EAGER_CUDA_FREE);
                        }
                        else {
                                // Filter and output are accounted as dense in 
the memory estimation for conv2d
@@ -444,7 +444,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 
                                                }
                                                // Deallocate temporary array 
to hold one element of input
-                                               
gCtx.cudaFreeHelper(tempdwPointer, true);
+                                               gCtx.cudaFreeHelper(instName, 
tempdwPointer, true);
                                        }
                                }
                        }
@@ -772,7 +772,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        long t4=0;
                        if (DMLScript.FINEGRAINED_STATISTICS) t4 = 
System.nanoTime();
                        if(!isMaxPoolOutputProvided)
-                               gCtx.cudaFreeHelper(instName, y);
+                               gCtx.cudaFreeHelper(instName, y, 
DMLScript.EAGER_CUDA_FREE);
                        if (DMLScript.FINEGRAINED_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4);
                }
        }
@@ -818,17 +818,15 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
                long N = in.getNumRows();
                long CHW = in.getNumColumns();
-               MatrixObject output = ec.getMatrixObject(outputName);
-               getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, 
in.getNumRows(), in.getNumColumns()); // Allocated the dense output matrix
+               Pointer dstData = getDenseOutputPointer(ec, gCtx, instName, 
outputName, in.getNumRows(), in.getNumColumns());
                long t0=0;
                if(N*CHW >= maxNumElementsOfCuDNNTensor) {
                        if(LOG.isTraceEnabled()) {
                                LOG.trace("GPU : relu custom kernel" + ", 
GPUContext=" + gCtx);
                        }
                        // Invokes relu(double* A,  double* ret, int rlen, int 
clen)
-                       if (DMLScript.FINEGRAINED_STATISTICS) t0 = 
System.nanoTime();
-                       Pointer dstData = getDensePointerForCuDNN(gCtx, output, 
instName);
                        Pointer srcData = getDensePointerForCuDNN(gCtx, in, 
instName); // TODO: FIXME: Add sparse kernel support for relu
+                       if (DMLScript.FINEGRAINED_STATISTICS) t0 = 
System.nanoTime();
                        getCudaKernels(gCtx).launchKernel("relu",
                                        
ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)),
                                        srcData, dstData, toInt(N), toInt(CHW));
@@ -838,11 +836,18 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                        cudnnTensorDescriptor tensorDescriptor = new 
cudnnTensorDescriptor();
                        cudnnCreateTensorDescriptor(tensorDescriptor);
                        cudnnSetTensor4dDescriptor(tensorDescriptor, 
CUDNN_TENSOR_NCHW, CUDNN_DATA_TYPE, toInt(N), 1, 1, toInt(CHW));
-                       cudnnReLU(gCtx, instName, in, 
getDensePointerForCuDNN(gCtx, output, instName), tensorDescriptor);
+                       cudnnReLU(gCtx, instName, in, dstData, 
tensorDescriptor);
                        cudnnDestroyTensorDescriptor(tensorDescriptor);
                }
        }
-
+       
+       private static Pointer getDenseOutputPointer(ExecutionContext ec, 
GPUContext gCtx, String instName, String outputName,
+                       long numRows, long numCols) throws DMLRuntimeException {
+               MatrixObject output = ec.getMatrixObject(outputName);
+               getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, 
numRows, numCols); // Allocated the dense output matrix
+               return getDensePointerForCuDNN(gCtx, output, instName, 
toInt(numRows), toInt(numCols));
+       }
+       
        /**
         * Convenience method to get jcudaDenseMatrixPtr. This method 
explicitly converts sparse to dense format, so use it judiciously.
         * 
@@ -858,6 +863,33 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                }
                return getDensePointer(gCtx, image, instName);
        }
+       
+       /**
+        * Convenience method to get jcudaDenseMatrixPtr. This method 
explicitly converts sparse to dense format, so use it judiciously.
+        * 
+        * @param gCtx a valid {@link GPUContext}
+        * @param image input matrix object
+        * @param instName name of the instruction
+        * @param numRows expected number of rows
+        * @param numCols expected number of columns 
+        * @return jcuda pointer
+        * @throws DMLRuntimeException if error occurs while sparse to dense 
conversion
+        */
+       public static Pointer getDensePointerForCuDNN(GPUContext gCtx, 
MatrixObject image, String instName, int numRows, int numCols) throws 
DMLRuntimeException {
+               long numElems = image.getNumRows()*image.getNumColumns();
+               if(image.getNumRows() != numRows || image.getNumColumns() != 
numCols) {
+                       throw new DMLRuntimeException("Expected input of 
size:[" +  numRows + ", " + numCols + "], but found [" + image.getNumRows() + 
", " + image.getNumColumns() + "]."); 
+               }
+               else if(numElems > maxNumElementsOfCuDNNTensor) {
+                       throw new DMLRuntimeException("CuDNN restriction: the 
size of input tensor cannot have greater than 2 giga-elements, but has " + 
numElems + " (i.e. [" + image.getNumRows() + " X " + image.getNumColumns() + 
"]). Hint: try reducing the mini-batch size.");
+               }
+               Pointer ptr = getDensePointer(gCtx, image, instName);
+               long sizeOfPtr = 
gCtx.getMemoryManager().getSizeAllocatedGPUPointer(ptr);
+               if(sizeOfPtr != numElems*sizeOfDataType) {
+                       throw new DMLRuntimeException("Incorrect pointer: 
expected size:" +  (numElems*sizeOfDataType) + ", but found " + sizeOfPtr);
+               }
+               return ptr;
+       }
 
        /**
         * Convenience method for checking the status of CuDNN kernel.
@@ -868,4 +900,4 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
                if(status != cudnnStatus.CUDNN_STATUS_SUCCESS)
                        throw new DMLRuntimeException("Error status returned by 
CuDNN:" + jcuda.jcudnn.cudnnStatus.stringFor(status));
        }
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
index a50dbc3..432e79e 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
@@ -97,7 +97,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseab
                        cudnnDestroyConvolutionDescriptor(convDesc);
                if(sizeInBytes != 0) {
                        try {
-                               gCtx.cudaFreeHelper(instName, workSpace);
+                               gCtx.cudaFreeHelper(instName, workSpace, 
DMLScript.EAGER_CUDA_FREE);
                        } catch (DMLRuntimeException e) {
                                throw new RuntimeException(e);
                        }
@@ -276,4 +276,4 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements 
java.lang.AutoCloseab
                cudnnSetConvolution2dDescriptor(convDesc, padding[0], 
padding[1], strides[0], strides[1], 1, 1, CUDNN_CROSS_CORRELATION, 
LibMatrixCUDA.CUDNN_DATA_TYPE);
                return convDesc;
        }
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
index f52da30..f3ce70d 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
@@ -76,7 +76,7 @@ public class LibMatrixCuDNNInputRowFetcher extends 
LibMatrixCUDA implements java
        @Override
        public void close() {
                try {
-                       gCtx.cudaFreeHelper(outPointer, true);
+                       gCtx.cudaFreeHelper(null, outPointer, true);
                } catch (DMLRuntimeException e) {
                        throw new RuntimeException(e);
                }

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
index f476dfe..60b2352 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
@@ -285,7 +285,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
                                        toInt(outRLen), C, toInt(outCLen));
                        if (!DMLScript.EAGER_CUDA_FREE)
                                JCuda.cudaDeviceSynchronize();
-                       gCtx.cudaFreeHelper(output, DMLScript.EAGER_CUDA_FREE);
+                       gCtx.cudaFreeHelper(instName, output, 
DMLScript.EAGER_CUDA_FREE);
                        if (DMLScript.FINEGRAINED_STATISTICS)
                                GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime()
                                                - t0);
@@ -466,4 +466,4 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
        private static int reverseCusparseOp(int trans) {
                return trans == CUSPARSE_OPERATION_TRANSPOSE ? 
CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
        }
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
index 9fec026..5fd642e 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
@@ -315,7 +315,7 @@ public class LibMatrixNative
                return ret2;
        }
        
-       private static void fromFloatBuffer(FloatBuffer buff, double[] output) {
+       public static void fromFloatBuffer(FloatBuffer buff, double[] output) {
                Arrays.parallelSetAll(output, i -> (double)buff.get(i) );
        }
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
index d5edf48..3bd101c 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
@@ -22,6 +22,11 @@ import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.FloatBuffer;
+import java.util.stream.IntStream;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
@@ -163,7 +168,7 @@ public class SinglePrecisionCudaSupportFunctions implements 
CudaSupportFunctions
        
        @Override
        public void deviceToHost(GPUContext gCtx, Pointer src, double[] dest, 
String instName, boolean isEviction) {
-               long t1 = DMLScript.FINEGRAINED_STATISTICS  && instName != 
null? System.nanoTime() : 0;
+               long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
                // We invoke transfer matrix from device to host in two cases:
                // 1. During eviction of unlocked matrices
                // 2. During acquireHostRead
@@ -177,40 +182,46 @@ public class SinglePrecisionCudaSupportFunctions 
implements CudaSupportFunctions
                        Pointer deviceDoubleData = 
gCtx.allocate(((long)dest.length)*Sizeof.DOUBLE);
                        LibMatrixCUDA.float2double(gCtx, src, deviceDoubleData, 
dest.length);
                        cudaMemcpy(Pointer.to(dest), deviceDoubleData, 
((long)dest.length)*Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
-                       gCtx.cudaFreeHelper(deviceDoubleData);
+                       gCtx.cudaFreeHelper(instName, deviceDoubleData, 
DMLScript.EAGER_CUDA_FREE);
                }
                else {
                        LOG.debug("Potential OOM: Allocated additional space on 
host in deviceToHost");
-                       float [] floatData = new float[dest.length];
+                       FloatBuffer floatData = 
ByteBuffer.allocateDirect(Sizeof.FLOAT*dest.length).order(ByteOrder.nativeOrder()).asFloatBuffer();
                        cudaMemcpy(Pointer.to(floatData), src, 
((long)dest.length)*Sizeof.FLOAT, cudaMemcpyDeviceToHost);
-                       for(int i = 0; i < dest.length; i++) {
-                               dest[i] = floatData[i];
-                       }
+                       LibMatrixNative.fromFloatBuffer(floatData, dest);
+               }
+               if(DMLScript.STATISTICS) {
+                       long totalTime = System.nanoTime() - t0;
+                       GPUStatistics.cudaFloat2DoubleTime.add(totalTime);
+                       GPUStatistics.cudaFloat2DoubleCount.add(1);
+                       if(DMLScript.FINEGRAINED_STATISTICS && instName != 
null) 
+                               GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, totalTime);
                }
-               if(DMLScript.FINEGRAINED_STATISTICS && instName != null) 
-                       GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t1);
        }
 
        @Override
        public void hostToDevice(GPUContext gCtx, double[] src, Pointer dest, 
String instName) {
                LOG.debug("Potential OOM: Allocated additional space in 
hostToDevice");
                // TODO: Perform conversion on GPU using double2float and 
float2double kernels
-               long t1 = DMLScript.FINEGRAINED_STATISTICS  && instName != 
null? System.nanoTime() : 0;
+               long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
                if(PERFORM_CONVERSION_ON_DEVICE) {
                        Pointer deviceDoubleData = 
gCtx.allocate(((long)src.length)*Sizeof.DOUBLE);
                        cudaMemcpy(deviceDoubleData, Pointer.to(src), 
((long)src.length)*Sizeof.DOUBLE, cudaMemcpyHostToDevice);
                        LibMatrixCUDA.double2float(gCtx, deviceDoubleData, 
dest, src.length);
-                       gCtx.cudaFreeHelper(deviceDoubleData);
+                       gCtx.cudaFreeHelper(instName, deviceDoubleData, 
DMLScript.EAGER_CUDA_FREE);
                }
                else {
-                       float [] floatData = new float[src.length];
-                       for(int i = 0; i < src.length; i++) {
-                               floatData[i] = (float) src[i];
-                       }
+                       FloatBuffer floatData = 
ByteBuffer.allocateDirect(Sizeof.FLOAT*src.length).order(ByteOrder.nativeOrder()).asFloatBuffer();
+                       IntStream.range(0, src.length).parallel().forEach(i -> 
floatData.put(i, (float)src[i]));
                        cudaMemcpy(dest, Pointer.to(floatData), 
((long)src.length)*Sizeof.FLOAT, cudaMemcpyHostToDevice);
                }
                
-               if(DMLScript.FINEGRAINED_STATISTICS && instName != null) 
-                       GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1);
+               if(DMLScript.STATISTICS) {
+                       long totalTime = System.nanoTime() - t0;
+                       GPUStatistics.cudaDouble2FloatTime.add(totalTime);
+                       GPUStatistics.cudaDouble2FloatCount.add(1);
+                       if(DMLScript.FINEGRAINED_STATISTICS && instName != 
null) 
+                               GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, totalTime);
+               }
        }
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/utils/GPUStatistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java 
b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
index d12f4dd..f7bee4f 100644
--- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java
+++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
@@ -52,6 +52,10 @@ public class GPUStatistics {
        public static LongAdder cudaToDevTime = new LongAdder();             // 
time spent in copying data from host (CPU) to device (GPU) memory
        public static LongAdder cudaFromDevTime = new LongAdder();           // 
time spent in copying data from device to host
        public static LongAdder cudaEvictTime = new LongAdder();                
 // time spent in eviction
+       public static LongAdder cudaEvictMallocTime = new LongAdder();      // 
time spent in eviction
+       public static LongAdder cudaFloat2DoubleTime = new LongAdder();         
// time spent in converting float to double during eviction
+       public static LongAdder cudaDouble2FloatTime = new LongAdder();         
// time spent in converting double to float during eviction
+       public static LongAdder cudaEvictMemcpyTime = new LongAdder();          
// time spent in cudaMemcpy kernel during eviction
        public static LongAdder cudaForcedClearLazyFreedEvictTime = new 
LongAdder(); // time spent in forced lazy eviction
        public static LongAdder cudaForcedClearUnpinnedEvictTime = new 
LongAdder(); // time spent in forced unpinned eviction
        public static LongAdder cudaAllocCount = new LongAdder();
@@ -60,6 +64,9 @@ public class GPUStatistics {
        public static LongAdder cudaToDevCount = new LongAdder();
        public static LongAdder cudaFromDevCount = new LongAdder();
        public static LongAdder cudaEvictionCount = new LongAdder();
+       public static LongAdder cudaFloat2DoubleCount = new LongAdder();
+       public static LongAdder cudaDouble2FloatCount = new LongAdder();
+       public static LongAdder cudaEvictionMallocCount = new LongAdder();
 
        // Per instruction miscellaneous timers.
        // Used to record events in a CP Heavy Hitter instruction and
@@ -88,6 +95,11 @@ public class GPUStatistics {
                cudaToDevTime.reset();
                cudaFromDevTime.reset();
                cudaEvictTime.reset();
+               cudaEvictMallocTime.reset();
+               cudaFloat2DoubleTime.reset();
+               cudaDouble2FloatTime.reset();
+               cudaFloat2DoubleCount.reset();
+               cudaDouble2FloatCount.reset();
                cudaForcedClearLazyFreedEvictTime.reset();
                cudaForcedClearUnpinnedEvictTime.reset();
                cudaAllocCount.reset();
@@ -95,6 +107,7 @@ public class GPUStatistics {
                cudaToDevCount.reset();
                cudaFromDevCount.reset();
                cudaEvictionCount.reset();
+               cudaEvictionMallocCount.reset();
                resetMiscTimers();
        }
 
@@ -193,21 +206,27 @@ public class GPUStatistics {
                sb.append("CUDA/CuLibraries init time:\t" + 
String.format("%.3f", cudaInitTime*1e-9) + "/"
                                + String.format("%.3f", 
cudaLibrariesInitTime*1e-9) + " sec.\n");
                sb.append("Number of executed GPU inst:\t" + 
getNoOfExecutedGPUInst() + ".\n");
-               sb.append("GPU mem tx time  
(alloc/dealloc/set0/toDev/fromDev/evict):\t"
+               sb.append("GPU mem tx time  
(alloc/dealloc/set0/toDev(d2f)/fromDev(f2d)/evict(alloc)):\t"
                                + String.format("%.3f", 
cudaAllocTime.longValue()*1e-9) + "/"
                                + String.format("%.3f", 
cudaDeAllocTime.longValue()*1e-9) + "/"
                                + String.format("%.3f", 
cudaMemSet0Time.longValue()*1e-9) + "/"
-                               + String.format("%.3f", 
cudaToDevTime.longValue()*1e-9) + "/"
-                               + String.format("%.3f", 
cudaFromDevTime.longValue()*1e-9) + "/"
-                               + String.format("%.3f", 
cudaEvictTime.longValue()*1e-9) + " sec.\n");
-               sb.append("GPU mem tx count 
(alloc/dealloc/set0/toDev/fromDev/evict):\t"
+                               + String.format("%.3f", 
cudaToDevTime.longValue()*1e-9) + "("
+                               + String.format("%.3f", 
cudaDouble2FloatTime.longValue()*1e-9)+ ")/"
+                               + String.format("%.3f", 
cudaFromDevTime.longValue()*1e-9) + "("
+                               + String.format("%.3f", 
cudaFloat2DoubleTime.longValue()*1e-9) + ")/"
+                               + String.format("%.3f", 
cudaEvictTime.longValue()*1e-9) + "("
+                               + String.format("%.3f", 
cudaEvictMallocTime.longValue()*1e-9) + ") sec.\n");
+               sb.append("GPU mem tx count 
(alloc/dealloc/set0/toDev(d2f)/fromDev(f2d)/evict(alloc)):\t"
                                + cudaAllocCount.longValue() + "/"
                                + cudaDeAllocCount.longValue() + "/"
                                + cudaMemSet0Count.longValue() + "/"
                                + cudaSparseConversionCount.longValue() + "/"
-                               + cudaToDevCount.longValue() + "/"
-                               + cudaFromDevCount.longValue() + "/"
-                               + cudaEvictionCount.longValue() + ".\n");
+                               + cudaToDevCount.longValue() + "("
+                               + cudaDouble2FloatCount.longValue() + ")/"
+                               + cudaFromDevCount.longValue() + "("
+                               + cudaFloat2DoubleCount.longValue() + ")/"
+                               + cudaEvictionCount.longValue() + "("
+                               + cudaEvictionMallocCount.longValue() + ").\n");
                sb.append("GPU conversion time  
(sparseConv/sp2dense/dense2sp):\t"
                                + String.format("%.3f", 
cudaSparseConversionTime.longValue()*1e-9) + "/"
                                + String.format("%.3f", 
cudaSparseToDenseTime.longValue()*1e-9) + "/"
@@ -221,4 +240,4 @@ public class GPUStatistics {
        }
 
 
-}
+}
\ No newline at end of file

Reply via email to