http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
index d858b0b..b4cb87d 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
@@ -18,34 +18,25 @@
  */
 package org.apache.sysml.runtime.instructions.gpu.context;
 
-import static jcuda.driver.JCudaDriver.cuCtxCreate;
-import static jcuda.driver.JCudaDriver.cuCtxGetCurrent;
-import static jcuda.driver.JCudaDriver.cuDeviceGet;
-import static jcuda.driver.JCudaDriver.cuInit;
 import static jcuda.driver.JCudaDriver.cuLaunchKernel;
 import static jcuda.driver.JCudaDriver.cuModuleGetFunction;
 import static jcuda.driver.JCudaDriver.cuModuleLoadDataEx;
-import static jcuda.driver.JCudaDriver.cuModuleUnload;
 
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.HashMap;
 
-import jcuda.runtime.JCuda;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.io.IOUtilFunctions;
 
-import jcuda.CudaException;
 import jcuda.Pointer;
-import jcuda.driver.CUcontext;
-import jcuda.driver.CUdevice;
 import jcuda.driver.CUfunction;
 import jcuda.driver.CUmodule;
 import jcuda.driver.CUresult;
 
 /**
- * Utility class that allows LibMatrixCUDA as well as JCudaObject to invoke 
custom CUDA kernels.
+ * Utility class that allows LibMatrixCUDA as well as GPUObject to invoke 
custom CUDA kernels.
  * 
  * The utility org.apache.sysml.runtime.instructions.gpu.context.JCudaKernels 
simplifies the launching of the kernels. 
  * For example: to launch a kernel 
@@ -54,70 +45,23 @@ import jcuda.driver.CUresult;
  */
 public class JCudaKernels {
 
-       private static String ptxFileName = "/kernels/SystemML.ptx";
+       private final static String ptxFileName = "/kernels/SystemML.ptx";
        private HashMap<String, CUfunction> kernels = new HashMap<String, 
CUfunction>();
        private CUmodule module;
+       private final int deviceNum;
        
        /**
         * Loads the kernels in the file ptxFileName. Though cubin files are 
also supported, we will stick with
         * ptx file as they are target-independent similar to Java's .class 
files.
-        * 
+        * @param deviceNum  the device number for which to initiate the driver 
API
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public JCudaKernels() throws DMLRuntimeException {
-               shutdown();
-               initCUDA();
+       JCudaKernels(int deviceNum) throws DMLRuntimeException {
+               this.deviceNum = deviceNum;
                module = new CUmodule();
                // Load the kernels specified in the ptxFileName file
                checkResult(cuModuleLoadDataEx(module, 
initKernels(ptxFileName), 0, new int[0], Pointer.to(new int[0])));
        }
-       
-       /**
-     * Initializes the JCuda driver API. Then it will try to attach to the 
-     * current CUDA context. If no active CUDA context exists, then it will 
-     * try to create one, for the device which is specified by the current 
-     * deviceNumber.
-     * 
-        * @throws DMLRuntimeException If it is neither possible to attach to 
an 
-     * existing context, nor to create a new context.
-     */
-    private static void initCUDA() throws DMLRuntimeException {
-        checkResult(cuInit(0));
-
-        // Try to obtain the current context
-        CUcontext context = new CUcontext();
-        checkResult(cuCtxGetCurrent(context));
-        
-        // If the context is 'null', then a new context
-        // has to be created.
-        CUcontext nullContext = new CUcontext(); 
-        if (context.equals(nullContext)) {
-            createContext();
-        }
-    }
-    
-    /**
-     * Tries to create a context for device 'deviceNumber'.
-     * @throws DMLRuntimeException 
-     * 
-     * @throws CudaException If the device can not be 
-     * accessed or the context can not be created
-     */
-    private static void createContext() throws DMLRuntimeException {
-       int deviceNumber = 0;
-        CUdevice device = new CUdevice();
-        checkResult(cuDeviceGet(device, deviceNumber));
-        CUcontext context = new CUcontext();
-        checkResult(cuCtxCreate(context, 0, device));
-    }
-    
-       /**
-        * Performs cleanup actions such as unloading the module
-        */
-       public void shutdown() {
-               if(module != null)
-                       cuModuleUnload(module);
-       }
 
        /**
         * Setups the kernel parameters and launches the kernel using 
cuLaunchKernel API. 
@@ -167,7 +111,7 @@ public class JCudaKernels {
                                config.gridDimX, config.gridDimY, 
config.gridDimZ, 
                                config.blockDimX, config.blockDimY, 
config.blockDimZ, 
                                config.sharedMemBytes, config.stream, 
Pointer.to(kernelParams), null));
-               JCuda.cudaDeviceSynchronize();
+               //JCuda.cudaDeviceSynchronize();
        }
        
     public static void checkResult(int cuResult) throws DMLRuntimeException {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
deleted file mode 100644
index bc0031b..0000000
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
+++ /dev/null
@@ -1,1330 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sysml.runtime.instructions.gpu.context;
-
-import jcuda.Pointer;
-import jcuda.jcublas.JCublas2;
-import jcuda.jcublas.cublasHandle;
-import jcuda.jcudnn.cudnnTensorDescriptor;
-import jcuda.jcusparse.JCusparse;
-import jcuda.jcusparse.cusparseDirection;
-import jcuda.jcusparse.cusparseHandle;
-import jcuda.jcusparse.cusparseMatDescr;
-import jcuda.jcusparse.cusparsePointerMode;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.sysml.api.DMLScript;
-import org.apache.sysml.runtime.DMLRuntimeException;
-import org.apache.sysml.runtime.controlprogram.caching.CacheException;
-import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
-import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
-import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
-import org.apache.sysml.runtime.matrix.data.MatrixBlock;
-import org.apache.sysml.runtime.matrix.data.SparseBlock;
-import org.apache.sysml.runtime.matrix.data.SparseBlockCOO;
-import org.apache.sysml.runtime.matrix.data.SparseBlockCSR;
-import org.apache.sysml.runtime.matrix.data.SparseBlockMCSR;
-import org.apache.sysml.utils.GPUStatistics;
-import org.apache.sysml.utils.LRUCacheMap;
-
-import java.util.HashMap;
-import java.util.LinkedList;
-
-import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
-import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor;
-import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
-import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
-import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
-import static jcuda.jcusparse.JCusparse.cusparseCreateMatDescr;
-import static jcuda.jcusparse.JCusparse.cusparseDcsr2dense;
-import static jcuda.jcusparse.JCusparse.cusparseDdense2csr;
-import static jcuda.jcusparse.JCusparse.cusparseDnnz;
-import static jcuda.jcusparse.JCusparse.cusparseSetMatIndexBase;
-import static jcuda.jcusparse.JCusparse.cusparseSetMatType;
-import static jcuda.jcusparse.JCusparse.cusparseSetPointerMode;
-import static jcuda.jcusparse.JCusparse.cusparseXcsrgeamNnz;
-import static jcuda.jcusparse.JCusparse.cusparseXcsrgemmNnz;
-import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO;
-import static jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL;
-import static jcuda.runtime.JCuda.cudaDeviceSynchronize;
-import static jcuda.runtime.JCuda.cudaFree;
-import static jcuda.runtime.JCuda.cudaMalloc;
-import static jcuda.runtime.JCuda.cudaMemcpy;
-import static jcuda.runtime.JCuda.cudaMemset;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
-
-/**
- * Handle to a matrix block on the GPU
- */
-public class JCudaObject extends GPUObject {
-
-       private static final Log LOG = 
LogFactory.getLog(JCudaObject.class.getName());
-       
-       // An optional tensor descriptor (and shape) that can be set by a 
tensor instruction such as convolution, maxpooling
-       // and exploited by a subsequent non-tensor instruction such as relu
-       private cudnnTensorDescriptor tensorDescriptor = null;
-       private int [] tensorShape = null;
-       
-       /**
-        * Returns a previously allocated tensor shape or null
-        * @return int array of four elements or null
-        */
-       public int [] getTensorShape() {
-               return tensorShape;
-       }
-       
-       /**
-        * Returns a previously allocated tensor descriptor or null
-        * @return cudnn tensor descriptor
-        */
-       public cudnnTensorDescriptor getTensorDescriptor() {
-               return tensorDescriptor;
-       }
-       
-       /**
-        * Returns a previously allocated or allocates and returns a tensor 
descriptor
-        * @param N number of images
-        * @param C number of channels
-        * @param H height
-        * @param W width
-        * @return cudnn tensor descriptor
-        */
-       public cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int 
H, int W) {
-               if(tensorDescriptor == null) {
-                       tensorDescriptor = new cudnnTensorDescriptor();
-                       cudnnCreateTensorDescriptor(tensorDescriptor);
-                       cudnnSetTensor4dDescriptor(tensorDescriptor, 
CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W);
-                       tensorShape = new int[4];
-                       tensorShape[0] = N;
-                       tensorShape[1] = C;
-                       tensorShape[2] = H;
-                       tensorShape[3] = W;
-               }
-               return tensorDescriptor;
-       }
-       
-       /**
-        * Compressed Sparse Row (CSR) format for CUDA
-        * Generalized matrix multiply is implemented for CSR format in the 
cuSparse library among other operations
-        */
-       public static class CSRPointer {
-               
-               public static cusparseMatDescr matrixDescriptor;
-               
-               /**
-                * @return Singleton default matrix descriptor object 
-                *                      (set with CUSPARSE_MATRIX_TYPE_GENERAL, 
CUSPARSE_INDEX_BASE_ZERO)
-                */
-               public static cusparseMatDescr 
getDefaultCuSparseMatrixDescriptor(){
-                       if (matrixDescriptor == null){
-                               // Code from JCuda Samples - 
http://www.jcuda.org/samples/JCusparseSample.java
-                               matrixDescriptor = new cusparseMatDescr();
-                               cusparseCreateMatDescr(matrixDescriptor);
-                               cusparseSetMatType(matrixDescriptor, 
CUSPARSE_MATRIX_TYPE_GENERAL);
-                               cusparseSetMatIndexBase(matrixDescriptor, 
CUSPARSE_INDEX_BASE_ZERO);
-                       }
-                       return matrixDescriptor;
-               }
-               
-               private static final double ULTRA_SPARSITY_TURN_POINT = 0.0004;
-
-               /**
-                * Default constructor to help with Factory method {@link 
#allocateEmpty(long, long)}
-                */
-               private CSRPointer() {
-                       val = new Pointer();
-                       rowPtr = new Pointer();
-                       colInd = new Pointer();
-                       allocateMatDescrPointer();
-               }
-               
-               /** Number of non zeroes                                        
                                        */
-               public long nnz;
-               /** double array of non zero values                             
                        */
-               public Pointer val;
-               /** integer array of start of all rows and end of last row + 1  
*/
-               public Pointer rowPtr;
-               /** integer array of nnz values' column indices                 
                */
-               public Pointer colInd;
-               /** descriptor of matrix, only CUSPARSE_MATRIX_TYPE_GENERAL 
supported   */
-               public cusparseMatDescr descr;
-               
-               /** 
-                * Check for ultra sparsity
-                * 
-                * @param rows number of rows
-                * @param cols number of columns
-                * @return true if ultra sparse
-                */
-               public boolean isUltraSparse(int rows, int cols) {
-                       double sp = ((double)nnz/rows/cols);
-                       return sp<ULTRA_SPARSITY_TURN_POINT;
-               }
-               
-               /**
-                * Initializes {@link #descr} to CUSPARSE_MATRIX_TYPE_GENERAL,
-                * the default that works for DGEMM.
-                */
-               private void allocateMatDescrPointer() {                        
-                       this.descr = getDefaultCuSparseMatrixDescriptor();
-               }
-               
-               /**
-                * Estimate the size of a CSR matrix in GPU memory
-                * Size of pointers is not needed and is not added in
-                * @param nnz2  number of non zeroes
-                * @param rows  number of rows 
-                * @return size estimate
-                */
-               public static long estimateSize(long nnz2, long rows) {
-                       long sizeofValArray = getDoubleSizeOf(nnz2);
-                       long sizeofRowPtrArray  = getIntSizeOf(rows + 1);
-                       long sizeofColIndArray = getIntSizeOf(nnz2);
-                       long sizeofDescr = getIntSizeOf(4);
-                       // From the CUSPARSE documentation, the 
cusparseMatDescr in native code is represented as: 
-                       // typedef struct {
-                       //      cusparseMatrixType_t MatrixType;
-                       //      cusparseFillMode_t FillMode;
-                       //      cusparseDiagType_t DiagType;
-                       //      cusparseIndexBase_t IndexBase;
-                       // } cusparseMatDescr_t;
-                       long tot = sizeofValArray + sizeofRowPtrArray + 
sizeofColIndArray + sizeofDescr;
-                       return tot;
-               }
-               
-               /** 
-                * Factory method to allocate an empty CSR Sparse matrix on the 
GPU
-                * @param nnz2  number of non-zeroes
-                * @param rows  number of rows
-                * @return a {@link CSRPointer} instance that encapsulates the 
CSR matrix on GPU
-                * @throws DMLRuntimeException if DMLRuntimeException occurs
-                */
-               public static CSRPointer allocateEmpty(long nnz2, long rows) 
throws DMLRuntimeException {
-                       assert nnz2 > -1 : "Incorrect usage of internal API, 
number of non zeroes is less than 0 when trying to allocate sparse data on GPU";
-                       CSRPointer r = new CSRPointer();
-                       r.nnz = nnz2;
-                       if(nnz2 == 0) {
-                               // The convention for an empty sparse matrix is 
to just have an instance of the CSRPointer object
-                               // with no memory allocated on the GPU.
-                               return r;
-                       }
-                       ensureFreeSpace(getDoubleSizeOf(nnz2) + 
getIntSizeOf(rows + 1) + getIntSizeOf(nnz2));
-                       // increment the cudaCount by 1 for the allocation of 
all 3 arrays
-                       r.val = allocate(null, getDoubleSizeOf(nnz2), 0);
-                       r.rowPtr = allocate(null, getIntSizeOf(rows + 1), 0);
-                       r.colInd = allocate(null, getIntSizeOf(nnz2), 1);
-                       return r;
-               }
-               
-               /**
-                * Static method to copy a CSR sparse matrix from Host to Device
-                * @param dest  [input] destination location (on GPU)
-                * @param rows  number of rows
-                * @param nnz   number of non-zeroes
-                * @param rowPtr        integer array of row pointers
-                * @param colInd        integer array of column indices
-                * @param values        double array of non zero values
-                */
-               public static void copyToDevice(CSRPointer dest, int rows, long 
nnz, int[] rowPtr, int[] colInd, double[] values) {
-                       CSRPointer r = dest;
-                       long t0=0;
-                       if (DMLScript.STATISTICS) t0 = System.nanoTime();
-                       r.nnz = nnz;
-                       cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), 
getIntSizeOf(rows + 1), cudaMemcpyHostToDevice);
-                       cudaMemcpy(r.colInd, Pointer.to(colInd), 
getIntSizeOf(nnz), cudaMemcpyHostToDevice);
-                       cudaMemcpy(r.val, Pointer.to(values), 
getDoubleSizeOf(nnz), cudaMemcpyHostToDevice);
-                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime()-t0);
-                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaToDevCount.addAndGet(3);
-               }
-               
-               /**
-                * Static method to copy a CSR sparse matrix from Device to host
-                * @param src   [input] source location (on GPU)
-                * @param rows  [input] number of rows
-                * @param nnz   [input] number of non-zeroes
-                * @param rowPtr        [output] pre-allocated integer array of 
row pointers of size (rows+1)
-                * @param colInd        [output] pre-allocated integer array of 
column indices of size nnz
-                * @param values        [output] pre-allocated double array of 
values of size nnz
-                */
-               public static void copyToHost(CSRPointer src, int rows, long 
nnz, int[] rowPtr, int[] colInd, double[] values){
-                       CSRPointer r = src;
-                       long t0=0;
-                       if (DMLScript.STATISTICS) t0 = System.nanoTime();
-                       cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, 
getIntSizeOf(rows + 1), cudaMemcpyDeviceToHost);
-                       cudaMemcpy(Pointer.to(colInd), r.colInd, 
getIntSizeOf(nnz), cudaMemcpyDeviceToHost);
-                       cudaMemcpy(Pointer.to(values), r.val, 
getDoubleSizeOf(nnz), cudaMemcpyDeviceToHost);
-                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime()-t0);
-                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaFromDevCount.addAndGet(3);
-               }
-               
-               // 
==============================================================================================
-
-               // The following methods estimate the memory needed for sparse 
matrices that are
-               // results of operations on other sparse matrices using the 
cuSparse Library.
-               // The operation is C = op(A) binaryOperation op(B), C is the 
output and A & B are the inputs
-               // op = whether to transpose or not
-               // binaryOperation = For cuSparse, +, - are *(matmul) are 
supported
-
-               // From CuSparse Manual,
-               // Since A and B have different sparsity patterns, cuSPARSE 
adopts a two-step approach
-               // to complete sparse matrix C. In the first step, the user 
allocates csrRowPtrC of m+1
-               // elements and uses function cusparseXcsrgeamNnz() to 
determine csrRowPtrC
-               // and the total number of nonzero elements. In the second 
step, the user gathers nnzC
-               //(number of nonzero elements of matrix C) from either 
(nnzC=*nnzTotalDevHostPtr)
-               // or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) and allocates csrValC, 
csrColIndC of
-               // nnzC elements respectively, then finally calls function 
cusparse[S|D|C|Z]csrgeam()
-               // to complete matrix C.
-
-               /**
-                * Allocate row pointers of m+1 elements
-                * 
-                * @param handle        a valid {@link cusparseHandle}
-                * @param C                     Output matrix
-                * @param rowsC                 number of rows in C
-                * @throws DMLRuntimeException ?
-                */
-               private static void step1AllocateRowPointers(cusparseHandle 
handle, CSRPointer C, int rowsC) throws DMLRuntimeException {
-                       cusparseSetPointerMode(handle, 
cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
-            cudaDeviceSynchronize();
-                       // Do not increment the cudaCount of allocations on GPU
-                       C.rowPtr = allocate(null, getIntSizeOf((long)rowsC+1), 
0);
-               }
-               
-               /**
-                * Determine total number of nonzero element for the 
cusparseDgeam  operation.
-                * This is done from either (nnzC=*nnzTotalDevHostPtr) or 
(nnzC=csrRowPtrC(m)-csrRowPtrC(0))
-                * 
-                * @param handle        a valid {@link cusparseHandle}
-                * @param A                     Sparse Matrix A on GPU
-                * @param B                     Sparse Matrix B on GPU
-                * @param C                     Output Sparse Matrix C on GPU
-                * @param m                     Rows in C
-                * @param n                     Columns in C
-                * @throws DMLRuntimeException ?
-                */
-               private static void step2GatherNNZGeam(cusparseHandle handle, 
CSRPointer A, CSRPointer B, CSRPointer C, int m, int n) throws 
DMLRuntimeException {
-                       int[] CnnzArray = { -1 };
-                       cusparseXcsrgeamNnz(handle, m, n,  
-                                       A.descr, toIntExact(A.nnz), A.rowPtr, 
A.colInd, 
-                                       B.descr, toIntExact(B.nnz), B.rowPtr, 
B.colInd, 
-                                       C.descr, C.rowPtr, 
Pointer.to(CnnzArray));
-            cudaDeviceSynchronize();
-                       if (CnnzArray[0] != -1){
-                               C.nnz = CnnzArray[0];
-                       }
-                       else {
-                       int baseArray[] = { 0 };
-                       cudaMemcpy(Pointer.to(CnnzArray), 
C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), 
cudaMemcpyDeviceToHost);
-                   cudaMemcpy(Pointer.to(baseArray), C.rowPtr,                 
                                           getIntSizeOf(1), 
cudaMemcpyDeviceToHost);
-                   C.nnz = CnnzArray[0] - baseArray[0];
-                       }
-               }
-
-               /**
-                *      Determine total number of nonzero element for the 
cusparseDgemm operation.
-                *
-                * @param handle        a valid {@link cusparseHandle}
-                * @param A                     Sparse Matrix A on GPU
-                * @param transA        op - whether A is transposed
-                * @param B                     Sparse Matrix B on GPU
-                * @param transB        op - whether B is transposed
-                * @param C                     Output Sparse Matrix C on GPU
-                * @param m                     Number of rows of sparse matrix 
op ( A ) and C
-                * @param n                     Number of columns of sparse 
matrix op ( B ) and C
-                * @param k                     Number of columns/rows of 
sparse matrix op ( A ) / op ( B )
-                * @throws DMLRuntimeException ?
-                */
-               private static void step2GatherNNZGemm(cusparseHandle handle, 
CSRPointer A, int transA, CSRPointer B, int transB, CSRPointer C, int m, int n, 
int k) throws DMLRuntimeException {
-                       int[] CnnzArray = { -1 };
-                       if (A.nnz >= Integer.MAX_VALUE || B.nnz >= 
Integer.MAX_VALUE) { 
-                               throw new DMLRuntimeException("Number of non 
zeroes is larger than supported by cuSparse"); 
-                       }
-                       cusparseXcsrgemmNnz(handle, transA, transB, m, n, k, 
-                                       A.descr, toIntExact(A.nnz), A.rowPtr, 
A.colInd, 
-                                       B.descr, toIntExact(B.nnz), B.rowPtr, 
B.colInd, 
-                                       C.descr, C.rowPtr, 
Pointer.to(CnnzArray));
-            cudaDeviceSynchronize();
-                       if (CnnzArray[0] != -1){
-                               C.nnz = CnnzArray[0];
-                       }
-                       else {
-                       int baseArray[] = { 0 };
-                       cudaMemcpy(Pointer.to(CnnzArray), 
C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), 
cudaMemcpyDeviceToHost);
-                   cudaMemcpy(Pointer.to(baseArray), C.rowPtr,                 
                                           getIntSizeOf(1), 
cudaMemcpyDeviceToHost);
-                   C.nnz = CnnzArray[0] - baseArray[0];
-                       }
-               }
-
-               /**
-                * Allocate val and index pointers.
-                * 
-                * @param handle        a valid {@link cusparseHandle}
-                * @param C                     Output sparse matrix on GPU
-                * @throws DMLRuntimeException ?
-                */
-               private static void step3AllocateValNInd(cusparseHandle handle, 
CSRPointer C) throws DMLRuntimeException {
-                       // Increment cudaCount by one when all three arrays of 
CSR sparse array are allocated
-                       C.val = allocate(null, getDoubleSizeOf(C.nnz), 0);
-                       C.colInd = allocate(null, getIntSizeOf(C.nnz), 1);
-               }
-
-               // 
==============================================================================================
-
-
-               /**
-                * Estimates the number of non zero elements from the results 
of a sparse cusparseDgeam operation
-                * C = a op(A) + b op(B)
-                * @param handle        a valid {@link cusparseHandle}
-                * @param A                     Sparse Matrix A on GPU
-                * @param B                     Sparse Matrix B on GPU
-                * @param m                     Rows in A
-                * @param n                     Columns in Bs
-                * @return CSR (compressed sparse row) pointer
-                * @throws DMLRuntimeException if DMLRuntimeException occurs
-                */
-               public static CSRPointer allocateForDgeam(cusparseHandle 
handle, CSRPointer A, CSRPointer B, int m, int n) 
-                               throws DMLRuntimeException{
-                       if (A.nnz >= Integer.MAX_VALUE || B.nnz >= 
Integer.MAX_VALUE) { 
-                               throw new DMLRuntimeException("Number of non 
zeroes is larger than supported by cuSparse"); 
-                       }
-                       CSRPointer C = new CSRPointer();
-                       step1AllocateRowPointers(handle, C, m);
-                       step2GatherNNZGeam(handle, A, B, C, m, n);
-                       step3AllocateValNInd(handle, C);
-                       return C;
-               }
-               
-               /**
-                * Estimates the number of non-zero elements from the result of 
a sparse matrix multiplication C = A * B
-                * and returns the {@link CSRPointer} to C with the appropriate 
GPU memory.
-                * @param handle        a valid {@link cusparseHandle}
-                * @param A                     Sparse Matrix A on GPU
-                * @param transA        'T' if A is to be transposed, 'N' 
otherwise
-                * @param B                     Sparse Matrix B on GPU
-                * @param transB        'T' if B is to be transposed, 'N' 
otherwise
-                * @param m                     Rows in A
-                * @param n                     Columns in B
-                * @param k                     Columns in A / Rows in B
-                * @return a {@link CSRPointer} instance that encapsulates the 
CSR matrix on GPU
-                * @throws DMLRuntimeException if DMLRuntimeException occurs
-                */
-               public static CSRPointer 
allocateForMatrixMultiply(cusparseHandle handle, CSRPointer A, int transA, 
CSRPointer B, int transB, int m, int n, int k) 
-                               throws DMLRuntimeException{
-                       // Following the code example at 
http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at
-                       // 
https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java
-                       CSRPointer C = new CSRPointer();
-                       step1AllocateRowPointers(handle, C, m);
-                       step2GatherNNZGemm(handle, A, transA, B, transB, C, m, 
n, k);
-                       step3AllocateValNInd(handle, C);
-                       return C;
-               }
-               
-               /**
-                * Copies this CSR matrix on the GPU to a dense column-major 
matrix
-                * on the GPU. This is a temporary matrix for operations such 
as 
-                * cusparseDcsrmv.
-                * Since the allocated matrix is temporary, bookkeeping is not 
updated.
-                * The caller is responsible for calling "free" on the returned 
Pointer object
-                * @param cusparseHandle        a valid {@link cusparseHandle}
-                * @param cublasHandle          a valid {@link cublasHandle}
-                * @param rows          number of rows in this CSR matrix
-                * @param cols          number of columns in this CSR matrix
-                * @return                      A {@link Pointer} to the 
allocated dense matrix (in column-major format)
-                * @throws DMLRuntimeException if DMLRuntimeException occurs
-                */
-               public Pointer toColumnMajorDenseMatrix(cusparseHandle 
cusparseHandle, cublasHandle cublasHandle, int rows, int cols) throws 
DMLRuntimeException {
-                       long size = ((long)rows) * getDoubleSizeOf((long)cols);
-                       Pointer A = JCudaObject.allocate(size);
-                       // If this sparse block is empty, the allocated dense 
matrix, initialized to zeroes, will be returned.
-                       if (val != null && rowPtr != null && colInd != null && 
nnz > 0) {
-                               // Note: cusparseDcsr2dense method cannot 
handle empty blocks
-                               cusparseDcsr2dense(cusparseHandle, rows, cols, 
descr, val, rowPtr, colInd, A, rows);
-                cudaDeviceSynchronize();
-                       } else {
-                               LOG.warn("in CSRPointer, the values array, row 
pointers array or column indices array was null");
-                       }
-                       return A;
-               }
-               
-               /**
-                * Calls cudaFree lazily on the allocated {@link Pointer} 
instances
-                */
-               public void deallocate() {
-                       deallocate(false);
-               }
-
-               /**
-                * Calls cudaFree lazily or eagerly on the allocated {@link 
Pointer} instances
-                * @param eager whether to do eager or lazy cudaFrees
-                */
-               public void deallocate(boolean eager){
-                       if (nnz > 0) {
-                               cudaFreeHelper(val, eager);
-                               cudaFreeHelper(rowPtr, eager);
-                               cudaFreeHelper(colInd, eager);
-                       }
-               }
-       };
-       
-       private static long getDoubleSizeOf(long numElems) {
-               return numElems * ((long)jcuda.Sizeof.DOUBLE);
-       }
-       
-       private static long getIntSizeOf(long numElems) {
-               return numElems * ((long)jcuda.Sizeof.INT);
-       }
-       
-       public synchronized boolean isAllocated() {
-               return (jcudaDenseMatrixPtr != null || jcudaSparseMatrixPtr != 
null);
-       }
-       
-       /** Pointer to dense matrix */
-       public Pointer jcudaDenseMatrixPtr = null;
-       /** Pointer to sparse matrix */
-       public CSRPointer jcudaSparseMatrixPtr = null;
-
-       public long numBytes;                                                   
/** Number of bytes occupied by this block on GPU */
-
-       /**
-        * Initializes this JCudaObject with a {@link MatrixObject} instance 
which will contain metadata about the enclosing matrix block
-        * @param m
-        */
-       JCudaObject(MatrixObject m) {
-               super(m);
-       }
-
-       /**
-        * Allocates a sparse and empty {@link JCudaObject}
-        * This is the result of operations that are both non zero matrices.
-        * 
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       public void allocateSparseAndEmpty() throws DMLRuntimeException{
-               setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(0, 
mat.getNumRows()));
-               setDeviceModify(0);
-       }
-
-
-       /**
-        * Allocates a dense matrix of size obtained from the attached matrix 
metadata
-        * and fills it up with a single value
-        * 
-        * @param v value to fill up the dense matrix
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       public void allocateAndFillDense(double v) throws DMLRuntimeException {
-               long rows = mat.getNumRows();
-               long cols = mat.getNumColumns();
-               int numElems = toIntExact(rows * cols);
-               long size = getDoubleSizeOf(numElems);
-               setDenseMatrixCudaPointer(allocate(size));
-               setDeviceModify(size);
-               // The "fill" kernel is called which treats the matrix 
"jcudaDensePtr" like a vector and fills it with value "v"
-               LibMatrixCUDA.kernels.launchKernel("fill", 
ExecutionConfig.getConfigForSimpleVectorOperations(numElems), 
jcudaDenseMatrixPtr, v, numElems);
-       }
-
-       /**
-        * If this {@link JCudaObject} is sparse and empty
-        * Being allocated is a prerequisite to being sparse and empty.
-        * 
-        * @return true if sparse and empty
-        */
-       public boolean isSparseAndEmpty() {
-               boolean isSparseAndAllocated = isAllocated()&& 
LibMatrixCUDA.isInSparseFormat(mat);
-               boolean isEmptyAndSparseAndAllocated = isSparseAndAllocated && 
jcudaSparseMatrixPtr.nnz == 0;
-               return isEmptyAndSparseAndAllocated;
-       }
-
-       @Override
-       public synchronized boolean acquireDeviceRead() throws 
DMLRuntimeException {
-               boolean transferred = false;
-               if(!isAllocated()) {
-                       copyFromHostToDevice();
-                       transferred = true;
-               } else {
-                       numLocks.addAndGet(1);
-               }
-               if(!isAllocated())
-                       throw new DMLRuntimeException("Expected device data to 
be allocated");
-               return transferred;
-       }
-       
-       @Override
-       public synchronized boolean acquireDeviceModifyDense() throws 
DMLRuntimeException {
-               boolean allocated = false;
-               if(!isAllocated()) {
-                       mat.setDirty(true);
-                       // Dense block, size = numRows * numCols
-                       allocateDenseMatrixOnDevice();
-                       allocated = true;
-                       synchronized(evictionLock) {
-                               JCudaContext.allocatedPointers.add(this);
-                       }
-               }
-               isDeviceCopyModified = true;
-               if(!isAllocated()) 
-                       throw new DMLRuntimeException("Expected device data to 
be allocated");
-               return allocated;
-       }
-       
-       @Override
-       public synchronized boolean acquireDeviceModifySparse() throws 
DMLRuntimeException {
-               boolean allocated = false;
-               isInSparseFormat = true;
-               if(!isAllocated()) {
-                       mat.setDirty(true);
-                       allocateSparseMatrixOnDevice();
-                       allocated = true;
-                       synchronized(evictionLock) {
-                               JCudaContext.allocatedPointers.add(this);
-                       }
-               }
-               isDeviceCopyModified = true;
-               if(!isAllocated()) 
-                       throw new DMLRuntimeException("Expected device data to 
be allocated");
-               return allocated;
-       }
-       
-       @Override
-       public synchronized boolean acquireHostRead() throws CacheException {
-               boolean copied = false;
-               if(isAllocated()) {
-                       try {
-                               if(isDeviceCopyModified) {
-                                       copyFromDeviceToHost();
-                                       copied = true;
-                               }
-                       } catch (DMLRuntimeException e) {
-                               throw new CacheException(e);
-                       }
-               }
-               else {
-                       throw new CacheException("Cannot perform 
acquireHostRead as the GPU data is not allocated:" + mat.getVarName());
-               }
-               return copied;
-       }
-       
-       /**
-        * updates the locks depending on the eviction policy selected
-        * @throws CacheException if there is no locked GPU Object
-        */
-       private void updateReleaseLocks() throws CacheException {
-               if(numLocks.addAndGet(-1) < 0) {
-            throw new CacheException("Redundant release of GPU object");
-               }
-               if(evictionPolicy == EvictionPolicy.LRU) {
-            timestamp.set(System.nanoTime());
-               }
-               else if(evictionPolicy == EvictionPolicy.LFU) {
-            timestamp.addAndGet(1);
-               }
-               else if(evictionPolicy == EvictionPolicy.MIN_EVICT) {
-            // Do Nothing
-               }
-               else {
-            throw new CacheException("The eviction policy is not supported:" + 
evictionPolicy.name());
-               }
-       }
-       
-       /**
-        * releases input allocated on GPU
-        * @throws CacheException if data is not allocated
-        */
-       public synchronized void releaseInput() throws CacheException {
-               updateReleaseLocks();
-               if(!isAllocated())
-                       throw new CacheException("Attempting to release an 
input before allocating it");
-       }
-
-       /**
-       @Override
-       void allocateMemoryOnDevice(long numElemToAllocate) throws 
DMLRuntimeException {
-               if(!isAllocated()) {
-                       long start = System.nanoTime();
-                       if(numElemToAllocate == -1 && 
LibMatrixCUDA.isInSparseFormat(mat)) {
-                               
setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(mat.getNnz(), 
mat.getNumRows()));
-                               numBytes = 
CSRPointer.estimateSize(mat.getNnz(), mat.getNumRows());
-                               
JCudaContext.deviceMemBytes.addAndGet(-numBytes);
-                               isInSparseFormat = true;
-                               //throw new DMLRuntimeException("Sparse format 
not implemented");
-                       } else if(numElemToAllocate == -1) {
-                               // Called for dense input
-                               setDenseMatrixCudaPointer(new Pointer());
-                               numBytes = 
mat.getNumRows()*getDoubleSizeOf(mat.getNumColumns());
-                               cudaMalloc(jcudaDenseMatrixPtr, numBytes);
-                               
JCudaContext.deviceMemBytes.addAndGet(-numBytes);
-                       }
-                       else {
-                               // Called for dense output
-                               setDenseMatrixCudaPointer(new Pointer());
-                               numBytes = getDoubleSizeOf(numElemToAllocate);
-                               if(numElemToAllocate <= 0 || numBytes <= 0)
-                                       throw new DMLRuntimeException("Cannot 
allocate dense matrix object with " + numElemToAllocate + " elements and size " 
+ numBytes);
-                               cudaMalloc(jcudaDenseMatrixPtr,  numBytes);
-                               
JCudaContext.deviceMemBytes.addAndGet(-numBytes);
-                       }
-
-                       
GPUStatistics.cudaAllocTime.addAndGet(System.nanoTime()-start);
-                       GPUStatistics.cudaAllocCount.addAndGet(1);
-
-               }
-       }
-        */
-
-       @Override
-       void allocateDenseMatrixOnDevice() throws DMLRuntimeException {
-               assert !isAllocated() : "Internal error - trying to allocated 
dense matrix to a JCudaObject that is already allocated";
-               long rows = mat.getNumRows();
-               long cols = mat.getNumColumns();
-               assert rows > 0 : "Internal error - invalid number of rows when 
allocating dense matrix";
-               assert cols > 0 : "Internal error - invalid number of columns 
when allocating dense matrix;";
-        long size = getDoubleSizeOf(rows * cols);
-               Pointer tmp = allocate(size);
-               setDenseMatrixCudaPointer(tmp);
-               setDeviceModify(size);
-       }
-
-       @Override
-       void allocateSparseMatrixOnDevice() throws DMLRuntimeException {
-               assert !isAllocated() : "Internal error = trying to allocated 
sparse matrix to a JCudaObject that is already allocated";
-               long rows = mat.getNumRows();
-               long nnz = mat.getNnz();
-               assert rows > 0 : "Internal error - invalid number of rows when 
allocating a sparse matrix";
-               assert nnz > 0 : "Internal error - invalid number of non zeroes 
when allocating a sparse matrix";
-               CSRPointer tmp = CSRPointer.allocateEmpty(nnz, rows);
-               setSparseMatrixCudaPointer(tmp);
-               long size = CSRPointer.estimateSize(nnz, rows);
-               setDeviceModify(size);
-       }
-
-       /**
-        * releases output allocated on GPU
-        * @throws CacheException if data is not allocated
-        */
-    @Override
-       public synchronized void releaseOutput() throws CacheException {
-               updateReleaseLocks();
-               isDeviceCopyModified = true;
-               if(!isAllocated())
-                       throw new CacheException("Attempting to release an 
output before allocating it");
-       }
-
-       @Override
-       public void setDeviceModify(long numBytes) {
-               this.numLocks.addAndGet(1);
-               this.numBytes = numBytes;
-               
((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(-numBytes);
-       }
-
-       @Override
-       void deallocateMemoryOnDevice(boolean eager) {
-               if(jcudaDenseMatrixPtr != null) {
-                       cudaFreeHelper(null, jcudaDenseMatrixPtr, eager);
-                       
((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(numBytes);
-               }
-               if (jcudaSparseMatrixPtr != null) {
-                       jcudaSparseMatrixPtr.deallocate(eager);
-                       
((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(numBytes);
-               }
-               jcudaDenseMatrixPtr = null;
-               jcudaSparseMatrixPtr = null;
-               if(tensorDescriptor != null) {
-                       cudnnDestroyTensorDescriptor(tensorDescriptor);
-                       tensorDescriptor = null;
-               }
-               numLocks.set(0);
-       }
-       
-       /** 
-        * Thin wrapper over {@link #evict(long)}
-        * @param size size to check
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       static void ensureFreeSpace(long size) throws DMLRuntimeException {
-               ensureFreeSpace(null, size);
-       }
-
-       /**
-        * Thin wrapper over {@link #evict(long)}
-        * @param instructionName instructionName name of the instruction for 
which performance measurements are made
-        * @param size size to check
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       static void ensureFreeSpace(String instructionName, long size) throws 
DMLRuntimeException {
-               if(size >= getAvailableMemory()) {
-                       evict(instructionName, size);
-               }
-       }
-       
-       @Override
-       void copyFromHostToDevice() 
-               throws DMLRuntimeException 
-       {
-               printCaller();
-               long start=0;
-               if (DMLScript.STATISTICS) start = System.nanoTime();
-               
-               MatrixBlock tmp = mat.acquireRead();
-               if(tmp.isInSparseFormat()) {
-                       
-                       int rowPtr[] = null;
-                       int colInd[] = null;
-                       double[] values = null;
-                       
-                       tmp.recomputeNonZeros();
-                       long nnz = tmp.getNonZeros();
-                       mat.getMatrixCharacteristics().setNonZeros(nnz);
-                       
-                       SparseBlock block = tmp.getSparseBlock();
-                       boolean copyToDevice = true;
-                       if(block == null && tmp.getNonZeros() == 0) {
-//                             // Allocate empty block --> not necessary
-//                             // To reproduce this, see 
org.apache.sysml.test.integration.applications.dml.ID3DMLTest
-//                             rowPtr = new int[0];
-//                             colInd = new int[0];
-//                             values = new double[0];
-                               copyToDevice = false;
-                       }
-                       else if(block == null && tmp.getNonZeros() != 0) {
-                               throw new DMLRuntimeException("Expected CP 
sparse block to be not null.");
-                       }
-                       else {
-                               // CSR is the preferred format for cuSparse GEMM
-                               // Converts MCSR and COO to CSR
-                               SparseBlockCSR csrBlock = null;
-                               long t0=0;
-                               if (block instanceof SparseBlockCSR){ 
-                                       csrBlock = (SparseBlockCSR)block;
-                               } else if (block instanceof SparseBlockCOO) {
-                                       // TODO - should we do this on the GPU 
using cusparse<t>coo2csr() ?
-                                       if (DMLScript.STATISTICS) t0 = 
System.nanoTime();
-                                       SparseBlockCOO cooBlock = 
(SparseBlockCOO)block;
-                                       csrBlock = new 
SparseBlockCSR(toIntExact(mat.getNumRows()), cooBlock.rowIndexes(), 
cooBlock.indexes(), cooBlock.values());
-                                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
-                                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaSparseConversionCount.incrementAndGet();
-                               } else if (block instanceof SparseBlockMCSR) {
-                                       if (DMLScript.STATISTICS) t0 = 
System.nanoTime();
-                                       SparseBlockMCSR mcsrBlock = 
(SparseBlockMCSR)block;
-                                       csrBlock = new 
SparseBlockCSR(mcsrBlock.getRows(), toIntExact(mcsrBlock.size()));
-                                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
-                                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaSparseConversionCount.incrementAndGet();
-                               } else {
-                                       throw new 
DMLRuntimeException("Unsupported sparse matrix format for CUDA operations");
-                               }
-                               rowPtr = csrBlock.rowPointers();
-                               colInd = csrBlock.indexes();
-                               values = csrBlock.values();     
-                       }
-                       allocateSparseMatrixOnDevice();
-                       synchronized(evictionLock) {
-                               JCudaContext.allocatedPointers.add(this);
-                       }
-                       if(copyToDevice) {
-                               CSRPointer.copyToDevice(jcudaSparseMatrixPtr, 
tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd, values);
-                       }
-               }
-               else {
-                       double[] data = tmp.getDenseBlock();
-                       
-                       if( data == null && tmp.getSparseBlock() != null )
-                               throw new DMLRuntimeException("Incorrect 
sparsity calculation");
-                       else if( data==null && tmp.getNonZeros() != 0 )
-                               throw new DMLRuntimeException("MatrixBlock is 
not allocated");
-                       else if( tmp.getNonZeros() == 0 )
-                               data = new 
double[tmp.getNumRows()*tmp.getNumColumns()];
-                       
-                       // Copy dense block
-                       allocateDenseMatrixOnDevice();
-                       synchronized(evictionLock) {
-                               JCudaContext.allocatedPointers.add(this);
-                       }
-                       cudaMemcpy(jcudaDenseMatrixPtr, Pointer.to(data), 
getDoubleSizeOf(mat.getNumRows()*mat.getNumColumns()), cudaMemcpyHostToDevice);
-               }
-               
-               mat.release();
-
-               if (DMLScript.STATISTICS) 
GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime()-start);
-               if (DMLScript.STATISTICS) 
GPUStatistics.cudaToDevCount.addAndGet(1);
-       }
-       
-       public static int toIntExact(long l) throws DMLRuntimeException {
-           if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) {
-               throw new DMLRuntimeException("Cannot be cast to int:" + l);
-           }
-           return (int) l;
-       }
-
-       @Override
-       protected void copyFromDeviceToHost() throws DMLRuntimeException {
-               if (jcudaDenseMatrixPtr != null && jcudaSparseMatrixPtr != 
null){
-                       throw new DMLRuntimeException("Invalid state : JCuda 
dense/sparse pointer are both allocated");
-               }
-
-               if(jcudaDenseMatrixPtr != null) {
-                       printCaller();
-                       long start=0;
-                       if (DMLScript.STATISTICS) start = System.nanoTime();
-                       MatrixBlock tmp = new 
MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), 
false);
-                       tmp.allocateDenseBlock();
-                       double [] data = tmp.getDenseBlock();
-                       
-                       cudaMemcpy(Pointer.to(data), jcudaDenseMatrixPtr, 
getDoubleSizeOf(data.length), cudaMemcpyDeviceToHost);
-                       
-                       tmp.recomputeNonZeros();
-                       mat.acquireModify(tmp);
-                       mat.release();
-
-                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime()-start);
-                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaFromDevCount.addAndGet(1);
-               }
-               else if (jcudaSparseMatrixPtr != null){
-                       printCaller();
-                       if(!LibMatrixCUDA.isInSparseFormat(mat))
-                               throw new DMLRuntimeException("Block not in 
sparse format on host yet the device sparse matrix pointer is not null");
-
-                       if(this.isSparseAndEmpty()){
-                               MatrixBlock tmp = new MatrixBlock();    // 
Empty Block
-                               mat.acquireModify(tmp);
-                               mat.release();
-                       } else {
-                               long start=0;
-                               if (DMLScript.STATISTICS) start = 
System.nanoTime();
-
-                               int rows = toIntExact(mat.getNumRows());
-                               int cols = toIntExact(mat.getNumColumns());
-                               int nnz = toIntExact(jcudaSparseMatrixPtr.nnz);
-                               int[] rowPtr = new int[rows + 1];
-                               int[] colInd = new int[nnz];
-                               double[] values = new double[nnz];
-                               CSRPointer.copyToHost(jcudaSparseMatrixPtr, 
rows, nnz, rowPtr, colInd, values);
-
-                               SparseBlockCSR sparseBlock = new 
SparseBlockCSR(rowPtr, colInd, values, nnz);
-                               MatrixBlock tmp = new MatrixBlock(rows, cols, 
nnz, sparseBlock);
-                               mat.acquireModify(tmp);
-                               mat.release();
-                               if (DMLScript.STATISTICS) 
GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - start);
-                               if (DMLScript.STATISTICS) 
GPUStatistics.cudaFromDevCount.addAndGet(1);
-                       }
-               }
-               else {
-                       throw new DMLRuntimeException("Cannot copy from device 
to host as JCuda dense/sparse pointer is not allocated");
-               }
-               isDeviceCopyModified = false;
-       }
-
-       @Override
-       protected long getSizeOnDevice() throws DMLRuntimeException {
-               long GPUSize = 0;
-               long rlen = mat.getNumRows();
-               long clen = mat.getNumColumns();
-               long nnz = mat.getNnz();
-
-               if(LibMatrixCUDA.isInSparseFormat(mat)) {
-                       GPUSize = CSRPointer.estimateSize(nnz, rlen);
-               }
-               else {
-                       GPUSize = getDoubleSizeOf(rlen * clen);
-               }
-               return GPUSize;
-       }
-       
-       private String getClassAndMethod(StackTraceElement st) {
-               String [] str = st.getClassName().split("\\.");
-               return str[str.length - 1] + "." + st.getMethodName();
-       }
-       
-       /**
-        * Convenience debugging method.
-        * Checks {@link JCudaContext#DEBUG} flag before printing to System.out
-        */
-       private void printCaller() {
-               if(JCudaContext.DEBUG) {
-                       StackTraceElement[] st = 
Thread.currentThread().getStackTrace();
-                       String ret = getClassAndMethod(st[1]);
-                       for (int i = 2; i < st.length && i < 7; i++) {
-                               ret += "->" + getClassAndMethod(st[i]);
-                       }
-                       System.out.println("CALL_STACK:" + ret);
-               }
-       }
-       
-       /**
-        * Convenience method to directly examine the Sparse matrix on GPU
-        * 
-        * @return CSR (compressed sparse row) pointer
-        */
-       public CSRPointer getSparseMatrixCudaPointer() {
-               return jcudaSparseMatrixPtr;
-       }
-       
-       /**
-        * Convenience method to directly set the sparse matrix on GPU
-        * Make sure to call {@link #setDeviceModify(long)} after this to set 
appropriate state, if you are not sure what you are doing.
-        * Needed for operations like {@link 
JCusparse#cusparseDcsrgemm(cusparseHandle, int, int, int, int, int, 
cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, int, 
Pointer, Pointer, Pointer, cusparseMatDescr, Pointer, Pointer, Pointer)}
-        * @param sparseMatrixPtr CSR (compressed sparse row) pointer
-        */
-       public synchronized void setSparseMatrixCudaPointer(CSRPointer 
sparseMatrixPtr) {
-               this.jcudaSparseMatrixPtr = sparseMatrixPtr;
-               this.isInSparseFormat = true;
-               if(jcudaDenseMatrixPtr != null) {
-                       cudaFreeHelper(jcudaDenseMatrixPtr);
-                       jcudaDenseMatrixPtr = null;
-               }
-       }
-
-       /**
-        * Convenience method to directly set the dense matrix pointer on GPU
-        * Make sure to call {@link #setDeviceModify(long)} after this to set 
appropriate state, if you are not sure what you are doing.
-        * 
-        * @param densePtr dense pointer
-        */
-       public synchronized void setDenseMatrixCudaPointer(Pointer densePtr){
-               this.jcudaDenseMatrixPtr = densePtr;
-               this.isInSparseFormat = false;
-               if(jcudaSparseMatrixPtr != null) {
-                       jcudaSparseMatrixPtr.deallocate();
-                       jcudaSparseMatrixPtr = null;
-               }
-       }
-       
-       /**
-        * Converts this JCudaObject from dense to sparse format.
-        * 
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       public void denseToSparse() throws DMLRuntimeException {
-               long t0=0;
-               if (DMLScript.STATISTICS) t0 = System.nanoTime();
-               cusparseHandle cusparseHandle = LibMatrixCUDA.cusparseHandle;
-               if(cusparseHandle == null)
-                       throw new DMLRuntimeException("Expected cusparse to be 
initialized");
-               int rows = toIntExact(mat.getNumRows());
-               int cols = toIntExact(mat.getNumColumns());
-               
-               if(jcudaDenseMatrixPtr == null || !isAllocated())
-                       throw new DMLRuntimeException("Expected allocated dense 
matrix before denseToSparse() call");
-
-               convertDensePtrFromRowMajorToColumnMajor();
-               
setSparseMatrixCudaPointer(columnMajorDenseToRowMajorSparse(cusparseHandle, 
rows, cols, jcudaDenseMatrixPtr));
-               // TODO: What if mat.getNnz() is -1 ?
-               numBytes = CSRPointer.estimateSize(mat.getNnz(), rows);
-               if (DMLScript.STATISTICS) 
GPUStatistics.cudaDenseToSparseTime.addAndGet(System.nanoTime() - t0);
-               if (DMLScript.STATISTICS) 
GPUStatistics.cudaDenseToSparseCount.addAndGet(1);
-       }
-
-       /**
-        * Transposes a dense matrix on the GPU by calling the cublasDgeam 
operation
-        * @param densePtr      Pointer to dense matrix on the GPU
-        * @param m                     rows in ouput matrix
-        * @param n                     columns in output matrix
-        * @param lda           rows in input matrix
-        * @param ldc           columns in output matrix
-        * @return                      transposed matrix
-        * @throws DMLRuntimeException if operation failed
-        */
-       public static Pointer transpose(Pointer densePtr, int m, int n, int 
lda, int ldc) throws DMLRuntimeException {
-               Pointer alpha = LibMatrixCUDA.pointerTo(1.0);
-               Pointer beta = LibMatrixCUDA.pointerTo(0.0);
-               Pointer A = densePtr;
-               Pointer C = JCudaObject.allocate(((long)m)*getDoubleSizeOf(n));
-
-               // Transpose the matrix to get a dense matrix
-               JCublas2.cublasDgeam(LibMatrixCUDA.cublasHandle, CUBLAS_OP_T, 
CUBLAS_OP_T, m, n, alpha, A, lda, beta, new Pointer(), lda, C, ldc);
-               return C;
-       }
-
-       /**
-        * Convenience method. Converts Row Major Dense Matrix --> Column Major 
Dense Matrix
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       private void convertDensePtrFromRowMajorToColumnMajor() throws 
DMLRuntimeException {
-               int m = toIntExact(mat.getNumRows());
-               int n = toIntExact(mat.getNumColumns());
-               int lda = n;
-               int ldc = m;
-               if(!isAllocated()) {
-                       throw new DMLRuntimeException("Error in converting row 
major to column major : data is not allocated");
-               }
-
-               Pointer tmp = transpose(jcudaDenseMatrixPtr, m, n, lda, ldc);
-               cudaFreeHelper(jcudaDenseMatrixPtr);
-               setDenseMatrixCudaPointer(tmp);
-       }
-
-       private void convertDensePtrFromColMajorToRowMajor() throws 
DMLRuntimeException {
-               int n = toIntExact(mat.getNumRows());
-               int m = toIntExact(mat.getNumColumns());
-               int lda = n;
-           int ldc = m;
-               if(!isAllocated()) {
-                       throw new DMLRuntimeException("Error in converting 
column major to row major : data is not allocated");
-               }
-
-               Pointer tmp = transpose(jcudaDenseMatrixPtr, m, n, lda, ldc);
-               cudaFreeHelper(jcudaDenseMatrixPtr);
-               setDenseMatrixCudaPointer(tmp);
-       }
-       
-       /**
-        * Convert sparse to dense (Performs transpose, use 
sparseToColumnMajorDense if the kernel can deal with column major format)
-        * 
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       public void sparseToDense() throws DMLRuntimeException {
-               sparseToDense(null);
-       }
-
-       /**
-        * Convert sparse to dense (Performs transpose, use 
sparseToColumnMajorDense if the kernel can deal with column major format)
-        * Also records per instruction invokation of sparseToDense.
-        * @param instructionName       Name of the instruction for which 
statistics are recorded in {@link GPUStatistics}
-        * @throws DMLRuntimeException ?
-        */
-       public void sparseToDense(String instructionName) throws 
DMLRuntimeException {
-               long start=0, end=0;
-               if (DMLScript.STATISTICS) start = System.nanoTime();
-               if(jcudaSparseMatrixPtr == null || !isAllocated())
-                       throw new DMLRuntimeException("Expected allocated 
sparse matrix before sparseToDense() call");
-
-               sparseToColumnMajorDense();
-               convertDensePtrFromColMajorToRowMajor();
-               if (DMLScript.STATISTICS) end = System.nanoTime();
-               if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, end - start);
-               if (DMLScript.STATISTICS) 
GPUStatistics.cudaSparseToDenseTime.addAndGet(end - start);
-               if (DMLScript.STATISTICS) 
GPUStatistics.cudaSparseToDenseCount.addAndGet(1);
-       }
-       
-       /**
-        * More efficient method to convert sparse to dense but returns dense 
in column major format
-        * 
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       public void sparseToColumnMajorDense() throws DMLRuntimeException {
-               if(jcudaSparseMatrixPtr == null || !isAllocated())
-                       throw new DMLRuntimeException("Expected allocated 
sparse matrix before sparseToDense() call");
-               
-               cusparseHandle cusparseHandle = LibMatrixCUDA.cusparseHandle;
-               if(cusparseHandle == null)
-                       throw new DMLRuntimeException("Expected cusparse to be 
initialized");
-               int rows = toIntExact(mat.getNumRows());
-               int cols = toIntExact(mat.getNumColumns());
-               
setDenseMatrixCudaPointer(jcudaSparseMatrixPtr.toColumnMajorDenseMatrix(cusparseHandle,
 null, rows, cols));
-               numBytes = ((long)rows)*getDoubleSizeOf(cols);
-       }
-       
-       /**
-        * Convenience method to convert a CSR matrix to a dense matrix on the 
GPU
-        * Since the allocated matrix is temporary, bookkeeping is not updated.
-        * Also note that the input dense matrix is expected to be in COLUMN 
MAJOR FORMAT
-        * Caller is responsible for deallocating memory on GPU.
-        * 
-        * @param cusparseHandle handle to cusparse library
-        * @param rows number of rows
-        * @param cols number of columns
-        * @param densePtr [in] dense matrix pointer on the GPU in row major
-        * @return CSR (compressed sparse row) pointer
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       public static CSRPointer 
columnMajorDenseToRowMajorSparse(cusparseHandle cusparseHandle, int rows, int 
cols, Pointer densePtr) throws DMLRuntimeException {
-               cusparseMatDescr matDescr = 
CSRPointer.getDefaultCuSparseMatrixDescriptor();
-               Pointer nnzPerRowPtr = null;
-               Pointer nnzTotalDevHostPtr = null;
-               
-               ensureFreeSpace(getIntSizeOf(rows + 1));
-               nnzPerRowPtr = allocate(getIntSizeOf(rows));
-               nnzTotalDevHostPtr = allocate(getIntSizeOf(1));
-               
-               // Output is in dense vector format, convert it to CSR
-               cusparseDnnz(cusparseHandle, 
cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows, 
nnzPerRowPtr, nnzTotalDevHostPtr);
-               cudaDeviceSynchronize();
-               int[] nnzC = {-1};
-
-               long t2=0;
-               if (DMLScript.STATISTICS) t2 = System.nanoTime();
-               cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, 
getIntSizeOf(1), cudaMemcpyDeviceToHost);
-               if (DMLScript.STATISTICS) 
GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t2);
-               if (DMLScript.STATISTICS) 
GPUStatistics.cudaFromDevCount.addAndGet(1);
-               
-               if (nnzC[0] == -1){
-                       throw new DMLRuntimeException("cusparseDnnz did not 
calculate the correct number of nnz from the sparse-matrix vector mulitply on 
the GPU");
-               }
-               
-               CSRPointer C = CSRPointer.allocateEmpty(nnzC[0], rows);         
-               cusparseDdense2csr(cusparseHandle, rows, cols, matDescr, 
densePtr, rows, nnzPerRowPtr, C.val, C.rowPtr, C.colInd);
-               cudaDeviceSynchronize();
-
-               cudaFreeHelper(nnzPerRowPtr);
-               cudaFreeHelper(nnzTotalDevHostPtr);
-               
-               return C;
-       }
-
-
-       /** Map of free blocks allocate on GPU. maps size_of_block -> pointer 
on GPU */
-       static LRUCacheMap<Long, LinkedList<Pointer>> freeCUDASpaceMap = new 
LRUCacheMap<Long, LinkedList<Pointer>>();
-       /** To record size of allocated blocks */
-       static HashMap<Pointer, Long> cudaBlockSizeMap = new HashMap<Pointer, 
Long>();
-
-
-       /**
-        * Convenience method for {@link #allocate(String, long, int)}, 
defaults statsCount to 1.
-        * @param size size of data (in bytes) to allocate
-        * @return jcuda pointer
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       public static Pointer allocate(long size) throws DMLRuntimeException {
-               return allocate(null, size, 1);
-       }
-
-       /**
-        * Convenience method for {@link #allocate(String, long, int)}, 
defaults statsCount to 1.
-        * @param instructionName name of instruction for which to record per 
instruction performance statistics, null if don't want to record
-        * @param size size of data (in bytes) to allocate
-        * @return jcuda pointer
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       public static Pointer allocate(String instructionName, long size) 
throws DMLRuntimeException {
-               return allocate(instructionName, size, 1);
-       }
-
-       /**
-        * Allocates temporary space on the device.
-        * Does not update bookkeeping.
-        * The caller is responsible for freeing up after usage.
-        * @param instructionName name of instruction for which to record per 
instruction performance statistics, null if don't want to record
-        * @param size                          Size of data (in bytes) to 
allocate
-        * @param statsCount    amount to increment the cudaAllocCount by
-        * @return jcuda Pointer
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       public static Pointer allocate(String instructionName, long size, int 
statsCount) throws DMLRuntimeException{
-               long t0=0, t1=0, end=0;
-               synchronized (JCudaContext.syncObj) {
-                       Pointer A;
-                       if (freeCUDASpaceMap.containsKey(size)) {
-                               if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-                               LinkedList<Pointer> freeList = 
freeCUDASpaceMap.get(size);
-                               A = freeList.pop();
-                               if (freeList.isEmpty())
-                                       freeCUDASpaceMap.remove(size);
-                               if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0);
-                       } else {
-                               if (DMLScript.STATISTICS) t0 = 
System.nanoTime();
-                               ensureFreeSpace(instructionName, size);
-                               A = new Pointer();
-                               cudaMalloc(A, size);
-                               
((JCudaContext)(JCudaContext.currContext)).deviceMemBytes.addAndGet(size);
-                               if (DMLScript.STATISTICS) 
GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0);
-                               if (DMLScript.STATISTICS) 
GPUStatistics.cudaAllocCount.getAndAdd(statsCount);
-                               if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_ALLOCATE, System.nanoTime() - t0);
-                       }
-                       // Set all elements to 0 since newly allocated space 
will contain garbage
-                       if (DMLScript.STATISTICS) t1 = System.nanoTime();
-                       cudaMemset(A, 0, size);
-                       if (DMLScript.STATISTICS) end = System.nanoTime();
-                       if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_SET_ZERO, end - t1);
-                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaMemSet0Time.getAndAdd(end - t1);
-                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaMemSet0Count.getAndAdd(1);
-                       cudaBlockSizeMap.put(A, size);
-                       return A;
-               }
-       }
-
-       /**
-        * Does lazy cudaFree calls
-        * @param toFree {@link Pointer} instance to be freed
-        */
-       public static void cudaFreeHelper(final Pointer toFree) {
-               cudaFreeHelper(null, toFree, false);
-       }
-
-       /**
-        * does lazy/eager cudaFree calls
-        * @param toFree {@link Pointer} instance to be freed
-        * @param eager true if to be done eagerly
-        */
-       public static void cudaFreeHelper(final Pointer toFree, boolean eager) {
-               cudaFreeHelper(null, toFree, eager);
-       }
-
-       /**
-        * Does lazy cudaFree calls
-        * @param instructionName name of the instruction for which to record 
per instruction free time, null if do not want to record
-        * @param toFree {@link Pointer} instance to be freed
-        */
-       public static void cudaFreeHelper(String instructionName, final Pointer 
toFree) {
-               cudaFreeHelper(instructionName, toFree, false);
-       }
-
-       /**
-        * Does cudaFree calls, lazily
-        * @param instructionName name of the instruction for which to record 
per instruction free time, null if do not want to record
-        * @param toFree {@link Pointer} instance to be freed
-        * @param eager true if to be done eagerly
-        */
-       public static void cudaFreeHelper(String instructionName, final Pointer 
toFree, boolean eager){
-               long t0 = 0;
-               assert cudaBlockSizeMap.containsKey(toFree) : "ERROR : Internal 
state corrupted, cache block size map is not aware of a block it trying to free 
up";
-               long size = cudaBlockSizeMap.get(toFree);
-               if (eager) {
-                       if (DMLScript.STATISTICS) t0 = System.nanoTime();
-                       
((JCudaContext)(JCudaContext.currContext)).deviceMemBytes.addAndGet(-size);
-                       cudaFree(toFree);
-                       cudaBlockSizeMap.remove(toFree);
-                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime() - t0);
-                       if (DMLScript.STATISTICS) 
GPUStatistics.cudaDeAllocCount.addAndGet(1);
-                       if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t0);
-               } else {
-                       LinkedList<Pointer> freeList = 
freeCUDASpaceMap.get(size);
-                       if (freeList == null) {
-                               freeList = new LinkedList<Pointer>();
-                               freeCUDASpaceMap.put(size, freeList);
-                       }
-                       freeList.add(toFree);
-               }
-       }
-
-
-       /**
-        * Gets the double array from GPU memory onto host memory and returns 
string.
-        * @param A Pointer to memory on device (GPU), assumed to point to a 
double array
-        * @param rows rows in matrix A
-     * @param cols columns in matrix A
-        * @return the debug string
-        * @throws DMLRuntimeException  if DMLRuntimeException occurs
-        */
-       public static String debugString(Pointer A, long rows, long cols) 
throws DMLRuntimeException {
-               StringBuffer sb = new StringBuffer();
-        int len = toIntExact(rows * cols);
-               double[] tmp = new double[len];
-               cudaMemcpy(Pointer.to(tmp), A, getDoubleSizeOf(len), 
cudaMemcpyDeviceToHost);
-        int k = 0;
-               for (int i=0; i<rows; i++){
-            for (int j=0; j<cols; j++){
-                          sb.append(tmp[k]).append(' ');
-               k++;
-            }
-            sb.append('\n');
-               }
-               return sb.toString();
-       }
-}

Reply via email to