http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java index d858b0b..b4cb87d 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java @@ -18,34 +18,25 @@ */ package org.apache.sysml.runtime.instructions.gpu.context; -import static jcuda.driver.JCudaDriver.cuCtxCreate; -import static jcuda.driver.JCudaDriver.cuCtxGetCurrent; -import static jcuda.driver.JCudaDriver.cuDeviceGet; -import static jcuda.driver.JCudaDriver.cuInit; import static jcuda.driver.JCudaDriver.cuLaunchKernel; import static jcuda.driver.JCudaDriver.cuModuleGetFunction; import static jcuda.driver.JCudaDriver.cuModuleLoadDataEx; -import static jcuda.driver.JCudaDriver.cuModuleUnload; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; -import jcuda.runtime.JCuda; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.io.IOUtilFunctions; -import jcuda.CudaException; import jcuda.Pointer; -import jcuda.driver.CUcontext; -import jcuda.driver.CUdevice; import jcuda.driver.CUfunction; import jcuda.driver.CUmodule; import jcuda.driver.CUresult; /** - * Utility class that allows LibMatrixCUDA as well as JCudaObject to invoke custom CUDA kernels. + * Utility class that allows LibMatrixCUDA as well as GPUObject to invoke custom CUDA kernels. * * The utility org.apache.sysml.runtime.instructions.gpu.context.JCudaKernels simplifies the launching of the kernels. * For example: to launch a kernel @@ -54,70 +45,23 @@ import jcuda.driver.CUresult; */ public class JCudaKernels { - private static String ptxFileName = "/kernels/SystemML.ptx"; + private final static String ptxFileName = "/kernels/SystemML.ptx"; private HashMap<String, CUfunction> kernels = new HashMap<String, CUfunction>(); private CUmodule module; + private final int deviceNum; /** * Loads the kernels in the file ptxFileName. Though cubin files are also supported, we will stick with * ptx file as they are target-independent similar to Java's .class files. - * + * @param deviceNum the device number for which to initiate the driver API * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public JCudaKernels() throws DMLRuntimeException { - shutdown(); - initCUDA(); + JCudaKernels(int deviceNum) throws DMLRuntimeException { + this.deviceNum = deviceNum; module = new CUmodule(); // Load the kernels specified in the ptxFileName file checkResult(cuModuleLoadDataEx(module, initKernels(ptxFileName), 0, new int[0], Pointer.to(new int[0]))); } - - /** - * Initializes the JCuda driver API. Then it will try to attach to the - * current CUDA context. If no active CUDA context exists, then it will - * try to create one, for the device which is specified by the current - * deviceNumber. - * - * @throws DMLRuntimeException If it is neither possible to attach to an - * existing context, nor to create a new context. - */ - private static void initCUDA() throws DMLRuntimeException { - checkResult(cuInit(0)); - - // Try to obtain the current context - CUcontext context = new CUcontext(); - checkResult(cuCtxGetCurrent(context)); - - // If the context is 'null', then a new context - // has to be created. - CUcontext nullContext = new CUcontext(); - if (context.equals(nullContext)) { - createContext(); - } - } - - /** - * Tries to create a context for device 'deviceNumber'. - * @throws DMLRuntimeException - * - * @throws CudaException If the device can not be - * accessed or the context can not be created - */ - private static void createContext() throws DMLRuntimeException { - int deviceNumber = 0; - CUdevice device = new CUdevice(); - checkResult(cuDeviceGet(device, deviceNumber)); - CUcontext context = new CUcontext(); - checkResult(cuCtxCreate(context, 0, device)); - } - - /** - * Performs cleanup actions such as unloading the module - */ - public void shutdown() { - if(module != null) - cuModuleUnload(module); - } /** * Setups the kernel parameters and launches the kernel using cuLaunchKernel API. @@ -167,7 +111,7 @@ public class JCudaKernels { config.gridDimX, config.gridDimY, config.gridDimZ, config.blockDimX, config.blockDimY, config.blockDimZ, config.sharedMemBytes, config.stream, Pointer.to(kernelParams), null)); - JCuda.cudaDeviceSynchronize(); + //JCuda.cudaDeviceSynchronize(); } public static void checkResult(int cuResult) throws DMLRuntimeException {
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java deleted file mode 100644 index bc0031b..0000000 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java +++ /dev/null @@ -1,1330 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.sysml.runtime.instructions.gpu.context; - -import jcuda.Pointer; -import jcuda.jcublas.JCublas2; -import jcuda.jcublas.cublasHandle; -import jcuda.jcudnn.cudnnTensorDescriptor; -import jcuda.jcusparse.JCusparse; -import jcuda.jcusparse.cusparseDirection; -import jcuda.jcusparse.cusparseHandle; -import jcuda.jcusparse.cusparseMatDescr; -import jcuda.jcusparse.cusparsePointerMode; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.sysml.api.DMLScript; -import org.apache.sysml.runtime.DMLRuntimeException; -import org.apache.sysml.runtime.controlprogram.caching.CacheException; -import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; -import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; -import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; -import org.apache.sysml.runtime.matrix.data.MatrixBlock; -import org.apache.sysml.runtime.matrix.data.SparseBlock; -import org.apache.sysml.runtime.matrix.data.SparseBlockCOO; -import org.apache.sysml.runtime.matrix.data.SparseBlockCSR; -import org.apache.sysml.runtime.matrix.data.SparseBlockMCSR; -import org.apache.sysml.utils.GPUStatistics; -import org.apache.sysml.utils.LRUCacheMap; - -import java.util.HashMap; -import java.util.LinkedList; - -import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T; -import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor; -import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor; -import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor; -import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW; -import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE; -import static jcuda.jcusparse.JCusparse.cusparseCreateMatDescr; -import static jcuda.jcusparse.JCusparse.cusparseDcsr2dense; -import static jcuda.jcusparse.JCusparse.cusparseDdense2csr; -import static jcuda.jcusparse.JCusparse.cusparseDnnz; -import static jcuda.jcusparse.JCusparse.cusparseSetMatIndexBase; -import static jcuda.jcusparse.JCusparse.cusparseSetMatType; -import static jcuda.jcusparse.JCusparse.cusparseSetPointerMode; -import static jcuda.jcusparse.JCusparse.cusparseXcsrgeamNnz; -import static jcuda.jcusparse.JCusparse.cusparseXcsrgemmNnz; -import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO; -import static jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL; -import static jcuda.runtime.JCuda.cudaDeviceSynchronize; -import static jcuda.runtime.JCuda.cudaFree; -import static jcuda.runtime.JCuda.cudaMalloc; -import static jcuda.runtime.JCuda.cudaMemcpy; -import static jcuda.runtime.JCuda.cudaMemset; -import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; -import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; - -/** - * Handle to a matrix block on the GPU - */ -public class JCudaObject extends GPUObject { - - private static final Log LOG = LogFactory.getLog(JCudaObject.class.getName()); - - // An optional tensor descriptor (and shape) that can be set by a tensor instruction such as convolution, maxpooling - // and exploited by a subsequent non-tensor instruction such as relu - private cudnnTensorDescriptor tensorDescriptor = null; - private int [] tensorShape = null; - - /** - * Returns a previously allocated tensor shape or null - * @return int array of four elements or null - */ - public int [] getTensorShape() { - return tensorShape; - } - - /** - * Returns a previously allocated tensor descriptor or null - * @return cudnn tensor descriptor - */ - public cudnnTensorDescriptor getTensorDescriptor() { - return tensorDescriptor; - } - - /** - * Returns a previously allocated or allocates and returns a tensor descriptor - * @param N number of images - * @param C number of channels - * @param H height - * @param W width - * @return cudnn tensor descriptor - */ - public cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int H, int W) { - if(tensorDescriptor == null) { - tensorDescriptor = new cudnnTensorDescriptor(); - cudnnCreateTensorDescriptor(tensorDescriptor); - cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W); - tensorShape = new int[4]; - tensorShape[0] = N; - tensorShape[1] = C; - tensorShape[2] = H; - tensorShape[3] = W; - } - return tensorDescriptor; - } - - /** - * Compressed Sparse Row (CSR) format for CUDA - * Generalized matrix multiply is implemented for CSR format in the cuSparse library among other operations - */ - public static class CSRPointer { - - public static cusparseMatDescr matrixDescriptor; - - /** - * @return Singleton default matrix descriptor object - * (set with CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO) - */ - public static cusparseMatDescr getDefaultCuSparseMatrixDescriptor(){ - if (matrixDescriptor == null){ - // Code from JCuda Samples - http://www.jcuda.org/samples/JCusparseSample.java - matrixDescriptor = new cusparseMatDescr(); - cusparseCreateMatDescr(matrixDescriptor); - cusparseSetMatType(matrixDescriptor, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatIndexBase(matrixDescriptor, CUSPARSE_INDEX_BASE_ZERO); - } - return matrixDescriptor; - } - - private static final double ULTRA_SPARSITY_TURN_POINT = 0.0004; - - /** - * Default constructor to help with Factory method {@link #allocateEmpty(long, long)} - */ - private CSRPointer() { - val = new Pointer(); - rowPtr = new Pointer(); - colInd = new Pointer(); - allocateMatDescrPointer(); - } - - /** Number of non zeroes */ - public long nnz; - /** double array of non zero values */ - public Pointer val; - /** integer array of start of all rows and end of last row + 1 */ - public Pointer rowPtr; - /** integer array of nnz values' column indices */ - public Pointer colInd; - /** descriptor of matrix, only CUSPARSE_MATRIX_TYPE_GENERAL supported */ - public cusparseMatDescr descr; - - /** - * Check for ultra sparsity - * - * @param rows number of rows - * @param cols number of columns - * @return true if ultra sparse - */ - public boolean isUltraSparse(int rows, int cols) { - double sp = ((double)nnz/rows/cols); - return sp<ULTRA_SPARSITY_TURN_POINT; - } - - /** - * Initializes {@link #descr} to CUSPARSE_MATRIX_TYPE_GENERAL, - * the default that works for DGEMM. - */ - private void allocateMatDescrPointer() { - this.descr = getDefaultCuSparseMatrixDescriptor(); - } - - /** - * Estimate the size of a CSR matrix in GPU memory - * Size of pointers is not needed and is not added in - * @param nnz2 number of non zeroes - * @param rows number of rows - * @return size estimate - */ - public static long estimateSize(long nnz2, long rows) { - long sizeofValArray = getDoubleSizeOf(nnz2); - long sizeofRowPtrArray = getIntSizeOf(rows + 1); - long sizeofColIndArray = getIntSizeOf(nnz2); - long sizeofDescr = getIntSizeOf(4); - // From the CUSPARSE documentation, the cusparseMatDescr in native code is represented as: - // typedef struct { - // cusparseMatrixType_t MatrixType; - // cusparseFillMode_t FillMode; - // cusparseDiagType_t DiagType; - // cusparseIndexBase_t IndexBase; - // } cusparseMatDescr_t; - long tot = sizeofValArray + sizeofRowPtrArray + sizeofColIndArray + sizeofDescr; - return tot; - } - - /** - * Factory method to allocate an empty CSR Sparse matrix on the GPU - * @param nnz2 number of non-zeroes - * @param rows number of rows - * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public static CSRPointer allocateEmpty(long nnz2, long rows) throws DMLRuntimeException { - assert nnz2 > -1 : "Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU"; - CSRPointer r = new CSRPointer(); - r.nnz = nnz2; - if(nnz2 == 0) { - // The convention for an empty sparse matrix is to just have an instance of the CSRPointer object - // with no memory allocated on the GPU. - return r; - } - ensureFreeSpace(getDoubleSizeOf(nnz2) + getIntSizeOf(rows + 1) + getIntSizeOf(nnz2)); - // increment the cudaCount by 1 for the allocation of all 3 arrays - r.val = allocate(null, getDoubleSizeOf(nnz2), 0); - r.rowPtr = allocate(null, getIntSizeOf(rows + 1), 0); - r.colInd = allocate(null, getIntSizeOf(nnz2), 1); - return r; - } - - /** - * Static method to copy a CSR sparse matrix from Host to Device - * @param dest [input] destination location (on GPU) - * @param rows number of rows - * @param nnz number of non-zeroes - * @param rowPtr integer array of row pointers - * @param colInd integer array of column indices - * @param values double array of non zero values - */ - public static void copyToDevice(CSRPointer dest, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) { - CSRPointer r = dest; - long t0=0; - if (DMLScript.STATISTICS) t0 = System.nanoTime(); - r.nnz = nnz; - cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), cudaMemcpyHostToDevice); - cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice); - cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), cudaMemcpyHostToDevice); - if (DMLScript.STATISTICS) GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime()-t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaToDevCount.addAndGet(3); - } - - /** - * Static method to copy a CSR sparse matrix from Device to host - * @param src [input] source location (on GPU) - * @param rows [input] number of rows - * @param nnz [input] number of non-zeroes - * @param rowPtr [output] pre-allocated integer array of row pointers of size (rows+1) - * @param colInd [output] pre-allocated integer array of column indices of size nnz - * @param values [output] pre-allocated double array of values of size nnz - */ - public static void copyToHost(CSRPointer src, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values){ - CSRPointer r = src; - long t0=0; - if (DMLScript.STATISTICS) t0 = System.nanoTime(); - cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 1), cudaMemcpyDeviceToHost); - cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), cudaMemcpyDeviceToHost); - cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz), cudaMemcpyDeviceToHost); - if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime()-t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(3); - } - - // ============================================================================================== - - // The following methods estimate the memory needed for sparse matrices that are - // results of operations on other sparse matrices using the cuSparse Library. - // The operation is C = op(A) binaryOperation op(B), C is the output and A & B are the inputs - // op = whether to transpose or not - // binaryOperation = For cuSparse, +, - are *(matmul) are supported - - // From CuSparse Manual, - // Since A and B have different sparsity patterns, cuSPARSE adopts a two-step approach - // to complete sparse matrix C. In the first step, the user allocates csrRowPtrC of m+1 - // elements and uses function cusparseXcsrgeamNnz() to determine csrRowPtrC - // and the total number of nonzero elements. In the second step, the user gathers nnzC - //(number of nonzero elements of matrix C) from either (nnzC=*nnzTotalDevHostPtr) - // or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) and allocates csrValC, csrColIndC of - // nnzC elements respectively, then finally calls function cusparse[S|D|C|Z]csrgeam() - // to complete matrix C. - - /** - * Allocate row pointers of m+1 elements - * - * @param handle a valid {@link cusparseHandle} - * @param C Output matrix - * @param rowsC number of rows in C - * @throws DMLRuntimeException ? - */ - private static void step1AllocateRowPointers(cusparseHandle handle, CSRPointer C, int rowsC) throws DMLRuntimeException { - cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST); - cudaDeviceSynchronize(); - // Do not increment the cudaCount of allocations on GPU - C.rowPtr = allocate(null, getIntSizeOf((long)rowsC+1), 0); - } - - /** - * Determine total number of nonzero element for the cusparseDgeam operation. - * This is done from either (nnzC=*nnzTotalDevHostPtr) or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) - * - * @param handle a valid {@link cusparseHandle} - * @param A Sparse Matrix A on GPU - * @param B Sparse Matrix B on GPU - * @param C Output Sparse Matrix C on GPU - * @param m Rows in C - * @param n Columns in C - * @throws DMLRuntimeException ? - */ - private static void step2GatherNNZGeam(cusparseHandle handle, CSRPointer A, CSRPointer B, CSRPointer C, int m, int n) throws DMLRuntimeException { - int[] CnnzArray = { -1 }; - cusparseXcsrgeamNnz(handle, m, n, - A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, - B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd, - C.descr, C.rowPtr, Pointer.to(CnnzArray)); - cudaDeviceSynchronize(); - if (CnnzArray[0] != -1){ - C.nnz = CnnzArray[0]; - } - else { - int baseArray[] = { 0 }; - cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), cudaMemcpyDeviceToHost); - cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost); - C.nnz = CnnzArray[0] - baseArray[0]; - } - } - - /** - * Determine total number of nonzero element for the cusparseDgemm operation. - * - * @param handle a valid {@link cusparseHandle} - * @param A Sparse Matrix A on GPU - * @param transA op - whether A is transposed - * @param B Sparse Matrix B on GPU - * @param transB op - whether B is transposed - * @param C Output Sparse Matrix C on GPU - * @param m Number of rows of sparse matrix op ( A ) and C - * @param n Number of columns of sparse matrix op ( B ) and C - * @param k Number of columns/rows of sparse matrix op ( A ) / op ( B ) - * @throws DMLRuntimeException ? - */ - private static void step2GatherNNZGemm(cusparseHandle handle, CSRPointer A, int transA, CSRPointer B, int transB, CSRPointer C, int m, int n, int k) throws DMLRuntimeException { - int[] CnnzArray = { -1 }; - if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) { - throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse"); - } - cusparseXcsrgemmNnz(handle, transA, transB, m, n, k, - A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, - B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd, - C.descr, C.rowPtr, Pointer.to(CnnzArray)); - cudaDeviceSynchronize(); - if (CnnzArray[0] != -1){ - C.nnz = CnnzArray[0]; - } - else { - int baseArray[] = { 0 }; - cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), cudaMemcpyDeviceToHost); - cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost); - C.nnz = CnnzArray[0] - baseArray[0]; - } - } - - /** - * Allocate val and index pointers. - * - * @param handle a valid {@link cusparseHandle} - * @param C Output sparse matrix on GPU - * @throws DMLRuntimeException ? - */ - private static void step3AllocateValNInd(cusparseHandle handle, CSRPointer C) throws DMLRuntimeException { - // Increment cudaCount by one when all three arrays of CSR sparse array are allocated - C.val = allocate(null, getDoubleSizeOf(C.nnz), 0); - C.colInd = allocate(null, getIntSizeOf(C.nnz), 1); - } - - // ============================================================================================== - - - /** - * Estimates the number of non zero elements from the results of a sparse cusparseDgeam operation - * C = a op(A) + b op(B) - * @param handle a valid {@link cusparseHandle} - * @param A Sparse Matrix A on GPU - * @param B Sparse Matrix B on GPU - * @param m Rows in A - * @param n Columns in Bs - * @return CSR (compressed sparse row) pointer - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public static CSRPointer allocateForDgeam(cusparseHandle handle, CSRPointer A, CSRPointer B, int m, int n) - throws DMLRuntimeException{ - if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) { - throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse"); - } - CSRPointer C = new CSRPointer(); - step1AllocateRowPointers(handle, C, m); - step2GatherNNZGeam(handle, A, B, C, m, n); - step3AllocateValNInd(handle, C); - return C; - } - - /** - * Estimates the number of non-zero elements from the result of a sparse matrix multiplication C = A * B - * and returns the {@link CSRPointer} to C with the appropriate GPU memory. - * @param handle a valid {@link cusparseHandle} - * @param A Sparse Matrix A on GPU - * @param transA 'T' if A is to be transposed, 'N' otherwise - * @param B Sparse Matrix B on GPU - * @param transB 'T' if B is to be transposed, 'N' otherwise - * @param m Rows in A - * @param n Columns in B - * @param k Columns in A / Rows in B - * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public static CSRPointer allocateForMatrixMultiply(cusparseHandle handle, CSRPointer A, int transA, CSRPointer B, int transB, int m, int n, int k) - throws DMLRuntimeException{ - // Following the code example at http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at - // https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java - CSRPointer C = new CSRPointer(); - step1AllocateRowPointers(handle, C, m); - step2GatherNNZGemm(handle, A, transA, B, transB, C, m, n, k); - step3AllocateValNInd(handle, C); - return C; - } - - /** - * Copies this CSR matrix on the GPU to a dense column-major matrix - * on the GPU. This is a temporary matrix for operations such as - * cusparseDcsrmv. - * Since the allocated matrix is temporary, bookkeeping is not updated. - * The caller is responsible for calling "free" on the returned Pointer object - * @param cusparseHandle a valid {@link cusparseHandle} - * @param cublasHandle a valid {@link cublasHandle} - * @param rows number of rows in this CSR matrix - * @param cols number of columns in this CSR matrix - * @return A {@link Pointer} to the allocated dense matrix (in column-major format) - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHandle cublasHandle, int rows, int cols) throws DMLRuntimeException { - long size = ((long)rows) * getDoubleSizeOf((long)cols); - Pointer A = JCudaObject.allocate(size); - // If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned. - if (val != null && rowPtr != null && colInd != null && nnz > 0) { - // Note: cusparseDcsr2dense method cannot handle empty blocks - cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows); - cudaDeviceSynchronize(); - } else { - LOG.warn("in CSRPointer, the values array, row pointers array or column indices array was null"); - } - return A; - } - - /** - * Calls cudaFree lazily on the allocated {@link Pointer} instances - */ - public void deallocate() { - deallocate(false); - } - - /** - * Calls cudaFree lazily or eagerly on the allocated {@link Pointer} instances - * @param eager whether to do eager or lazy cudaFrees - */ - public void deallocate(boolean eager){ - if (nnz > 0) { - cudaFreeHelper(val, eager); - cudaFreeHelper(rowPtr, eager); - cudaFreeHelper(colInd, eager); - } - } - }; - - private static long getDoubleSizeOf(long numElems) { - return numElems * ((long)jcuda.Sizeof.DOUBLE); - } - - private static long getIntSizeOf(long numElems) { - return numElems * ((long)jcuda.Sizeof.INT); - } - - public synchronized boolean isAllocated() { - return (jcudaDenseMatrixPtr != null || jcudaSparseMatrixPtr != null); - } - - /** Pointer to dense matrix */ - public Pointer jcudaDenseMatrixPtr = null; - /** Pointer to sparse matrix */ - public CSRPointer jcudaSparseMatrixPtr = null; - - public long numBytes; /** Number of bytes occupied by this block on GPU */ - - /** - * Initializes this JCudaObject with a {@link MatrixObject} instance which will contain metadata about the enclosing matrix block - * @param m - */ - JCudaObject(MatrixObject m) { - super(m); - } - - /** - * Allocates a sparse and empty {@link JCudaObject} - * This is the result of operations that are both non zero matrices. - * - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public void allocateSparseAndEmpty() throws DMLRuntimeException{ - setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(0, mat.getNumRows())); - setDeviceModify(0); - } - - - /** - * Allocates a dense matrix of size obtained from the attached matrix metadata - * and fills it up with a single value - * - * @param v value to fill up the dense matrix - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public void allocateAndFillDense(double v) throws DMLRuntimeException { - long rows = mat.getNumRows(); - long cols = mat.getNumColumns(); - int numElems = toIntExact(rows * cols); - long size = getDoubleSizeOf(numElems); - setDenseMatrixCudaPointer(allocate(size)); - setDeviceModify(size); - // The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v" - LibMatrixCUDA.kernels.launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), jcudaDenseMatrixPtr, v, numElems); - } - - /** - * If this {@link JCudaObject} is sparse and empty - * Being allocated is a prerequisite to being sparse and empty. - * - * @return true if sparse and empty - */ - public boolean isSparseAndEmpty() { - boolean isSparseAndAllocated = isAllocated()&& LibMatrixCUDA.isInSparseFormat(mat); - boolean isEmptyAndSparseAndAllocated = isSparseAndAllocated && jcudaSparseMatrixPtr.nnz == 0; - return isEmptyAndSparseAndAllocated; - } - - @Override - public synchronized boolean acquireDeviceRead() throws DMLRuntimeException { - boolean transferred = false; - if(!isAllocated()) { - copyFromHostToDevice(); - transferred = true; - } else { - numLocks.addAndGet(1); - } - if(!isAllocated()) - throw new DMLRuntimeException("Expected device data to be allocated"); - return transferred; - } - - @Override - public synchronized boolean acquireDeviceModifyDense() throws DMLRuntimeException { - boolean allocated = false; - if(!isAllocated()) { - mat.setDirty(true); - // Dense block, size = numRows * numCols - allocateDenseMatrixOnDevice(); - allocated = true; - synchronized(evictionLock) { - JCudaContext.allocatedPointers.add(this); - } - } - isDeviceCopyModified = true; - if(!isAllocated()) - throw new DMLRuntimeException("Expected device data to be allocated"); - return allocated; - } - - @Override - public synchronized boolean acquireDeviceModifySparse() throws DMLRuntimeException { - boolean allocated = false; - isInSparseFormat = true; - if(!isAllocated()) { - mat.setDirty(true); - allocateSparseMatrixOnDevice(); - allocated = true; - synchronized(evictionLock) { - JCudaContext.allocatedPointers.add(this); - } - } - isDeviceCopyModified = true; - if(!isAllocated()) - throw new DMLRuntimeException("Expected device data to be allocated"); - return allocated; - } - - @Override - public synchronized boolean acquireHostRead() throws CacheException { - boolean copied = false; - if(isAllocated()) { - try { - if(isDeviceCopyModified) { - copyFromDeviceToHost(); - copied = true; - } - } catch (DMLRuntimeException e) { - throw new CacheException(e); - } - } - else { - throw new CacheException("Cannot perform acquireHostRead as the GPU data is not allocated:" + mat.getVarName()); - } - return copied; - } - - /** - * updates the locks depending on the eviction policy selected - * @throws CacheException if there is no locked GPU Object - */ - private void updateReleaseLocks() throws CacheException { - if(numLocks.addAndGet(-1) < 0) { - throw new CacheException("Redundant release of GPU object"); - } - if(evictionPolicy == EvictionPolicy.LRU) { - timestamp.set(System.nanoTime()); - } - else if(evictionPolicy == EvictionPolicy.LFU) { - timestamp.addAndGet(1); - } - else if(evictionPolicy == EvictionPolicy.MIN_EVICT) { - // Do Nothing - } - else { - throw new CacheException("The eviction policy is not supported:" + evictionPolicy.name()); - } - } - - /** - * releases input allocated on GPU - * @throws CacheException if data is not allocated - */ - public synchronized void releaseInput() throws CacheException { - updateReleaseLocks(); - if(!isAllocated()) - throw new CacheException("Attempting to release an input before allocating it"); - } - - /** - @Override - void allocateMemoryOnDevice(long numElemToAllocate) throws DMLRuntimeException { - if(!isAllocated()) { - long start = System.nanoTime(); - if(numElemToAllocate == -1 && LibMatrixCUDA.isInSparseFormat(mat)) { - setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(mat.getNnz(), mat.getNumRows())); - numBytes = CSRPointer.estimateSize(mat.getNnz(), mat.getNumRows()); - JCudaContext.deviceMemBytes.addAndGet(-numBytes); - isInSparseFormat = true; - //throw new DMLRuntimeException("Sparse format not implemented"); - } else if(numElemToAllocate == -1) { - // Called for dense input - setDenseMatrixCudaPointer(new Pointer()); - numBytes = mat.getNumRows()*getDoubleSizeOf(mat.getNumColumns()); - cudaMalloc(jcudaDenseMatrixPtr, numBytes); - JCudaContext.deviceMemBytes.addAndGet(-numBytes); - } - else { - // Called for dense output - setDenseMatrixCudaPointer(new Pointer()); - numBytes = getDoubleSizeOf(numElemToAllocate); - if(numElemToAllocate <= 0 || numBytes <= 0) - throw new DMLRuntimeException("Cannot allocate dense matrix object with " + numElemToAllocate + " elements and size " + numBytes); - cudaMalloc(jcudaDenseMatrixPtr, numBytes); - JCudaContext.deviceMemBytes.addAndGet(-numBytes); - } - - GPUStatistics.cudaAllocTime.addAndGet(System.nanoTime()-start); - GPUStatistics.cudaAllocCount.addAndGet(1); - - } - } - */ - - @Override - void allocateDenseMatrixOnDevice() throws DMLRuntimeException { - assert !isAllocated() : "Internal error - trying to allocated dense matrix to a JCudaObject that is already allocated"; - long rows = mat.getNumRows(); - long cols = mat.getNumColumns(); - assert rows > 0 : "Internal error - invalid number of rows when allocating dense matrix"; - assert cols > 0 : "Internal error - invalid number of columns when allocating dense matrix;"; - long size = getDoubleSizeOf(rows * cols); - Pointer tmp = allocate(size); - setDenseMatrixCudaPointer(tmp); - setDeviceModify(size); - } - - @Override - void allocateSparseMatrixOnDevice() throws DMLRuntimeException { - assert !isAllocated() : "Internal error = trying to allocated sparse matrix to a JCudaObject that is already allocated"; - long rows = mat.getNumRows(); - long nnz = mat.getNnz(); - assert rows > 0 : "Internal error - invalid number of rows when allocating a sparse matrix"; - assert nnz > 0 : "Internal error - invalid number of non zeroes when allocating a sparse matrix"; - CSRPointer tmp = CSRPointer.allocateEmpty(nnz, rows); - setSparseMatrixCudaPointer(tmp); - long size = CSRPointer.estimateSize(nnz, rows); - setDeviceModify(size); - } - - /** - * releases output allocated on GPU - * @throws CacheException if data is not allocated - */ - @Override - public synchronized void releaseOutput() throws CacheException { - updateReleaseLocks(); - isDeviceCopyModified = true; - if(!isAllocated()) - throw new CacheException("Attempting to release an output before allocating it"); - } - - @Override - public void setDeviceModify(long numBytes) { - this.numLocks.addAndGet(1); - this.numBytes = numBytes; - ((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(-numBytes); - } - - @Override - void deallocateMemoryOnDevice(boolean eager) { - if(jcudaDenseMatrixPtr != null) { - cudaFreeHelper(null, jcudaDenseMatrixPtr, eager); - ((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(numBytes); - } - if (jcudaSparseMatrixPtr != null) { - jcudaSparseMatrixPtr.deallocate(eager); - ((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(numBytes); - } - jcudaDenseMatrixPtr = null; - jcudaSparseMatrixPtr = null; - if(tensorDescriptor != null) { - cudnnDestroyTensorDescriptor(tensorDescriptor); - tensorDescriptor = null; - } - numLocks.set(0); - } - - /** - * Thin wrapper over {@link #evict(long)} - * @param size size to check - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - static void ensureFreeSpace(long size) throws DMLRuntimeException { - ensureFreeSpace(null, size); - } - - /** - * Thin wrapper over {@link #evict(long)} - * @param instructionName instructionName name of the instruction for which performance measurements are made - * @param size size to check - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - static void ensureFreeSpace(String instructionName, long size) throws DMLRuntimeException { - if(size >= getAvailableMemory()) { - evict(instructionName, size); - } - } - - @Override - void copyFromHostToDevice() - throws DMLRuntimeException - { - printCaller(); - long start=0; - if (DMLScript.STATISTICS) start = System.nanoTime(); - - MatrixBlock tmp = mat.acquireRead(); - if(tmp.isInSparseFormat()) { - - int rowPtr[] = null; - int colInd[] = null; - double[] values = null; - - tmp.recomputeNonZeros(); - long nnz = tmp.getNonZeros(); - mat.getMatrixCharacteristics().setNonZeros(nnz); - - SparseBlock block = tmp.getSparseBlock(); - boolean copyToDevice = true; - if(block == null && tmp.getNonZeros() == 0) { -// // Allocate empty block --> not necessary -// // To reproduce this, see org.apache.sysml.test.integration.applications.dml.ID3DMLTest -// rowPtr = new int[0]; -// colInd = new int[0]; -// values = new double[0]; - copyToDevice = false; - } - else if(block == null && tmp.getNonZeros() != 0) { - throw new DMLRuntimeException("Expected CP sparse block to be not null."); - } - else { - // CSR is the preferred format for cuSparse GEMM - // Converts MCSR and COO to CSR - SparseBlockCSR csrBlock = null; - long t0=0; - if (block instanceof SparseBlockCSR){ - csrBlock = (SparseBlockCSR)block; - } else if (block instanceof SparseBlockCOO) { - // TODO - should we do this on the GPU using cusparse<t>coo2csr() ? - if (DMLScript.STATISTICS) t0 = System.nanoTime(); - SparseBlockCOO cooBlock = (SparseBlockCOO)block; - csrBlock = new SparseBlockCSR(toIntExact(mat.getNumRows()), cooBlock.rowIndexes(), cooBlock.indexes(), cooBlock.values()); - if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionCount.incrementAndGet(); - } else if (block instanceof SparseBlockMCSR) { - if (DMLScript.STATISTICS) t0 = System.nanoTime(); - SparseBlockMCSR mcsrBlock = (SparseBlockMCSR)block; - csrBlock = new SparseBlockCSR(mcsrBlock.getRows(), toIntExact(mcsrBlock.size())); - if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaSparseConversionCount.incrementAndGet(); - } else { - throw new DMLRuntimeException("Unsupported sparse matrix format for CUDA operations"); - } - rowPtr = csrBlock.rowPointers(); - colInd = csrBlock.indexes(); - values = csrBlock.values(); - } - allocateSparseMatrixOnDevice(); - synchronized(evictionLock) { - JCudaContext.allocatedPointers.add(this); - } - if(copyToDevice) { - CSRPointer.copyToDevice(jcudaSparseMatrixPtr, tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd, values); - } - } - else { - double[] data = tmp.getDenseBlock(); - - if( data == null && tmp.getSparseBlock() != null ) - throw new DMLRuntimeException("Incorrect sparsity calculation"); - else if( data==null && tmp.getNonZeros() != 0 ) - throw new DMLRuntimeException("MatrixBlock is not allocated"); - else if( tmp.getNonZeros() == 0 ) - data = new double[tmp.getNumRows()*tmp.getNumColumns()]; - - // Copy dense block - allocateDenseMatrixOnDevice(); - synchronized(evictionLock) { - JCudaContext.allocatedPointers.add(this); - } - cudaMemcpy(jcudaDenseMatrixPtr, Pointer.to(data), getDoubleSizeOf(mat.getNumRows()*mat.getNumColumns()), cudaMemcpyHostToDevice); - } - - mat.release(); - - if (DMLScript.STATISTICS) GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime()-start); - if (DMLScript.STATISTICS) GPUStatistics.cudaToDevCount.addAndGet(1); - } - - public static int toIntExact(long l) throws DMLRuntimeException { - if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) { - throw new DMLRuntimeException("Cannot be cast to int:" + l); - } - return (int) l; - } - - @Override - protected void copyFromDeviceToHost() throws DMLRuntimeException { - if (jcudaDenseMatrixPtr != null && jcudaSparseMatrixPtr != null){ - throw new DMLRuntimeException("Invalid state : JCuda dense/sparse pointer are both allocated"); - } - - if(jcudaDenseMatrixPtr != null) { - printCaller(); - long start=0; - if (DMLScript.STATISTICS) start = System.nanoTime(); - MatrixBlock tmp = new MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), false); - tmp.allocateDenseBlock(); - double [] data = tmp.getDenseBlock(); - - cudaMemcpy(Pointer.to(data), jcudaDenseMatrixPtr, getDoubleSizeOf(data.length), cudaMemcpyDeviceToHost); - - tmp.recomputeNonZeros(); - mat.acquireModify(tmp); - mat.release(); - - if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime()-start); - if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(1); - } - else if (jcudaSparseMatrixPtr != null){ - printCaller(); - if(!LibMatrixCUDA.isInSparseFormat(mat)) - throw new DMLRuntimeException("Block not in sparse format on host yet the device sparse matrix pointer is not null"); - - if(this.isSparseAndEmpty()){ - MatrixBlock tmp = new MatrixBlock(); // Empty Block - mat.acquireModify(tmp); - mat.release(); - } else { - long start=0; - if (DMLScript.STATISTICS) start = System.nanoTime(); - - int rows = toIntExact(mat.getNumRows()); - int cols = toIntExact(mat.getNumColumns()); - int nnz = toIntExact(jcudaSparseMatrixPtr.nnz); - int[] rowPtr = new int[rows + 1]; - int[] colInd = new int[nnz]; - double[] values = new double[nnz]; - CSRPointer.copyToHost(jcudaSparseMatrixPtr, rows, nnz, rowPtr, colInd, values); - - SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, colInd, values, nnz); - MatrixBlock tmp = new MatrixBlock(rows, cols, nnz, sparseBlock); - mat.acquireModify(tmp); - mat.release(); - if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - start); - if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(1); - } - } - else { - throw new DMLRuntimeException("Cannot copy from device to host as JCuda dense/sparse pointer is not allocated"); - } - isDeviceCopyModified = false; - } - - @Override - protected long getSizeOnDevice() throws DMLRuntimeException { - long GPUSize = 0; - long rlen = mat.getNumRows(); - long clen = mat.getNumColumns(); - long nnz = mat.getNnz(); - - if(LibMatrixCUDA.isInSparseFormat(mat)) { - GPUSize = CSRPointer.estimateSize(nnz, rlen); - } - else { - GPUSize = getDoubleSizeOf(rlen * clen); - } - return GPUSize; - } - - private String getClassAndMethod(StackTraceElement st) { - String [] str = st.getClassName().split("\\."); - return str[str.length - 1] + "." + st.getMethodName(); - } - - /** - * Convenience debugging method. - * Checks {@link JCudaContext#DEBUG} flag before printing to System.out - */ - private void printCaller() { - if(JCudaContext.DEBUG) { - StackTraceElement[] st = Thread.currentThread().getStackTrace(); - String ret = getClassAndMethod(st[1]); - for (int i = 2; i < st.length && i < 7; i++) { - ret += "->" + getClassAndMethod(st[i]); - } - System.out.println("CALL_STACK:" + ret); - } - } - - /** - * Convenience method to directly examine the Sparse matrix on GPU - * - * @return CSR (compressed sparse row) pointer - */ - public CSRPointer getSparseMatrixCudaPointer() { - return jcudaSparseMatrixPtr; - } - - /** - * Convenience method to directly set the sparse matrix on GPU - * Make sure to call {@link #setDeviceModify(long)} after this to set appropriate state, if you are not sure what you are doing. - * Needed for operations like {@link JCusparse#cusparseDcsrgemm(cusparseHandle, int, int, int, int, int, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, Pointer, Pointer, Pointer)} - * @param sparseMatrixPtr CSR (compressed sparse row) pointer - */ - public synchronized void setSparseMatrixCudaPointer(CSRPointer sparseMatrixPtr) { - this.jcudaSparseMatrixPtr = sparseMatrixPtr; - this.isInSparseFormat = true; - if(jcudaDenseMatrixPtr != null) { - cudaFreeHelper(jcudaDenseMatrixPtr); - jcudaDenseMatrixPtr = null; - } - } - - /** - * Convenience method to directly set the dense matrix pointer on GPU - * Make sure to call {@link #setDeviceModify(long)} after this to set appropriate state, if you are not sure what you are doing. - * - * @param densePtr dense pointer - */ - public synchronized void setDenseMatrixCudaPointer(Pointer densePtr){ - this.jcudaDenseMatrixPtr = densePtr; - this.isInSparseFormat = false; - if(jcudaSparseMatrixPtr != null) { - jcudaSparseMatrixPtr.deallocate(); - jcudaSparseMatrixPtr = null; - } - } - - /** - * Converts this JCudaObject from dense to sparse format. - * - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public void denseToSparse() throws DMLRuntimeException { - long t0=0; - if (DMLScript.STATISTICS) t0 = System.nanoTime(); - cusparseHandle cusparseHandle = LibMatrixCUDA.cusparseHandle; - if(cusparseHandle == null) - throw new DMLRuntimeException("Expected cusparse to be initialized"); - int rows = toIntExact(mat.getNumRows()); - int cols = toIntExact(mat.getNumColumns()); - - if(jcudaDenseMatrixPtr == null || !isAllocated()) - throw new DMLRuntimeException("Expected allocated dense matrix before denseToSparse() call"); - - convertDensePtrFromRowMajorToColumnMajor(); - setSparseMatrixCudaPointer(columnMajorDenseToRowMajorSparse(cusparseHandle, rows, cols, jcudaDenseMatrixPtr)); - // TODO: What if mat.getNnz() is -1 ? - numBytes = CSRPointer.estimateSize(mat.getNnz(), rows); - if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.addAndGet(System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.addAndGet(1); - } - - /** - * Transposes a dense matrix on the GPU by calling the cublasDgeam operation - * @param densePtr Pointer to dense matrix on the GPU - * @param m rows in ouput matrix - * @param n columns in output matrix - * @param lda rows in input matrix - * @param ldc columns in output matrix - * @return transposed matrix - * @throws DMLRuntimeException if operation failed - */ - public static Pointer transpose(Pointer densePtr, int m, int n, int lda, int ldc) throws DMLRuntimeException { - Pointer alpha = LibMatrixCUDA.pointerTo(1.0); - Pointer beta = LibMatrixCUDA.pointerTo(0.0); - Pointer A = densePtr; - Pointer C = JCudaObject.allocate(((long)m)*getDoubleSizeOf(n)); - - // Transpose the matrix to get a dense matrix - JCublas2.cublasDgeam(LibMatrixCUDA.cublasHandle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, new Pointer(), lda, C, ldc); - return C; - } - - /** - * Convenience method. Converts Row Major Dense Matrix --> Column Major Dense Matrix - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - private void convertDensePtrFromRowMajorToColumnMajor() throws DMLRuntimeException { - int m = toIntExact(mat.getNumRows()); - int n = toIntExact(mat.getNumColumns()); - int lda = n; - int ldc = m; - if(!isAllocated()) { - throw new DMLRuntimeException("Error in converting row major to column major : data is not allocated"); - } - - Pointer tmp = transpose(jcudaDenseMatrixPtr, m, n, lda, ldc); - cudaFreeHelper(jcudaDenseMatrixPtr); - setDenseMatrixCudaPointer(tmp); - } - - private void convertDensePtrFromColMajorToRowMajor() throws DMLRuntimeException { - int n = toIntExact(mat.getNumRows()); - int m = toIntExact(mat.getNumColumns()); - int lda = n; - int ldc = m; - if(!isAllocated()) { - throw new DMLRuntimeException("Error in converting column major to row major : data is not allocated"); - } - - Pointer tmp = transpose(jcudaDenseMatrixPtr, m, n, lda, ldc); - cudaFreeHelper(jcudaDenseMatrixPtr); - setDenseMatrixCudaPointer(tmp); - } - - /** - * Convert sparse to dense (Performs transpose, use sparseToColumnMajorDense if the kernel can deal with column major format) - * - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public void sparseToDense() throws DMLRuntimeException { - sparseToDense(null); - } - - /** - * Convert sparse to dense (Performs transpose, use sparseToColumnMajorDense if the kernel can deal with column major format) - * Also records per instruction invokation of sparseToDense. - * @param instructionName Name of the instruction for which statistics are recorded in {@link GPUStatistics} - * @throws DMLRuntimeException ? - */ - public void sparseToDense(String instructionName) throws DMLRuntimeException { - long start=0, end=0; - if (DMLScript.STATISTICS) start = System.nanoTime(); - if(jcudaSparseMatrixPtr == null || !isAllocated()) - throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call"); - - sparseToColumnMajorDense(); - convertDensePtrFromColMajorToRowMajor(); - if (DMLScript.STATISTICS) end = System.nanoTime(); - if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, end - start); - if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.addAndGet(end - start); - if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.addAndGet(1); - } - - /** - * More efficient method to convert sparse to dense but returns dense in column major format - * - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public void sparseToColumnMajorDense() throws DMLRuntimeException { - if(jcudaSparseMatrixPtr == null || !isAllocated()) - throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call"); - - cusparseHandle cusparseHandle = LibMatrixCUDA.cusparseHandle; - if(cusparseHandle == null) - throw new DMLRuntimeException("Expected cusparse to be initialized"); - int rows = toIntExact(mat.getNumRows()); - int cols = toIntExact(mat.getNumColumns()); - setDenseMatrixCudaPointer(jcudaSparseMatrixPtr.toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols)); - numBytes = ((long)rows)*getDoubleSizeOf(cols); - } - - /** - * Convenience method to convert a CSR matrix to a dense matrix on the GPU - * Since the allocated matrix is temporary, bookkeeping is not updated. - * Also note that the input dense matrix is expected to be in COLUMN MAJOR FORMAT - * Caller is responsible for deallocating memory on GPU. - * - * @param cusparseHandle handle to cusparse library - * @param rows number of rows - * @param cols number of columns - * @param densePtr [in] dense matrix pointer on the GPU in row major - * @return CSR (compressed sparse row) pointer - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public static CSRPointer columnMajorDenseToRowMajorSparse(cusparseHandle cusparseHandle, int rows, int cols, Pointer densePtr) throws DMLRuntimeException { - cusparseMatDescr matDescr = CSRPointer.getDefaultCuSparseMatrixDescriptor(); - Pointer nnzPerRowPtr = null; - Pointer nnzTotalDevHostPtr = null; - - ensureFreeSpace(getIntSizeOf(rows + 1)); - nnzPerRowPtr = allocate(getIntSizeOf(rows)); - nnzTotalDevHostPtr = allocate(getIntSizeOf(1)); - - // Output is in dense vector format, convert it to CSR - cusparseDnnz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, nnzTotalDevHostPtr); - cudaDeviceSynchronize(); - int[] nnzC = {-1}; - - long t2=0; - if (DMLScript.STATISTICS) t2 = System.nanoTime(); - cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost); - if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t2); - if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(1); - - if (nnzC[0] == -1){ - throw new DMLRuntimeException("cusparseDnnz did not calculate the correct number of nnz from the sparse-matrix vector mulitply on the GPU"); - } - - CSRPointer C = CSRPointer.allocateEmpty(nnzC[0], rows); - cusparseDdense2csr(cusparseHandle, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, C.val, C.rowPtr, C.colInd); - cudaDeviceSynchronize(); - - cudaFreeHelper(nnzPerRowPtr); - cudaFreeHelper(nnzTotalDevHostPtr); - - return C; - } - - - /** Map of free blocks allocate on GPU. maps size_of_block -> pointer on GPU */ - static LRUCacheMap<Long, LinkedList<Pointer>> freeCUDASpaceMap = new LRUCacheMap<Long, LinkedList<Pointer>>(); - /** To record size of allocated blocks */ - static HashMap<Pointer, Long> cudaBlockSizeMap = new HashMap<Pointer, Long>(); - - - /** - * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1. - * @param size size of data (in bytes) to allocate - * @return jcuda pointer - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public static Pointer allocate(long size) throws DMLRuntimeException { - return allocate(null, size, 1); - } - - /** - * Convenience method for {@link #allocate(String, long, int)}, defaults statsCount to 1. - * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record - * @param size size of data (in bytes) to allocate - * @return jcuda pointer - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public static Pointer allocate(String instructionName, long size) throws DMLRuntimeException { - return allocate(instructionName, size, 1); - } - - /** - * Allocates temporary space on the device. - * Does not update bookkeeping. - * The caller is responsible for freeing up after usage. - * @param instructionName name of instruction for which to record per instruction performance statistics, null if don't want to record - * @param size Size of data (in bytes) to allocate - * @param statsCount amount to increment the cudaAllocCount by - * @return jcuda Pointer - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public static Pointer allocate(String instructionName, long size, int statsCount) throws DMLRuntimeException{ - long t0=0, t1=0, end=0; - synchronized (JCudaContext.syncObj) { - Pointer A; - if (freeCUDASpaceMap.containsKey(size)) { - if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); - LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size); - A = freeList.pop(); - if (freeList.isEmpty()) - freeCUDASpaceMap.remove(size); - if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0); - } else { - if (DMLScript.STATISTICS) t0 = System.nanoTime(); - ensureFreeSpace(instructionName, size); - A = new Pointer(); - cudaMalloc(A, size); - ((JCudaContext)(JCudaContext.currContext)).deviceMemBytes.addAndGet(size); - if (DMLScript.STATISTICS) GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaAllocCount.getAndAdd(statsCount); - if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_ALLOCATE, System.nanoTime() - t0); - } - // Set all elements to 0 since newly allocated space will contain garbage - if (DMLScript.STATISTICS) t1 = System.nanoTime(); - cudaMemset(A, 0, size); - if (DMLScript.STATISTICS) end = System.nanoTime(); - if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SET_ZERO, end - t1); - if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Time.getAndAdd(end - t1); - if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Count.getAndAdd(1); - cudaBlockSizeMap.put(A, size); - return A; - } - } - - /** - * Does lazy cudaFree calls - * @param toFree {@link Pointer} instance to be freed - */ - public static void cudaFreeHelper(final Pointer toFree) { - cudaFreeHelper(null, toFree, false); - } - - /** - * does lazy/eager cudaFree calls - * @param toFree {@link Pointer} instance to be freed - * @param eager true if to be done eagerly - */ - public static void cudaFreeHelper(final Pointer toFree, boolean eager) { - cudaFreeHelper(null, toFree, eager); - } - - /** - * Does lazy cudaFree calls - * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record - * @param toFree {@link Pointer} instance to be freed - */ - public static void cudaFreeHelper(String instructionName, final Pointer toFree) { - cudaFreeHelper(instructionName, toFree, false); - } - - /** - * Does cudaFree calls, lazily - * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record - * @param toFree {@link Pointer} instance to be freed - * @param eager true if to be done eagerly - */ - public static void cudaFreeHelper(String instructionName, final Pointer toFree, boolean eager){ - long t0 = 0; - assert cudaBlockSizeMap.containsKey(toFree) : "ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up"; - long size = cudaBlockSizeMap.get(toFree); - if (eager) { - if (DMLScript.STATISTICS) t0 = System.nanoTime(); - ((JCudaContext)(JCudaContext.currContext)).deviceMemBytes.addAndGet(-size); - cudaFree(toFree); - cudaBlockSizeMap.remove(toFree); - if (DMLScript.STATISTICS) GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaDeAllocCount.addAndGet(1); - if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t0); - } else { - LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size); - if (freeList == null) { - freeList = new LinkedList<Pointer>(); - freeCUDASpaceMap.put(size, freeList); - } - freeList.add(toFree); - } - } - - - /** - * Gets the double array from GPU memory onto host memory and returns string. - * @param A Pointer to memory on device (GPU), assumed to point to a double array - * @param rows rows in matrix A - * @param cols columns in matrix A - * @return the debug string - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public static String debugString(Pointer A, long rows, long cols) throws DMLRuntimeException { - StringBuffer sb = new StringBuffer(); - int len = toIntExact(rows * cols); - double[] tmp = new double[len]; - cudaMemcpy(Pointer.to(tmp), A, getDoubleSizeOf(len), cudaMemcpyDeviceToHost); - int k = 0; - for (int i=0; i<rows; i++){ - for (int j=0; j<cols; j++){ - sb.append(tmp[k]).append(' '); - k++; - } - sb.append('\n'); - } - return sb.toString(); - } -}