Repository: systemml Updated Branches: refs/heads/master 323dd72a8 -> 6de8f051d
[SYSTEMML-446] [SYSTEMML-702] Updated the sparse matrix multiplication to minimize sparse-to-dense as well as dense-to-sparse conversion 1. The goal of this PR is not to improve performance (for example: by considering the cost of sparse-to-dense vs FLOPs required given a memory budget) but instead to minimize sparse-to-dense conversion in the GPU matrix multiplication operator. 2. If matmult uses unnecessary sparse-to-dense conversion, then we run into risk of one of the two situations: - In best case some of the matmult won't be pushed to GPU under worst-case memory budget. - On other hand, if these conversion are not accounted for, they may cause OOMs. 3. Every operator (except dense-sparse matrix multiplication) uses only memory allocated to input and output matrices. 4. Since there is no CuSPARSE kernel for dense-sparse matrix multiplication operator, we either have to transpose the output after performing sparse-dense matrix multiplication or perform dense-dense matrix multiplication after converting the sparse input to dense format. Closes #686. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/6de8f051 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/6de8f051 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/6de8f051 Branch: refs/heads/master Commit: 6de8f051daaefdab403c0edd3c7beb30c9619033 Parents: 323dd72 Author: Niketan Pansare <npan...@us.ibm.com> Authored: Fri Oct 20 12:29:20 2017 -0800 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Fri Oct 20 13:29:20 2017 -0700 ---------------------------------------------------------------------- .../java/org/apache/sysml/hops/AggBinaryOp.java | 30 +- .../gpu/AggregateBinaryGPUInstruction.java | 3 +- .../instructions/gpu/GPUInstruction.java | 1 + .../instructions/gpu/context/CSRPointer.java | 7 +- .../instructions/gpu/context/GPUObject.java | 2 +- .../runtime/matrix/data/LibMatrixCUDA.java | 541 +------------------ .../runtime/matrix/data/LibMatrixCuMatMult.java | 480 ++++++++++++++++ .../test/gpu/MatrixMultiplicationOpTest.java | 98 +++- 8 files changed, 581 insertions(+), 581 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/hops/AggBinaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java index cfa99a4..d733d6a 100644 --- a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java +++ b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java @@ -371,40 +371,16 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop double ret = 0; if (isGPUEnabled()) { - // In GPU Mode, intermediate memory is only needed in case of one of the matrix blocks is sparse - // When sparse block is converted to dense and a dense MM takes place, we need (dim1 * dim2) - // When dense block is converted to sparse and a sparse MM takes place, we need (dim1 * dim2 * 2) - Hop in1 = _input.get(0); Hop in2 = _input.get(1); double in1Sparsity = OptimizerUtils.getSparsity(in1.getDim1(), in1.getDim2(), in1.getNnz()); double in2Sparsity = OptimizerUtils.getSparsity(in2.getDim1(), in2.getDim2(), in2.getNnz()); - boolean in1Sparse = in1Sparsity < MatrixBlock.SPARSITY_TURN_POINT; boolean in2Sparse = in2Sparsity < MatrixBlock.SPARSITY_TURN_POINT; - - boolean in1UltraSparse = in1Sparsity < MatrixBlock.ULTRA_SPARSITY_TURN_POINT; - boolean in2UltraSparse = in2Sparsity < MatrixBlock.ULTRA_SPARSITY_TURN_POINT; - - // For Matmult X * Y, if X is sparse, Y is dense, X is converted to dense - // If X is ultrasparse, Y is converted to sparse - if (in1Sparse ^ in2Sparse) { // one sparse, one dense - if (in1Sparse) { - if (in1UltraSparse) { - ret += 2 * OptimizerUtils.estimateSizeExactSparsity(in2.getDim1(), in2.getDim2(), in2.getNnz()); - } else { - ret += OptimizerUtils.estimateSizeExactSparsity(in1.getDim1(), in1.getDim2(), in1.getNnz()); - } - } else if (in2Sparse) { - if (in2UltraSparse) { - ret += 2 * OptimizerUtils.estimateSizeExactSparsity(in1.getDim1(), in1.getDim2(), in1.getNnz()); - } else { - ret += OptimizerUtils.estimateSizeExactSparsity(in2.getDim1(), in2.getDim2(), in2.getNnz()); - } - } - + if(in1Sparse && !in2Sparse) { + // Only in sparse-dense cases, we need additional memory budget for GPU + ret += OptimizerUtils.estimateSizeExactSparsity(dim1, dim2, 1.0); } - } //account for potential final dense-sparse transformation (worst-case sparse representation) http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java index dfc000f..29a1ead 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java @@ -27,6 +27,7 @@ import org.apache.sysml.runtime.functionobjects.SwapIndex; import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.instructions.cp.CPOperand; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; +import org.apache.sysml.runtime.matrix.data.LibMatrixCuMatMult; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.operators.AggregateBinaryOperator; import org.apache.sysml.runtime.matrix.operators.AggregateOperator; @@ -94,7 +95,7 @@ public class AggregateBinaryGPUInstruction extends GPUInstruction { int clen = (int) (_isRightTransposed ? m2.getNumRows() : m2.getNumColumns()); ec.setMetaData(_output.getName(), rlen, clen); - LibMatrixCUDA.matmult(ec, ec.getGPUContext(0), getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed); + LibMatrixCuMatMult.matmult(ec, ec.getGPUContext(0), getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed); //release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java index ed2a4a2..9dd163d 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java @@ -78,6 +78,7 @@ public abstract class GPUInstruction extends Instruction { public final static String MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB = "Mdmdm"; // time spent in matrix mult of dense matrices public final static String MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB = "Msmdv"; // time spent in matrix mult of sparse matrix and dense vector public final static String MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB = "Msmsm"; // time spent in matrix mult of sparse matrices + public final static String MISC_TIMER_SPARSE_MATRIX_DENSE_MATRIX_LIB = "Msmdm"; // time spent in matrix mult of sparse matrices public final static String MISC_TIMER_SYRK_LIB = "Msyrk"; // time spent in symmetric rank-k update // Other BLAS instructions http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java index 5a6e21c..7176a9c 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java @@ -37,7 +37,9 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysml.api.DMLScript; import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; import org.apache.sysml.utils.GPUStatistics; +import org.apache.sysml.utils.Statistics; import jcuda.Pointer; import jcuda.Sizeof; @@ -495,11 +497,13 @@ public class CSRPointer { * @param cublasHandle a valid {@link cublasHandle} * @param rows number of rows in this CSR matrix * @param cols number of columns in this CSR matrix + * @param instName name of the invoking instruction to record{@link Statistics}. * @return A {@link Pointer} to the allocated dense matrix (in column-major format) * @throws DMLRuntimeException if DMLRuntimeException occurs */ public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHandle cublasHandle, int rows, - int cols) throws DMLRuntimeException { + int cols, String instName) throws DMLRuntimeException { + long t0 = GPUStatistics.DISPLAY_STATISTICS && instName != null ? System.nanoTime() : 0; LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext=" + getGPUContext()); long size = ((long) rows) * getDoubleSizeOf((long) cols); @@ -512,6 +516,7 @@ public class CSRPointer { } else { LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null"); } + if (GPUStatistics.DISPLAY_STATISTICS && instName != null) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0); return A; } http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index feb34bc..06327db 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -458,7 +458,7 @@ public class GPUObject { throw new DMLRuntimeException("Expected cusparse to be initialized"); int rows = toIntExact(mat.getNumRows()); int cols = toIntExact(mat.getNumColumns()); - setDenseMatrixCudaPointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols)); + setDenseMatrixCudaPointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols, null)); } /** http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index f4a00ab..7e25299 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -22,18 +22,11 @@ package org.apache.sysml.runtime.matrix.data; import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N; import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T; import static jcuda.jcusparse.JCusparse.cusparseDcsr2csc; -import static jcuda.jcusparse.JCusparse.cusparseDcsrgemm; -import static jcuda.jcusparse.JCusparse.cusparseDcsrmv; -import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE; -import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE; import static jcuda.runtime.JCuda.cudaMemcpy; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; -import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.sysml.api.DMLScript; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; @@ -175,11 +168,11 @@ public class LibMatrixCUDA { } - private static cusparseHandle getCusparseHandle(GPUContext gCtx) throws DMLRuntimeException{ + protected static cusparseHandle getCusparseHandle(GPUContext gCtx) throws DMLRuntimeException{ return gCtx.getCusparseHandle(); } - private static cublasHandle getCublasHandle(GPUContext gCtx) throws DMLRuntimeException { + protected static cublasHandle getCublasHandle(GPUContext gCtx) throws DMLRuntimeException { return gCtx.getCublasHandle(); } @@ -410,7 +403,7 @@ public class LibMatrixCUDA { throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); if(isInSparseFormat(gCtx, left)) { // For sparse TSMM, invoke matmult (TODO: possible performance improvement) - matmult(ec, gCtx, instName, left, left, outputName, isLeftTransposed, !isLeftTransposed); + LibMatrixCuMatMult.matmult(ec, gCtx, instName, left, left, outputName, isLeftTransposed, !isLeftTransposed); return; } @@ -481,534 +474,6 @@ public class LibMatrixCUDA { //******** End of TRANSPOSE SELF MATRIX MULTIPLY Functions ***********/ //********************************************************************/ - //********************************************************************/ - //***************** MATRIX MULTIPLY Functions ************************/ - //********************************************************************/ - - /** - * Matrix multiply on GPU - * Examines sparsity and shapes and routes call to appropriate method - * from cuBLAS or cuSparse - * C = op(A) x op(B) - * <p> - * Memory Requirements - - * Both dense - inputs, output, no intermediate - * Both sparse - inputs, output, no intermediate - * One sparse, one dense - inputs, output, intermediates - (input_dim1 * input_dim2) OR (input_dim1 * input_dim2 + input in sparse format) - * - * @param ec Current {@link ExecutionContext} instance - * @param gCtx a valid {@link GPUContext} - * @param instName name of the invoking instruction to record{@link Statistics}. - * @param left Matrix A - * @param right Matrix B - * @param outputName Name of the output matrix C (in code generated after LOP layer) - * @param isLeftTransposed op for A, transposed or not - * @param isRightTransposed op for B, tranposed or not - * @throws DMLRuntimeException if DMLRuntimeException occurs - * @return output of matrix multiply - */ - public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, String outputName, - boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException { - if (ec.getGPUContext(0) != gCtx) - throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); - if(LOG.isTraceEnabled()) { - LOG.trace("GPU : matmult" + ", GPUContext=" + gCtx); - } - if(!left.getGPUObject(gCtx).isAllocated() || !right.getGPUObject(gCtx).isAllocated()) - throw new DMLRuntimeException("One of input is not allocated:" + left.getGPUObject(gCtx).isAllocated() + " " + right.getGPUObject(gCtx).isAllocated()); - - boolean bothDense = !left.getGPUObject(gCtx).isSparse() && !right.getGPUObject(gCtx).isSparse(); - boolean bothSparse = left.getGPUObject(gCtx).isSparse() && right.getGPUObject(gCtx).isSparse(); - - MatrixObject output = ec.getMatrixObject(outputName); - - long outRLen = isLeftTransposed ? left.getNumColumns() : left.getNumRows(); - long outCLen = isRightTransposed ? right.getNumRows() : right.getNumColumns(); - - if (bothDense) { // Dense C = Dense A * Dense B - // For both dense, do cuBLAS - getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen); // Allocated the dense output matrix - denseDenseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed); - } - else if (bothSparse){ // Sparse C = Sparse A * Sparse B - ec.allocateGPUMatrixObject(outputName, outRLen, outCLen); - bothSparseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed); - } - else { // Either of A or B is sparse, Sparse C = Sparse/Dense A * Dense/Sparse B - // Convert the dense to sparse and use the cusparseDcsrgemm routine - ec.allocateGPUMatrixObject(outputName, outRLen, outCLen); - eitherSparseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed); - } - - return output; - } - - /** - * One of the matrices is sparse, the other dense - * C = op(A) x op(B) - * - * @param gCtx a valid {@link GPUContext} - * @param instName the invoking instruction's name for record {@link Statistics}. - * @param output allocated output object for C on host to which GPU output will be attached - * @param left Matrix A on host - * @param right Matrix B on host - * @param isLeftTransposed op for A, tranposed or not - * @param isRightTransposed op for B, transposed or not - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - private static void eitherSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right, - boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException { - - int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ; - int n = toInt(isRightTransposed ? right.getNumRows() : right.getNumColumns()); - int k = toInt(isLeftTransposed ? left.getNumRows() : left.getNumColumns()); - int k1 = toInt(isRightTransposed ? right.getNumColumns() : right.getNumRows()); - if(k != k1) - throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1); - - if(m == -1 || n == -1 || k == -1) - throw new DMLRuntimeException("Incorrect dimensions"); - - - if (left.getGPUObject(gCtx).isSparse()) { - // Left sparse, right dense - sparseDenseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed, m, n, k); - } else { - // Left dense, right sparse - denseSparseMatmult(gCtx, instName, left, right, output, isLeftTransposed, isRightTransposed, m, n, k); - } - } - - /** - * C = op(A) * op(B) where A is dense and B is sparse - * If B is ultrasparse, A is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked - * otherwise B is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked. - * - * @param gCtx a valid {@link GPUContext} - * @param instName the invoking instruction's name for record {@link Statistics}. - * @param left {@link MatrixObject} of A - * @param right {@link MatrixObject} of B - * @param output {@link MatrixObject} of the output matrix C - * @param isLeftTransposed whether matrix A needs to be transposed - * @param isRightTransposed whether matrix B needs to be transposed - * @param m ? - * @param n ? - * @param k ? - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - private static void denseSparseMatmult(GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, MatrixObject output, - boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) - throws DMLRuntimeException { - // right sparse, left dense - CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); - Pointer ADense = getDensePointer(gCtx, left, instName); - if (B.isUltraSparse(k, n)){ - if(LOG.isTraceEnabled()) { - LOG.trace(" GPU : Convert d M %*% sp M --> sp M %*% sp M)" + ", GPUContext=" + gCtx); - } - - // Convert left to CSR and do cuSparse matmul - int rowsA = (int)left.getNumRows(); - int colsA = (int)left.getNumColumns(); - - long t0=0,t1=0, t2=0; - if (DMLScript.STATISTICS) t0 = System.nanoTime(); - Pointer AT = GPUObject.transpose(gCtx, ADense, rowsA, colsA, colsA, rowsA); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0); - - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); - CSRPointer A = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), AT, rowsA, colsA); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1); - - if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.add(1); - sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k); - - if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime(); - A.deallocate(); - gCtx.cudaFreeHelper(AT); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2); - - } else { - if(LOG.isTraceEnabled()) { - LOG.trace(" GPU : Convert d M %*% sp M --> d M %*% d M" + ", GPUContext=" + gCtx); - } - // Convert right to dense and do a cuBlas matmul - // BDenseTransposed is a column major matrix - // Note the arguments to denseDenseMatmult to accommodate for this. - long t0=0, t1=0; - if (DMLScript.STATISTICS) t0 = System.nanoTime(); - Pointer BDenseTransposed = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int)right.getNumRows(), (int)right.getNumColumns()); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.add(System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.add(System.nanoTime() - t0); - - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); - boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense(); // To allocate the dense matrix - if (GPUStatistics.DISPLAY_STATISTICS && allocated) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1); - Pointer C = getDensePointer(gCtx, output, instName); - denseDenseMatmult(gCtx, instName, C, - toInt(left.getNumRows()), toInt(left.getNumColumns()), - toInt(right.getNumColumns()), toInt(right.getNumRows()), - isLeftTransposed, !isRightTransposed, - ADense, BDenseTransposed); - - gCtx.cudaFreeHelper(instName, BDenseTransposed); - } - } - - /** - * * C = op(A) * op(B) where A is sparse and B is dense - * If A is ultrasparse, B is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked - * otherwise A is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked. - * - * @param gCtx a valid {@link GPUContext} - * @param instName the invoking instruction's name for record {@link Statistics}. - * @param output the output matrix object - * @param left matrix A - * @param right matrix B - * @param isLeftTransposed if A needs to be transposed - * @param isRightTransposed if B needs to be transposed - * @param m ? - * @param n ? - * @param k ? - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - private static void sparseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right, - boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) - throws DMLRuntimeException { - CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); - Pointer BDense = getDensePointer(gCtx, right, instName); - - if (n == 1){ - // Sparse Matrix - Dense Vector multiply - sparseMatrixDenseVectorMult(gCtx, instName, output, A, BDense, isLeftTransposed, (int)left.getNumRows(), (int)left.getNumColumns()); - - } else { - - long t0=0, t1=0, t2=0; - // Sparse Matrix Dense Matrix multiply - if (A.isUltraSparse(m, k)){ - if(LOG.isTraceEnabled()) { - LOG.trace(" GPU : Convert sp M %*% d M --> sp M %*% sp M" + ", GPUContext=" + gCtx); - } - // Convert right to CSR and do cuSparse matmul - int rowsB = (int)right.getNumRows(); - int colsB = (int)right.getNumColumns(); - - if (DMLScript.STATISTICS) t0 = System.nanoTime(); - Pointer BT = GPUObject.transpose(gCtx, BDense, rowsB, colsB, colsB, rowsB); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0); - - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); - CSRPointer B = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), BT, rowsB, colsB); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1); - - if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.add(1); - - sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k); - - if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime(); - B.deallocate(); - gCtx.cudaFreeHelper(BT); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2); - - } else { - if(LOG.isTraceEnabled()) { - LOG.trace(" GPU : Convert sp M %*% d M --> d M %*% d M" + ", GPUContext=" + gCtx); - } - // Convert left to dense and do a cuBlas matmul - // ADenseTransposed is a column major matrix - // Note the arguments to denseDenseMatmult to accommodate for this. - if (DMLScript.STATISTICS) t0 = System.nanoTime(); - Pointer ADenseTransposed = A.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int)left.getNumRows(), (int)left.getNumColumns()); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.add(System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.add(System.nanoTime() - t0); - - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); - boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense(); // To allocate the dense matrix - if (allocated && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1); - - Pointer C = getDensePointer(gCtx, output, instName); - denseDenseMatmult(gCtx, instName, C, - toInt(left.getNumColumns()), toInt(left.getNumRows()), - toInt(right.getNumRows()), toInt(right.getNumColumns()), - !isLeftTransposed, isRightTransposed, - ADenseTransposed, BDense); - - gCtx.cudaFreeHelper(instName, ADenseTransposed); - } - } - } - - /** - * C = op(A) x B - * A is a sparse matrix, B is a dense vector - * - * @param gCtx a valid {@link GPUContext} - * @param instName the invoking instruction's name for record {@link Statistics}. - * @param output allocated output on the host, to which the GPU output C will be attached - * @param A sparse matrix A on the GPU - * @param B_dense dense matrix/vector B on the GPU - * @param isATranposed op for A, tranposed or not - * @param m number of rows in A (not op(A)) - * @param k number of cols in A or number of rows in B (not op(A) or op(B)) - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - private static void sparseMatrixDenseVectorMult(GPUContext gCtx, String instName, MatrixObject output, CSRPointer A, Pointer B_dense, boolean isATranposed, - int m, int k) throws DMLRuntimeException { - if(LOG.isTraceEnabled()) { - LOG.trace("GPU : sp M %*% dense V" + ", GPUContext=" + gCtx); - } - int transA = CUSPARSE_OPERATION_NON_TRANSPOSE; - long size = m * Sizeof.DOUBLE; - if (isATranposed){ - size = k * Sizeof.DOUBLE; - transA = CUSPARSE_OPERATION_TRANSPOSE; - } - Pointer C_dense = gCtx.allocate(instName, (int)size); - long t1=0; - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); - cusparseDcsrmv(getCusparseHandle(gCtx), transA, m, k, (int)A.nnz, one(), A.descr, A.val, A.rowPtr, A.colInd, B_dense, zero(), C_dense); - //cudaDeviceSynchronize; // Since cusparseDcsrmv is asynchronously executed - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t1); - - output.getGPUObject(gCtx).setDenseMatrixCudaPointer(C_dense); - } - - /** - * Sparse C = Sparse op(A) * Sparse op(B) - * Reroutes call to sparse matrix-vector mult if needed - * - * @param gCtx a valid {@link GPUContext} - * @param instName the invoking instruction's name for record {@link Statistics}. - * @param output ? - * @param instName name of the invoking instruction to record{@link Statistics}. - * @param left ? - * @param right ? - * @param isLeftTransposed ? - * @param isRightTransposed ? - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - private static void bothSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right, - boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException { - int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ; - int n = toInt(isRightTransposed ? right.getNumRows() : right.getNumColumns()); - int k = toInt(isLeftTransposed ? left.getNumRows() : left.getNumColumns()); - int k1 = toInt(isRightTransposed ? right.getNumColumns() : right.getNumRows()); - if(k != k1) - throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1); - - if(m == -1 || n == -1 || k == -1) - throw new DMLRuntimeException("Incorrect dimensions"); - - CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); - CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); - - // TODO if (m == 1) { // Vector-matrix multiplication - - if (!isRightTransposed && right.getNumColumns() == 1){ // Matrix-Vector multiplication - sparseMatrixVectorMult(gCtx, instName, output, isLeftTransposed, (int)left.getNumRows(), (int)left.getNumColumns(), (int)right.getNumRows(), A, B); - } else { // Matrix-Matrix multiplication - sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k); - } - } - - /** - * Does a sparse matrix-vector multiply. - * C = op(A) x B, A is a sparse matrix, B is a sparse vector with numCols = 1. - * - * @param gCtx a valid {@link GPUContext} - * @param instName the invoking instruction's name for record {@link Statistics}. - * @param output allocated output object C to which the GPU output matrix will be attached - * @param isATranposed if A is to be transposed or not (the op in op(A)) - * @param m number of rows in A (not op(A)) - * @param n number of cols in A (not op(A)) - * @param k number of rows in B, (cols in B is assumed to be 1) - * @param A left sparse matrix on GPU - * @param B right sparse vector on GPU - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - private static void sparseMatrixVectorMult(GPUContext gCtx, String instName, MatrixObject output, boolean isATranposed, int m, int n, int k, - CSRPointer A, CSRPointer B) throws DMLRuntimeException { - long t0=0; - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); - Pointer BDenseVector = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), k, 1); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0); - sparseMatrixDenseVectorMult(gCtx, instName, output, A, BDenseVector, isATranposed, m, k); - } - - /** - * Does a sparse-sparse Matrix multiply - * C = op(A) x op(B), A, B are sparse matrices - * - * @param gCtx a valid {@link GPUContext} - * @param instName the invoking instruction's name for record {@link Statistics}. - * @param A left sparse matrix on GPU - * @param B right sparse matrix on GPU - * @param output allocated output object on host to which the GPU output matrix will be attached - * @param isLeftTransposed op for A - to be transposed or not - * @param isRightTransposed op for B - * @param m number of rows in op(A) - * @param n number of cols in op(B) - * @param k number of cols in op(A) or rows in op(B) - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - private static void sparseSparseMatmult(GPUContext gCtx, String instName, CSRPointer A, CSRPointer B, MatrixObject output, - boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException { - if(LOG.isTraceEnabled()) { - LOG.trace("GPU : sp M %*% sp M" + ", GPUContext=" + gCtx); - } - - int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; - int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; - - long t0=0, t1=0; - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); - CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transA, B, transB, m, n, k); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t0); - - output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C); - - if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); - cusparseDcsrgemm(getCusparseHandle(gCtx), transA, transB, m, n, k, - A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd, - B.descr, (int)B.nnz, B.val, B.rowPtr, B.colInd, - C.descr, C.val, C.rowPtr, C.colInd); - //cudaDeviceSynchronize; - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB, System.nanoTime() - t1); - } - - /** - * Dense dense matrix multiply - * C = op(A) * op(B), A and B are dense matrices - * - * @param gCtx a valid {@link GPUContext} - * @param instName name of the invoking instruction to record{@link Statistics}. - * @param output output object C on host with GPU data allocated - * @param left left matrix A (in row-major order) - * @param right right matrix B (in row-major order) - * @param isLeftTransposed op for A, transposed or not - * @param isRightTransposed op for B, transposed or not - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - private static void denseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right, - boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException { - - Pointer leftPtr = getDensePointer(gCtx, left, instName); - Pointer rightPtr = getDensePointer(gCtx, right, instName); - - int leftRows = toInt(left.getNumRows()); - int leftCols = toInt(left.getNumColumns()); - int rightRows = toInt(right.getNumRows()); - int rightCols = toInt(right.getNumColumns()); - Pointer C = getDensePointer(gCtx, output, instName); - denseDenseMatmult(gCtx, instName, C, leftRows, leftCols, rightRows, rightCols, isLeftTransposed, isRightTransposed, - leftPtr, rightPtr); - } - - /** - * Dense-dense matrix multiply - * C = op(A) * op(B), A and B are dense matrices - * On the host, the matrices are in row-major format; cuBLAS expects them in column-major format. - * What we have as input is t(A) and t(B), t(X) = transpose of X. - * We do t(B) %*% t(A) to get t(C); - * If we were to calculate t(t(C), we would get the resultant matrix C, but this would be in column-major format. - * What we really want is t(C). This we already have as the result of t(B) %*% t(A). - * - * @param gCtx a valid {@link GPUContext} - * @param instName name of the invoking instruction to record{@link Statistics}. - * @param output output allocated on GPU in column major format - * @param leftRows1 number of rows in A - * @param leftCols1 number of cols in A - * @param rightRows1 number of rows in B - * @param rightCols1 number of cols in B - * @param isLeftTransposed1 op for A, transposed or not - * @param isRightTransposed1 op for B, transposed or not - * @param leftPtr A allocated on the GPU in row-major format - * @param rightPtr B allocated on the GPU in row-major format - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public static void denseDenseMatmult(GPUContext gCtx, String instName, Pointer output, int leftRows1, int leftCols1, int rightRows1, - int rightCols1, boolean isLeftTransposed1, boolean isRightTransposed1, Pointer leftPtr, Pointer rightPtr) - throws DMLRuntimeException { - if(LOG.isTraceEnabled()) { - LOG.trace("GPU : d M %*% d M" + ", GPUContext=" + gCtx); - } - - Pointer A = rightPtr; - Pointer B = leftPtr; - - // To compensate for the input matrices being in row-major format instead of column-major (the way cublas expects) - int leftRows = rightCols1; - int leftCols = rightRows1; - int rightRows = leftCols1; - int rightCols = leftRows1; - - boolean isLeftTransposed = isRightTransposed1; - boolean isRightTransposed = isLeftTransposed1; - - // Note: the dimensions are swapped - int m = isLeftTransposed ? leftCols : leftRows ; - int n = isRightTransposed ? rightRows : rightCols; - int k = isLeftTransposed ? leftRows : leftCols; - int k1 = isRightTransposed ? rightCols : rightRows; - if(k != k1) - throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1); - - if(m == -1 || n == -1 || k == -1) - throw new DMLRuntimeException("Incorrect dimensions"); - - double[] one = { 1 }; - double[] zero = { 0 }; - - //int lda = leftRows; - //int ldb = leftCols; - int lda = isLeftTransposed ? k : m; - int ldb = isRightTransposed ? n : k; - int ldc = m; - - int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N; - int transb = isRightTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N; - - long t0=0; - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); - Pointer C = output; - if (m == 1 && n == 1){ - // Vector product - LOG.debug(" GPU Dense-dense Vector Product"); - double[] result = {0}; - JCublas2.cublasDdot(getCublasHandle(gCtx), k, A, 1, B, 1, Pointer.to(result)); - // By default in CuBlas V2, cublas pointer mode is set to CUBLAS_POINTER_MODE_HOST. - // This means that scalar values passed are on host (as opposed to on device). - // The result is copied from the host back to the device so that the rest of - // infrastructure can treat it uniformly. - cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, cudaMemcpyHostToDevice); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DOT_LIB, System.nanoTime() - t0); - } else if (m == 1) { - // Vector-matrix multiply - LOG.debug(" GPU Dense Vector-Matrix Multiply"); - transb = isRightTransposed ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T; - JCublas2.cublasDgemv(getCublasHandle(gCtx), transb, rightRows, rightCols, Pointer.to(one), B, ldb, A, 1, Pointer.to(zero), C, 1); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB, System.nanoTime() - t0); - } else if (n == 1){ - // Matrix-vector multiply - LOG.debug(" GPU Dense Matrix-Vector Multiply"); - JCublas2.cublasDgemv(getCublasHandle(gCtx), transa, leftRows, leftCols, Pointer.to(one), A, lda, B, 1, Pointer.to(zero), C, 1); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t0); - } else { - LOG.debug(" GPU Dense-Dense Matrix Multiply "); - JCublas2.cublasDgemm(getCublasHandle(gCtx), transa, transb, m, n, k, Pointer.to(one), A, lda, B, ldb, Pointer.to(zero), C, ldc); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB, System.nanoTime() - t0); - } - } - - //********************************************************************/ - //***************** END OF MATRIX MULTIPLY Functions *****************/ - //********************************************************************/ - //********************************************************************/ //**************** UNARY AGGREGATE Functions ************************/ http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java new file mode 100644 index 0000000..21a2a35 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java @@ -0,0 +1,480 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.matrix.data; + +import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE; +import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE; +import static jcuda.runtime.JCuda.cudaMemcpy; +import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; +import jcuda.Pointer; +import jcuda.Sizeof; +import jcuda.jcublas.JCublas2; +import jcuda.jcublas.cublasHandle; +import jcuda.jcublas.cublasOperation; +import jcuda.jcusparse.JCusparse; +import jcuda.jcusparse.cusparseHandle; +import jcuda.runtime.JCuda; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysml.api.DMLScript; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; +import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; +import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; +import org.apache.sysml.runtime.instructions.gpu.context.CSRPointer; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; +import org.apache.sysml.utils.GPUStatistics; +import org.apache.sysml.utils.Statistics; + +public class LibMatrixCuMatMult extends LibMatrixCUDA { + + private static final Log LOG = LogFactory.getLog(LibMatrixCuMatMult.class.getName()); + + private static class CuMatMultParameters { + /* + * For the operation, C = op(A) %*% op(B), the below parameters are used + * to invoke the corresponding kernels in CuBLAS and CuSPARSE. + * + * All the below values have to be valid or else this class has to throw + * an exception. No special values like -1 for unknowns allowed. + */ + public int m; // number of rows of matrix op(A) and C. + public int n; // number of columns of matrix op(B) and C. + public int k; // number of columns of op(A) and rows of op(B). + public int lda; // leading dimension of two-dimensional array used to + // store the matrix A. + public int ldb; // leading dimension of two-dimensional array used to + // store matrix B. + public int ldc; // leading dimension of a two-dimensional array used to + // store the matrix C. + public long leftNumRows; // number of rows of A + public long leftNumCols; // number of cols of A + public long rightNumRows; // number of rows of B + public long rightNumCols; // number of cols of B + private boolean isLeftTransposed; // is op(A) = t(A) + private boolean isRightTransposed; // is op(B) = t(B) + + public CuMatMultParameters(long leftNumRows1, long leftNumCols1, long rightNumRows1, long rightNumCols1, + boolean isLeftTransposed1, boolean isRightTransposed1) throws DMLRuntimeException { + leftNumRows = leftNumRows1; + leftNumCols = leftNumCols1; + rightNumRows = rightNumRows1; + rightNumCols = rightNumCols1; + isLeftTransposed = isLeftTransposed1; + isRightTransposed = isRightTransposed1; + setDimensions(); + } + + public void rowToColumnMajor() throws DMLRuntimeException { + // To compensate for the input matrices being in row-major format + // instead of column-major (the way cublas expects) + isRightTransposed = swap(isLeftTransposed, isLeftTransposed = isRightTransposed); + rightNumCols = swap(leftNumRows, leftNumRows = rightNumCols); + rightNumRows = swap(leftNumCols, leftNumCols = rightNumRows); + setDimensions(); + } + + private void validate() throws DMLRuntimeException { + int k1 = toInt(isRightTransposed ? rightNumCols : rightNumRows); + if (k != k1) + throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1 + " [" + leftNumRows + "," + + leftNumCols + "," + rightNumRows + "," + rightNumCols + "], " + isLeftTransposed + " " + + isRightTransposed); + } + + private void setDimensions() throws DMLRuntimeException { + // Validate the dimensions + m = toInt(isLeftTransposed ? leftNumCols : leftNumRows); + n = toInt(isRightTransposed ? rightNumRows : rightNumCols); + k = toInt(isLeftTransposed ? leftNumRows : leftNumCols); + lda = isLeftTransposed ? k : m; + ldb = isRightTransposed ? n : k; + ldc = m; + if (m == -1 || n == -1 || k == -1) + throw new DMLRuntimeException("Incorrect dimensions"); + } + } + + /** + * Matrix multiply on GPU Examines sparsity and shapes and routes call to + * appropriate method from cuBLAS or cuSparse C = op(A) x op(B) + * + * The user is expected to call + * ec.releaseMatrixOutputForGPUInstruction(outputName); + * + * @param ec + * Current {@link ExecutionContext} instance + * @param gCtx + * a valid {@link GPUContext} + * @param instName + * name of the invoking instruction to record{@link Statistics}. + * @param left + * Matrix A + * @param right + * Matrix B + * @param outputName + * Name of the output matrix C (in code generated after LOP + * layer) + * @param isLeftTransposed + * op for A, transposed or not + * @param isRightTransposed + * op for B, tranposed or not + * @throws DMLRuntimeException + * if DMLRuntimeException occurs + * @return output of matrix multiply + */ + public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, + MatrixObject right, String outputName, boolean isLeftTransposed, boolean isRightTransposed) + throws DMLRuntimeException { + boolean isM1Sparse = isInSparseFormat(gCtx, left); + boolean isM2Sparse = isInSparseFormat(gCtx, right); + MatrixObject output = ec.getMatrixObject(outputName); + long outRLen = isLeftTransposed ? left.getNumColumns() : left.getNumRows(); + long outCLen = isRightTransposed ? right.getNumRows() : right.getNumColumns(); + + CuMatMultParameters params = new CuMatMultParameters(left.getNumRows(), left.getNumColumns(), + right.getNumRows(), right.getNumColumns(), isLeftTransposed, isRightTransposed); + + if (isM1Sparse && isM2Sparse) { + // ------------------------------------------------------------------------------------- + // sparse-sparse matrix multiplication + params.validate(); + int transa = cusparseOp(isLeftTransposed); + int transb = cusparseOp(isRightTransposed); + + // Step 1: Allocate output => sparse format + ec.allocateGPUMatrixObject(outputName, outRLen, outCLen); + + // Step 2: Get the handles to sparse/dense pointers for left, right + // and output + CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); + CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); + long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transa, B, transb, + params.m, params.n, params.k); + if (GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, + System.nanoTime() - t0); + + // Step 3: Invoke the kernel + long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + JCusparse.cusparseDcsrgemm(getCusparseHandle(gCtx), transa, transb, params.m, params.n, params.k, A.descr, + (int) A.nnz, A.val, A.rowPtr, A.colInd, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr, + C.val, C.rowPtr, C.colInd); + if (GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB, + System.nanoTime() - t1); + output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C); + // ------------------------------------------------------------------------------------- + } else if (!isM1Sparse && isM2Sparse) { + // ------------------------------------------------------------------------------------- + // dense-sparse matrix multiplication + // Step 1: Allocate output => dense format + getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen); + + // Step 2: Get the handles to sparse/dense pointers for left, right + // and output + Pointer A = getDensePointer(gCtx, left, instName); + CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); + Pointer C = getDensePointer(gCtx, output, instName); + + // Step 3: Invoke the kernel + denseSparseMatMult(getCusparseHandle(gCtx), instName, C, A, B, params); + // ------------------------------------------------------------------------------------- + } else if (isM1Sparse && !isM2Sparse) { + // ------------------------------------------------------------------------------------- + // sparse-dense matrix multiplication + // Step 1: Allocate output => dense format + getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen); + + // Step 2: Get the handles to sparse/dense pointers for left, right + // and output + CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); + Pointer B = getDensePointer(gCtx, right, instName); + Pointer C = getDensePointer(gCtx, output, instName); + + // Step 3: Invoke the kernel + sparseDenseMatMult(gCtx, instName, C, A, B, left.getNumRows(), left.getNumColumns(), right.getNumRows(), + right.getNumColumns(), outRLen, outCLen, isLeftTransposed, isRightTransposed); + // ------------------------------------------------------------------------------------- + } else { + // ------------------------------------------------------------------------------------- + // dense-dense matrix multiplication + // Step 1: Allocate output => dense format + getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen); + + // Step 2: Get the handles to sparse/dense pointers for left, right + // and output + Pointer A = getDensePointer(gCtx, left, instName); + Pointer B = getDensePointer(gCtx, right, instName); + Pointer C = getDensePointer(gCtx, output, instName); + + // Step 3: Invoke the kernel + denseDenseMatMult(getCublasHandle(gCtx), instName, C, A, B, params); + // ------------------------------------------------------------------------------------- + } + return output; + } + + /** + * Internal method to invoke the appropriate CuSPARSE kernel for matrix + * multiplication for operation: C = op(A) * op(B) This assumes B and C are + * allocated in dense row-major format and A is sparse. + * + * Other than input and output, this method requires additional memory = + * outRLen * outCLen * Sizeof.DOUBLE + * + * @param gCtx + * a valid {@link GPUContext} + * @param instName + * name of the invoking instruction to record{@link Statistics}. + * @param C + * output matrix pointer + * @param A + * left matrix pointer + * @param B + * right matrix pointer + * @param leftNumRows + * number of rows of A + * @param leftNumColumns + * number of cols of A + * @param rightNumRows + * number of rows of B + * @param rightNumColumns + * number of cols of B + * @param outRLen + * number of rows of C + * @param outCLen + * number of cols of C + * @param isLeftTransposed + * is op(A) = t(A) + * @param isRightTransposed + * is op(B) = t(B) + * @throws DMLRuntimeException + * if error + */ + private static void sparseDenseMatMult(GPUContext gCtx, String instName, Pointer C, CSRPointer A, Pointer B, + long leftNumRows, long leftNumColumns, long rightNumRows, long rightNumColumns, long outRLen, long outCLen, + boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException { + // t(C) = t(B) %*% t(A) + Pointer output = null; + if (outRLen != 1 && outCLen != 1) { + output = gCtx.allocate(outRLen * outCLen * Sizeof.DOUBLE); + } else { + // no transpose required for vector output + output = C; + } + CuMatMultParameters params = new CuMatMultParameters(rightNumRows, rightNumColumns, leftNumRows, + leftNumColumns, !isRightTransposed, !isLeftTransposed); + denseSparseMatMult(getCusparseHandle(gCtx), instName, output, B, A, params); + if (outRLen != 1 && outCLen != 1) { + // Transpose: C = t(output) + long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + JCublas2.cublasDgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T, + toInt(outCLen), toInt(outRLen), one(), output, toInt(outRLen), zero(), new Pointer(), + toInt(outRLen), C, toInt(outCLen)); + if (!DMLScript.EAGER_CUDA_FREE) + JCuda.cudaDeviceSynchronize(); + gCtx.cudaFreeHelper(output, DMLScript.EAGER_CUDA_FREE); + if (GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() + - t0); + } + } + + /** + * Internal method to invoke the appropriate CuSPARSE kernel for matrix + * multiplication for operation: C = op(A) * op(B) This assumes B and C are + * allocated in dense row-major format and A is sparse. + * + * @param handle + * cusparse handle + * @param instName + * name of the invoking instruction to record{@link Statistics}. + * @param C + * output matrix pointer + * @param A + * left matrix pointer + * @param B + * right matrix pointer + * @param param + * BLAS parameters + * @throws DMLRuntimeException + * if error + */ + private static void denseSparseMatMult(cusparseHandle handle, String instName, Pointer C, Pointer A, CSRPointer B, + CuMatMultParameters param) throws DMLRuntimeException { + long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + String kernel = GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_MATRIX_LIB; + // Ignoring sparse vector dense matrix multiplication and dot product + boolean isVector = (param.leftNumRows == 1 && !param.isLeftTransposed) + || (param.leftNumCols == 1 && param.isLeftTransposed); + if (isVector) { + LOG.debug(" GPU Sparse-Dense Matrix Vector "); + int m = toInt(param.rightNumRows); + int n = toInt(param.rightNumCols); + int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed)); + JCusparse.cusparseDcsrmv(handle, transa, m, n, toInt(B.nnz), one(), B.descr, B.val, B.rowPtr, B.colInd, A, + zero(), C); + kernel = GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB; + } else { + int m = toInt(param.rightNumRows); + int k = toInt(param.rightNumCols); + param.rowToColumnMajor(); + param.validate(); + int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed)); + int transb = cusparseOp(param.isRightTransposed); + LOG.debug(" GPU Sparse-Dense Matrix Multiply (rhs transpose) "); + JCusparse.cusparseDcsrmm2(handle, transa, transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.val, + B.rowPtr, B.colInd, A, param.ldb, zero(), C, param.ldc); + } + if (GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics.maintainCPMiscTimes(instName, kernel, System.nanoTime() - t0); + } + + /** + * Internal method to invoke the appropriate CuBLAS kernel for matrix + * multiplication for operation: C = op(A) * op(B) This assumes A, B and C + * are allocated in dense format. The caller is expected to invoke + * params.rowToColumnMajor(). + * + * @param handle + * cublas handle + * @param instName + * name of the invoking instruction to record{@link Statistics}. + * @param C + * output matrix pointer + * @param A + * left matrix pointer + * @param B + * right matrix pointer + * @param param + * BLAS parameters + * @throws DMLRuntimeException + * if error + */ + private static void denseDenseMatMult(cublasHandle handle, String instName, Pointer C, Pointer A, Pointer B, + CuMatMultParameters param) throws DMLRuntimeException { + long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + String kernel = null; + param.rowToColumnMajor(); + param.validate(); + int transa = cublasOp(param.isLeftTransposed); + int transb = cublasOp(param.isRightTransposed); + B = swap(A, A = B); + if (param.m == 1 && param.n == 1) { + // Vector product + LOG.debug(" GPU Dense-dense Vector Product"); + double[] result = { 0 }; + JCublas2.cublasDdot(handle, param.k, A, 1, B, 1, Pointer.to(result)); + // By default in CuBlas V2, cublas pointer mode is set to + // CUBLAS_POINTER_MODE_HOST. + // This means that scalar values passed are on host (as opposed to + // on device). + // The result is copied from the host back to the device so that the + // rest of + // infrastructure can treat it uniformly. + cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, cudaMemcpyHostToDevice); + kernel = GPUInstruction.MISC_TIMER_DENSE_DOT_LIB; + } else if (param.m == 1) { + // Vector-matrix multiply + LOG.debug(" GPU Dense Vector-Matrix Multiply"); + transb = reverseCublasOp(transb); + int rightNumRows = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.k : param.n; + int rightNumCols = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.n : param.k; + JCublas2.cublasDgemv(handle, transb, rightNumRows, rightNumCols, one(), B, param.ldb, A, 1, zero(), C, 1); + kernel = GPUInstruction.MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB; + } else if (param.n == 1) { + // Matrix-vector multiply + LOG.debug(" GPU Dense Matrix-Vector Multiply"); + int leftNumRows = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.m : param.k; + int leftNumCols = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.k : param.m; + JCublas2.cublasDgemv(handle, transa, leftNumRows, leftNumCols, one(), A, param.lda, B, 1, zero(), C, 1); + kernel = GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB; + } else { + LOG.debug(" GPU Dense-Dense Matrix Multiply "); + JCublas2.cublasDgemm(handle, transa, transb, param.m, param.n, param.k, one(), A, param.lda, B, param.ldb, + zero(), C, param.ldc); + kernel = GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB; + } + if (GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics.maintainCPMiscTimes(instName, kernel, System.nanoTime() - t0); + } + + // Convenient methods to swap two values + // Usage: y = swap(x, x=y); + private static long swap(long x, long y) { + return x; + } + + private static boolean swap(boolean x, boolean y) { + return x; + } + + private static Pointer swap(Pointer x, Pointer y) { + return x; + } + + /** + * Convenient wrapper to return appropriate cuSPARSE trans value + * + * @param isTransposed + * is op(input) = t(input) + * @return CUSPARSE_OPERATION_TRANSPOSE or CUSPARSE_OPERATION_NON_TRANSPOSE + */ + private static int cusparseOp(boolean isTransposed) { + return isTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; + } + + /** + * Convenient wrapper to return appropriate cuBLAS trans value + * + * @param isTransposed + * is op(input) = t(input) + * @return CUBLAS_OP_T or CUBLAS_OP_N + */ + private static int cublasOp(boolean isTransposed) { + return isTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N; + } + + /** + * Flips the cuBLAS trans value + * + * @param trans + * can be CUBLAS_OP_T or CUBLAS_OP_N + * @return CUBLAS_OP_N if trans is CUBLAS_OP_T else CUBLAS_OP_T + */ + private static int reverseCublasOp(int trans) { + return trans == cublasOperation.CUBLAS_OP_T ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T; + } + + /** + * Flips the cuSPARSE trans value + * + * @param trans + * can be CUSPARSE_OPERATION_NON_TRANSPOSE or + * CUSPARSE_OPERATION_TRANSPOSE + * @return CUSPARSE_OPERATION_NON_TRANSPOSE if trans is + * CUSPARSE_OPERATION_TRANSPOSE else CUSPARSE_OPERATION_TRANSPOSE + */ + private static int reverseCusparseOp(int trans) { + return trans == CUSPARSE_OPERATION_TRANSPOSE ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; + } +} http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java index 81bc254..d983716 100644 --- a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java +++ b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java @@ -50,9 +50,81 @@ public class MatrixMultiplicationOpTest extends GPUTests { public void matrixMatrixTest1() { String scriptStr = "O = X %*% Y"; - int[] X1 = { 1, 128, 513, 1024 }; - int[] X2 = { 128, 512, 1024 }; - int[] Y2 = { 1, 128, 513, 1024 }; + int[] X1 = { 1, 128, 1024 }; + int[] X2 = { 1, 128, 1024 }; + int[] Y2 = { 1, 128, 1024 }; + double[] SX = { 0.0, 0.03, 0.3, 0.9 }; + double[] SY = { 0.0, 0.03, 0.3, 0.9 }; + + for (int x1 = 0; x1 < X1.length; x1++) { + for (int x2 = 0; x2 < X2.length; x2++) { + int y1 = x2; + for (int y2 = 0; y2 < Y2.length; y2++) { + for (int sx = 0; sx < SX.length; sx++) { + for (int sy = 0; sy < SY.length; sy++) { + assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], X2[y1], Y2[y2], SX[sx], SY[sy]); + } + } + } + } + } + } + + @Test + public void commonCaseMLMatrixMatrixTest1() { + String scriptStr = "O = X %*% Y"; + + int[] X1 = { 1000000 }; + int[] X2 = { 1000 }; + int[] Y2 = { 1, 20 }; + double[] SX = { 0.0, 0.03, 0.3 }; + double[] SY = { 0.0, 0.03, 0.3, 0.9 }; + + for (int x1 = 0; x1 < X1.length; x1++) { + for (int x2 = 0; x2 < X2.length; x2++) { + int y1 = x2; + for (int y2 = 0; y2 < Y2.length; y2++) { + for (int sx = 0; sx < SX.length; sx++) { + for (int sy = 0; sy < SY.length; sy++) { + assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], X2[y1], Y2[y2], SX[sx], SY[sy]); + } + } + } + } + } + } + + @Test + public void commonCaseDLMatrixMatrixTest1() { + String scriptStr = "O = X %*% Y"; + + int[] X1 = { 100 }; + int[] X2 = { 600, 900 }; + int[] Y2 = { 205800 }; + double[] SX = { 0.0, 0.03, 0.3 }; + double[] SY = { 0.0, 0.03, 0.3, 0.9 }; + + for (int x1 = 0; x1 < X1.length; x1++) { + for (int x2 = 0; x2 < X2.length; x2++) { + int y1 = x2; + for (int y2 = 0; y2 < Y2.length; y2++) { + for (int sx = 0; sx < SX.length; sx++) { + for (int sy = 0; sy < SY.length; sy++) { + assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], X2[y1], Y2[y2], SX[sx], SY[sy]); + } + } + } + } + } + } + + @Test + public void commonCaseDLMatrixMatrixTest2() { + String scriptStr = "O = X %*% Y"; + + int[] X1 = { 64 }; + int[] X2 = { 196608 }; + int[] Y2 = { 512 }; double[] SX = { 0.0, 0.03, 0.3, 0.9 }; double[] SY = { 0.0, 0.03, 0.3, 0.9 }; @@ -74,9 +146,9 @@ public class MatrixMultiplicationOpTest extends GPUTests { public void matrixMatrixTest2() { String scriptStr = "O = X %*% t(Y)"; - int[] X1 = { 1, 128, 513, 1024 }; - int[] X2 = { 128, 512, 1024 }; - int[] Y1 = { 1, 128, 513, 1024 }; + int[] X1 = { 1, 128, 1024 }; + int[] X2 = { 1, 128, 1024 }; + int[] Y1 = { 1, 128, 1024 }; double[] SX = { 0.0, 0.03, 0.3, 0.9 }; double[] SY = { 0.0, 0.03, 0.3, 0.9 }; @@ -98,9 +170,9 @@ public class MatrixMultiplicationOpTest extends GPUTests { public void matrixMatrixTest3() { String scriptStr = "O = t(X) %*% Y"; - int[] X1 = { 1, 128, 513, 1024 }; - int[] X2 = { 128, 512, 1024 }; - int[] Y2 = { 1, 128, 513, 1024 }; + int[] X1 = { 1, 128, 1024 }; + int[] X2 = { 1, 128, 1024 }; + int[] Y2 = { 1, 128, 1024 }; double[] SX = { 0.0, 0.03, 0.3, 0.9 }; double[] SY = { 0.0, 0.03, 0.3, 0.9 }; @@ -122,9 +194,9 @@ public class MatrixMultiplicationOpTest extends GPUTests { public void matrixMatrixTest4() { String scriptStr = "O = t(X) %*% t(Y)"; - int[] X1 = { 1, 128, 513, 1024 }; - int[] X2 = { 128, 512, 1024 }; - int[] Y1 = { 1, 128, 513, 1024 }; + int[] X1 = { 1, 128, 1024 }; + int[] X2 = { 1, 128, 1024 }; + int[] Y1 = { 1, 128, 1024 }; double[] SX = { 0.0, 0.03, 0.3, 0.9 }; double[] SY = { 0.0, 0.03, 0.3, 0.9 }; @@ -146,7 +218,7 @@ public class MatrixMultiplicationOpTest extends GPUTests { public void transposeSelfMatrixMultiply() { String scriptStr = "O = t(X) %*% X"; - int[] sizes = { 1, 128, 512, 1024, 2049 }; + int[] sizes = { 1, 128, 1024 }; double[] sparsities = { 0.0, 0.03, 0.3, 0.9 }; for (int i = 0; i < sizes.length; i++) {