http://git-wip-us.apache.org/repos/asf/systemml/blob/ae268a9e/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index 217acd6..d02a875 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -28,6 +28,7 @@ import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysml.api.DMLScript; +import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; @@ -333,11 +334,11 @@ public class LibMatrixCUDA { Pointer outputPointer = getDensePointer(gCtx, outputBlock, instName); long t1=0; - if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t1 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("relu_backward", ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)), imagePointer, doutPointer, outputPointer, toInt(rows), toInt(cols)); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1); } @@ -367,7 +368,7 @@ public class LibMatrixCUDA { Pointer tmp = gCtx.allocate(instName, cols*sizeOfDataType); reduceCol(gCtx, instName, "reduce_col_sum", imagePointer, tmp, N, cols); reduceRow(gCtx, instName, "reduce_row_sum", tmp, outputPointer, toInt(C), toInt(HW)); - gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, tmp, gCtx.EAGER_CUDA_FREE); } /** @@ -402,11 +403,11 @@ public class LibMatrixCUDA { Pointer biasPointer = bias.getGPUObject(gCtx).getDensePointer(); Pointer outputPointer = outputBlock.getGPUObject(gCtx).getDensePointer(); long t1 = 0; - if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t1 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("bias_multiply", ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)), imagePointer, biasPointer, outputPointer, toInt(rows), toInt(cols), toInt(PQ)); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1); } @@ -454,11 +455,11 @@ public class LibMatrixCUDA { } int PQ = cols / k; long t1 = 0; - if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t1 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("bias_add", ExecutionConfig.getConfigForSimpleMatrixOperations(rows, cols), image, bias, output, rows, cols, PQ); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1); } @@ -525,13 +526,13 @@ public class LibMatrixCUDA { long t0=0, t1=0; - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); cudaSupportFunctions.cublassyrk(getCublasHandle(gCtx), cublasFillMode.CUBLAS_FILL_MODE_LOWER,transa, m, k, one(), A, lda, zero(), C, ldc); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SYRK_LIB, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SYRK_LIB, System.nanoTime() - t0); - if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t1 = System.nanoTime(); copyUpperToLowerTriangle(gCtx, instName, output); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL, System.nanoTime() - t1); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL, System.nanoTime() - t1); } /** @@ -740,7 +741,7 @@ public class LibMatrixCUDA { default: throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared"); } - gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, tmp, gCtx.EAGER_CUDA_FREE); break; } case OP_MEAN:{ @@ -853,7 +854,7 @@ public class LibMatrixCUDA { ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1); matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp); - gCtx.cudaFreeHelper(instName, tmpRow, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, tmpRow, gCtx.EAGER_CUDA_FREE); break; } @@ -871,15 +872,15 @@ public class LibMatrixCUDA { ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1); matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp); - gCtx.cudaFreeHelper(instName, tmpCol, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, tmpCol, gCtx.EAGER_CUDA_FREE); break; } default: throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance"); } - gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE); - gCtx.cudaFreeHelper(instName, tmp2, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, tmp, gCtx.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, tmp2, gCtx.EAGER_CUDA_FREE); break; } case OP_MAXINDEX : { @@ -938,24 +939,24 @@ public class LibMatrixCUDA { long t1=0,t2=0; - if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t1 = System.nanoTime(); getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n); //cudaDeviceSynchronize; - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t1); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t1); int s = blocks; while (s > 1) { tmp = getKernelParamsForReduceAll(gCtx, s); blocks = tmp[0]; threads = tmp[1]; sharedMem = tmp[2]; - if (DMLScript.FINEGRAINED_STATISTICS) t2 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t2 = System.nanoTime(); getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), tempOut, tempOut, s); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t2); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t2); s = (s + (threads*2-1)) / (threads*2); } double[] result = {-1f}; cudaSupportFunctions.deviceToHost(gCtx, tempOut, result, instName, false); - gCtx.cudaFreeHelper(instName, tempOut, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, tempOut, gCtx.EAGER_CUDA_FREE); return result[0]; } @@ -978,11 +979,11 @@ public class LibMatrixCUDA { int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; long t0=0; - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols); //cudaDeviceSynchronize; - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ROW_KERNEL, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ROW_KERNEL, System.nanoTime() - t0); } @@ -1005,11 +1006,11 @@ public class LibMatrixCUDA { int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; long t0=0; - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols); //cudaDeviceSynchronize; - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_COL_KERNEL, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_COL_KERNEL, System.nanoTime() - t0); } /** @@ -1329,11 +1330,11 @@ public class LibMatrixCUDA { int isLeftScalar = (op instanceof LeftScalarOperator) ? 1 : 0; int size = rlenA * clenA; long t0=0; - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("matrix_scalar_op", ExecutionConfig.getConfigForSimpleVectorOperations(size), a, scalar, c, size, getBinaryOp(op.fn), isLeftScalar); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_SCALAR_OP_KERNEL, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_SCALAR_OP_KERNEL, System.nanoTime() - t0); } /** @@ -1431,11 +1432,11 @@ public class LibMatrixCUDA { LOG.trace("GPU : matrix_matrix_cellwise_op" + ", GPUContext=" + gCtx); } long t0=0; - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("matrix_matrix_cellwise_op", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRlen, maxClen), a, b, c, maxRlen, maxClen, vecStatusA, vecStatusB, getBinaryOp(op.fn)); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL, System.nanoTime() - t0); } /** @@ -1525,11 +1526,11 @@ public class LibMatrixCUDA { int rlen = toInt(out.getNumRows()); int clen = toInt(out.getNumColumns()); long t0 = 0; - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); int size = rlen * clen; getCudaKernels(gCtx).launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(size), A, constant, size); - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_FILL_KERNEL, System.nanoTime() - t0); } } @@ -1544,10 +1545,10 @@ public class LibMatrixCUDA { */ public static void deviceCopy(String instName, Pointer src, Pointer dest, int rlen, int clen) { long t0=0; - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); int size = rlen * clen * sizeOfDataType; cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t0); } /** @@ -1629,19 +1630,19 @@ public class LibMatrixCUDA { // Invoke cuSparse when either are in sparse format // Perform sparse-sparse dgeam if (!isInSparseFormat(gCtx, in1)) { - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); in1.getGPUObject(gCtx).denseToSparse(); - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0); } CSRPointer A = in1.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); if (!isInSparseFormat(gCtx, in2)) { - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); in2.getGPUObject(gCtx).denseToSparse(); - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0); } @@ -1664,21 +1665,21 @@ public class LibMatrixCUDA { "Transpose in cusparseDcsrgeam not supported for sparse matrices on GPU"); } - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) t1 = System.nanoTime(); CSRPointer C = CSRPointer.allocateForDgeam(gCtx, getCusparseHandle(gCtx), A, B, m, n); - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t1); out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C); //long sizeOfC = CSRPointer.estimateSize(C.nnz, out.getNumRows()); - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); cudaSupportFunctions.cusparsecsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, toInt(A.nnz), A.val, A.rowPtr, A.colInd, betaPtr, B.descr, toInt(B.nnz), B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd); //cudaDeviceSynchronize; - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_DGEAM_LIB, System.nanoTime() - t0); } @@ -1705,9 +1706,9 @@ public class LibMatrixCUDA { getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen); // Allocated the dense output matrix Pointer C = getDensePointer(gCtx, out, instName); - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); cudaSupportFunctions.cublasgeam(getCublasHandle(gCtx), transa, transb, m, n, alphaPtr, A, lda, betaPtr, B, ldb, C, ldc); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0); } } @@ -1736,9 +1737,9 @@ public class LibMatrixCUDA { int m = toInt(numRowsA); int n = lda; int ldc = m; - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); cudaSupportFunctions.cublasgeam(getCublasHandle(gCtx), CUBLAS_OP_T, CUBLAS_OP_T, m, n, one(), A, lda, zero(), A, ldb, C, ldc); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0); } @@ -1846,7 +1847,7 @@ public class LibMatrixCUDA { */ protected static void sliceDenseDense(GPUContext gCtx, String instName, Pointer inPointer, Pointer outPointer, int rl, int ru, int cl, int cu, int inClen) { - long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t0 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; long retClen = cu - cl + 1; if (inClen == retClen) { cudaMemcpy(outPointer, inPointer.withByteOffset(rl * inClen * sizeOfDataType), (ru - rl + 1) * inClen @@ -1856,7 +1857,7 @@ public class LibMatrixCUDA { getCudaKernels(gCtx).launchKernel("slice_dense_dense", ExecutionConfig.getConfigForSimpleVectorOperations(toInt(retRlen*retClen)), inPointer, outPointer, rl, ru, cl, cu, inClen, retRlen, retClen); } - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RIX_DENSE_OP, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RIX_DENSE_OP, System.nanoTime() - t0); } /** @@ -1879,7 +1880,7 @@ public class LibMatrixCUDA { if(size == 0) return; int retRlen = ru - rl + 1; - long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t0 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; int retClen = cu - cl + 1; String kernel = null; String timer = null; @@ -1901,7 +1902,7 @@ public class LibMatrixCUDA { // We can generalize this later to output sparse matrix. getCudaKernels(gCtx).launchKernel(kernel, ExecutionConfig.getConfigForSimpleVectorOperations(size), inPointer.val, inPointer.rowPtr, inPointer.colInd, outPointer, rl, ru, cl, cu, retClen); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, timer, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, timer, System.nanoTime() - t0); } /** @@ -1946,11 +1947,11 @@ public class LibMatrixCUDA { int maxRows = toInt(Math.max(rowsA, rowsB)); int maxCols = toInt(Math.max(colsA, colsB)); - if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t1 = System.nanoTime(); getCudaKernels(gCtx) .launchKernel("cbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C, rowsA, colsA, rowsB, colsB); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CBIND_KERNEL, System.nanoTime() - t1); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CBIND_KERNEL, System.nanoTime() - t1); } @@ -1981,11 +1982,11 @@ public class LibMatrixCUDA { int maxRows = Math.max(rowsA, rowsB); int maxCols = Math.max(colsA, colsB); - if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t1 = System.nanoTime(); getCudaKernels(gCtx) .launchKernel("rbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C, rowsA, colsA, rowsB, colsB); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RBIND_KERNEL, System.nanoTime() - t1); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RBIND_KERNEL, System.nanoTime() - t1); } @@ -2315,10 +2316,10 @@ public class LibMatrixCUDA { Pointer output = getDensePointer(gCtx, out, instName); Pointer input = getDensePointer(gCtx, in1, instName); int size = toInt(in1.getNumColumns() * in1.getNumRows()); - if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t1 = System.nanoTime(); getCudaKernels(gCtx).launchKernel(kernel, ExecutionConfig.getConfigForSimpleVectorOperations(size), input, output, size); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, kernelTimer, System.nanoTime() - t1); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, kernelTimer, System.nanoTime() - t1); } } @@ -2356,13 +2357,13 @@ public class LibMatrixCUDA { // becomes // C <- A // C <- alpha*B + C - if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t1 = System.nanoTime(); cudaMemcpy(C, A, n*((long)sizeOfDataType), cudaMemcpyDeviceToDevice); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t1); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t1); - if (DMLScript.FINEGRAINED_STATISTICS) t2 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t2 = System.nanoTime(); cudaSupportFunctions.cublasaxpy(getCublasHandle(gCtx), toInt(n), alphaPtr, B, 1, C, 1); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_LIB, System.nanoTime() - t2); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_LIB, System.nanoTime() - t2); } else { if(LOG.isTraceEnabled()) { @@ -2372,12 +2373,12 @@ public class LibMatrixCUDA { // Matrix-Vector daxpy // Note: Vector-Matrix operation is not supported // daxpy_matrix_vector(double* A, double* B, double alpha, double* ret, int rlenA, int clenA, int rlenB, int clenB) - if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t1 = System.nanoTime(); int rlenA = toInt(in1.getNumRows()); int clenA = toInt(in1.getNumColumns()); int rlenB = toInt(in2.getNumRows()); int clenB = toInt(in2.getNumColumns()); getCudaKernels(gCtx).launchKernel("daxpy_matrix_vector", ExecutionConfig.getConfigForSimpleMatrixOperations(rlenA, clenA), A, B, constant, C, rlenA, clenA, rlenB, clenB); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_MV_KERNEL, System.nanoTime() - t1); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_MV_KERNEL, System.nanoTime() - t1); } } @@ -2424,20 +2425,20 @@ public class LibMatrixCUDA { // convert dense matrices to row major // Operation in cuSolver and cuBlas are for column major dense matrices // and are destructive to the original input - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); GPUObject ATobj = (GPUObject) Aobj.clone(); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0); - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); ATobj.denseRowMajorToColumnMajor(); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0); Pointer A = ATobj.getDensePointer(); - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); GPUObject bTobj = (GPUObject) bobj.clone(); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0); - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); bTobj.denseRowMajorToColumnMajor(); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0); Pointer b = bTobj.getDensePointer(); @@ -2446,18 +2447,18 @@ public class LibMatrixCUDA { // http://docs.nvidia.com/cuda/cusolver/#ormqr-example1 // step 3: query working space of geqrf and ormqr - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); int[] lwork = {0}; cudaSupportFunctions.cusolverDngeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR_BUFFER, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR_BUFFER, System.nanoTime() - t0); // step 4: compute QR factorization Pointer work = gCtx.allocate(instName, lwork[0] * sizeOfDataType); Pointer tau = gCtx.allocate(instName, m * sizeOfDataType); Pointer devInfo = gCtx.allocate(instName, Sizeof.INT); - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); cudaSupportFunctions.cusolverDngeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR, System.nanoTime() - t0); int[] qrError = {-1}; cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost); @@ -2466,34 +2467,34 @@ public class LibMatrixCUDA { } // step 5: compute Q^T*B - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); cudaSupportFunctions.cusolverDnormqr(gCtx.getCusolverDnHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasOperation.CUBLAS_OP_T, m, 1, n, A, m, tau, b, m, work, lwork[0], devInfo); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ORMQR, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ORMQR, System.nanoTime() - t0); cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost); if (qrError[0] != 0) { throw new DMLRuntimeException("GPU : Error in call to ormqr (to compuete Q^T*B after QR factorization) as part of solve, argument " + qrError[0] + " was wrong"); } // step 6: compute x = R \ Q^T*B - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); cudaSupportFunctions.cublastrsm(gCtx.getCublasHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasFillMode.CUBLAS_FILL_MODE_UPPER, cublasOperation.CUBLAS_OP_N, cublasDiagType.CUBLAS_DIAG_NON_UNIT, n, 1, dataTypePointerTo(1.0), A, m, b, m); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRSM, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRSM, System.nanoTime() - t0); - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); bTobj.denseColumnMajorToRowMajor(); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_COLUMN_TO_ROW_MAJOR, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_COLUMN_TO_ROW_MAJOR, System.nanoTime() - t0); // TODO : Find a way to assign bTobj directly to the output and set the correct flags so as to not crash // There is an avoidable copy happening here MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumColumns(), 1); cudaMemcpy(out.getGPUObject(gCtx).getDensePointer(), bTobj.getDensePointer(), n * 1 * sizeOfDataType, cudaMemcpyDeviceToDevice); - gCtx.cudaFreeHelper(instName, work, DMLScript.EAGER_CUDA_FREE); - gCtx.cudaFreeHelper(instName, tau, DMLScript.EAGER_CUDA_FREE); - ATobj.clearData(instName, DMLScript.EAGER_CUDA_FREE); - bTobj.clearData(instName, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, work, gCtx.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, tau, gCtx.EAGER_CUDA_FREE); + ATobj.clearData(instName, gCtx.EAGER_CUDA_FREE); + bTobj.clearData(instName, gCtx.EAGER_CUDA_FREE); //debugPrintMatrix(b, n, 1); } @@ -2514,10 +2515,10 @@ public class LibMatrixCUDA { */ public static MatrixObject getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String instName, String name, long numRows, long numCols) { long t0=0; - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); Pair<MatrixObject, Boolean> mb = ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols); if (mb.getValue()) - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t0); return mb.getKey(); } @@ -2535,10 +2536,10 @@ public class LibMatrixCUDA { */ private static MatrixObject getSparseMatrixOutputForGPUInstruction(ExecutionContext ec, long numRows, long numCols, long nnz, String instName, String name) { long t0=0; - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); Pair<MatrixObject, Boolean> mb = ec.getSparseMatrixOutputForGPUInstruction(name, numRows, numCols, nnz); if (mb.getValue()) - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_SPARSE_OUTPUT, System.nanoTime() - t0); return mb.getKey(); }
http://git-wip-us.apache.org/repos/asf/systemml/blob/ae268a9e/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java index c6abbfe..d3b5984 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java @@ -49,7 +49,7 @@ import jcuda.jcudnn.cudnnTensorDescriptor; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.sysml.api.DMLScript; +import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; @@ -136,7 +136,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { private static Pointer denseIm2col(GPUContext gCtx, String instName, MatrixObject image, boolean isSparseImage, long N, long C, long H, long W, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) { Pointer im2colPointer = null; - long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t1 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; if(isSparseImage) { CSRPointer inPointer = getSparsePointer(gCtx, image, instName); if(inPointer.nnz < 0) { @@ -147,7 +147,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { getCudaKernels(gCtx).launchKernel("sparse_dense_im2col", ExecutionConfig.getConfigForSimpleVectorOperations(toInt(inPointer.nnz)), inPointer.val, inPointer.rowPtr, inPointer.colInd, im2colPointer, inPointer.nnz, N, C*H*W, H*W, W, R, S, P, Q, P*Q, R*S, N*P*Q, stride_h, stride_w, pad_h, pad_w); - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_IM2COL_KERNEL, System.nanoTime() - t1); } else @@ -159,7 +159,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { getCudaKernels(gCtx).launchKernel("dense_dense_im2col", ExecutionConfig.getConfigForSimpleVectorOperations(toInt(N*C*H*W)), imagePointer, im2colPointer, N*C*H*W, C*H*W, H*W, W, R, S, P, Q, P*Q, R*S, N*P*Q, stride_h, stride_w, pad_h, pad_w); - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_IM2COL_KERNEL, System.nanoTime() - t1); } return im2colPointer; @@ -220,16 +220,16 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { CSRPointer filterPointer = filter.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); Pointer matmultOutputPointer = gCtx.allocate(instName, NKPQ*sizeOfDataType); LibMatrixCuMatMult.sparseDenseMatMult(gCtx, instName, matmultOutputPointer, filterPointer, im2colPointer, K, CRS, CRS, NPQ, K, NPQ, false, false); - gCtx.cudaFreeHelper(instName, im2colPointer, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, im2colPointer, gCtx.EAGER_CUDA_FREE); // Perform reorg_knpq a reorg operation of matmultOutputPointer matrix with dimensions [K, NPQ] // and return a matrix dstPointer with dimensions [N, KPQ] - long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t1 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; getCudaKernels(gCtx).launchKernel("reorg_knpq", ExecutionConfig.getConfigForSimpleVectorOperations(toInt(NKPQ)), matmultOutputPointer, dstPointer, NKPQ, NPQ, KPQ, P*Q); - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_REORG_KNPQ_KERNEL, System.nanoTime() - t1); - gCtx.cudaFreeHelper(instName, matmultOutputPointer, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, matmultOutputPointer, gCtx.EAGER_CUDA_FREE); } else { // Filter and output are accounted as dense in the memory estimation for conv2d @@ -357,13 +357,13 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { } try { long t1 = 0; - if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t1 = System.nanoTime(); int status = cudnnConvolutionForward(getCudnnHandle(gCtx), one(), algo.nchwTensorDesc, image, algo.filterDesc, filter, algo.convDesc, algo.algo, algo.workSpace, algo.sizeInBytes, zero(), algo.nkpqTensorDesc, output); - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_FORWARD_LIB, System.nanoTime() - t1); if (status != cudnnStatus.CUDNN_STATUS_SUCCESS) { throw new DMLRuntimeException("Could not executed cudnnConvolutionForward: " + cudnnStatus.stringFor(status)); @@ -438,9 +438,9 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { // Perform one-input conv2dBackwardFilter Pointer tempdwPointer = gCtx.allocate(instName, KCRS*sizeOfDataType); for(int n = 0; n < N; n++) { - long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t0 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; cudaMemset(tempdwPointer, 0, KCRS*sizeOfDataType); - if(DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t0); + if(ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t0); // Perform one-input conv2dBackwardFilter cudnnConv2dBackwardFilter(gCtx, instName, imgFetcher.getNthRow(n), doutFetcher.getNthRow(n), tempdwPointer, algo); getCudaKernels(gCtx).launchKernel("inplace_add", @@ -475,10 +475,10 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { LOG.trace("GPU : conv2dBackwardFilter" + ", GPUContext=" + gCtx); } try { - long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t1 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; int status = cudnnConvolutionBackwardFilter(getCudnnHandle(gCtx), one(), algo.nchwTensorDesc, imagePointer, algo.nkpqTensorDesc, doutPointer, algo.convDesc, algo.algo, algo.workSpace, algo.sizeInBytes, zero(), algo.filterDesc, dwPointer); - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - t1); if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardFilter: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); @@ -578,10 +578,10 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" + gCtx); } try { - long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t1 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; int status = cudnnConvolutionBackwardData(getCudnnHandle(gCtx), one(), algo.filterDesc, w, algo.nkpqTensorDesc, dy, algo.convDesc, algo.algo, algo.workSpace, algo.sizeInBytes, zero(), algo.nchwTensorDesc, dx); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - t1); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - t1); if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardData: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); @@ -653,11 +653,11 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { LibMatrixCuDNNPoolingDescriptors.cudnnPoolingDescriptors(gCtx, instName, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, poolingType)) { long t1=0,t2=0; - if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); - if (DMLScript.FINEGRAINED_STATISTICS) t2 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t1 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); + if (ConfigurationManager.isFinegrainedStatistics()) t2 = System.nanoTime(); int status = cudnnPoolingForward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.xDesc, x, zero(), desc.yDesc, y); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2); if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { throw new DMLRuntimeException("Could not executed cudnnPoolingForward: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); } @@ -752,20 +752,20 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { long t1=0, t2=0, t3=0; int status; if(!isMaxPoolOutputProvided) { - if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t1 = System.nanoTime(); long numBytes = N*C*P*Q*sizeOfDataType; y = gCtx.allocate(instName, numBytes); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); - if (DMLScript.FINEGRAINED_STATISTICS) t2 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); + if (ConfigurationManager.isFinegrainedStatistics()) t2 = System.nanoTime(); status = cudnnPoolingForward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.xDesc, x, zero(), desc.yDesc, y); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2); if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { throw new DMLRuntimeException("Could not executed cudnnPoolingForward before cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); } } - if (DMLScript.FINEGRAINED_STATISTICS) t3 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t3 = System.nanoTime(); status = cudnnPoolingBackward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.yDesc, y, desc.dyDesc, dy, desc.xDesc, x, zero(), desc.dxDesc, dx); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_BACKWARD_LIB, System.nanoTime() - t3); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_BACKWARD_LIB, System.nanoTime() - t3); if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { throw new DMLRuntimeException("Could not executed cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); @@ -775,10 +775,10 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { } finally { long t4=0; - if (DMLScript.FINEGRAINED_STATISTICS) t4 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t4 = System.nanoTime(); if(!isMaxPoolOutputProvided) - gCtx.cudaFreeHelper(instName, y, DMLScript.EAGER_CUDA_FREE); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4); + gCtx.cudaFreeHelper(instName, y, gCtx.EAGER_CUDA_FREE); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4); } } @@ -795,18 +795,18 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { cudnnCreateActivationDescriptor(activationDescriptor); double dummy = -1; cudnnSetActivationDescriptor(activationDescriptor, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, dummy); - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); cudnnActivationForward(getCudnnHandle(gCtx), activationDescriptor, one(), srcTensorDesc, srcData, zero(), dstTensorDesc, dstData); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ACTIVATION_FORWARD_LIB, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ACTIVATION_FORWARD_LIB, System.nanoTime() - t0); } catch (CudaException e) { throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e); } finally { long t1=0; - if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t1); + if (ConfigurationManager.isFinegrainedStatistics()) t1 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t1); } } @@ -831,11 +831,11 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { } // Invokes relu(double* A, double* ret, int rlen, int clen) Pointer srcData = getDensePointerForCuDNN(gCtx, in, instName); // TODO: FIXME: Add sparse kernel support for relu - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t0 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("relu", ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)), srcData, dstData, toInt(N), toInt(CHW)); - if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0); } else { cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor(); @@ -910,13 +910,13 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { } if(return_sequences) { - gCtx.cudaFreeHelper(instName, hyPointer, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, hyPointer, gCtx.EAGER_CUDA_FREE); Pointer sysmlYPointer = getDenseOutputPointer(ec, gCtx, instName, outputName, N, T*M); LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_output", ExecutionConfig.getConfigForSimpleVectorOperations(N*T*M), sysmlYPointer, cudnnYPointer, N, T, M, N*T*M); } - gCtx.cudaFreeHelper(instName, cudnnYPointer, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, cudnnYPointer, gCtx.EAGER_CUDA_FREE); } public static void lstmBackward(ExecutionContext ec, GPUContext gCtx, String instName, @@ -966,7 +966,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { // ---------------------- algo.workSpace, algo.sizeInBytes, algo.reserveSpace, algo.reserveSpaceSizeInBytes); - gCtx.cudaFreeHelper(instName, dy, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, dy, gCtx.EAGER_CUDA_FREE); ec.releaseMatrixInputForGPUInstruction(dcyName); ec.releaseMatrixOutputForGPUInstruction(dhxName); ec.releaseMatrixOutputForGPUInstruction(dcxName); @@ -976,7 +976,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D), smlDx, cudnnDx, N, D, T*D, N*T*D); ec.releaseMatrixOutputForGPUInstruction(dxName); - gCtx.cudaFreeHelper(instName, cudnnDx, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, cudnnDx, gCtx.EAGER_CUDA_FREE); // ------------------------------------------------------------------------------------------- Pointer cudnnDwPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType); @@ -991,12 +991,12 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)), getDenseOutputPointer(ec, gCtx, instName, dwName, D+M, 4*M), getDenseOutputPointer(ec, gCtx, instName, dbName, 1, 4*M), cudnnDwPointer, D, M); - gCtx.cudaFreeHelper(instName, cudnnDwPointer, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, cudnnDwPointer, gCtx.EAGER_CUDA_FREE); ec.releaseMatrixOutputForGPUInstruction(dwName); ec.releaseMatrixOutputForGPUInstruction(dbName); // ------------------------------------------------------------------------------------------- - gCtx.cudaFreeHelper(instName, yPointer, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, yPointer, gCtx.EAGER_CUDA_FREE); } } http://git-wip-us.apache.org/repos/asf/systemml/blob/ae268a9e/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java index f70b453..dbad80c 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java @@ -19,7 +19,7 @@ package org.apache.sysml.runtime.matrix.data; -import org.apache.sysml.api.DMLScript; +import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; @@ -86,7 +86,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab @Override public void close() { long t3 = 0; - if (DMLScript.FINEGRAINED_STATISTICS) t3 = System.nanoTime(); + if (ConfigurationManager.isFinegrainedStatistics()) t3 = System.nanoTime(); if(nchwTensorDesc != null) cudnnDestroyTensorDescriptor(nchwTensorDesc); if(nkpqTensorDesc != null) @@ -97,12 +97,12 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab cudnnDestroyConvolutionDescriptor(convDesc); if(sizeInBytes != 0) { try { - gCtx.cudaFreeHelper(instName, workSpace, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, workSpace, gCtx.EAGER_CUDA_FREE); } catch (DMLRuntimeException e) { throw new RuntimeException(e); } } - if(DMLScript.FINEGRAINED_STATISTICS) + if(ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3); } @@ -130,7 +130,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionForwardAlgorithm( GPUContext gCtx, String instName, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q, long workspaceLimit) { - long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t1 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); int[] algos = {-1}; @@ -144,7 +144,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0]); ret.sizeInBytes = sizeInBytesArray[0]; ret.algo = algos[0]; - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); return ret; } @@ -173,7 +173,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab public static LibMatrixCuDNNConvolutionAlgorithm cudnnGetConvolutionBackwardFilterAlgorithm( GPUContext gCtx, String instName, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q, long workspaceLimit) { - long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t1 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); @@ -190,7 +190,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab ret.sizeInBytes = sizeInBytesArray[0]; ret.algo = algos[0]; - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); return ret; } @@ -229,7 +229,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab ret.algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; } else { - long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t1 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; int[] algos = {-1}; long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)}; jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm( @@ -242,7 +242,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab ret.workSpace = gCtx.allocate(instName, sizeInBytesArray[0]); ret.sizeInBytes = sizeInBytesArray[0]; ret.algo = algos[0]; - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); } return ret; http://git-wip-us.apache.org/repos/asf/systemml/blob/ae268a9e/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java index 0130aa6..81a703d 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java @@ -21,7 +21,7 @@ package org.apache.sysml.runtime.matrix.data; import static jcuda.runtime.JCuda.cudaMemset; import jcuda.Pointer; -import org.apache.sysml.api.DMLScript; +import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; @@ -59,10 +59,10 @@ public class LibMatrixCuDNNInputRowFetcher extends LibMatrixCUDA implements java public Pointer getNthRow(int n) { if(isInputInSparseFormat) { jcuda.runtime.JCuda.cudaDeviceSynchronize(); - long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t0 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; cudaMemset(outPointer, 0, numColumns*sizeOfDataType); jcuda.runtime.JCuda.cudaDeviceSynchronize(); - if(DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t0); + if(ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t0); LibMatrixCUDA.sliceSparseDense(gCtx, instName, (CSRPointer)inPointer, outPointer, n, n, 0, LibMatrixCUDA.toInt(numColumns-1), numColumns); } else { http://git-wip-us.apache.org/repos/asf/systemml/blob/ae268a9e/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java index 8ebc4e0..7b2c601 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java @@ -32,7 +32,6 @@ import static jcuda.jcudnn.cudnnRNNInputMode.CUDNN_LINEAR_INPUT; import static jcuda.jcudnn.cudnnDirectionMode.CUDNN_UNIDIRECTIONAL; import static jcuda.jcudnn.cudnnRNNAlgo.CUDNN_RNN_ALGO_STANDARD; -import org.apache.sysml.api.DMLScript; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; @@ -301,7 +300,7 @@ public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable { } if(sizeInBytes != 0) { try { - gCtx.cudaFreeHelper(instName, workSpace, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, workSpace, gCtx.EAGER_CUDA_FREE); } catch (DMLRuntimeException e) { throw new RuntimeException(e); } @@ -309,7 +308,7 @@ public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable { workSpace = null; if(reserveSpaceSizeInBytes != 0) { try { - gCtx.cudaFreeHelper(instName, reserveSpace, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, reserveSpace, gCtx.EAGER_CUDA_FREE); } catch (DMLRuntimeException e) { throw new RuntimeException(e); } @@ -317,7 +316,7 @@ public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable { reserveSpace = null; if(dropOutSizeInBytes != 0) { try { - gCtx.cudaFreeHelper(instName, dropOutStateSpace, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, dropOutStateSpace, gCtx.EAGER_CUDA_FREE); } catch (DMLRuntimeException e) { throw new RuntimeException(e); } http://git-wip-us.apache.org/repos/asf/systemml/blob/ae268a9e/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java index 18739a8..9833456 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java @@ -26,7 +26,7 @@ import jcuda.Pointer; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.sysml.api.DMLScript; +import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; @@ -161,19 +161,19 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA { // and output CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); - long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t0 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transa, B, transb, params.m, params.n, params.k); - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t0); // Step 3: Invoke the kernel - long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t1 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; cudaSupportFunctions.cusparsecsrgemm(getCusparseHandle(gCtx), transa, transb, params.m, params.n, params.k, A.descr, (int) A.nnz, A.val, A.rowPtr, A.colInd, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd); - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB, System.nanoTime() - t1); output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C); @@ -279,14 +279,14 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA { denseSparseMatMult(getCusparseHandle(gCtx), instName, output, B, A, params); if (outRLen != 1 && outCLen != 1) { // Transpose: C = t(output) - long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t0 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T, toInt(outCLen), toInt(outRLen), one(), output, toInt(outRLen), zero(), new Pointer(), toInt(outRLen), C, toInt(outCLen)); - if (!DMLScript.EAGER_CUDA_FREE) + if (!gCtx.EAGER_CUDA_FREE) JCuda.cudaDeviceSynchronize(); - gCtx.cudaFreeHelper(instName, output, DMLScript.EAGER_CUDA_FREE); - if (DMLScript.FINEGRAINED_STATISTICS) + gCtx.cudaFreeHelper(instName, output, gCtx.EAGER_CUDA_FREE); + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0); } @@ -312,7 +312,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA { */ private static void denseSparseMatMult(cusparseHandle handle, String instName, Pointer C, Pointer A, CSRPointer B, CuMatMultParameters param) { - long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t0 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; String kernel = GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_MATRIX_LIB; // Ignoring sparse vector dense matrix multiplication and dot product boolean isVector = (param.leftNumRows == 1 && !param.isLeftTransposed) @@ -336,7 +336,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA { cudaSupportFunctions.cusparsecsrmm2(handle, transa, transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.val, B.rowPtr, B.colInd, A, param.ldb, zero(), C, param.ldc); } - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, kernel, System.nanoTime() - t0); } @@ -361,7 +361,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA { */ private static void denseDenseMatMult(cublasHandle handle, String instName, Pointer C, Pointer A, Pointer B, CuMatMultParameters param) { - long t0 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t0 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; String kernel = null; param.rowToColumnMajor(); param.validate(); @@ -403,7 +403,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA { zero(), C, param.ldc); kernel = GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB; } - if (DMLScript.FINEGRAINED_STATISTICS) + if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, kernel, System.nanoTime() - t0); } http://git-wip-us.apache.org/repos/asf/systemml/blob/ae268a9e/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java index 0c6f41a..4569dbe 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java @@ -28,7 +28,7 @@ import java.util.concurrent.atomic.AtomicLong; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.sysml.api.DMLScript; +import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.functionobjects.KahanPlus; @@ -91,7 +91,7 @@ public class LibMatrixDNN { static AtomicLong loopedConvBwdDataCol2ImTime = new AtomicLong(0); public static void appendStatistics(StringBuilder sb) { - if(DMLScript.FINEGRAINED_STATISTICS) { + if(ConfigurationManager.isFinegrainedStatistics()) { sb.append("LibMatrixDNN dense count (conv/bwdF/bwdD/im2col/maxBwd):\t" + conv2dDenseCount.get() + "/" + conv2dBwdFilterDenseCount.get() + "/" @@ -238,7 +238,7 @@ public class LibMatrixDNN { throw new DMLRuntimeException("Incorrect dout dimensions in pooling_backward:" + input.getNumRows() + " " + input.getNumColumns() + " " + params.N + " " + params.K*params.P*params.Q); } - if(DMLScript.FINEGRAINED_STATISTICS) { + if(ConfigurationManager.isFinegrainedStatistics()) { boolean isSparse = (poolType == PoolingType.MAX) ? (input.isInSparseFormat() || dout.isInSparseFormat()) : dout.isInSparseFormat(); if(isSparse) maxPoolBwdSparseCount.addAndGet(1); @@ -780,7 +780,7 @@ public class LibMatrixDNN { if(params.stride_h <= 0 || params.stride_w <= 0) throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w); - if(DMLScript.FINEGRAINED_STATISTICS) { + if(ConfigurationManager.isFinegrainedStatistics()) { if(filter.isInSparseFormat() || dout.isInSparseFormat()) { conv2dBwdDataSparseCount.addAndGet(1); } @@ -805,7 +805,7 @@ public class LibMatrixDNN { if(params.stride_h <= 0 || params.stride_w <= 0) throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w); - if(DMLScript.FINEGRAINED_STATISTICS) { + if(ConfigurationManager.isFinegrainedStatistics()) { if(input.isInSparseFormat() || dout.isInSparseFormat()) { conv2dBwdFilterSparseCount.addAndGet(1); } @@ -831,7 +831,7 @@ public class LibMatrixDNN { if(params.stride_h <= 0 || params.stride_w <= 0) throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w); - if(DMLScript.FINEGRAINED_STATISTICS) { + if(ConfigurationManager.isFinegrainedStatistics()) { if(input.isInSparseFormat() || filter.isInSparseFormat()) { conv2dSparseCount.addAndGet(1); } http://git-wip-us.apache.org/repos/asf/systemml/blob/ae268a9e/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2d.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2d.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2d.java index b938a0a..982949f 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2d.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2d.java @@ -22,7 +22,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.concurrent.Callable; -import org.apache.sysml.api.DMLScript; +import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.matrix.data.LibMatrixDNNRotate180.Rotate180Worker; @@ -172,16 +172,16 @@ public class LibMatrixDNNConv2d MatrixBlock outMM = new MatrixBlock(K, PQ, _params.output.sparse); long time1 = 0; long time2 = 0; for(int n = _rl; n < _ru; n++) { - long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t1 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; LibMatrixDNNIm2Col.im2col(_params.input1, outIm2col, n, _params, false); - long t2 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t2 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; // filter %*% _im2ColOutBlock => matMultOutBlock outMM.reset(outMM.rlen, outMM.clen, _params.output.sparse); LibMatrixDNNHelper.singleThreadedMatMult(_params.input2, outIm2col, outMM, false, true, _params); - long t3 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t3 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; - if(DMLScript.FINEGRAINED_STATISTICS) { + if(ConfigurationManager.isFinegrainedStatistics()) { time1 += t2 - t1; time2 += t3 - t2; } @@ -195,7 +195,7 @@ public class LibMatrixDNNConv2d _params.bias.getDenseBlockValues(), K, PQ); } - if(DMLScript.FINEGRAINED_STATISTICS) { + if(ConfigurationManager.isFinegrainedStatistics()) { LibMatrixDNN.loopedConvIm2ColTime.addAndGet(time1); LibMatrixDNN.loopedConvMatMultTime.addAndGet(time2); } @@ -416,20 +416,20 @@ public class LibMatrixDNNConv2d // rotate180(dout[n,]) => dout_reshaped rotate180Worker.execute(n, 0); // dout_reshaped %*% filter => temp - long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t1 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; outMM.reset(PQ, CRS, false); LibMatrixDNNHelper.singleThreadedMatMult(outRotate, filter, outMM, !outRotate.sparse, false, _params); - long t2 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t2 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; // col2im(temp) => output[n,] LibMatrixDNNIm2Col.col2imOverSingleImage(n, outMM, _params); - long t3 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t3 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; - if(DMLScript.FINEGRAINED_STATISTICS) { + if(ConfigurationManager.isFinegrainedStatistics()) { time1 += t2 - t1; time2 += t3 - t2; } } - if(DMLScript.FINEGRAINED_STATISTICS) { + if(ConfigurationManager.isFinegrainedStatistics()) { LibMatrixDNN.loopedConvBwdDataMatMultTime.addAndGet(time1); LibMatrixDNN.loopedConvBwdDataCol2ImTime.addAndGet(time2); } @@ -512,24 +512,24 @@ public class LibMatrixDNNConv2d rotate180Worker.execute(n, 0); // im2col(input) => _im2ColOutBlock - long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t1 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; LibMatrixDNNIm2Col.im2col(_params.input1, im2ColOutBlock, n, _params, false); - long t2 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t2 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; outMM.reset(CRS, K, false); LibMatrixDNNHelper.singleThreadedMatMult(im2ColOutBlock, outRotate, outMM, !im2ColOutBlock.sparse, !outRotate.sparse, _params); - long t3 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t3 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; if( !outMM.isEmptyBlock() ) //accumulate row results LibMatrixMult.vectAdd(outMM.getDenseBlockValues(), partRet, 0, 0, K*CRS); - if(DMLScript.FINEGRAINED_STATISTICS) { + if(ConfigurationManager.isFinegrainedStatistics()) { time1 += t2 - t1; time2 += t3 - t2; } } inplaceTransAdd(partRet, _params); - if(DMLScript.FINEGRAINED_STATISTICS) { + if(ConfigurationManager.isFinegrainedStatistics()) { LibMatrixDNN.loopedConvBwdFilterIm2ColTime.addAndGet(time1); LibMatrixDNN.loopedConvBwdFilterMatMultTime.addAndGet(time2); } @@ -562,27 +562,27 @@ public class LibMatrixDNNConv2d rotate180Worker.execute(n, 0); // im2col(input) => _im2ColOutBlock - long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t1 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; LibMatrixDNNIm2Col.im2col(_params.input1, im2ColOutBlock, n, _params, true); - long t2 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t2 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; outMM.reset(K, CRS, false); //Timing time = new Timing(true); LibMatrixDNNHelper.singleThreadedMatMult(outRotate, im2ColOutBlock, outMM, !outRotate.sparse, !im2ColOutBlock.sparse, _params); - long t3 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + long t3 = ConfigurationManager.isFinegrainedStatistics() ? System.nanoTime() : 0; if( !outMM.isEmptyBlock() ) //accumulate row results LibMatrixMult.vectAdd(outMM.getDenseBlockValues(), partRet, 0, 0, K*CRS); - if(DMLScript.FINEGRAINED_STATISTICS) { + if(ConfigurationManager.isFinegrainedStatistics()) { time1 += t2 - t1; time2 += t3 - t2; } } //no need to transpose because t(t(out)) cancel out inplaceAdd(partRet, _params); - if(DMLScript.FINEGRAINED_STATISTICS) { + if(ConfigurationManager.isFinegrainedStatistics()) { LibMatrixDNN.loopedConvBwdFilterIm2ColTime.addAndGet(time1); LibMatrixDNN.loopedConvBwdFilterMatMultTime.addAndGet(time2); } http://git-wip-us.apache.org/repos/asf/systemml/blob/ae268a9e/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java index aa5ba86..2cb64c2 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java @@ -26,7 +26,6 @@ import java.util.stream.IntStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.sysml.api.DMLScript; import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.conf.DMLConfig; import org.apache.sysml.hops.OptimizerUtils; @@ -87,7 +86,7 @@ public class LibMatrixNative { ret.sparse = false; ret.allocateDenseBlock(); - long start = DMLScript.STATISTICS ? System.nanoTime() : 0; + long start = ConfigurationManager.isStatistics() ? System.nanoTime() : 0; boolean rccode = false; if( isSinglePrecision() ) { FloatBuffer fin1 = toFloatBuffer(m1.getDenseBlockValues(), inBuff, true); @@ -102,7 +101,7 @@ public class LibMatrixNative ret.getDenseBlockValues(), m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k); } if (rccode) { - if(DMLScript.STATISTICS) { + if(ConfigurationManager.isStatistics()) { Statistics.nativeLibMatrixMultTime += System.nanoTime() - start; Statistics.numNativeLibMatrixMultCalls.increment(); } @@ -158,7 +157,7 @@ public class LibMatrixNative params.numThreads = params.numThreads <= 0 ? NativeHelper.getMaxNumThreads() : params.numThreads; if(NativeHelper.isNativeLibraryLoaded() && !input.isInSparseFormat() && !filter.isInSparseFormat()) { setNumThreads(params); - long start = DMLScript.STATISTICS ? System.nanoTime() : 0; + long start = ConfigurationManager.isStatistics() ? System.nanoTime() : 0; int nnz = 0; if(params.bias == null) { nnz = NativeHelper.conv2dDense(input.getDenseBlockValues(), filter.getDenseBlockValues(), @@ -195,7 +194,7 @@ public class LibMatrixNative } //post processing and error handling if(nnz != -1) { - if(DMLScript.STATISTICS) { + if(ConfigurationManager.isStatistics()) { Statistics.nativeConv2dTime += System.nanoTime() - start; Statistics.numNativeConv2dCalls.increment(); } @@ -234,13 +233,13 @@ public class LibMatrixNative params.numThreads = params.numThreads <= 0 ? NativeHelper.getMaxNumThreads() : params.numThreads; if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !input.isInSparseFormat()) { setNumThreads(params); - long start = DMLScript.STATISTICS ? System.nanoTime() : 0; + long start = ConfigurationManager.isStatistics() ? System.nanoTime() : 0; int nnz = NativeHelper.conv2dBackwardFilterDense(input.getDenseBlockValues(), dout.getDenseBlockValues(), outputBlock.getDenseBlockValues(), params.N, params.C, params.H, params.W, params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, params.P, params.Q, params.numThreads); if(nnz != -1) { - if(DMLScript.STATISTICS) { + if(ConfigurationManager.isStatistics()) { Statistics.nativeConv2dBwdFilterTime += System.nanoTime() - start; Statistics.numNativeConv2dBwdFilterCalls.increment(); } @@ -270,13 +269,13 @@ public class LibMatrixNative params.numThreads = params.numThreads <= 0 ? NativeHelper.getMaxNumThreads() : params.numThreads; if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !filter.isInSparseFormat()) { setNumThreads(params); - long start = DMLScript.STATISTICS ? System.nanoTime() : 0; + long start = ConfigurationManager.isStatistics() ? System.nanoTime() : 0; int nnz = NativeHelper.conv2dBackwardDataDense(filter.getDenseBlockValues(), dout.getDenseBlockValues(), outputBlock.getDenseBlockValues(), params.N, params.C, params.H, params.W, params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, params.P, params.Q, params.numThreads); if(nnz != -1) { - if(DMLScript.STATISTICS) { + if(ConfigurationManager.isStatistics()) { Statistics.nativeConv2dBwdDataTime += System.nanoTime() - start; Statistics.numNativeConv2dBwdDataCalls.increment(); } http://git-wip-us.apache.org/repos/asf/systemml/blob/ae268a9e/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java index 942b56b..044e943 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java @@ -29,7 +29,7 @@ import java.util.stream.IntStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.sysml.api.DMLScript; +import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; import org.apache.sysml.utils.GPUStatistics; @@ -168,7 +168,7 @@ public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions @Override public void deviceToHost(GPUContext gCtx, Pointer src, double[] dest, String instName, boolean isEviction) { - long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; + long t0 = ConfigurationManager.isStatistics() ? System.nanoTime() : 0; // We invoke transfer matrix from device to host in two cases: // 1. During eviction of unlocked matrices // 2. During acquireHostRead @@ -182,7 +182,7 @@ public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions Pointer deviceDoubleData = gCtx.allocate(instName, ((long)dest.length)*Sizeof.DOUBLE); LibMatrixCUDA.float2double(gCtx, src, deviceDoubleData, dest.length); cudaMemcpy(Pointer.to(dest), deviceDoubleData, ((long)dest.length)*Sizeof.DOUBLE, cudaMemcpyDeviceToHost); - gCtx.cudaFreeHelper(instName, deviceDoubleData, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, deviceDoubleData, gCtx.EAGER_CUDA_FREE); } else { LOG.debug("Potential OOM: Allocated additional space on host in deviceToHost"); @@ -190,11 +190,11 @@ public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions cudaMemcpy(Pointer.to(floatData), src, ((long)dest.length)*Sizeof.FLOAT, cudaMemcpyDeviceToHost); LibMatrixNative.fromFloatBuffer(floatData, dest); } - if(DMLScript.STATISTICS) { + if(ConfigurationManager.isStatistics()) { long totalTime = System.nanoTime() - t0; GPUStatistics.cudaFloat2DoubleTime.add(totalTime); GPUStatistics.cudaFloat2DoubleCount.add(1); - if(DMLScript.FINEGRAINED_STATISTICS && instName != null) + if(ConfigurationManager.isFinegrainedStatistics() && instName != null) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, totalTime); } } @@ -203,12 +203,12 @@ public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions public void hostToDevice(GPUContext gCtx, double[] src, Pointer dest, String instName) { LOG.debug("Potential OOM: Allocated additional space in hostToDevice"); // TODO: Perform conversion on GPU using double2float and float2double kernels - long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; + long t0 = ConfigurationManager.isStatistics() ? System.nanoTime() : 0; if(PERFORM_CONVERSION_ON_DEVICE) { Pointer deviceDoubleData = gCtx.allocate(instName, ((long)src.length)*Sizeof.DOUBLE); cudaMemcpy(deviceDoubleData, Pointer.to(src), ((long)src.length)*Sizeof.DOUBLE, cudaMemcpyHostToDevice); LibMatrixCUDA.double2float(gCtx, deviceDoubleData, dest, src.length); - gCtx.cudaFreeHelper(instName, deviceDoubleData, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, deviceDoubleData, gCtx.EAGER_CUDA_FREE); } else { FloatBuffer floatData = ByteBuffer.allocateDirect(Sizeof.FLOAT*src.length).order(ByteOrder.nativeOrder()).asFloatBuffer(); @@ -216,11 +216,11 @@ public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions cudaMemcpy(dest, Pointer.to(floatData), ((long)src.length)*Sizeof.FLOAT, cudaMemcpyHostToDevice); } - if(DMLScript.STATISTICS) { + if(ConfigurationManager.isStatistics()) { long totalTime = System.nanoTime() - t0; GPUStatistics.cudaDouble2FloatTime.add(totalTime); GPUStatistics.cudaDouble2FloatCount.add(1); - if(DMLScript.FINEGRAINED_STATISTICS && instName != null) + if(ConfigurationManager.isFinegrainedStatistics() && instName != null) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, totalTime); } } http://git-wip-us.apache.org/repos/asf/systemml/blob/ae268a9e/src/main/java/org/apache/sysml/runtime/util/ProgramConverter.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/util/ProgramConverter.java b/src/main/java/org/apache/sysml/runtime/util/ProgramConverter.java index 21e6bd3..8289c30 100644 --- a/src/main/java/org/apache/sysml/runtime/util/ProgramConverter.java +++ b/src/main/java/org/apache/sysml/runtime/util/ProgramConverter.java @@ -688,7 +688,7 @@ public class ProgramConverter builder.append(NEWLINE); //handle additional configurations - builder.append(CONF_STATS + "=" + DMLScript.STATISTICS); + builder.append(CONF_STATS + "=" + ConfigurationManager.isStatistics()); builder.append(COMPONENTS_DELIM); builder.append(NEWLINE); @@ -753,7 +753,7 @@ public class ProgramConverter sb.append( NEWLINE ); //handle additional configurations - sb.append( CONF_STATS + "=" + DMLScript.STATISTICS ); + sb.append( CONF_STATS + "=" + ConfigurationManager.isStatistics() ); sb.append( COMPONENTS_DELIM ); sb.append( NEWLINE ); @@ -1727,7 +1727,7 @@ public class ProgramConverter private static void parseAndSetAdditionalConfigurations(String conf) { String[] statsFlag = conf.split("="); - DMLScript.STATISTICS = Boolean.parseBoolean(statsFlag[1]); + ConfigurationManager.setStatistics(Boolean.parseBoolean(statsFlag[1])); } ////////// http://git-wip-us.apache.org/repos/asf/systemml/blob/ae268a9e/src/main/java/org/apache/sysml/utils/GPUStatistics.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java b/src/main/java/org/apache/sysml/utils/GPUStatistics.java index fcbc4c4..e748057 100644 --- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java +++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java @@ -28,7 +28,7 @@ import java.util.List; import java.util.Map; import java.util.concurrent.atomic.LongAdder; -import org.apache.sysml.api.DMLScript; +import org.apache.sysml.conf.ConfigurationManager; /** * Measures performance numbers when GPU mode is enabled @@ -155,7 +155,7 @@ public class GPUStatistics { */ public synchronized static void maintainCPMiscTimes( String instructionName, String miscTimer, long timeNanos, long incrementCount) { - if (!(DMLScript.FINEGRAINED_STATISTICS)) + if (!(ConfigurationManager.isFinegrainedStatistics())) return; HashMap<String, Long> miscTimesMap = _cpInstMiscTime.get(instructionName);