[SYSTEMML-540] Refactored LibMatrixDNN to reduce instruction cache misses - Bugfix for empty filter for conv2d_bias_add - Improved sparse maxpooling's performance - Reduced branch mispredictions and instruction cache misses.
Closes #520. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/19eed8f3 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/19eed8f3 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/19eed8f3 Branch: refs/heads/master Commit: 19eed8f3858d7daad1c549b548b7de4ff270def8 Parents: 28c92b9 Author: Niketan Pansare <npan...@us.ibm.com> Authored: Mon May 29 15:21:22 2017 -0800 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Mon May 29 16:21:22 2017 -0700 ---------------------------------------------------------------------- docs/python-reference.md | 8 +- scripts/nn/test/compare_backends/compare.dml | 5 +- .../cp/ConvolutionCPInstruction.java | 20 +- .../matrix/data/ConvolutionParameters.java | 3 +- .../sysml/runtime/matrix/data/LibMatrixDNN.java | 995 ++----------------- .../LibMatrixDNNConv2dBackwardDataHelper.java | 112 +++ .../LibMatrixDNNConv2dBackwardFilterHelper.java | 138 +++ .../matrix/data/LibMatrixDNNConv2dHelper.java | 224 +++++ .../runtime/matrix/data/LibMatrixDNNHelper.java | 541 ++++++++++ .../matrix/data/LibMatrixDNNIm2ColHelper.java | 386 +++++++ .../data/LibMatrixDNNPoolingBackwardHelper.java | 212 ++++ .../matrix/data/LibMatrixDNNPoolingHelper.java | 143 +++ .../data/LibMatrixDNNRotate180Helper.java | 107 ++ 13 files changed, 1951 insertions(+), 943 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/19eed8f3/docs/python-reference.md ---------------------------------------------------------------------- diff --git a/docs/python-reference.md b/docs/python-reference.md index 2ebfc38..a847964 100644 --- a/docs/python-reference.md +++ b/docs/python-reference.md @@ -189,14 +189,10 @@ method as DataFrame or NumPy array. ### Support for NumPy's universal functions -The matrix class also supports most of NumPy's universal functions (i.e. ufuncs). -The current version of NumPy explicitly disables overriding ufunc, but this should be enabled in next release. -Until then to test above code, please use: +The matrix class also supports most of NumPy's universal functions (i.e. ufuncs): ```bash -git clone https://github.com/niketanpansare/numpy.git -cd numpy -python setup.py install +pip install --ignore-installed 'numpy>=1.13.0rc2' ``` This will enable NumPy's functions to invoke matrix class: http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/19eed8f3/scripts/nn/test/compare_backends/compare.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/test/compare_backends/compare.dml b/scripts/nn/test/compare_backends/compare.dml index f87c472..7205631 100644 --- a/scripts/nn/test/compare_backends/compare.dml +++ b/scripts/nn/test/compare_backends/compare.dml @@ -22,7 +22,10 @@ X = read($1) Y = read($2) msg = ifdef($3, " ") -eps = 1e-3 +eps = 1e-6 +# Normalize X and Y +X = X / max(X) +Y = Y / max(Y) num_mismatch = sum(abs(X - Y) > eps) if(num_mismatch > 0) { print("---------------------------------------------------\nERROR: >>>>>>>>> The results don't match(num_mismatch:" + num_mismatch + "): " + msg + "\n---------------------------------------------------") http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/19eed8f3/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java index 1331d64..840b39e 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java @@ -218,13 +218,11 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction .getLongValue(); } - @SuppressWarnings("unused") public void processReluBackwardInstruction(ExecutionContext ec) throws DMLRuntimeException { // (X > 0) * dout MatrixBlock input = ec.getMatrixInput(input1.getName()); MatrixBlock dout = ec.getMatrixInput(_in2.getName()); - MatrixBlock outputBlock = new MatrixBlock(input.getNumRows(), input.getNumColumns(), - LibMatrixDNN.SUPPORTS_SPARSE_OUTPUTS && (input.isInSparseFormat() || dout.isInSparseFormat())); + MatrixBlock outputBlock = new MatrixBlock(input.getNumRows(), input.getNumColumns(), (input.isInSparseFormat() || dout.isInSparseFormat())); if( !input.isEmpty() && !dout.isEmpty() ) { outputBlock.allocateDenseOrSparseBlock(); @@ -383,12 +381,26 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) { MatrixBlock filter = ec.getMatrixInput(_in3.getName()); MatrixBlock bias = ec.getMatrixInput(_in2.getName()); - if((filter.isEmpty() || matBlock.isEmpty()) && bias.isEmpty()) { + if(bias.getNumRows() != params.K || bias.getNumColumns() != 1) { + throw new DMLRuntimeException("Incorrect shape of bias matrix: [" + bias.getNumRows() + " " + bias.getNumColumns() + "]. " + + "Expected: [" + params.K + ", 1]"); + } + boolean isOutputConvEmpty = filter.isEmpty() || matBlock.isEmpty(); + if(isOutputConvEmpty && bias.isEmpty()) { + // bias_add(empty mb, empty mb) = empty mb outputBlock = new MatrixBlock(N, K*P*Q, true); } + else if(isOutputConvEmpty && !bias.isEmpty()) { + // Add bias to empty output block + // bias_add(empty mb, bias) + outputBlock = getDenseOutputBlock(N, K*P*Q); + for(int n = 0; n < params.N; n++) + ConvolutionUtils.fillBias(bias, outputBlock.getDenseBlock(), n, n+1, params.N, params.K, params.P*params.Q); + } else { outputBlock = getDenseOutputBlock(N, K*P*Q); if(!bias.isEmpty()) { + // Handle situation where both input and filter are non empty, but bias is empty params.bias = bias; } if(params.enableNative && !isFilterSparse(filter) && !matBlock.isInSparseFormat()) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/19eed8f3/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java index 11e74ca..a24a736 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java @@ -58,7 +58,8 @@ public class ConvolutionParameters implements Serializable { } public String toString() { - return "(" + N + " " + C + " " + H + " " + W + " " + K + " " + R + " " + S + ")"; + return "(NCHW=[" + N + " " + C + " " + H + " " + W + "], KCRS=[" + K + " " + R + " " + S + "], stride=[" + stride_h + "," + stride_w + + "], pad=[" + pad_h + "," + pad_w + "])"; } public ConvolutionParameters(long N, long C, long H, long W, http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/19eed8f3/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java index ab82697..30b8b64 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java @@ -19,10 +19,8 @@ package org.apache.sysml.runtime.matrix.data; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import java.util.concurrent.Callable; -import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; @@ -36,10 +34,9 @@ import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.matrix.operators.BinaryOperator; import org.apache.sysml.runtime.util.ConvolutionUtils; -import org.apache.sysml.utils.NativeHelper; import org.apache.sysml.utils.Statistics; -/** +/* * This class allows users to invoke deep learning related operations * (such as conv2d, conv2d_backward_data, conv2d_backward_filter, maxpooling, maxpooling_backward, bias_add) * using multiple threads. @@ -47,23 +44,32 @@ import org.apache.sysml.utils.Statistics; * The methods accept the input matrices as MatrixBlock and the parameters using ConvolutionParameters. * * To run in single thread, please set ConvolutionParameters.numThreads to 1. + * + * DESIGN: + * + * 1. LibMatrixDNN contains the user-facing methods for deep learning related operations. + * 2. The deep learning tasks are executed in parallel using java's ExecutorService. The key pattern + * followed by the above mentioned functions are as follows: + * execute(LibMatrixDNNHelper.get__Workers(params), params); + * 3. LibMatrixDNN's execute() method ensures the creation and shutdown of the ExecutorService. + * 4. LibMatrixDNNHelper.get__Workers creates appropriate workers based on the runtime characteristics of + * the input data (for example: input activations, filter, dout, ...). For code maintenance, these workers + * are placed in the respective LibMatrixDNN__Helper files. + * 5. The above mentioned workers may also use additional workers such as im2col and rotate180. + * We have created similar get__Workers methods to return the appropriate worker based on the + * runtime characteristics. + * 6. As opposed to earlier implementation, this design reduces branch misprediction as well + * as instruction cache misses. It also allows us to experiment with new operators for different + * data characteristics without affecting the performance of other operators. + * 7. This class assumes that the caller (for CP ConvolutionCPInstruction) deals with the empty block cases. + * */ public class LibMatrixDNN { protected static final Log LOG = LogFactory.getLog(LibMatrixDNN.class.getName()); //library configurations and external contracts - public static final boolean SUPPORTS_SPARSE_OUTPUTS = false; //operations able to handle sparse outputs - private static final boolean ALLOW_MULTI_THREADED_OPS = true; //enable multi-threading in cp - private static final int NUM_TASK_FACTOR = 2; //number of tasks is vcores scaled by this factor public static boolean DISPLAY_STATISTICS = false; //conv2d summaries in stats output - - private enum TaskType { - MaxPooling_Forward, MaxPooling_Backward, MaxPooling_Relu_Backward, - // Alternate approaches that we tried but the performance was unsatisfactory be included: direct, non-looped im2col - LoopedIm2ColConv2d, LoopedIm2ColConv2dBwdFilter, LoopedIm2ColConv2dBwdData, - ReluBackward - } // ------------------------------------------------------------------------------------------------ private static AtomicLong conv2dSparseCount = new AtomicLong(0); @@ -76,12 +82,12 @@ public class LibMatrixDNN { private static AtomicLong im2colDenseCount = new AtomicLong(0); private static AtomicLong maxPoolBwdSparseCount = new AtomicLong(0); private static AtomicLong maxPoolBwdDenseCount = new AtomicLong(0); - private static AtomicLong loopedConvMatMultTime = new AtomicLong(0); - private static AtomicLong loopedConvIm2ColTime = new AtomicLong(0); - private static AtomicLong loopedConvBwdFilterMatMultTime = new AtomicLong(0); - private static AtomicLong loopedConvBwdFilterIm2ColTime = new AtomicLong(0); - private static AtomicLong loopedConvBwdDataMatMultTime = new AtomicLong(0); - private static AtomicLong loopedConvBwdDataCol2ImTime = new AtomicLong(0); + static AtomicLong loopedConvMatMultTime = new AtomicLong(0); + static AtomicLong loopedConvIm2ColTime = new AtomicLong(0); + static AtomicLong loopedConvBwdFilterMatMultTime = new AtomicLong(0); + static AtomicLong loopedConvBwdFilterIm2ColTime = new AtomicLong(0); + static AtomicLong loopedConvBwdDataMatMultTime = new AtomicLong(0); + static AtomicLong loopedConvBwdDataCol2ImTime = new AtomicLong(0); public static void appendStatistics(StringBuilder sb) { if(DMLScript.STATISTICS && DISPLAY_STATISTICS) { @@ -128,8 +134,8 @@ public class LibMatrixDNN { } // Commonly used operators - private static BinaryOperator _binaryElementWiseAddition = null; - private static BinaryOperator _binaryElementWiseMultiplication = null; + static BinaryOperator _binaryElementWiseAddition = null; + static BinaryOperator _binaryElementWiseMultiplication = null; static { try { _binaryElementWiseAddition = InstructionUtils.parseBinaryOperator("+"); @@ -158,7 +164,7 @@ public class LibMatrixDNN { if(isEligibleForConv2dSparse(params)) Statistics.numNativeSparseConv2dCalls.increment(); - runConvTask(TaskType.LoopedIm2ColConv2d, params); + execute(LibMatrixDNNHelper.getConv2dWorkers(params), params); //post-processing: maintain nnz outputBlock.recomputeNonZeros(); @@ -179,7 +185,7 @@ public class LibMatrixDNN { if(isEligibleForConv2dBackwardDataDense(params)) Statistics.numNativeSparseConv2dBwdDataCalls.increment(); - runConvTask(TaskType.LoopedIm2ColConv2dBwdData, params); + execute(LibMatrixDNNHelper.getConv2dBackwardDataWorkers(params), params); //post-processing: maintain nnz outputBlock.recomputeNonZeros(); @@ -200,7 +206,7 @@ public class LibMatrixDNN { if(isEligibleForConv2dBackwardFilterSparseDense(params)) Statistics.numNativeSparseConv2dBwdFilterCalls.increment(); - runConvTask(TaskType.LoopedIm2ColConv2dBwdFilter, params); + execute(LibMatrixDNNHelper.getConv2dBackwardFilterWorkers(params), params); //post-processing: maintain nnz outputBlock.recomputeNonZeros(); @@ -239,10 +245,6 @@ public class LibMatrixDNN { conv2dBwdDataDenseCount.addAndGet(1); } } - - int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads); - if (!(ALLOW_MULTI_THREADED_OPS && params.isOutputThreadSafe() && constrainedNumThreads > 1)) - params.numThreads = 1; } static void checkInputsConv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { @@ -270,89 +272,6 @@ public class LibMatrixDNN { } } - /** - * Performs the operation for(e : elem) ret += t(e) in a cache-conscious manner - * by sequentially aggregating for(e : elem) tmp += e and finally transposing - * ret = t(tmp). - * - * @param ret left and output matrix - * @param elem array of right untransposed matrices (expected in dense format) - * @param params convolution parameters - * @throws DMLRuntimeException in case of unsupported inputs or output - */ - private static void elementWiseInPlaceTransposedAddition(MatrixBlock ret, MatrixBlock[] elem) - throws DMLRuntimeException - { - //sanity checks non-empty and dense inputs / dense output - if( elem == null || elem.length==0 ) - throw new DMLRuntimeException("Empty input not supported."); - for( MatrixBlock e : elem ) - if( e.isInSparseFormat() ) - throw new DMLRuntimeException("Sparse input format not supported."); - if( ret.isInSparseFormat() ) - throw new DMLRuntimeException("Sparse output format not supported."); - - //Step 1: aggregate partial blocks without transpose - MatrixBlock tmpAgg = elem[0]; - double[] tmp = tmpAgg.denseBlock; - for( int k=1; k<elem.length; k++ ) { - double[] tmp2 = elem[k].denseBlock; - for( int i=0; i<tmp.length; i++ ) - tmp[i] += tmp2[i]; - } - - //Step 2: cache-conscious transpose to output - tmpAgg.setNonZeros(-1); //avoid early abort - LibMatrixReorg.transpose(tmpAgg, ret); - } - - private static void doLoopedIm2ColConv2dBwdData(int n, MatrixBlock dout_reshaped, ConvolutionParameters params) throws DMLRuntimeException { - MatrixBlock filter = params.input1; - MatrixBlock dout = params.input2; - doRotate180(n, 0, dout, dout_reshaped.denseBlock, params, true); - - MatrixBlock temp = new MatrixBlock(params.P*params.Q, params.C*params.R*params.S, false); - long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0; - singleThreadedMatMult(dout_reshaped, filter, temp, true, false, params); - long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ; - doCol2imOverSingleImage(n, temp, params); - long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ; - if(DMLScript.STATISTICS && DISPLAY_STATISTICS) { - loopedConvBwdDataMatMultTime.addAndGet(t2-t1); - loopedConvBwdDataCol2ImTime.addAndGet(t3-t2); - } - } - - private static MatrixBlock doLoopedIm2ColConv2dBwdFilter(int n, - MatrixBlock im2ColOutBlock, MatrixBlock dout_reshaped, MatrixBlock partialRetBlock, ConvolutionParameters params, double [] tempIm2ColArr) throws DMLRuntimeException { - long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0; - doIm2col(n, im2ColOutBlock, params, tempIm2ColArr); - long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ; - - doRotate180(n, 0, params.input2, dout_reshaped.denseBlock, params, true); - - MatrixBlock temp = new MatrixBlock(params.C*params.R*params.S, params.K, false); - long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ; - singleThreadedMatMult(im2ColOutBlock, dout_reshaped, temp, true, true, params); - long t4 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ; - if(DMLScript.STATISTICS && DISPLAY_STATISTICS) { - loopedConvBwdFilterMatMultTime.addAndGet(t4-t3); - loopedConvBwdFilterIm2ColTime.addAndGet(t2-t1); - } - if(!temp.isEmptyBlock()) { - // partialRetBlock is size: [params.C*params.R*params.S, params.K] - ConvolutionUtils.binaryOperationInPlace(temp, partialRetBlock.getDenseBlock(), 0, params.K, 0, params.C*params.R*params.S, - _binaryElementWiseAddition); - } - return partialRetBlock; - } - - private static void computeTensorIndexes(int j, int [] ret, int H, int W) throws DMLRuntimeException { - ret[0] = j / (H*W); - ret[1] = (j - ret[0]*(H*W))/W; - ret[2] = j % W; - } - static void checkInputsConv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { params.input1 = input; params.input2 = filter; @@ -379,76 +298,6 @@ public class LibMatrixDNN { } } - // Single-threaded matrix multiplication - private static void singleThreadedMatMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, - boolean recomputeNNZM1, boolean recomputeNNZM2, ConvolutionParameters params) throws DMLRuntimeException { - if(!params.enableNative || m1.isInSparseFormat() || m2.isInSparseFormat()) { - if(recomputeNNZM1) - m1.recomputeNonZeros(); - if(recomputeNNZM2) - m2.recomputeNonZeros(); - LibMatrixMult.matrixMult(m1, m2, ret, false); - } - else { - ret.sparse = false; - if(ret.getDenseBlock() == null) - ret.allocateDenseBlock(); - NativeHelper.matrixMultDenseDense(m1.denseBlock, m2.denseBlock, - ret.denseBlock, m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), 1); - ret.recomputeNonZeros(); - } - } - - private static void doLoopedIm2ColConv2d(int n, MatrixBlock im2ColOutBlock, ConvolutionParameters params, double [] temp) throws DMLRuntimeException { - long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0; - doIm2col(n, im2ColOutBlock, params, temp); - long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0; - - MatrixBlock matMultOutBlock = new MatrixBlock(params.K, params.P*params.Q, false); - singleThreadedMatMult(params.input2, im2ColOutBlock, matMultOutBlock, false, true, params); - - long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0; - - if(DMLScript.STATISTICS && DISPLAY_STATISTICS) { - loopedConvIm2ColTime.addAndGet(t2 - t1); - loopedConvMatMultTime.addAndGet(t3 - t2); - } - - // ----------------------------------------------------------------------------- - // Copying is required as LibMatrixMult.matrixMult (and/or Java) is not pointer aware. - // This is not required in Native implementation - int destPos = n*params.K*params.P*params.Q; - int length = params.K*params.P*params.Q; - if(!matMultOutBlock.isEmptyBlock()) { - if(matMultOutBlock.isInSparseFormat()) { - // Copy the sparse matrix matMultOutBlock of shape [K X PQ] to - // params.output.denseBlock + destPos - final int outOffset = n*params.K*params.P*params.Q; - final int PQ = params.P*params.Q; - for(int k = 0; k < matMultOutBlock.getNumRows(); k++) { - if( !matMultOutBlock.sparseBlock.isEmpty(k) ) { - int apos = matMultOutBlock.sparseBlock.pos(k); - int alen = matMultOutBlock.sparseBlock.size(k); - int[] aix = matMultOutBlock.sparseBlock.indexes(k); - double[] avals = matMultOutBlock.sparseBlock.values(k); - for(int j = apos; j < apos+alen; j++) { - int pqIndex = aix[j]; - params.output.denseBlock[outOffset + k*PQ + pqIndex ] = avals[j]; - } - } - } - } - else - System.arraycopy(matMultOutBlock.denseBlock, 0, params.output.denseBlock, destPos, length); - } - // ----------------------------------------------------------------------------- - - // Recomputing nnz is not required for each individual im2col as it is invoked by outer public methods (i.e. conv2d. - //post-processing: maintain nnz - // params.output.recomputeNonZeros(); - } - - /** * This method computes the backpropogation errors for previous layer of maxpooling operation * @@ -485,10 +334,8 @@ public class LibMatrixDNN { throw new DMLRuntimeException("Sparse maxpooling_backward is not supported"); fillIndexesArray(params); - if(performReluBackward) - runConvTask(TaskType.MaxPooling_Relu_Backward, params); - else - runConvTask(TaskType.MaxPooling_Backward, params); + + execute(LibMatrixDNNHelper.getMaxPoolingBackwardWorkers(params, performReluBackward), params); //post-processing: maintain nnz outputBlock.recomputeNonZeros(); @@ -521,211 +368,6 @@ public class LibMatrixDNN { } } - private static void doPoolingBackward(int n, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException { - double [] inputArray = null; - if (!params.input1.isInSparseFormat()) - inputArray = params.input1.getDenseBlock(); - double [] doutArray = null; - if (!params.input2.isInSparseFormat()) - doutArray = params.input2.getDenseBlock(); - double [] outputArray = null; - if (!params.output.isInSparseFormat()) - outputArray = params.output.getDenseBlock(); - else - throw new DMLRuntimeException("Only dense output supported for pooling_backward"); - - if(inputArray != null) { - if(doutArray != null) - doPoolingBackwardDenseDense(n, inputArray, doutArray, outputArray, params, performReluBackward); - else - doPoolingBackwardDenseSparse(n, inputArray, params.input2, outputArray, params, performReluBackward); - } - else { - if(doutArray != null) - doPoolingBackwardSparseDense(n, doutArray, outputArray, params, performReluBackward); - else - doPoolingBackwardSparseSparse(n, outputArray, params, performReluBackward); - } - } - - private static void doPoolingBackwardSparseDense(int n, double [] doutArray, double [] outputArray, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException { - if (!params.input1.isInSparseFormat()) - throw new DMLRuntimeException("Incorrect usage: Call optimized versions"); - - for (int c = 0; c < params.C; c++) { - for (int p = 0; p < params.P; p++) { - for (int q = 0; q < params.Q; q++) { - double inVal = doutArray[n*params.C*params.P*params.Q + c*params.P*params.Q + p * params.Q + q]; - if(inVal != 0) { - final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W; - int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params, performReluBackward); - if(maxIndex != -1) - outputArray[maxIndex] += inVal; - } - } - } - } - } - - private static void doPoolingBackwardSparseSparse(int n, double [] outputArray, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException { - if (!params.input1.isInSparseFormat()) - throw new DMLRuntimeException("Incorrect usage: Call optimized versions"); - - if( !params.input2.sparseBlock.isEmpty(n) ) { - int [] tensorIndexes = new int[3]; - int apos = params.input2.sparseBlock.pos(n); - int alen = params.input2.sparseBlock.size(n); - int[] aix = params.input2.sparseBlock.indexes(n); - double[] avals = params.input2.sparseBlock.values(n); - for(int j = apos; j < apos+alen; j++) { - computeTensorIndexes(aix[j], tensorIndexes, params.P, params.Q); - int c = tensorIndexes[0]; - int p = tensorIndexes[1]; - int q = tensorIndexes[2]; - final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W; - int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params, performReluBackward); - if(maxIndex != -1) - outputArray[maxIndex] += avals[j]; - } - } - } - - private static void doPoolingBackwardDenseSparse(int n, double [] inputArray, - MatrixBlock dout, double [] outputArray, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException { - if( !dout.sparseBlock.isEmpty(n) ) { - int [] tensorIndexes = new int[3]; - int apos = dout.sparseBlock.pos(n); - int alen = dout.sparseBlock.size(n); - int[] aix = dout.sparseBlock.indexes(n); - double[] avals = dout.sparseBlock.values(n); - for(int j = apos; j < apos+alen; j++) { - computeTensorIndexes(aix[j], tensorIndexes, params.P, params.Q); - int c = tensorIndexes[0]; - int p = tensorIndexes[1]; - int q = tensorIndexes[2]; - final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W; - int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params, performReluBackward); - if(maxIndex != -1) - outputArray[maxIndex] += avals[j]; - } - } - } - - private static void doPoolingBackwardDenseDense(int n, double [] inputArray, double [] doutArray, - double [] outputArray, ConvolutionParameters params, boolean performReluBackward) { - for (int c = 0; c < params.C; c++) { - final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W; - final int outputOffset = n*params.C*params.P*params.Q + c*params.P*params.Q; - - for (int p = 0; p < params.P; p++) { - for (int q = 0; q < params.Q; q++) { - int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params, performReluBackward); - if(maxIndex != -1) - outputArray[maxIndex] += doutArray[outputOffset + p * params.Q + q]; - } - } - } - } - - /** - * Returns the index of cell with maximum value. This method is optimized for sparse input - * - * @param p output feature map height - * @param q output feature map width - * @param inputOffset offset to be used for input index - * @param n number of images - * @param c number of channels - * @param input input matrix - * @param params convolution parameters - * @param performReluBackward perform ReLU on input - * @return index of the cell with maximum value - * @throws DMLRuntimeException if error occurs - */ - private static int getMaxIndexSparse(int p, int q, int inputOffset, int n, int c, MatrixBlock input, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException { - if(!input.isInSparseFormat()) - throw new DMLRuntimeException("Incorrect usage: Only sparse format supported"); - - int [] tensorIndexes = new int[3]; - - int start_index_h = params.start_indexes_h[p]; - int end_index_h = params.end_indexes_h[p]; - int start_index_w = params.start_indexes_w[q]; - int end_index_w = params.end_indexes_w[q]; - - int maxIndex = -1; - double maxVal = -Double.MAX_VALUE; - - // Note: We do not treat pad as zero and hence we don't do: - // maxVal = 0 - // if start_index_h < 0 || start_index_w < 0 || end_index_h >= params.H || end_index_w >= params.W - - // input.isEmptyBlock() check is done by the caller - if( !input.sparseBlock.isEmpty(n) ) { - // Find maxIndex - int apos = input.sparseBlock.pos(n); - int alen = input.sparseBlock.size(n); - int[] aix = input.sparseBlock.indexes(n); - double[] avals = input.sparseBlock.values(n); - for(int j=apos; j<apos+alen; j++) { - computeTensorIndexes(aix[j], tensorIndexes, params.H, params.W); - if(c != tensorIndexes[0]) - continue; - int h = tensorIndexes[1]; - int w = tensorIndexes[2]; - if(h >= start_index_h && h < end_index_h && w >= start_index_w && w < end_index_w) { - double val = performReluBackward && avals[j] < 0 ? 0 : avals[j]; - if(maxVal < val) { - maxIndex = inputOffset + h*params.W + w; - maxVal = val; - } - } - } - } - else { - maxIndex = inputOffset; - } - return maxIndex; - } - - /** - * Returns the index of cell with maximum value. This method is optimized for dense input - * - * @param p output feature map height - * @param q output feature map width - * @param inputOffset offset to be used for input index - * @param inputArray input array - * @param params convolution parameters - * @param performReluBackward perform ReLU backward - * @return index of cell with maximum value - */ - private static int getMaxIndex(int p, int q, int inputOffset, double [] inputArray, ConvolutionParameters params, boolean performReluBackward) { - int start_index_h = params.start_indexes_h[p]; - int end_index_h = params.end_indexes_h[p]; - int start_index_w = params.start_indexes_w[q]; - int end_index_w = params.end_indexes_w[q]; - - int maxIndex = -1; - double maxVal = -Double.MAX_VALUE; - - // Note: We do not treat pad as zero and hence we don't do: - // maxVal = 0 - // if start_index_h < 0 || start_index_w < 0 || end_index_h >= params.H || end_index_w >= params.W - - // Find maxIndex - double currDoutVal = -1; - for (int h = start_index_h; h < end_index_h; h++) { - for (int w = start_index_w; w < end_index_w; w++) { - currDoutVal = inputArray[inputOffset + h*params.W + w]; - currDoutVal = performReluBackward && currDoutVal < 0 ? 0 : currDoutVal; - if(maxVal < currDoutVal) { - maxIndex = inputOffset + h*params.W + w; - maxVal = currDoutVal; - } - } - } - return maxIndex; - } - /** * This method computes the backpropagation errors for previous layer of relu operation * @@ -746,37 +388,12 @@ public class LibMatrixDNN { input.getNumRows() + " != " + dout.getNumRows() + " || " + input.getNumColumns() + " != " + dout.getNumColumns()); } - runConvTask(TaskType.ReluBackward, params); - - //note: no post-processing as nnz maintained per task - } - - private static long doReluBackward(ConvolutionParameters params, int rl, int ru) throws DMLRuntimeException { - // (X > 0) * dout - double [] outputArray = params.output.getDenseBlock(); - int numOutCols = params.input1.getNumColumns(); - - if(!params.input1.isInSparseFormat() && !params.input2.isInSparseFormat()) { - double [] inputArr = params.input1.getDenseBlock(); - double [] doutArr = params.input2.getDenseBlock(); - for(int i = rl*numOutCols; i < ru*numOutCols; i++) { - outputArray[i] = inputArr[i] > 0 ? doutArr[i] : 0; - } - } - else { - // Perform (X > 0) - ConvolutionUtils.scalarOperations(params.input1, outputArray, rl*numOutCols, numOutCols, rl, ru, - InstructionUtils.parseScalarBinaryOperator(">", false, 0)); - // Then perform (X > 0) * dout - ConvolutionUtils.binaryOperationInPlace(params.input2, outputArray, rl*numOutCols, numOutCols, rl, ru, - _binaryElementWiseMultiplication); - } + execute(LibMatrixDNNHelper.getReluBackwardWorkers(params), params); - //post-processing: maintain nnz - return params.output.recomputeNonZeros(rl, ru-1, 0, numOutCols-1); + // post-processing: maintain nnz + outputBlock.recomputeNonZeros(); } - /** * Performs the operation corresponding to the DML script: * ones = matrix(1, rows=1, cols=Hout*Wout) @@ -883,539 +500,55 @@ public class LibMatrixDNN { } fillIndexesArray(params); - runConvTask(TaskType.MaxPooling_Forward, params); - - //post-processing: maintain nnz - outputBlock.recomputeNonZeros(); - } - - private static void doPooling(int n, ConvolutionParameters params) throws DMLRuntimeException { - double [] inputArray = null; - if (!params.input1.isInSparseFormat()) - inputArray = params.input1.getDenseBlock(); - double [] outputArray = null; - if (!params.output.isInSparseFormat()) - outputArray = params.output.getDenseBlock(); - else - throw new DMLRuntimeException("Expected the output to be allocated in dense format"); - - final int inOffset = n*params.C*params.H*params.W; - int out_index = n*params.C*params.P*params.Q; - final int HW = params.H*params.W; - if(inputArray != null) { - for (int c = 0; c < params.C; c++) { - final int inOffset1 = inOffset + c*HW; - for (int p = 0; p < params.P; p++) { - for (int q = 0; q < params.Q; q++, out_index++) { - for (int h = params.start_indexes_h[p]; h < params.end_indexes_h[p]; h++) { - for (int w = params.start_indexes_w[q]; w < params.end_indexes_w[q]; w++) { - outputArray[out_index] = Math.max(outputArray[out_index], inputArray[inOffset1 + h*params.W + w]); - } - } - } - } - } - } - else { - // TODO: Optimize sparse maxpooling - // Low priority after adding fused relu_maxpooling operator as output of conv2d expected to be dense - for (int c = 0; c < params.C; c++) { - for (int p = 0; p < params.P; p++) { - for (int q = 0; q < params.Q; q++, out_index++) { - for (int h = params.start_indexes_h[p]; h < params.end_indexes_h[p]; h++) { - for (int w = params.start_indexes_w[q]; w < params.end_indexes_w[q]; w++) { - outputArray[out_index] = Math.max(outputArray[out_index], params.input1.quickGetValue(n, c*HW + h*params.W + w)); - } - } - } - } - } - } - } - - private static void doRotate180(int inputN, int outputN, MatrixBlock input, - double [] outputArray, ConvolutionParameters params, boolean zeroOutSparseOutput) throws DMLRuntimeException { - double [] inputArray = null; - if (!input.isInSparseFormat()) - inputArray = input.getDenseBlock(); - if(outputArray == null) - throw new DMLRuntimeException("Sparse output is not supported for rotate180"); + execute(LibMatrixDNNHelper.getMaxPoolingWorkers(params), params); - int outputOffset = outputN*params.K*params.P*params.Q; - if(inputArray != null) { - for (int k = 0; k < params.K; k++) { - for (int p = 0; p < params.P; p++) { - for (int q = 0; q < params.Q; q++) { - outputArray[outputOffset + p*params.Q*params.K + q*params.K + k] = inputArray[inputN*params.K*params.P*params.Q + k*params.P*params.Q + p*params.Q + q]; - } - } - } - } - else { - if(zeroOutSparseOutput) - Arrays.fill(outputArray, 0); - - if(!input.isEmptyBlock()) { - if( !input.sparseBlock.isEmpty(inputN) ) { - int [] tensorIndexes = new int[3]; - int apos = input.sparseBlock.pos(inputN); - int alen = input.sparseBlock.size(inputN); - int[] aix = input.sparseBlock.indexes(inputN); - double[] avals = input.sparseBlock.values(inputN); - for(int j = apos; j < apos+alen; j++) { - computeTensorIndexes(aix[j], tensorIndexes, params.P, params.Q); - int k = tensorIndexes[0]; - int p = tensorIndexes[1]; - int q = tensorIndexes[2]; - outputArray[outputOffset + p*params.Q*params.K + q*params.K + k] = avals[j]; - } - } - } - } + // post-processing: maintain nnz + outputBlock.recomputeNonZeros(); } - // ---------------------------------------------------------------------------------------------------------------- - private static void addMatrixBlocks(int poolSize, TaskType type, ConvolutionParameters params, - ConcurrentLinkedQueue<MatrixBlock> im2ColOutBlocks, ConcurrentLinkedQueue<MatrixBlock> doutReshapedBlocks, - ConcurrentLinkedQueue<MatrixBlock> partialRetBlocks) { - boolean isEligibleForConv2dSparse = (type == TaskType.LoopedIm2ColConv2d) && isEligibleForConv2dSparse(params); - boolean isEligibleForConv2dBackwardFilterSparseDense = (type == TaskType.LoopedIm2ColConv2dBwdFilter) && isEligibleForConv2dBackwardFilterSparseDense(params) ; - for(int i = 0; i < poolSize; i++) { - if(type == TaskType.LoopedIm2ColConv2d || type == TaskType.LoopedIm2ColConv2dBwdFilter) { - if(!isEligibleForConv2dSparse && !isEligibleForConv2dBackwardFilterSparseDense) { - MatrixBlock im2ColOutBlock = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false); - im2ColOutBlock.allocateDenseBlock(); - im2ColOutBlocks.add(im2ColOutBlock); + /** + * Executes the tasks in parallel using java's ExecutorService. + * + * @param tasks deep learning related tasks + * @param params convolution parameters + * @throws DMLRuntimeException if the error occurs + */ + private static void execute(ArrayList<Callable<Long>> tasks, ConvolutionParameters params) throws DMLRuntimeException { + int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); + try { + if(k == 1) { + // Single-threaded execution when called in parfor + // this avoid unnecessary creation of threadpool. + for(Callable<Long> task : tasks) { + task.call(); } } - - if(type == TaskType.LoopedIm2ColConv2dBwdFilter) { - MatrixBlock partialRetBlock = new MatrixBlock(params.C*params.R*params.S, params.K, false); - partialRetBlock.allocateDenseBlock(); - partialRetBlocks.add(partialRetBlock); - } - - if(type == TaskType.LoopedIm2ColConv2dBwdData || type == TaskType.LoopedIm2ColConv2dBwdFilter) { - MatrixBlock doutReshapedBlock = new MatrixBlock(params.P*params.Q, params.K, false); - doutReshapedBlock.allocateDenseBlock(); - doutReshapedBlocks.add(doutReshapedBlock); - } - } - } - // Methods to execute convolution-related tasks using multiple threads. - private static void runConvTask(TaskType type, ConvolutionParameters params) throws DMLRuntimeException { - int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); - ConcurrentLinkedQueue<MatrixBlock> im2ColOutBlocks = new ConcurrentLinkedQueue<MatrixBlock>(); - ConcurrentLinkedQueue<MatrixBlock> doutReshapedBlocks = new ConcurrentLinkedQueue<MatrixBlock>(); - ConcurrentLinkedQueue<MatrixBlock> partialRetBlocks = new ConcurrentLinkedQueue<MatrixBlock>(); - - if (ALLOW_MULTI_THREADED_OPS && params.isOutputThreadSafe() && k > 1) { - int poolSize = Math.min(k, params.N); - addMatrixBlocks(poolSize, type, params, im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks); - - ArrayList<ConvTask> tasks = new ArrayList<ConvTask>(); - int blklen = (int)(Math.ceil((double)params.N/poolSize/NUM_TASK_FACTOR)); - for( int i=0; i<poolSize*NUM_TASK_FACTOR && i*blklen<params.N; i++ ) - tasks.add(new ConvTask(i*blklen, Math.min((i+1)*blklen, params.N), - type, params, im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks)); - - try { - ExecutorService pool = Executors.newFixedThreadPool( poolSize ); + else { + ExecutorService pool = Executors.newFixedThreadPool( Math.min(k, params.N) ); List<Future<Long>> taskret = pool.invokeAll(tasks); pool.shutdown(); for( Future<Long> task : taskret ) - params.output.nonZeros += task.get(); - if(type == TaskType.LoopedIm2ColConv2dBwdFilter) { - elementWiseInPlaceTransposedAddition(params.output, partialRetBlocks.toArray(new MatrixBlock[0])); - } - } - catch (Exception e) { - throw new DMLRuntimeException("Error while executing multi-threaded " + type.name(), e); - } - } - else { - addMatrixBlocks(1, type, params, im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks); - try { - //execute single task and maintain nnz if supported - params.output.setNonZeros(new ConvTask(0, params.N, type, params, im2ColOutBlocks, - doutReshapedBlocks, partialRetBlocks).call()); - - if(type == TaskType.LoopedIm2ColConv2dBwdFilter) { - elementWiseInPlaceTransposedAddition(params.output, partialRetBlocks.toArray(new MatrixBlock[0])); - } - } catch (Exception e) { - throw new DMLRuntimeException("Error while executing single-threaded " + type.name(), e); + task.get(); } + } + catch (Exception e) { + throw new DMLRuntimeException("Error while executing multi-threaded tasks", e); } } - // ---------------------------------------------------------------------------------------------------------------- - private static boolean isEligibleForConv2dBackwardFilterSparseDense(ConvolutionParameters params) { - // NativeHelper.conv2dBackwardFilterSparseDense only if filter is sparse. + static boolean isEligibleForConv2dBackwardFilterSparseDense(ConvolutionParameters params) { + // NativeHelper.conv2dBackwardFilterSparseDense only if input is sparse. // dout converted to dense if sparse. return params.enableNative && params.input1.isInSparseFormat(); } - private static boolean isEligibleForConv2dSparse(ConvolutionParameters params) { + static boolean isEligibleForConv2dSparse(ConvolutionParameters params) { // NativeHelper.conv2dSparse only if filter is dense and input is sparse return params.enableNative && params.input1.isInSparseFormat() && !params.input2.isInSparseFormat(); } - private static boolean isEligibleForConv2dBackwardDataDense(ConvolutionParameters params) { + static boolean isEligibleForConv2dBackwardDataDense(ConvolutionParameters params) { // NativeHelper.conv2dBackwardDataDense only if filter is dense. // dout converted to dense if sparse. return params.enableNative && !params.input1.isInSparseFormat(); } - - /** - * The ConvTask allows the convolution operations (such s conv2d, conv2d_backward, maxpooling, etc) - * to be executed in multi-thread manner. - * - */ - private static class ConvTask implements Callable<Long> - { - public int _rl; - public int _ru; - private final ConvolutionParameters _params; - private final TaskType _type; - private final ConcurrentLinkedQueue<MatrixBlock> _im2ColOutBlocks; - private final ConcurrentLinkedQueue<MatrixBlock> _partialRetBlocks; - private final ConcurrentLinkedQueue<MatrixBlock> _doutReshapedBlocks; - - public ConvTask(int rl, int ru, TaskType type, ConvolutionParameters params, - ConcurrentLinkedQueue<MatrixBlock> im2ColOutBlocks, - ConcurrentLinkedQueue<MatrixBlock> doutReshapedBlocks, - ConcurrentLinkedQueue<MatrixBlock> partialRetBlocks) { - _rl = rl; - _ru = ru; - _type = type; - _params = params; - _im2ColOutBlocks = im2ColOutBlocks; - _partialRetBlocks = partialRetBlocks; - _doutReshapedBlocks = doutReshapedBlocks; - } - - @Override - public Long call() throws DMLRuntimeException { - long lnnz = 0; //nnz per partition - - switch(_type) { - case MaxPooling_Forward: - for(int n = _rl; n < _ru; n++) - doPooling(n, _params); - break; - case MaxPooling_Backward: - for(int n = _rl; n < _ru; n++) - doPoolingBackward(n, _params, false); - break; - case MaxPooling_Relu_Backward: - for(int n = _rl; n < _ru; n++) - doPoolingBackward(n, _params, true); - break; - case ReluBackward: - lnnz = doReluBackward(_params, _rl, _ru); - break; - case LoopedIm2ColConv2d: - { - if(isEligibleForConv2dSparse(_params)) { - // NativeHelper.conv2dSparse only if filter is dense and input is sparse - int KPQ = _params.K*_params.P*_params.Q; - double[] temp = new double[KPQ]; - for(int n = _rl; n < _ru; n++) { - if( !_params.input1.getSparseBlock().isEmpty(n) ) { - int apos = _params.input1.getSparseBlock().pos(n); - int alen = _params.input1.getSparseBlock().size(n); - int[] aix = _params.input1.getSparseBlock().indexes(n); - double[] avals = _params.input1.getSparseBlock().values(n); - NativeHelper.conv2dSparse(apos, alen, aix, avals, _params.input2.getDenseBlock(), temp, - 1, _params.C, _params.H, _params.W, _params.K, _params.R, _params.S, - _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1); - System.arraycopy(temp, 0, _params.output.denseBlock, n*KPQ, KPQ); - } - } - } - else { - // In all other cases, perform im2col in Java + matmult (either native or java). - MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove(); - double [] temp = (_params.input1.isInSparseFormat() || _params.input1.denseBlock == null) ? new double[_params.input1.getNumColumns()] : null; - for(int n = _rl; n < _ru; n++) - doLoopedIm2ColConv2d(n, im2ColOutBlock, _params, temp); - _im2ColOutBlocks.add(im2ColOutBlock); - } - if(_params.bias != null) { - // bias is always converted to dense format - double [] biasArr = _params.bias.getDenseBlock(); - int PQ = _params.P*_params.Q; - int index = _rl*_params.K*PQ; - for(int n = _rl; n < _ru; n++) { - for(int k = 0; k < _params.K; k++) { - for(int pq = 0; pq < PQ; pq++, index++) { - _params.output.denseBlock[index] += biasArr[k]; - } - } - } - } - break; - } - case LoopedIm2ColConv2dBwdFilter: - { - MatrixBlock partialRetBlock = _partialRetBlocks.remove(); - MatrixBlock doutReshapedBlock = _doutReshapedBlocks.remove(); - if(isEligibleForConv2dBackwardFilterSparseDense(_params)) { - double [] dout_n = doutReshapedBlock.getDenseBlock(); - for(int n = _rl; n < _ru; n++) { - if( !_params.input1.getSparseBlock().isEmpty(n) ) { - doRotate180(n, 0, _params.input2, dout_n, _params, true); - int apos = _params.input1.getSparseBlock().pos(n); - int alen = _params.input1.getSparseBlock().size(n); - int[] aix = _params.input1.getSparseBlock().indexes(n); - double[] avals = _params.input1.getSparseBlock().values(n); - NativeHelper.conv2dBackwardFilterSparseDense(apos, alen, aix, avals, - dout_n, partialRetBlock.getDenseBlock(), 1, _params.C, _params.H, _params.W, _params.K, - _params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1); - } - } - } - else { - MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove(); - double [] temp = _params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] : null; - for(int n = _rl; n < _ru; n++) - partialRetBlock = doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock, doutReshapedBlock, partialRetBlock, _params, temp); - _im2ColOutBlocks.add(im2ColOutBlock); - } - _doutReshapedBlocks.add(doutReshapedBlock); - _partialRetBlocks.add(partialRetBlock); - break; - } - case LoopedIm2ColConv2dBwdData: - { - MatrixBlock doutReshapedBlock = _doutReshapedBlocks.remove(); - if(isEligibleForConv2dBackwardDataDense(_params)) { - int CHW = _params.C*_params.H*_params.W; - double [] ret = new double[CHW]; - double [] filterArr = _params.input1.getDenseBlock(); - for(int n = _rl; n < _ru; n++) { - double [] dout_n = getRowInDenseFormat(_params.input2, n, doutReshapedBlock.getDenseBlock()); - if(n > _rl) - Arrays.fill(ret, 0); - NativeHelper.conv2dBackwardDataDense(filterArr, dout_n, ret, 1, - _params.C, _params.H, _params.W, _params.K, - _params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1); - System.arraycopy(ret, 0, _params.output.getDenseBlock(), n*CHW, CHW); - } - } - else { - for(int n = _rl; n < _ru; n++) - doLoopedIm2ColConv2dBwdData(n, doutReshapedBlock, _params); - } - _doutReshapedBlocks.add(doutReshapedBlock); - break; - } - default: - throw new DMLRuntimeException("Unsupported ConvTask:" + _type.name()); - } - - return lnnz; - } - } - - // Converts input: PQ X CRS matrix and writes to 1 X CHW - private static void doCol2imOverSingleImage(int outputN, MatrixBlock input, ConvolutionParameters params) throws DMLRuntimeException { - if(input.rlen != params.P*params.Q || input.clen != params.C*params.R*params.S) { - throw new DMLRuntimeException("Incorrect input dimensions"); - } - - double [] outputArray = null; - if (!params.output.isInSparseFormat()) - outputArray = params.output.getDenseBlock(); - else { - throw new DMLRuntimeException("Only dense output is implemented"); - } - - if(!input.isInSparseFormat()) { - double [] inputArray = input.getDenseBlock(); - doCol2IMDenseInput(0, outputN, inputArray, outputArray, params); - } - else { - if(!input.isEmptyBlock()) { - int [] tensorIndexes = new int[3]; - for(int i = 0; i < input.getNumRows(); i++) { - if( !input.sparseBlock.isEmpty(i) ) { - computeTensorIndexes(i, tensorIndexes, params.P, params.Q); - int p = tensorIndexes[1]; - int q = tensorIndexes[2]; - if(tensorIndexes[0] != 0) - throw new DMLRuntimeException("Incorrect tensor indexes: " + tensorIndexes[0] + " != 0 <" + p + " " + q + " " + tensorIndexes[0] + params.P + " " + params.Q + ">"); - - int apos = input.sparseBlock.pos(i); - int alen = input.sparseBlock.size(i); - int[] aix = input.sparseBlock.indexes(i); - double[] avals = input.sparseBlock.values(i); - for(int j = apos; j < apos+alen; j++) { - computeTensorIndexes(aix[j], tensorIndexes, params.R, params.S); - int c = tensorIndexes[0]; - int r = tensorIndexes[1]; - int s = tensorIndexes[2]; - int h = p*params.stride_h + r - params.pad_h; - int w = q*params.stride_w + s - params.pad_w; - if(h >= 0 && h < params.H && w >= 0 && w < params.W) { - int outIndex = outputN*params.C*params.H*params.W + c*params.H*params.W + h*params.W + w; - outputArray[outIndex] += avals[j]; - } - } - } - } - } - } - } - - // Converts input: PQ X CRS matrix and writes to 1 X CHW if inputN == 0 - // Or converts input: NPQ X CRS matrix and writes to N X CHW - private static void doCol2IMDenseInput(int inputN, int outputN, double [] inputArray, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException { - final int outputNOffset = outputN*params.C*params.H*params.W; - for (int p = 0; p < params.P; p++) { - // h = p*params.stride_h + r - params.pad_h - // = r + hOffset - // Based on restrictions: h >= 0 and r >= 0 and h < params.H and r < params.R, we get - // max(0, - hOffset) <= r < min(params.R, params.H - hOffset) - final int hOffset = p*params.stride_h - params.pad_h; - final int rStart = Math.max(0, - hOffset); - final int rEnd = Math.min(params.R, params.H - hOffset); - for (int q = 0; q < params.Q; q++) { - // Using the same logic as above on following: - // w = q*params.stride_w + s - params.pad_w - final int wOffset = q*params.stride_w - params.pad_w; - final int sStart = Math.max(0, - wOffset); - final int sEnd = Math.min(params.S, params.W - wOffset); - final int tempOffset = (inputN*params.P*params.Q + p*params.Q + q)*params.C*params.R*params.S; - for (int c = 0; c < params.C; c++) { - final int outOffset = outputNOffset + c*params.H*params.W; - final int inputOffset = tempOffset + c*params.R*params.S; - for (int r = rStart; r < rEnd; r++) { - for (int s = sStart; s < sEnd; s++) { - int inputIndex = inputOffset + r*params.S + s; - int outIndex = outOffset + (hOffset + r)*params.W + wOffset + s; - outputArray[outIndex] += inputArray[inputIndex]; - } - } - } - } - } - } - - private static void doIm2colDense(int n, double [] inputArray, double [] outputArray, ConvolutionParameters params) { - int CRS = params.C * params.R * params.S; - final int nOffset = n * params.C*params.H*params.W; - if (params.stride_h == 1 && params.stride_w == 1 && params.pad_h == 0 && params.pad_w == 0) { - for (int c = 0; c < CRS; ++c) { - int wOffset = c % params.S; - int hOffset = (c / params.S) % params.R; - int cInput = c / params.R / params.S; - for (int h = 0; h < params.P; ++h) { - int hPadded = h + hOffset; - int outOffset = (c * params.P + h) * params.Q; - int inputOffset = nOffset + (cInput * params.H + hPadded) * params.W; - System.arraycopy(inputArray, inputOffset + wOffset, outputArray, outOffset, params.Q); - int w = params.Q - 1; - int wPadded = w + wOffset; - if (hPadded < params.H && wPadded < params.W) - outputArray[outOffset + w] = inputArray[inputOffset + wPadded]; - else - outputArray[outOffset + w] = 0; - } - } - } else { - for (int c = 0; c < CRS; ++c) { - int wOffset = c % params.S; - int hOffset = (c / params.S) % params.R; - int cInput = c / params.R / params.S; - for (int h = 0; h < params.P; ++h) { - int outOffset = (c * params.P + h) * params.Q; - int hPadded = h * params.stride_h - params.pad_h + hOffset; - int inputOffset = nOffset + (cInput * params.H + hPadded) * params.W; - if (hPadded < 0 || hPadded >= params.H) { - Arrays.fill(outputArray, outOffset, outOffset+params.Q, 0); - } else { - for (int w = 0; w < params.Q; ++w) { - int wPadded = w * params.stride_w - params.pad_w + wOffset; - if (wPadded >= 0 && wPadded < params.W) - outputArray[outOffset + w] = inputArray[inputOffset + wPadded]; - else - outputArray[outOffset + w] = 0; - } - } - } - } - } - } - - // Returns the row of matrix in dense format - private static double [] getRowInDenseFormat(MatrixBlock input, int n, double [] temp) throws DMLRuntimeException { - if(input.getNumColumns() != temp.length) { - throw new DMLRuntimeException("Invalid parameters"); - } - // Use temporary array to avoid binary search - if(input.isInSparseFormat()) { - Arrays.fill(temp, 0); - if( !input.sparseBlock.isEmpty(n) ) { - int apos = input.sparseBlock.pos(n); - int alen = input.sparseBlock.size(n); - int[] aix = input.sparseBlock.indexes(n); - double[] avals = input.sparseBlock.values(n); - for(int j=apos; j<apos+alen; j++) - temp[ aix[j] ] = avals[j]; - } - } - else { - System.arraycopy(input.getDenseBlock(), n*input.getNumColumns(), temp, 0, input.getNumColumns()); - } - return temp; - } - - // Keeping this as a separate sparse method to allow for further dense optimizations - private static void doIm2colSparse(int n, MatrixBlock input, double [] outputArray, ConvolutionParameters params, double [] temp) throws DMLRuntimeException { - int CRS = params.C * params.R * params.S; - - // Using a temporary array improves performance by not requiring binary search for getValue - // Since the access pattern depends on ConvolutionParameters, this serves as a temporary fix. - temp = getRowInDenseFormat(input, n, temp); - // final int nOffset = n * params.C*params.H*params.W; - for (int c = 0; c < CRS; ++c) { - int wOffset = c % params.S; - int hOffset = (c / params.S) % params.R; - int cInput = c / params.R / params.S; - for (int h = 0; h < params.P; ++h) { - int outOffset = (c * params.P + h) * params.Q; - int hPadded = h * params.stride_h - params.pad_h + hOffset; - int tempOffset = (cInput * params.H + hPadded) * params.W; - // int inputOffset = nOffset + tempOffset; - if (hPadded < 0 || hPadded >= params.H) { - Arrays.fill(outputArray, outOffset, outOffset+params.Q, 0); - } else { - for (int w = 0; w < params.Q; ++w) { - int wPadded = w * params.stride_w - params.pad_w + wOffset; - if (wPadded >= 0 && wPadded < params.W) - outputArray[outOffset + w] = temp[tempOffset + wPadded]; - else - outputArray[outOffset + w] = 0; - } - } - } - } - } - - private static void doIm2col(int n, MatrixBlock output, ConvolutionParameters params, double [] temp) throws DMLRuntimeException { - double [] inputArray = null; - if (!params.input1.isInSparseFormat()) - inputArray = params.input1.getDenseBlock(); - double [] outputArray = null; - if(!output.isInSparseFormat()) - outputArray = output.getDenseBlock(); - else - throw new DMLRuntimeException("Sparse output is not supported for im2col"); - - if(inputArray != null) - doIm2colDense(n, inputArray, outputArray, params); - else - doIm2colSparse(n, params.input1, outputArray, params, temp); - } } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/19eed8f3/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java new file mode 100644 index 0000000..609af11 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.matrix.data; + +import java.util.Arrays; +import java.util.concurrent.Callable; + +import org.apache.sysml.api.DMLScript; +import org.apache.sysml.utils.NativeHelper; + +/** + * This class contains the set of operators used for performing conv2d backward data + */ +public class LibMatrixDNNConv2dBackwardDataHelper { + + /** + * This operator is used only if native is enabled and filter is sparse. + * dout is converted into dense if sparse. + */ + public static class SparseNativeConv2dBackwardDataDense implements Callable<Long> + { + public int _rl; public int _ru; + private final ConvolutionParameters _params; + public SparseNativeConv2dBackwardDataDense(int rl, int ru, ConvolutionParameters params) { + _rl = rl; _ru = ru; + _params = params; + } + + @Override + public Long call() throws Exception { + int CHW = _params.C*_params.H*_params.W; + double [] ret = new double[CHW]; + double [] filterArr = _params.input1.getDenseBlock(); + double [] dout_n = new double[_params.P*_params.Q*_params.K]; + for(int n = _rl; n < _ru; n++) { + LibMatrixDNNHelper.getRowInDenseFormat(_params.input2, n, dout_n); + if(n > _rl) + Arrays.fill(ret, 0); + NativeHelper.conv2dBackwardDataDense(filterArr, dout_n, ret, 1, + _params.C, _params.H, _params.W, _params.K, + _params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1); + System.arraycopy(ret, 0, _params.output.getDenseBlock(), n*CHW, CHW); + } + return 0L; + } + } + + /** + * General conv2d backward data operator + */ + public static class Conv2dBackwardData implements Callable<Long> { + + public int _rl; public int _ru; + private final ConvolutionParameters _params; + public Conv2dBackwardData(int rl, int ru, ConvolutionParameters params) { + _rl = rl; _ru = ru; + _params = params; + } + + @Override + public Long call() throws Exception { + int PQ = _params.P*_params.Q; int K = _params.K; int CRS = _params.C*_params.R*_params.S; + MatrixBlock filter = _params.input1; + MatrixBlock dout = _params.input2; + MatrixBlock dout_reshaped = new MatrixBlock(PQ, K, false); + dout_reshaped.allocateDenseBlock(); + LibMatrixDNNRotate180Helper.Rotate180Worker rotate180Worker = + LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( dout, dout_reshaped.getDenseBlock(), _params, true); + long time1 = 0; long time2 = 0; + for(int n = _rl; n < _ru; n++) { + // rotate180(dout[n,]) => dout_reshaped + rotate180Worker.execute(n, 0); + + // dout_reshaped %*% filter => temp + MatrixBlock temp = new MatrixBlock(PQ, CRS, false); + long t1 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + LibMatrixDNNHelper.singleThreadedMatMult(dout_reshaped, filter, temp, true, false, _params); + long t2 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + // col2im(temp) => output[n,] + LibMatrixDNNHelper.doCol2imOverSingleImage(n, temp, _params); + long t3 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + + if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { + time1 += t2 - t1; + time2 += t3 - t2; + } + } + if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { + LibMatrixDNN.loopedConvBwdDataMatMultTime.addAndGet(time1); + LibMatrixDNN.loopedConvBwdDataCol2ImTime.addAndGet(time2); + } + return 0L; + } + + } +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/19eed8f3/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java new file mode 100644 index 0000000..560f32c --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.matrix.data; + +import java.util.concurrent.Callable; + +import org.apache.sysml.api.DMLScript; +import org.apache.sysml.runtime.util.ConvolutionUtils; +import org.apache.sysml.utils.NativeHelper; + +public class LibMatrixDNNConv2dBackwardFilterHelper { + + /** + * This operator is used only if native is enabled and input is sparse. + * dout is converted into dense if sparse. + */ + public static class SparseNativeConv2dBackwardFilterDense implements Callable<Long> + { + + public int _rl; public int _ru; + private final ConvolutionParameters _params; + public SparseNativeConv2dBackwardFilterDense(int rl, int ru, ConvolutionParameters params) { + _rl = rl; _ru = ru; + _params = params; + } + + @Override + public Long call() throws Exception { + int CRS = _params.C*_params.R*_params.S; + double [] dout_n = new double[_params.P*_params.Q*_params.K]; + LibMatrixDNNRotate180Helper.Rotate180Worker rotate180Worker = + LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( _params.input2, dout_n, _params, true); + // partialRetBlock is size: [params.C*params.R*params.S, params.K] + double [] partialRetBlock = new double[CRS*_params.K]; + for(int n = _rl; n < _ru; n++) { + if( !_params.input1.getSparseBlock().isEmpty(n) ) { + // rotate180(dout[n,]) => dout_n + rotate180Worker.execute(n, 0); + + int apos = _params.input1.getSparseBlock().pos(n); + int alen = _params.input1.getSparseBlock().size(n); + int[] aix = _params.input1.getSparseBlock().indexes(n); + double[] avals = _params.input1.getSparseBlock().values(n); + NativeHelper.conv2dBackwardFilterSparseDense(apos, alen, aix, avals, + dout_n, partialRetBlock, 1, _params.C, _params.H, _params.W, _params.K, + _params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1); + } + } + inplaceTransposedAddition(partialRetBlock, _params); + return 0L; + } + } + + /** + * General conv2d backward data operator + */ + public static class Conv2dBackwardFilter implements Callable<Long> { + + public int _rl; public int _ru; + private final ConvolutionParameters _params; + public Conv2dBackwardFilter(int rl, int ru, ConvolutionParameters params) { + _rl = rl; _ru = ru; + _params = params; + } + + @Override + public Long call() throws Exception { + int PQ = _params.P*_params.Q; int K = _params.K; int CRS = _params.C*_params.R*_params.S; + MatrixBlock dout = _params.input2; + MatrixBlock im2ColOutBlock = new MatrixBlock(CRS, PQ, false); + im2ColOutBlock.allocateDenseBlock(); + MatrixBlock dout_reshaped = new MatrixBlock(PQ, K, false); + dout_reshaped.allocateDenseBlock(); + LibMatrixDNNIm2ColHelper.Im2colWorker im2ColWorker = LibMatrixDNNIm2ColHelper.Im2colWorker.getWorker( _params.input1, im2ColOutBlock, _params, true); + LibMatrixDNNRotate180Helper.Rotate180Worker rotate180Worker = + LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( dout, dout_reshaped.getDenseBlock(), _params, true); + double [] partialRetBlock = new double[CRS*_params.K]; + long time1 = 0; long time2 = 0; + for(int n = _rl; n < _ru; n++) { + // rotate180(dout[n,]) => dout_reshaped + rotate180Worker.execute(n, 0); + + // im2col(input) => _im2ColOutBlock + long t1 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + im2ColWorker.execute(n); + long t2 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + + MatrixBlock temp = new MatrixBlock(CRS, K, false); + LibMatrixDNNHelper.singleThreadedMatMult(im2ColOutBlock, dout_reshaped, temp, true, true, _params); + long t3 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + + if(!temp.isEmptyBlock()) { + // partialRetBlock is size: [params.C*params.R*params.S, params.K] + ConvolutionUtils.binaryOperationInPlace(temp, partialRetBlock, 0, K, 0, CRS, + LibMatrixDNN._binaryElementWiseAddition); + } + + if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { + time1 += t2 - t1; + time2 += t3 - t2; + } + } + inplaceTransposedAddition(partialRetBlock, _params); + if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { + LibMatrixDNN.loopedConvBwdFilterIm2ColTime.addAndGet(time1); + LibMatrixDNN.loopedConvBwdFilterMatMultTime.addAndGet(time2); + } + return 0L; + } + } + private static synchronized void inplaceTransposedAddition(double [] partialRetBlock, ConvolutionParameters params) { + // Perform transposed addition: output of size [K, CRS] += partialRetBlock of size [CRS,K] + int iter = 0; int CRS = params.C*params.R*params.S; int K = params.K; + double [] outputArr = params.output.denseBlock; + for(int i = 0; i < CRS; i++) { + for(int j = 0; j < K; j++, iter++) { + int index = j*CRS+i; + outputArr[index] += partialRetBlock[iter]; + } + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/19eed8f3/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java new file mode 100644 index 0000000..b2c4d67 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java @@ -0,0 +1,224 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.matrix.data; + +import java.util.ArrayList; +import java.util.concurrent.Callable; + +import org.apache.sysml.api.DMLScript; +import org.apache.sysml.utils.NativeHelper; + +/** + * This class contains the set of operators used for performing conv2d + */ +public class LibMatrixDNNConv2dHelper { + + /** + * Performs convolution via: partialCopy1(filter %*% im2col(input)) = output. + * This operator has less memory pressure than LoopedIm2ColConv2dAllChannels. + */ + public static class LoopedIm2ColConv2dOneChannel implements Callable<Long> + { + public int _rl; public int _ru; + private final ConvolutionParameters _params; ArrayList<MatrixBlock> _filters; + public LoopedIm2ColConv2dOneChannel(int rl, int ru, ConvolutionParameters params, ArrayList<MatrixBlock> filters) { + _rl = rl; _ru = ru; + _params = params; + _filters = filters; + } + + @Override + public Long call() throws Exception { + int PQ = _params.P*_params.Q; int K = _params.K; + int RS = _params.R*_params.S; + MatrixBlock im2ColOutBlock = new MatrixBlock(RS, PQ, false); + im2ColOutBlock.allocateDenseBlock(); + LibMatrixDNNIm2ColHelper.Im2colWorker im2ColWorker = LibMatrixDNNIm2ColHelper.Im2colWorker.getWorker( _params.input1, im2ColOutBlock, _params, false); + long time1 = 0; long time2 = 0; + for(int n = _rl; n < _ru; n++) { + for(int c = 0; c < _params.C; c++) { + // im2col(input) => _im2ColOutBlock + long t1 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + im2ColWorker.execute(n, c); + long t2 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + + // filter %*% _im2ColOutBlock => matMultOutBlock + MatrixBlock matMultOutBlock = new MatrixBlock(K, PQ, false); + LibMatrixDNNHelper.singleThreadedMatMult(_filters.get(c), im2ColOutBlock, matMultOutBlock, false, true, _params); + long t3 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + + if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { + time1 += t2 - t1; + time2 += t3 - t2; + } + + // Add the matrix matMultOutBlock of shape [K X PQ] to params.output.denseBlock + destPos + add(matMultOutBlock, _params.output.getDenseBlock(), n*K*PQ, K, PQ); + } + } + if(_params.bias != null) { + // bias is always converted to dense format + LibMatrixDNNHelper.addBias(_rl, _ru, _params.output.getDenseBlock(), _params.bias.getDenseBlock(), K, PQ); + } + if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { + LibMatrixDNN.loopedConvIm2ColTime.addAndGet(time1); + LibMatrixDNN.loopedConvMatMultTime.addAndGet(time2); + } + return 0L; + } + + // Copy the matrix src of shape [K X PQ] to params.output.denseBlock + destPos + private void add(MatrixBlock src, double [] dest, int destPos, int K, int PQ) { + // Copying is required as LibMatrixMult.matrixMult (and/or Java) is not pointer aware. + // This is not required in Native implementation + if(!src.isEmptyBlock()) { + if(src.isInSparseFormat()) { + // Copy the sparse matrix matMultOutBlock of shape [K X PQ] to + // params.output.denseBlock + destPos + for(int k = 0; k < src.getNumRows(); k++) { + if( !src.sparseBlock.isEmpty(k) ) { + int apos = src.sparseBlock.pos(k); + int alen = src.sparseBlock.size(k); + int[] aix = src.sparseBlock.indexes(k); + double[] avals = src.sparseBlock.values(k); + for(int j = apos; j < apos+alen; j++) { + int pqIndex = aix[j]; + dest[destPos + k*PQ + pqIndex ] += avals[j]; + } + } + } + } + else { + for(int i = 0; i < K * PQ; i++) { + dest[destPos+i] += src.denseBlock[i]; + } + } + } + } + } + + /** + * Performs convolution via: partialCopy1(filter %*% im2col(input)) = output + */ + public static class LoopedIm2ColConv2dAllChannels implements Callable<Long> + { + public int _rl; public int _ru; + private final ConvolutionParameters _params; + public LoopedIm2ColConv2dAllChannels(int rl, int ru, ConvolutionParameters params) { + _rl = rl; _ru = ru; + _params = params; + } + + @Override + public Long call() throws Exception { + int PQ = _params.P*_params.Q; int K = _params.K; int CRS = _params.C*_params.R*_params.S; + MatrixBlock im2ColOutBlock = new MatrixBlock(CRS, PQ, false); + im2ColOutBlock.allocateDenseBlock(); + LibMatrixDNNIm2ColHelper.Im2colWorker im2ColWorker = LibMatrixDNNIm2ColHelper.Im2colWorker.getWorker( _params.input1, im2ColOutBlock, _params, true); + long time1 = 0; long time2 = 0; + for(int n = _rl; n < _ru; n++) { + // im2col(input) => _im2ColOutBlock + long t1 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + im2ColWorker.execute(n); + long t2 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + + // filter %*% _im2ColOutBlock => matMultOutBlock + MatrixBlock matMultOutBlock = new MatrixBlock(K, PQ, false); + LibMatrixDNNHelper.singleThreadedMatMult(_params.input2, im2ColOutBlock, matMultOutBlock, false, true, _params); + long t3 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0; + + if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { + time1 += t2 - t1; + time2 += t3 - t2; + } + + // Copy the matrix matMultOutBlock of shape [K X PQ] to params.output.denseBlock + destPos + partialCopy1(matMultOutBlock, _params.output.getDenseBlock(), n*K*PQ, K, PQ); + } + if(_params.bias != null) { + // bias is always converted to dense format + LibMatrixDNNHelper.addBias(_rl, _ru, _params.output.getDenseBlock(), _params.bias.getDenseBlock(), K, PQ); + } + if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { + LibMatrixDNN.loopedConvIm2ColTime.addAndGet(time1); + LibMatrixDNN.loopedConvMatMultTime.addAndGet(time2); + } + return 0L; + } + + // Copy the matrix src of shape [K X PQ] to params.output.denseBlock + destPos + private void partialCopy1(MatrixBlock src, double [] dest, int destPos, int K, int PQ) { + // Copying is required as LibMatrixMult.matrixMult (and/or Java) is not pointer aware. + // This is not required in Native implementation + if(!src.isEmptyBlock()) { + if(src.isInSparseFormat()) { + // Copy the sparse matrix matMultOutBlock of shape [K X PQ] to + // params.output.denseBlock + destPos + for(int k = 0; k < src.getNumRows(); k++) { + if( !src.sparseBlock.isEmpty(k) ) { + int apos = src.sparseBlock.pos(k); + int alen = src.sparseBlock.size(k); + int[] aix = src.sparseBlock.indexes(k); + double[] avals = src.sparseBlock.values(k); + for(int j = apos; j < apos+alen; j++) { + int pqIndex = aix[j]; + dest[destPos + k*PQ + pqIndex ] = avals[j]; + } + } + } + } + else + System.arraycopy(src.denseBlock, 0, dest, destPos, K * PQ); + } + } + } + + + /** + * This operator is used only if native is enabled, filter is dense and input is sparse + */ + public static class SparseNativeConv2d implements Callable<Long> + { + public int _rl; public int _ru; + private final ConvolutionParameters _params; + public SparseNativeConv2d(int rl, int ru, ConvolutionParameters params) { + _rl = rl; _ru = ru; + _params = params; + } + + @Override + public Long call() throws Exception { + int KPQ = _params.K*_params.P*_params.Q; + double[] temp = new double[KPQ]; + for(int n = _rl; n < _ru; n++) { + if( !_params.input1.getSparseBlock().isEmpty(n) ) { + int apos = _params.input1.getSparseBlock().pos(n); + int alen = _params.input1.getSparseBlock().size(n); + int[] aix = _params.input1.getSparseBlock().indexes(n); + double[] avals = _params.input1.getSparseBlock().values(n); + NativeHelper.conv2dSparse(apos, alen, aix, avals, _params.input2.getDenseBlock(), temp, + 1, _params.C, _params.H, _params.W, _params.K, _params.R, _params.S, + _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1); + System.arraycopy(temp, 0, _params.output.denseBlock, n*KPQ, KPQ); + } + } + return 0L; + } + } +}