[1/2] incubator-systemml git commit: [SYSTEMML-540] Refactored LibMatrixDNN to reduce instruction cache misses

niketanpansare Mon, 29 May 2017 16:22:37 -0700

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 28c92b93f -> 19eed8f38



http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/19eed8f3/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
new file mode 100644
index 0000000..40a39f0
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
@@ -0,0 +1,541 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.concurrent.Callable;
+
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.InstructionUtils;
+import org.apache.sysml.runtime.util.ConvolutionUtils;
+import org.apache.sysml.utils.NativeHelper;
+
+
+public class LibMatrixDNNHelper {
+       
+       // *********************************** low-level runtime operator 
selection ***********************************************
+       // *********************************** based on runtime properties 
(sparsity, native, etc) ********************************
+       // These methods help reduce branch miss predictions and 
instruction-cache misses.
+       // Also, they simplify the design of LibMatrixDNN and help in 
code-maintenance.
+       
+       /**
+        * Factory method that returns list of callable tasks for performing 
maxpooling operation
+        * 
+        * @param params convolution parameters
+        * @return list of callable tasks for performing maxpooling operation
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static ArrayList<Callable<Long>> 
getMaxPoolingWorkers(ConvolutionParameters params) throws DMLRuntimeException {
+               ArrayList<Callable<Long>> ret = new ArrayList<Callable<Long>>();
+               int k = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               int taskSize = (int)(Math.ceil((double)params.N / k));
+               for(int i = 0; i*taskSize < params.N; i++) {
+                       if(params.input1.isInSparseFormat())
+                               ret.add(new 
LibMatrixDNNPoolingHelper.SparseMaxPooling(i*taskSize, Math.min((i+1)*taskSize, 
params.N), params));
+                       else
+                               ret.add(new 
LibMatrixDNNPoolingHelper.DenseMaxPooling(i*taskSize, Math.min((i+1)*taskSize, 
params.N), params));
+               }
+               return ret;
+       }
+       
+       /**
+        * Factory method that returns list of callable tasks for performing 
maxpooling backward operation
+        * 
+        * @param params convolution parameters
+        * @return list of callable tasks for performing maxpooling backward 
operation
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static ArrayList<Callable<Long>> 
getMaxPoolingBackwardWorkers(ConvolutionParameters params, boolean 
performReluBackward) throws DMLRuntimeException {
+               ArrayList<Callable<Long>> ret = new ArrayList<Callable<Long>>();
+               int k = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               int taskSize = (int)(Math.ceil((double)params.N / k));
+               for(int i = 0; i*taskSize < params.N; i++) {
+                       if(!params.input1.isInSparseFormat()) {
+                               if(!params.input2.isInSparseFormat()) 
+                                       ret.add(new 
LibMatrixDNNPoolingBackwardHelper.PoolingBackwardDenseDense(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params, performReluBackward));
+                               else
+                                       ret.add(new 
LibMatrixDNNPoolingBackwardHelper.PoolingBackwardDenseSparse(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params, performReluBackward));
+                       }
+                       else {
+                               if(!params.input2.isInSparseFormat()) 
+                                       ret.add(new 
LibMatrixDNNPoolingBackwardHelper.PoolingBackwardSparseDense(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params, performReluBackward));
+                               else
+                                       ret.add(new 
LibMatrixDNNPoolingBackwardHelper.PoolingBackwardSparseSparse(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params, performReluBackward));
+                       }
+               }
+               return ret;
+       }
+       
+       /**
+        * Factory method that returns list of callable tasks for performing 
relu backward operation
+        * 
+        * @param params convolution parameters
+        * @return list of callable tasks for performing relu backward operation
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static ArrayList<Callable<Long>> 
getReluBackwardWorkers(ConvolutionParameters params) throws DMLRuntimeException 
{
+               ArrayList<Callable<Long>> ret = new ArrayList<Callable<Long>>();
+               int k = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               int taskSize = (int)(Math.ceil((double)params.N / k));
+               for(int i = 0; i*taskSize < params.N; i++) {
+                       ret.add(new ReluBackward(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params));
+               }
+               return ret;
+       }
+       
+       /**
+        * Factory method that returns list of callable tasks for performing 
conv2d
+        * 
+        * @param params convolution parameters
+        * @return list of callable tasks for performing conv2d
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static ArrayList<Callable<Long>> 
getConv2dWorkers(ConvolutionParameters params) throws DMLRuntimeException {
+               ArrayList<Callable<Long>> ret = new ArrayList<Callable<Long>>();
+               
+               // Try to create as many tasks as threads. 
+               // Creating more tasks will help in tail, but would have 
additional overhead of maintaining the intermediate
+               // data structures such as im2col blocks.
+               int k = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               int taskSize = (int)(Math.ceil((double)params.N / k));
+               
+               // TODO: Decide here based on params whether to use 
LoopedIm2ColConv2dAllChannels or LoopedIm2ColConv2dOneChannel
+               // For now, let's stick to the existing approach of converting 
[1, CHW] to [CRS, PQ] as it allows matrix multiplication large enough matrix.
+               boolean allChannels = true; ArrayList<MatrixBlock> filters = 
null;
+               if(!allChannels) {
+                       filters = splitFilter(params);
+               }
+               
+               boolean isEmptyDenseInput = !params.input1.isInSparseFormat() 
&& params.input1.denseBlock == null;
+               
+               for(int i = 0; i*taskSize < params.N; i++) {
+                       if(LibMatrixDNN.isEligibleForConv2dSparse(params)) 
+                               ret.add(new 
LibMatrixDNNConv2dHelper.SparseNativeConv2d(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params));
+                       else if(!isEmptyDenseInput && allChannels)
+                               ret.add(new 
LibMatrixDNNConv2dHelper.LoopedIm2ColConv2dAllChannels(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params));
+                       else if(!isEmptyDenseInput && !allChannels)
+                               ret.add(new 
LibMatrixDNNConv2dHelper.LoopedIm2ColConv2dOneChannel(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params, filters));
+                       else
+                               throw new DMLRuntimeException("Unsupported 
operator");
+               }
+               return ret;
+       }
+       
+       /**
+        * Factory method that returns list of callable tasks for performing 
conv2d backward filter
+        * 
+        * @param params convolution parameters
+        * @return list of callable tasks for performing conv2d backward filter
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static ArrayList<Callable<Long>> 
getConv2dBackwardFilterWorkers(ConvolutionParameters params) throws 
DMLRuntimeException {
+               ArrayList<Callable<Long>> ret = new ArrayList<Callable<Long>>();
+               // Try to create as many tasks as threads. 
+               // Creating more tasks will help in tail, but would have 
additional overhead of maintaining the intermediate
+               // data structures such as im2col blocks.
+               int k = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               int taskSize = (int)(Math.ceil((double)params.N / k));
+               
+               boolean isEmptyDenseInput = (!params.input1.isInSparseFormat() 
&& params.input1.denseBlock == null) || 
+                                                                               
                                                
(!params.input2.isInSparseFormat() && params.input2.denseBlock == null);
+               
+               for(int i = 0; i*taskSize < params.N; i++) {
+                       
if(LibMatrixDNN.isEligibleForConv2dBackwardFilterSparseDense(params)) 
+                               ret.add(new 
LibMatrixDNNConv2dBackwardFilterHelper.SparseNativeConv2dBackwardFilterDense(i*taskSize,
 Math.min((i+1)*taskSize, params.N), params));
+                       else if(!isEmptyDenseInput)
+                               ret.add(new 
LibMatrixDNNConv2dBackwardFilterHelper.Conv2dBackwardFilter(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params));
+                       else
+                               throw new DMLRuntimeException("Unsupported 
operator");
+               }
+               return ret;
+       }
+       
+       /**
+        * Factory method that returns list of callable tasks for performing 
conv2d backward data
+        * 
+        * @param params convolution parameters
+        * @return list of callable tasks for performing conv2d backward data
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static ArrayList<Callable<Long>> 
getConv2dBackwardDataWorkers(ConvolutionParameters params) throws 
DMLRuntimeException {
+               ArrayList<Callable<Long>> ret = new ArrayList<Callable<Long>>();
+               
+               // Try to create as many tasks as threads. 
+               // Creating more tasks will help in tail, but would have 
additional overhead of maintaining the intermediate
+               // data structures such as im2col blocks.
+               int k = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               int taskSize = (int)(Math.ceil((double)params.N / k));
+               
+               boolean isEmptyDenseInput = (!params.input1.isInSparseFormat() 
&& params.input1.denseBlock == null) || 
+                                                                               
                                                
(!params.input2.isInSparseFormat() && params.input2.denseBlock == null);
+               
+               for(int i = 0; i*taskSize < params.N; i++) {
+                       
if(LibMatrixDNN.isEligibleForConv2dBackwardDataDense(params)) 
+                               ret.add(new 
LibMatrixDNNConv2dBackwardDataHelper.SparseNativeConv2dBackwardDataDense(i*taskSize,
 Math.min((i+1)*taskSize, params.N), params));
+                       else if(!isEmptyDenseInput)
+                               ret.add(new 
LibMatrixDNNConv2dBackwardDataHelper.Conv2dBackwardData(i*taskSize, 
Math.min((i+1)*taskSize, params.N), params));
+                       else
+                               throw new DMLRuntimeException("Unsupported 
operator");
+               }
+                       
+               return ret;
+       }
+       
+       // *********************************** relu backward operator 
******************************************************
+       
+       /**
+        * Performs the operation: (X gt 0) * dout
+        */
+       public static class ReluBackward implements Callable<Long> 
+       {
+               public int _rl; public int _ru; 
+               private final ConvolutionParameters _params; 
+               double [] outputArray; int numOutCols;
+               public ReluBackward(int rl, int ru, ConvolutionParameters 
params) {
+                       _rl = rl; _ru = ru;
+                       _params = params;
+                       outputArray= params.output.getDenseBlock();
+                       numOutCols = params.input1.getNumColumns();
+               }
+               
+               @Override
+               public Long call() throws Exception {
+                       if(!_params.input1.isInSparseFormat() && 
!_params.input2.isInSparseFormat()) {
+                               double [] inputArr = 
_params.input1.getDenseBlock();
+                               double [] doutArr = 
_params.input2.getDenseBlock();
+                               for(int i = _rl*numOutCols; i < _ru*numOutCols; 
i++) {
+                                       outputArray[i] = inputArr[i] > 0 ? 
doutArr[i] : 0;
+                               }
+                       }
+                       else {
+                               // Perform (X > 0)
+                               
ConvolutionUtils.scalarOperations(_params.input1, outputArray, _rl*numOutCols, 
numOutCols, _rl, _ru, 
+                                               
InstructionUtils.parseScalarBinaryOperator(">", false, 0));
+                               // Then perform (X > 0) * dout
+                               
ConvolutionUtils.binaryOperationInPlace(_params.input2, outputArray, 
_rl*numOutCols, numOutCols, _rl, _ru, 
+                                               
LibMatrixDNN._binaryElementWiseMultiplication);
+                       }
+                       return 0L;
+               }
+       }
+       
+       // *********************************** utility methods 
******************************************************
+       
+       /**
+        * Computes tensor indexes from column index such that column index  is 
equal to ret[0]*HW + ret[1]*W + ret[2]
+        * 
+        * @param j column index
+        * @param ret tensor indexes
+        * @param H second last dimension
+        * @param W last dimension
+        */
+       static void computeTensorIndexes(int j, int [] ret, int H, int W) {
+               ret[0] = j / (H*W);
+               ret[1] = (j - ret[0]*(H*W))/W;
+               ret[2] = j % W;
+       }
+       
+       //Split a filter of size [K, CRS] into c filters of [K, RS]
+       private static ArrayList<MatrixBlock> splitFilter(ConvolutionParameters 
_params) {
+               ArrayList<MatrixBlock> ret = new ArrayList<MatrixBlock>();
+               int RS = _params.R*_params.S; int CRS = 
_params.C*_params.R*_params.S;
+               double [] filter = _params.input2.getDenseBlock(); int S = 
_params.S;
+               for(int c = 0; c < _params.C; c++) {
+                       MatrixBlock mb = new MatrixBlock(_params.K, RS, false);
+                       mb.allocateDenseBlock(); long nnz = 0;
+                       double [] outputArr = mb.getDenseBlock();
+                       if(filter != null) {
+                               for(int k = 0; k < _params.K; k++) {
+                                       for(int rs = 0; rs < RS; rs++) {
+                                               outputArr[k*RS + rs] = 
filter[k*CRS + c*RS + rs];
+                                               nnz += outputArr[k*RS + rs] != 
0 ? 1 : 0;
+                                       }
+                               }
+                       }
+                       else {
+                               for(int k = 0; k < _params.K; k++) {
+                                       if( 
!_params.input2.sparseBlock.isEmpty(k) ) {
+                                               int [] tensorIndexes = new 
int[3];
+                                               // Find maxIndex
+                                               int apos = 
_params.input2.sparseBlock.pos(k);
+                                               int alen = 
_params.input2.sparseBlock.size(k);
+                                               int[] aix = 
_params.input2.sparseBlock.indexes(k);
+                                               double[] avals = 
_params.input2.sparseBlock.values(k);
+                                               for(int j=apos; j<apos+alen; 
j++) {
+                                                       
computeTensorIndexes(aix[j], tensorIndexes, _params.R, _params.S);
+                                                       if(c != 
tensorIndexes[0])
+                                                               continue;
+                                                       int r = 
tensorIndexes[1];
+                                                       int s = 
tensorIndexes[2];
+                                                       outputArr[k*RS + r*S + 
s] = avals[j];
+                                                       nnz += outputArr[k*RS + 
r*S + s] != 0 ? 1 : 0;
+                                               }
+                                       }
+                               }
+                       }
+                       mb.setNonZeros(nnz);
+                       ret.add(mb);
+               }
+               return ret;
+       }
+       
+       // Single-threaded matrix multiplication
+       static void singleThreadedMatMult(MatrixBlock m1, MatrixBlock m2, 
MatrixBlock ret, 
+                       boolean recomputeNNZM1, boolean recomputeNNZM2, 
ConvolutionParameters params) throws DMLRuntimeException {
+               if(!params.enableNative || m1.isInSparseFormat() || 
m2.isInSparseFormat()) {
+                       if(recomputeNNZM1)
+                               m1.recomputeNonZeros();
+                       if(recomputeNNZM2)
+                               m2.recomputeNonZeros();
+                       LibMatrixMult.matrixMult(m1, m2, ret, false);
+               }
+               else {
+                       ret.sparse = false;
+                       if(ret.getDenseBlock() == null)
+                               ret.allocateDenseBlock();
+                       NativeHelper.matrixMultDenseDense(m1.denseBlock, 
m2.denseBlock, 
+                                       ret.denseBlock, m1.getNumRows(), 
m1.getNumColumns(), m2.getNumColumns(), 1);
+                       ret.recomputeNonZeros();
+               }
+       }
+       
+       static void addBias(int _rl, int _ru, double [] outputArr, double [] 
biasArr, int K, int PQ) {
+               // double [] biasArr = _params.bias.getDenseBlock();
+               
+               int index = _rl*K*PQ;
+               for(int n = _rl; n < _ru; n++) {
+                       for(int k = 0; k < K; k++) {
+                               for(int pq = 0; pq < PQ; pq++, index++) {
+                                       outputArr[index] += biasArr[k];
+                               }
+                       }
+               }
+       }
+       
+       /**
+        * Returns the index of cell with maximum value. This method is 
optimized for dense input
+        * 
+        * @param p output feature map height
+        * @param q output feature map width
+        * @param inputOffset offset to be used for input index
+        * @param inputArray input array
+        * @param params convolution parameters
+        * @param performReluBackward perform ReLU backward
+        * @return index of cell with maximum value
+        */
+       static int getMaxIndex(int p, int q, int inputOffset, double [] 
inputArray, ConvolutionParameters params, boolean performReluBackward) {
+               int start_index_h = params.start_indexes_h[p];
+               int end_index_h = params.end_indexes_h[p];
+               int start_index_w = params.start_indexes_w[q];
+               int end_index_w = params.end_indexes_w[q];
+               
+               int maxIndex = -1; 
+               double maxVal = -Double.MAX_VALUE;
+               
+               // Note: We do not treat pad as zero and hence we don't do:  
+               // maxVal = 0 
+               // if start_index_h < 0 || start_index_w < 0 || end_index_h >= 
params.H || end_index_w >= params.W
+               
+               // Find maxIndex
+               double currDoutVal = -1;
+               for (int h = start_index_h; h < end_index_h; h++) {
+                       for (int w = start_index_w; w < end_index_w; w++) {
+                               currDoutVal = inputArray[inputOffset +  
h*params.W + w];
+                               currDoutVal = performReluBackward && 
currDoutVal < 0 ? 0 : currDoutVal;
+                               if(maxVal < currDoutVal) {
+                                       maxIndex = inputOffset +  h*params.W + 
w;
+                                       maxVal = currDoutVal;
+                               }
+                       }
+               }
+               return maxIndex;
+       }
+       
+       /**
+        * Returns the index of cell with maximum value. This method is 
optimized for sparse input
+        * 
+        * @param p output feature map height
+        * @param q output feature map width
+        * @param inputOffset offset to be used for input index
+        * @param n number of images
+        * @param c number of channels 
+        * @param input input matrix
+        * @param params convolution parameters
+        * @param performReluBackward perform ReLU on input
+        * @return index of the cell with maximum value
+        * @throws DMLRuntimeException if error occurs
+        */
+       static int getMaxIndexSparse(int p, int q, int inputOffset, int n, int 
c, MatrixBlock input, ConvolutionParameters params, boolean 
performReluBackward) throws DMLRuntimeException {
+               if(!input.isInSparseFormat())
+                       throw new DMLRuntimeException("Incorrect usage: Only 
sparse format supported");
+               
+               int [] tensorIndexes = new int[3];
+               
+               int start_index_h = params.start_indexes_h[p];
+               int end_index_h = params.end_indexes_h[p];
+               int start_index_w = params.start_indexes_w[q];
+               int end_index_w = params.end_indexes_w[q];
+               
+               int maxIndex = -1; 
+               double maxVal = -Double.MAX_VALUE;
+               
+               // Note: We do not treat pad as zero and hence we don't do:  
+               // maxVal = 0 
+               // if start_index_h < 0 || start_index_w < 0 || end_index_h >= 
params.H || end_index_w >= params.W
+
+               // input.isEmptyBlock() check is done by the caller
+               if( !input.sparseBlock.isEmpty(n) ) {
+                       // Find maxIndex
+                       int apos = input.sparseBlock.pos(n);
+                       int alen = input.sparseBlock.size(n);
+                       int[] aix = input.sparseBlock.indexes(n);
+                       double[] avals = input.sparseBlock.values(n);
+                       for(int j=apos; j<apos+alen; j++) {
+                               computeTensorIndexes(aix[j], tensorIndexes, 
params.H, params.W);
+                               if(c != tensorIndexes[0])
+                                       continue;
+                               int h = tensorIndexes[1];
+                               int w = tensorIndexes[2];
+                               if(h >= start_index_h && h < end_index_h && w 
>= start_index_w && w < end_index_w) {
+                                       double val = performReluBackward && 
avals[j] < 0 ? 0 : avals[j]; 
+                                       if(maxVal < val) {
+                                               maxIndex = inputOffset +  
h*params.W + w;
+                                               maxVal = val;
+                                       }
+                               }
+                       }
+               }
+               else {
+                       maxIndex = inputOffset;
+               }
+               return maxIndex;
+       }
+       
+       // Returns the row of matrix in dense format
+       static void getRowInDenseFormat(MatrixBlock input, int n, double []  
ret) throws DMLRuntimeException {
+               if(input.getNumColumns() != ret.length) {
+                       throw new DMLRuntimeException("Invalid parameters");
+               }
+               // Use temporary array to avoid binary search
+               if(input.isInSparseFormat()) {
+                       Arrays.fill(ret, 0);
+                       if( !input.sparseBlock.isEmpty(n) ) {
+                               int apos = input.sparseBlock.pos(n);
+                               int alen = input.sparseBlock.size(n);
+                               int[] aix = input.sparseBlock.indexes(n);
+                               double[] avals = input.sparseBlock.values(n);
+                               for(int j=apos; j<apos+alen; j++)
+                                       ret[ aix[j] ] = avals[j];
+                       }
+               }
+               else {
+                       System.arraycopy(input.getDenseBlock(), 
n*input.getNumColumns(), ret, 0, input.getNumColumns());
+               }
+       }
+       
+       // 
------------------------------------------------------------------------------------------------------
+       // Since col2im always operates on intermediate generated as part of 
matmult, it is not clear which operator to select apriori.
+       // Therefore, it is provided as utility function rather than an 
operator (like im2col or rotate180)
+       
+       //Converts input: PQ X CRS matrix and writes to 1 X CHW
+       static void doCol2imOverSingleImage(int outputN, MatrixBlock input, 
ConvolutionParameters params) throws DMLRuntimeException {
+               if(input.rlen != params.P*params.Q || input.clen != 
params.C*params.R*params.S) {
+                       throw new DMLRuntimeException("Incorrect input 
dimensions");
+               }
+               
+               double [] outputArray = null;
+               if (!params.output.isInSparseFormat())
+                       outputArray = params.output.getDenseBlock();
+               else {
+                       throw new DMLRuntimeException("Only dense output is 
implemented");
+               }
+               
+               if(!input.isInSparseFormat()) {
+                       double [] inputArray = input.getDenseBlock();
+                       doCol2IMDenseInput(0, outputN, inputArray, outputArray, 
params);
+               }
+               else {
+                       if(!input.isEmptyBlock()) {
+                               int [] tensorIndexes = new int[3];
+                               for(int i = 0; i < input.getNumRows(); i++) {
+                                       if( !input.sparseBlock.isEmpty(i) ) {
+                                               computeTensorIndexes(i, 
tensorIndexes, params.P, params.Q);
+                                               int p = tensorIndexes[1];
+                                               int q = tensorIndexes[2];
+                                               if(tensorIndexes[0] != 0) 
+                                                       throw new 
DMLRuntimeException("Incorrect tensor indexes: " + tensorIndexes[0] + " != 0 <" 
+ p + " " + q + " " + tensorIndexes[0] + params.P + " " + params.Q + ">");
+                                               
+                                               int apos = 
input.sparseBlock.pos(i);
+                                               int alen = 
input.sparseBlock.size(i);
+                                               int[] aix = 
input.sparseBlock.indexes(i);
+                                               double[] avals = 
input.sparseBlock.values(i);
+                                               for(int j = apos; j < 
apos+alen; j++) {
+                                                       
computeTensorIndexes(aix[j], tensorIndexes, params.R, params.S);
+                                                       int c = 
tensorIndexes[0];
+                                                       int r = 
tensorIndexes[1];
+                                                       int s = 
tensorIndexes[2];
+                                                       int h = 
p*params.stride_h + r - params.pad_h;
+                                                       int w = 
q*params.stride_w + s - params.pad_w;
+                                                       if(h >= 0 && h < 
params.H && w >= 0 && w < params.W) {
+                                                               int outIndex = 
outputN*params.C*params.H*params.W + c*params.H*params.W + h*params.W + w;
+                                                               
outputArray[outIndex] += avals[j];
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+       
+       // Converts input: PQ X CRS matrix and writes to 1 X CHW if inputN == 0
+       // Or converts input: NPQ X CRS matrix and writes to N X CHW 
+       private static void doCol2IMDenseInput(int inputN, int outputN, double 
[] inputArray, double [] outputArray, ConvolutionParameters params) throws 
DMLRuntimeException {
+               final int outputNOffset = outputN*params.C*params.H*params.W;
+               for (int p = 0; p < params.P; p++) {
+                       // h = p*params.stride_h + r - params.pad_h
+                       //   = r + hOffset
+                       // Based on restrictions: h >= 0 and r >= 0 and h < 
params.H and r < params.R, we get
+                       // max(0, - hOffset) <= r < min(params.R, params.H - 
hOffset)
+                       final int hOffset = p*params.stride_h - params.pad_h;
+                       final int rStart = Math.max(0, - hOffset);
+                       final int rEnd = Math.min(params.R, params.H - hOffset);
+                       for (int q = 0; q < params.Q; q++) {
+                               // Using the same logic as above on following:
+                               // w = q*params.stride_w + s - params.pad_w
+                               final int wOffset = q*params.stride_w - 
params.pad_w;
+                               final int sStart = Math.max(0, - wOffset);
+                               final int sEnd = Math.min(params.S, params.W - 
wOffset);
+                               final int tempOffset = 
(inputN*params.P*params.Q + p*params.Q + q)*params.C*params.R*params.S;
+                               for (int c = 0; c < params.C; c++) {
+                                       final int outOffset = outputNOffset + 
c*params.H*params.W;
+                                       final int inputOffset = tempOffset + 
c*params.R*params.S;
+                                       for (int r = rStart; r < rEnd; r++) {
+                                               for (int s = sStart; s < sEnd; 
s++) {
+                                                       int inputIndex = 
inputOffset + r*params.S + s;
+                                                       int outIndex = 
outOffset + (hOffset + r)*params.W + wOffset + s;
+                                                       outputArray[outIndex] 
+= inputArray[inputIndex];
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/19eed8f3/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
new file mode 100644
index 0000000..9ae39bf
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
@@ -0,0 +1,386 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import java.util.Arrays;
+
+/**
+ * This class contains the different implementation of im2col operation
+ */
+public class LibMatrixDNNIm2ColHelper {
+       
+       static interface Im2colWorker {
+               public void execute(int n);
+               public void execute(int n, int c);
+               public static Im2colWorker getWorker(MatrixBlock input, 
MatrixBlock im2ColOutBlock, ConvolutionParameters params, boolean allChannels) {
+                       if(im2ColOutBlock.isInSparseFormat() || 
im2ColOutBlock.getDenseBlock() == null)
+                               throw new RuntimeException("im2col output is 
always in dense format");
+                       if(allChannels) {
+                               if(!input.isInSparseFormat()) {
+                                       if (params.stride_h == 1 && 
params.stride_w == 1 && params.pad_h == 0 && params.pad_w == 0) 
+                                               return new 
DenseIm2colWorkerStride1Pad0AllChannels(input.getDenseBlock(), 
im2ColOutBlock.getDenseBlock(), params);
+                                       else
+                                               return new 
DenseIm2colWorkerAllChannels(input.getDenseBlock(), 
im2ColOutBlock.getDenseBlock(), params);
+                               }
+                               else 
+                                       return new 
SparseIm2colWorkerAllChannels(input, im2ColOutBlock, params);
+                       }
+                       else {
+                               if(!input.isInSparseFormat()) {
+                                       if (params.stride_h == 1 && 
params.stride_w == 1 && params.pad_h == 0 && params.pad_w == 0) 
+                                               return new 
DenseIm2colWorkerStride1Pad0(input.getDenseBlock(), 
im2ColOutBlock.getDenseBlock(), params);
+                                       else
+                                               return new 
DenseIm2colWorker(input.getDenseBlock(), im2ColOutBlock.getDenseBlock(), 
params);
+                               }
+                               else 
+                                       return new SparseIm2colWorker(input, 
im2ColOutBlock, params);
+                       }
+               }
+       }
+       
+       /**
+        * Special case operator for performing dense im2col when stride = [1, 
1] and pad = [0, 0] by using System.arraycopy
+        */
+       static class DenseIm2colWorkerStride1Pad0 implements Im2colWorker {
+               double [] inputArray; double [] outputArray; 
+               int CRS; int S; int R; int P; int Q; int CHW; int H; int W;
+               public DenseIm2colWorkerStride1Pad0(double [] inputArray, 
double [] outputArray, ConvolutionParameters params) {
+                       this.inputArray = inputArray;
+                       this.outputArray = outputArray;
+                       this.CRS = params.C * params.R * params.S;
+                       this.H = params.H; this.W = params.W; this.R = 
params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
+                       this.CHW = params.C*params.H*params.W;
+               }
+               
+               @Override
+               public void execute(int n) {
+                       throw new RuntimeException("Not supported");
+               }
+
+               @Override
+               public void execute(int n, int cInput) {
+                       int nOffset = n * CHW;
+                       int RS = R*S;
+                       for (int rs = 0; rs < RS; ++rs) {
+                               int wOffset = rs % S;
+                               int hOffset = rs / S;
+                               for (int h = 0; h < P; ++h) {
+                                       int hPadded = h + hOffset;
+                                       int outOffset = (rs * P + h) * Q;
+                                       int inputOffset = nOffset + (cInput * H 
+ hPadded) * W;
+                                       System.arraycopy(inputArray, 
inputOffset + wOffset, outputArray, outOffset, Q);
+                                       int w = Q - 1;
+                                       int wPadded = w + wOffset;
+                                       if (hPadded < H && wPadded < W)
+                                               outputArray[outOffset + w] = 
inputArray[inputOffset + wPadded];
+                                       else
+                                               outputArray[outOffset + w] = 0;
+                               }
+                       }
+               }
+       }
+
+       
+               
+       /**
+        * Special case operator for performing dense im2col when stride = [1, 
1] and pad = [0, 0] by using System.arraycopy
+        */
+       static class DenseIm2colWorkerStride1Pad0AllChannels implements 
Im2colWorker {
+               double [] inputArray; double [] outputArray; 
+               int CRS; int S; int R; int P; int Q; int CHW; int H; int W;
+               public DenseIm2colWorkerStride1Pad0AllChannels(double [] 
inputArray, double [] outputArray, ConvolutionParameters params) {
+                       this.inputArray = inputArray;
+                       this.outputArray = outputArray;
+                       this.CRS = params.C * params.R * params.S;
+                       this.H = params.H; this.W = params.W; this.R = 
params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
+                       this.CHW = params.C*params.H*params.W;
+               }
+               
+               @Override
+               public void execute(int n, int c) {
+                       throw new RuntimeException("Not supported");
+               }
+
+               @Override
+               public void execute(int n) {
+                       int nOffset = n * CHW;
+                       for (int c = 0; c < CRS; ++c) {
+                               int wOffset = c % S;
+                               int hOffset = (c / S) % R;
+                               int cInput = c / R / S;
+                               for (int h = 0; h < P; ++h) {
+                                       int hPadded = h + hOffset;
+                                       int outOffset = (c * P + h) * Q;
+                                       int inputOffset = nOffset + (cInput * H 
+ hPadded) * W;
+                                       System.arraycopy(inputArray, 
inputOffset + wOffset, outputArray, outOffset, Q);
+                                       int w = Q - 1;
+                                       int wPadded = w + wOffset;
+                                       if (hPadded < H && wPadded < W)
+                                               outputArray[outOffset + w] = 
inputArray[inputOffset + wPadded];
+                                       else
+                                               outputArray[outOffset + w] = 0;
+                               }
+                       }
+               }
+       }
+       
+       /**
+        * Performing dense im2col (general case)
+        */
+       static class DenseIm2colWorker implements Im2colWorker {
+               double [] inputArray; double [] outputArray; 
+               int CRS; int S; int R; int P; int Q; int CHW; int H; int W; 
+               int stride_h; int stride_w; int pad_h; int pad_w;
+               public DenseIm2colWorker(double [] inputArray, double [] 
outputArray, ConvolutionParameters params) {
+                       this.inputArray = inputArray;
+                       this.outputArray = outputArray;
+                       this.CRS = params.C * params.R * params.S;
+                       this.H = params.H; this.W = params.W; this.R = 
params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
+                       this.CHW = params.C*params.H*params.W;
+                       this.stride_h = params.stride_h; this.stride_w = 
params.stride_w;
+                       this.pad_h = params.pad_h; this.pad_w = params.pad_w;
+               }
+               
+               @Override
+               public void execute(int n) {
+                       throw new RuntimeException("Not supported");
+               }
+
+               @Override
+               public void execute(int n, int cInput) {
+                       int nOffset = n * CHW; int RS = R*S;
+                       for (int rs = 0; rs < RS; ++rs) {
+                               int wOffset = rs % S;
+                               int hOffset = rs / S;
+                               for (int h = 0; h < P; ++h) {
+                                       int outOffset = (rs * P + h) * Q;
+                                       int hPadded = h * stride_h - pad_h + 
hOffset;
+                                       int inputOffset = nOffset + (cInput * H 
+ hPadded) * W;
+                                       if (hPadded < 0 || hPadded >= H) {
+                                               Arrays.fill(outputArray, 
outOffset, outOffset+Q, 0);
+                                       } else {
+                                               for (int w = 0; w < Q; ++w) {
+                                                       int wPadded = w * 
stride_w - pad_w + wOffset;
+                                                       if (wPadded >= 0 && 
wPadded < W)
+                                                               
outputArray[outOffset + w] = inputArray[inputOffset + wPadded];
+                                                       else
+                                                               
outputArray[outOffset + w] = 0;
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+       
+       /**
+        * Performing dense im2col (general case)
+        */
+       static class DenseIm2colWorkerAllChannels implements Im2colWorker {
+               double [] inputArray; double [] outputArray; 
+               int CRS; int S; int R; int P; int Q; int CHW; int H; int W; 
+               int stride_h; int stride_w; int pad_h; int pad_w;
+               public DenseIm2colWorkerAllChannels(double [] inputArray, 
double [] outputArray, ConvolutionParameters params) {
+                       this.inputArray = inputArray;
+                       this.outputArray = outputArray;
+                       this.CRS = params.C * params.R * params.S;
+                       this.H = params.H; this.W = params.W; this.R = 
params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
+                       this.CHW = params.C*params.H*params.W;
+                       this.stride_h = params.stride_h; this.stride_w = 
params.stride_w;
+                       this.pad_h = params.pad_h; this.pad_w = params.pad_w;
+               }
+               
+               @Override
+               public void execute(int n, int c) {
+                       throw new RuntimeException("Not supported");
+               }
+
+               @Override
+               public void execute(int n) {
+                       int nOffset = n * CHW;
+                       for (int c = 0; c < CRS; ++c) {
+                               int wOffset = c % S;
+                               int hOffset = (c / S) % R;
+                               int cInput = c / R / S;
+                               for (int h = 0; h < P; ++h) {
+                                       int outOffset = (c * P + h) * Q;
+                                       int hPadded = h * stride_h - pad_h + 
hOffset;
+                                       int inputOffset = nOffset + (cInput * H 
+ hPadded) * W;
+                                       if (hPadded < 0 || hPadded >= H) {
+                                               Arrays.fill(outputArray, 
outOffset, outOffset+Q, 0);
+                                       } else {
+                                               for (int w = 0; w < Q; ++w) {
+                                                       int wPadded = w * 
stride_w - pad_w + wOffset;
+                                                       if (wPadded >= 0 && 
wPadded < W)
+                                                               
outputArray[outOffset + w] = inputArray[inputOffset + wPadded];
+                                                       else
+                                                               
outputArray[outOffset + w] = 0;
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+       
+       /**
+        * Performing dense im2col (general case)
+        */
+       static class SparseIm2colWorkerAllChannels implements Im2colWorker {
+               MatrixBlock input; double [] outputArray; 
+               int CRS; int S; int R; int P; int Q; int H; int W; 
+               int stride_h; int stride_w; int pad_h; int pad_w; double [] 
temp;
+               public SparseIm2colWorkerAllChannels(MatrixBlock input, 
MatrixBlock im2ColOutBlock, ConvolutionParameters params) {
+                       this.input = input;
+                       this.outputArray = im2ColOutBlock.getDenseBlock();
+                       this.CRS = params.C * params.R * params.S;
+                       this.H = params.H; this.W = params.W; this.R = 
params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
+                       this.stride_h = params.stride_h; this.stride_w = 
params.stride_w;
+                       this.pad_h = params.pad_h; this.pad_w = params.pad_w;
+                       temp = new double[input.getNumColumns()];
+               }
+               
+               @Override
+               public void execute(int n, int c) {
+                       throw new RuntimeException("Not supported");
+               }
+
+               @Override
+               public void execute(int n) {
+                       // Using a temporary array improves performance by not 
requiring binary search for getValue
+                       // Since the access pattern depends on 
ConvolutionParameters, this serves as a temporary fix.
+                       fillTemp(input, n);
+                       // final int nOffset = n * params.C*params.H*params.W;
+                       for (int c = 0; c < CRS; ++c) {
+                               int wOffset = c % S;
+                               int hOffset = (c / S) % R;
+                               int cInput = c / R / S;
+                               for (int h = 0; h < P; ++h) {
+                                       int outOffset = (c * P + h) * Q;
+                                       int hPadded = h * stride_h - pad_h + 
hOffset;
+                                       int tempOffset = (cInput * H + hPadded) 
* W;
+                                       // int inputOffset = nOffset + 
tempOffset;
+                                       if (hPadded < 0 || hPadded >= H) {
+                                               Arrays.fill(outputArray, 
outOffset, outOffset+Q, 0);
+                                       } else {
+                                               for (int w = 0; w < Q; ++w) {
+                                                       int wPadded = w * 
stride_w - pad_w + wOffset;
+                                                       if (wPadded >= 0 && 
wPadded < W) 
+                                                               
outputArray[outOffset + w] = temp[tempOffset + wPadded];
+                                                       else
+                                                               
outputArray[outOffset + w] = 0;
+                                               }
+                                       }
+                               }
+                       }
+               }
+               // Returns the row of matrix in dense format
+               private void fillTemp(MatrixBlock input, int n) {
+                       if(input.getNumColumns() != temp.length) {
+                               throw new RuntimeException("Invalid 
parameters");
+                       }
+                       // Use temporary array to avoid binary search
+                       if(input.isInSparseFormat()) {
+                               Arrays.fill(temp, 0);
+                               if( !input.sparseBlock.isEmpty(n) ) {
+                                       int apos = input.sparseBlock.pos(n);
+                                       int alen = input.sparseBlock.size(n);
+                                       int[] aix = 
input.sparseBlock.indexes(n);
+                                       double[] avals = 
input.sparseBlock.values(n);
+                                       for(int j=apos; j<apos+alen; j++)
+                                               temp[ aix[j] ] = avals[j];
+                               }
+                       }
+                       else {
+                               System.arraycopy(input.getDenseBlock(), 
n*input.getNumColumns(), temp, 0, input.getNumColumns());
+                       }
+               }
+       }
+       
+       /**
+        * Performing dense im2col (general case)
+        */
+       static class SparseIm2colWorker implements Im2colWorker {
+               MatrixBlock input; double [] outputArray; 
+               int CRS; int S; int R; int P; int Q; int H; int W; 
+               int stride_h; int stride_w; int pad_h; int pad_w; double [] 
temp;
+               public SparseIm2colWorker(MatrixBlock input, MatrixBlock 
im2ColOutBlock, ConvolutionParameters params) {
+                       this.input = input;
+                       this.outputArray = im2ColOutBlock.getDenseBlock();
+                       this.CRS = params.C * params.R * params.S;
+                       this.H = params.H; this.W = params.W; this.R = 
params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
+                       this.stride_h = params.stride_h; this.stride_w = 
params.stride_w;
+                       this.pad_h = params.pad_h; this.pad_w = params.pad_w;
+                       temp = new double[input.getNumColumns()];
+               }
+               
+               @Override
+               public void execute(int n) {
+                       throw new RuntimeException("Not supported");
+               }
+
+               @Override
+               public void execute(int n, int cInput) {
+                       // Using a temporary array improves performance by not 
requiring binary search for getValue
+                       // Since the access pattern depends on 
ConvolutionParameters, this serves as a temporary fix.
+                       fillTemp(input, n); int RS = R*S;
+                       for (int rs = 0; rs < RS; ++rs) {
+                               int wOffset = rs % S;
+                               int hOffset = rs / S;
+                               for (int h = 0; h < P; ++h) {
+                                       int outOffset = (rs * P + h) * Q;
+                                       int hPadded = h * stride_h - pad_h + 
hOffset;
+                                       int tempOffset = (cInput * H + hPadded) 
* W;
+                                       // int inputOffset = nOffset + 
tempOffset;
+                                       if (hPadded < 0 || hPadded >= H) {
+                                               Arrays.fill(outputArray, 
outOffset, outOffset+Q, 0);
+                                       } else {
+                                               for (int w = 0; w < Q; ++w) {
+                                                       int wPadded = w * 
stride_w - pad_w + wOffset;
+                                                       if (wPadded >= 0 && 
wPadded < W) 
+                                                               
outputArray[outOffset + w] = temp[tempOffset + wPadded];
+                                                       else
+                                                               
outputArray[outOffset + w] = 0;
+                                               }
+                                       }
+                               }
+                       }
+               }
+               // Returns the row of matrix in dense format
+               private void fillTemp(MatrixBlock input, int n) {
+                       if(input.getNumColumns() != temp.length) {
+                               throw new RuntimeException("Invalid 
parameters");
+                       }
+                       // Use temporary array to avoid binary search
+                       if(input.isInSparseFormat()) {
+                               Arrays.fill(temp, 0);
+                               if( !input.sparseBlock.isEmpty(n) ) {
+                                       int apos = input.sparseBlock.pos(n);
+                                       int alen = input.sparseBlock.size(n);
+                                       int[] aix = 
input.sparseBlock.indexes(n);
+                                       double[] avals = 
input.sparseBlock.values(n);
+                                       for(int j=apos; j<apos+alen; j++)
+                                               temp[ aix[j] ] = avals[j];
+                               }
+                       }
+                       else {
+                               System.arraycopy(input.getDenseBlock(), 
n*input.getNumColumns(), temp, 0, input.getNumColumns());
+                       }
+               }
+       }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/19eed8f3/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingBackwardHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingBackwardHelper.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingBackwardHelper.java
new file mode 100644
index 0000000..b400105
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingBackwardHelper.java
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import java.util.concurrent.Callable;
+
+/**
+ * This class contains the set of operators used for performing pooling 
backward
+ */
+public class LibMatrixDNNPoolingBackwardHelper {
+       /**
+        * Performs the maxpooling backward operation for dense input and dense 
error (dout)
+        */
+       public static class PoolingBackwardDenseDense implements Callable<Long> 
+       {
+               public int _rl; public int _ru; 
+               private final ConvolutionParameters _params; 
+               double [] outputArray; boolean performReluBackward;
+               double [] inputArray; double [] doutArray;
+               int C; int CHW; int P; int Q; int HW; int CPQ; int PQ;
+               public PoolingBackwardDenseDense(int rl, int ru, 
ConvolutionParameters params, boolean performReluBackward) {
+                       _rl = rl; _ru = ru;
+                       _params = params;
+                       this.performReluBackward = performReluBackward;
+                       inputArray = params.input1.getDenseBlock();
+                       doutArray = params.input2.getDenseBlock();
+                       outputArray = params.output.getDenseBlock();
+                       C = params.C; CHW = params.C*params.H*params.W; HW = 
params.H*params.W;
+                       P = params.P; Q = params.Q; CPQ = 
params.C*params.P*params.Q;
+                       PQ = params.P*params.Q;
+                       if (inputArray == null || doutArray == null || 
outputArray == null )
+                               throw new RuntimeException("Incorrect usage: 
empty inputs");
+               }
+               
+               @Override
+               public Long call() throws Exception {
+                       for(int n = _rl; n < _ru; n++)  {
+                               for (int c = 0; c < C; c++) {
+                                       final int inputOffset = n*CHW + c*HW;
+                                       final int outputOffset = n*CPQ + c*PQ;
+                                       for (int p = 0; p < P; p++) {
+                                               for (int q = 0; q < Q; q++) {
+                                                       int maxIndex = 
LibMatrixDNNHelper.getMaxIndex(p, q, inputOffset, inputArray, _params, 
performReluBackward);
+                                                       if(maxIndex != -1)
+                                                               
outputArray[maxIndex] += doutArray[outputOffset +  p * Q + q];
+                                               }
+                                       }
+                               }
+                       }
+                       return 0L;
+               }
+       }
+       
+       /**
+        * Performs the maxpooling backward operation for dense input and 
sparse error (dout)
+        */
+       public static class PoolingBackwardDenseSparse implements 
Callable<Long> 
+       {
+               public int _rl; public int _ru; 
+               private final ConvolutionParameters _params; 
+               double [] outputArray; boolean performReluBackward;
+               double [] inputArray;  MatrixBlock dout;
+               int C; int CHW; int P; int Q; int HW;
+               public PoolingBackwardDenseSparse(int rl, int ru, 
ConvolutionParameters params, boolean performReluBackward) {
+                       _rl = rl; _ru = ru;
+                       _params = params;
+                       this.performReluBackward = performReluBackward;
+                       inputArray = params.input1.getDenseBlock();
+                       dout = params.input2;
+                       outputArray = params.output.getDenseBlock();
+                       C = params.C; CHW = params.C*params.H*params.W; HW = 
params.H*params.W;
+                       P = params.P; Q = params.Q; 
+                       if (inputArray == null || outputArray == null )
+                               throw new RuntimeException("Incorrect usage: 
empty inputs");
+                       if (!params.input2.isInSparseFormat())
+                               throw new RuntimeException("Incorrect usage: 
Call optimized versions");
+               }
+               
+               @Override
+               public Long call() throws Exception {
+                       for(int n = _rl; n < _ru; n++)  {
+                               if( !dout.sparseBlock.isEmpty(n) ) {
+                                       int [] tensorIndexes = new int[3];
+                                       int apos = dout.sparseBlock.pos(n);
+                                       int alen = dout.sparseBlock.size(n);
+                                       int[] aix = dout.sparseBlock.indexes(n);
+                                       double[] avals = 
dout.sparseBlock.values(n);
+                                       for(int j = apos; j < apos+alen; j++) {
+                                               
LibMatrixDNNHelper.computeTensorIndexes(aix[j], tensorIndexes, P, Q);
+                                               int c = tensorIndexes[0];
+                                               int p = tensorIndexes[1];
+                                               int q = tensorIndexes[2];
+                                               final int inputOffset = n*CHW + 
c*HW;
+                                               int maxIndex = 
LibMatrixDNNHelper.getMaxIndex(p, q, inputOffset, inputArray, _params, 
performReluBackward);
+                                               if(maxIndex != -1)
+                                                       outputArray[maxIndex] 
+= avals[j];
+                                       }
+                               }
+                       }
+                       return 0L;
+               }
+       }
+       
+       /**
+        * Performs the maxpooling backward operation for sparse input and 
dense error (dout)
+        */
+       public static class PoolingBackwardSparseDense implements 
Callable<Long> 
+       {
+               public int _rl; public int _ru; 
+               private final ConvolutionParameters _params; 
+               double [] outputArray; boolean performReluBackward;
+               double [] doutArray;
+               int C; int CHW; int P; int Q; int HW; int CPQ; int PQ;
+               public PoolingBackwardSparseDense(int rl, int ru, 
ConvolutionParameters params, boolean performReluBackward) {
+                       _rl = rl; _ru = ru;
+                       _params = params;
+                       this.performReluBackward = performReluBackward;
+                       doutArray = params.input2.getDenseBlock();
+                       outputArray = params.output.getDenseBlock();
+                       C = params.C; CHW = params.C*params.H*params.W; HW = 
params.H*params.W;
+                       P = params.P; Q = params.Q; CPQ = 
params.C*params.P*params.Q;
+                       PQ = params.P*params.Q;
+                       if (doutArray == null || outputArray == null )
+                               throw new RuntimeException("Incorrect usage: 
empty inputs");
+                       if (!params.input1.isInSparseFormat())
+                               throw new RuntimeException("Incorrect usage: 
Call optimized versions");
+               }
+               
+               @Override
+               public Long call() throws Exception {
+                       for(int n = _rl; n < _ru; n++)  {
+                               for (int c = 0; c < C; c++) {
+                                       for (int p = 0; p < P; p++) {
+                                               for (int q = 0; q < Q; q++) {
+                                                       double inVal = 
doutArray[n*CPQ + c*PQ +  p * Q + q];
+                                                       if(inVal != 0) {
+                                                               final int 
inputOffset = n*CHW + c*HW;
+                                                               int maxIndex = 
LibMatrixDNNHelper.getMaxIndexSparse(p, q, inputOffset, n, c, _params.input1, 
_params, performReluBackward);
+                                                               if(maxIndex != 
-1)
+                                                                       
outputArray[maxIndex] += inVal;
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+                       return 0L;
+               }
+       }
+       
+       /**
+        * Performs the maxpooling backward operation for sparse input and 
sparse error (dout)
+        */
+       public static class PoolingBackwardSparseSparse implements 
Callable<Long> 
+       {
+               public int _rl; public int _ru; 
+               private final ConvolutionParameters _params; 
+               double [] outputArray; boolean performReluBackward;
+               int C; int CHW; int P; int Q; int HW; 
+               public PoolingBackwardSparseSparse(int rl, int ru, 
ConvolutionParameters params, boolean performReluBackward) {
+                       _rl = rl; _ru = ru;
+                       _params = params;
+                       this.performReluBackward = performReluBackward;
+                       outputArray = params.output.getDenseBlock();
+                       C = params.C; CHW = params.C*params.H*params.W; HW = 
params.H*params.W;
+                       P = params.P; Q = params.Q;
+                       if (outputArray == null )
+                               throw new RuntimeException("Incorrect usage: 
empty outputs");
+                       if (!params.input1.isInSparseFormat() || 
!params.input2.isInSparseFormat())
+                               throw new RuntimeException("Incorrect usage: 
Call optimized versions");
+               }
+               
+               @Override
+               public Long call() throws Exception {
+                       for(int n = _rl; n < _ru; n++)  {
+                               if( !_params.input2.sparseBlock.isEmpty(n) ) {
+                                       int [] tensorIndexes = new int[3];
+                                       int apos = 
_params.input2.sparseBlock.pos(n);
+                                       int alen = 
_params.input2.sparseBlock.size(n);
+                                       int[] aix = 
_params.input2.sparseBlock.indexes(n);
+                                       double[] avals = 
_params.input2.sparseBlock.values(n);
+                                       for(int j = apos; j < apos+alen; j++) {
+                                               
LibMatrixDNNHelper.computeTensorIndexes(aix[j], tensorIndexes, P, Q);
+                                               int c = tensorIndexes[0];
+                                               int p = tensorIndexes[1];
+                                               int q = tensorIndexes[2];
+                                               final int inputOffset = n*CHW + 
c*HW;
+                                               int maxIndex = 
LibMatrixDNNHelper.getMaxIndexSparse(p, q, inputOffset, n, c, _params.input1, 
_params, performReluBackward);
+                                               if(maxIndex != -1)
+                                                       outputArray[maxIndex] 
+= avals[j];
+                                       }
+                               }
+                       }
+                       return 0L;
+               }
+       }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/19eed8f3/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java
new file mode 100644
index 0000000..c6aaee2
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import java.util.Arrays;
+import java.util.concurrent.Callable;
+
+/**
+ * This class contains the set of operators used for performing pooling
+ */
+public class LibMatrixDNNPoolingHelper {
+       
+       /**
+        * Performs the dense maxpooling
+        */
+       public static class DenseMaxPooling implements Callable<Long> 
+       {
+               public int _rl; public int _ru; 
+               private final ConvolutionParameters _params;
+               double [] inputArray; double [] outputArray;
+               int C; int P; int Q; int W;
+               public DenseMaxPooling(int rl, int ru, ConvolutionParameters 
params) {
+                       _rl = rl; _ru = ru;
+                       _params = params;
+                       inputArray = params.input1.getDenseBlock();
+                       outputArray = params.output.getDenseBlock();
+                       C = params.C; P = params.P; Q = params.Q; W = params.W;
+               }
+               
+               @Override
+               public Long call() throws Exception {
+                       final int HW = _params.H*_params.W;
+                       final int CHW = _params.C*_params.H*_params.W;
+                       final int CPQ = C*P*Q;
+                       for(int n = _rl; n < _ru; n++)  {
+                               final int inOffset = n*CHW;
+                               int out_index = n*CPQ;
+                               for (int c = 0; c < C; c++) {
+                                       final int inOffset1 = inOffset + c*HW;
+                                       for (int p = 0; p < P; p++) {
+                                               for (int q = 0; q < Q; q++, 
out_index++) {
+                                                       for (int h = 
_params.start_indexes_h[p]; h < _params.end_indexes_h[p]; h++) {
+                                                               for (int w = 
_params.start_indexes_w[q]; w < _params.end_indexes_w[q]; w++) {
+                                                                       
outputArray[out_index] = Math.max(outputArray[out_index], inputArray[inOffset1 
+  h*W + w]);
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+                       return 0L;
+               }
+       }
+       
+       /**
+        * Performs the sparse maxpooling
+        */
+       public static class SparseMaxPooling implements Callable<Long> 
+       {
+               public int _rl; public int _ru; 
+               private final ConvolutionParameters _params;
+               int HW;
+               double [] outputArray;
+               int C; int P; int Q; int W;
+               public SparseMaxPooling(int rl, int ru, ConvolutionParameters 
params) {
+                       _rl = rl; _ru = ru;
+                       _params = params;
+                       outputArray = params.output.getDenseBlock();
+                       C = params.C; P = params.P; Q = params.Q; W = params.W;
+                       HW = _params.H*_params.W;
+               }
+               
+               boolean isNthRowEmpty = false;
+               int apos; int alen; int[] aix; double[] avals;
+               private void getNthSparseRow(int n) {
+                       if( !_params.input1.sparseBlock.isEmpty(n) ) {
+                               apos = _params.input1.sparseBlock.pos(n);
+                               alen = _params.input1.sparseBlock.size(n);
+                               aix = _params.input1.sparseBlock.indexes(n);
+                               avals = _params.input1.sparseBlock.values(n);
+                               isNthRowEmpty = false;
+                       }
+                       else
+                               isNthRowEmpty = true;
+               }
+               int fromIndex = -1; // as per C
+               int toIndex = -1; // as per C
+               private int setSearchIndex(int from, int searchVal) {
+                       for(int j = from; j < apos+alen; j++) {
+                               if(aix[j] > searchVal)
+                                       return Math.max(from, j-1);
+                       }
+                       return apos+alen;
+               }
+               private double getValue(int col) {
+                       if( !isNthRowEmpty ) {
+                               int index = Arrays.binarySearch(aix, fromIndex, 
toIndex, col);
+                               return index > 0 ? avals[index] : 0;
+                       }
+                       return 0;
+               }
+               
+               @Override
+               public Long call() throws Exception {
+                       final int CPQ = C*P*Q;
+                       for(int n = _rl; n < _ru; n++)  {
+                               getNthSparseRow(n);
+                               int out_index = n*CPQ;
+                               for (int c = 0; c < C; c++) {
+                                       // This allows for binary search in 
getValue to be more efficient
+                                       fromIndex = setSearchIndex(apos, c*HW);
+                                       toIndex = Math.min(apos+alen, 
setSearchIndex(fromIndex, (c+1)*HW));
+                                       for (int p = 0; p < P; p++) {
+                                               for (int q = 0; q < Q; q++, 
out_index++) {
+                                                       for (int h = 
_params.start_indexes_h[p]; h < _params.end_indexes_h[p]; h++) {
+                                                               for (int w = 
_params.start_indexes_w[q]; w < _params.end_indexes_w[q]; w++) {
+                                                                       
outputArray[out_index] = Math.max(outputArray[out_index], getValue(c*HW +  h*W 
+ w));
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+                       return 0L;
+               }
+       }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/19eed8f3/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java
new file mode 100644
index 0000000..c003756
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import java.util.Arrays;
+
+/**
+ * This class contains the different implementation of rotate180 operation
+ */
+public class LibMatrixDNNRotate180Helper {
+
+       static interface Rotate180Worker {
+               public void execute(int inputN, int outputN);
+               public static Rotate180Worker getWorker(MatrixBlock input, 
double [] outputArray, ConvolutionParameters params, boolean 
zeroOutSparseOutput) {
+                       if(!input.isInSparseFormat()) 
+                               return new DenseRotate180Worker(input, 
outputArray, params);
+                       else
+                               return new SparseRotate180Worker(input, 
outputArray, params, zeroOutSparseOutput);
+               }
+       }
+       
+       /**
+        * Performing dense rotate180 (general case)
+        */
+       static class DenseRotate180Worker implements Rotate180Worker {
+
+               double [] inputArray; double [] outputArray;  
+               ConvolutionParameters params;
+               public DenseRotate180Worker(MatrixBlock input, double [] 
outputArray,  ConvolutionParameters params) {
+                       this.outputArray = outputArray;
+                       this.params = params;
+                       inputArray = input.getDenseBlock();
+                       if(inputArray == null || outputArray == null)
+                               throw new RuntimeException("Incorrect usage: 
empty inputs");
+               }
+               
+               @Override
+               public void execute(int inputN, int outputN) {
+                       int outputOffset = outputN*params.K*params.P*params.Q;
+                       for (int k = 0; k < params.K; k++) {
+                               for (int p = 0; p < params.P; p++) {
+                                       for (int q = 0; q < params.Q; q++) {
+                                               outputArray[outputOffset + 
p*params.Q*params.K + q*params.K + k] = 
+                                                               
inputArray[inputN*params.K*params.P*params.Q + k*params.P*params.Q + p*params.Q 
+ q];
+                                       }
+                               }
+                       }
+               }
+       }
+       
+       /**
+        * Performing rotate180 when input is sparse (general case)
+        */
+       static class SparseRotate180Worker implements Rotate180Worker {
+
+               double [] outputArray;  MatrixBlock input;
+               ConvolutionParameters params; boolean zeroOutSparseOutput;
+               public SparseRotate180Worker(MatrixBlock input, double [] 
outputArray,  ConvolutionParameters params, boolean zeroOutSparseOutput) {
+                       this.outputArray = outputArray;
+                       this.params = params;
+                       this.zeroOutSparseOutput = zeroOutSparseOutput;
+                       this.input = input;
+                       if(outputArray == null)
+                               throw new RuntimeException("Incorrect usage: 
empty inputs");
+               }
+               
+               @Override
+               public void execute(int inputN, int outputN) {
+                       if(zeroOutSparseOutput)
+                               Arrays.fill(outputArray, 0);
+                       
+                       int outputOffset = outputN*params.K*params.P*params.Q;
+                       if(!input.isEmptyBlock()) {
+                               if( !input.sparseBlock.isEmpty(inputN) ) {
+                                       int [] tensorIndexes = new int[3];
+                                       int apos = 
input.sparseBlock.pos(inputN);
+                                       int alen = 
input.sparseBlock.size(inputN);
+                                       int[] aix = 
input.sparseBlock.indexes(inputN);
+                                       double[] avals = 
input.sparseBlock.values(inputN);
+                                       for(int j = apos; j < apos+alen; j++) {
+                                               
LibMatrixDNNHelper.computeTensorIndexes(aix[j], tensorIndexes, params.P, 
params.Q);
+                                               int k = tensorIndexes[0];
+                                               int p = tensorIndexes[1];
+                                               int q = tensorIndexes[2];
+                                               outputArray[outputOffset + 
p*params.Q*params.K + q*params.K + k] = avals[j];
+                                       }
+                               }
+                       }
+               }
+       }
+}

[1/2] incubator-systemml git commit: [SYSTEMML-540] Refactored LibMatrixDNN to reduce instruction cache misses

Reply via email to