systemml git commit: [SYSTEMML-540] Support sparse output for CP im2col operator

niketanpansare Wed, 30 Aug 2017 10:24:45 -0700

Repository: systemml
Updated Branches:
  refs/heads/master bd1946a3d -> 4744da167



[SYSTEMML-540] Support sparse output for CP im2col operator

- We support sparse output for im2col, which improved the performance by
  4x on CPU for 1-layer CNN with input sparsity of 0.04.
- This commit also fixes the NeuralNetworkOpTests.

Closes #645.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/4744da16
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/4744da16
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/4744da16

Branch: refs/heads/master
Commit: 4744da1670eb6e8c922394f563f360c0fa233482
Parents: bd1946a
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Wed Aug 30 10:17:04 2017 -0700
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Wed Aug 30 10:23:39 2017 -0700

----------------------------------------------------------------------
 .../instructions/gpu/context/CSRPointer.java    |   8 +-
 .../LibMatrixDNNConv2dBackwardFilterHelper.java |   1 -
 .../matrix/data/LibMatrixDNNConv2dHelper.java   |   2 -
 .../matrix/data/LibMatrixDNNIm2ColHelper.java   | 278 ++++---
 .../org/apache/sysml/test/gpu/GPUTests.java     |  58 +-
 .../sysml/test/gpu/NeuralNetworkOpTests.java    | 733 +++++++++----------
 6 files changed, 592 insertions(+), 488 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/4744da16/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index a4147a3..9379534 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -185,13 +185,19 @@ public class CSRPointer {
         * @param rowPtr integer array of row pointers
         * @param colInd integer array of column indices
         * @param values double array of non zero values
+        * @throws DMLRuntimeException if error occurs
         */
-       public static void copyToDevice(CSRPointer dest, int rows, long nnz, 
int[] rowPtr, int[] colInd, double[] values) {
+       public static void copyToDevice(CSRPointer dest, int rows, long nnz, 
int[] rowPtr, int[] colInd, double[] values) throws DMLRuntimeException {
                CSRPointer r = dest;
                long t0 = 0;
                if (DMLScript.STATISTICS)
                        t0 = System.nanoTime();
                r.nnz = nnz;
+               if(rows < 0) throw new DMLRuntimeException("Incorrect input 
parameter: rows=" + rows);
+               if(nnz < 0) throw new DMLRuntimeException("Incorrect input 
parameter: nnz=" + nnz);
+               if(rowPtr.length < rows + 1) throw new DMLRuntimeException("The 
length of rowPtr needs to be greater than or equal to " + (rows + 1));
+               if(colInd.length < nnz) throw new DMLRuntimeException("The 
length of colInd needs to be greater than or equal to " + nnz);
+               if(values.length < nnz) throw new DMLRuntimeException("The 
length of values needs to be greater than or equal to " + nnz);
                cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 
1), cudaMemcpyHostToDevice);
                cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), 
cudaMemcpyHostToDevice);
                cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), 
cudaMemcpyHostToDevice);

http://git-wip-us.apache.org/repos/asf/systemml/blob/4744da16/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
index 560f32c..a135f62 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
@@ -84,7 +84,6 @@ public class LibMatrixDNNConv2dBackwardFilterHelper {
                        int PQ = _params.P*_params.Q; int K = _params.K; int 
CRS = _params.C*_params.R*_params.S;
                        MatrixBlock dout = _params.input2;
                        MatrixBlock im2ColOutBlock = new MatrixBlock(CRS, PQ, 
false);
-                       im2ColOutBlock.allocateDenseBlock();
                        MatrixBlock dout_reshaped = new MatrixBlock(PQ, K, 
false);
                        dout_reshaped.allocateDenseBlock();
                        LibMatrixDNNIm2ColHelper.Im2colWorker im2ColWorker = 
LibMatrixDNNIm2ColHelper.Im2colWorker.getWorker( _params.input1, 
im2ColOutBlock, _params, true);

http://git-wip-us.apache.org/repos/asf/systemml/blob/4744da16/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
index b2c4d67..4c3a3c3 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
@@ -48,7 +48,6 @@ public class LibMatrixDNNConv2dHelper {
                        int PQ = _params.P*_params.Q; int K = _params.K;
                        int RS = _params.R*_params.S;
                        MatrixBlock im2ColOutBlock = new MatrixBlock(RS, PQ, 
false);
-                       im2ColOutBlock.allocateDenseBlock();
                        LibMatrixDNNIm2ColHelper.Im2colWorker im2ColWorker = 
LibMatrixDNNIm2ColHelper.Im2colWorker.getWorker( _params.input1, 
im2ColOutBlock, _params, false);
                        long time1 = 0; long time2 = 0;
                        for(int n = _rl; n < _ru; n++)  {
@@ -129,7 +128,6 @@ public class LibMatrixDNNConv2dHelper {
                public Long call() throws Exception {
                        int PQ = _params.P*_params.Q; int K = _params.K; int 
CRS = _params.C*_params.R*_params.S;
                        MatrixBlock im2ColOutBlock = new MatrixBlock(CRS, PQ, 
false);
-                       im2ColOutBlock.allocateDenseBlock();
                        LibMatrixDNNIm2ColHelper.Im2colWorker im2ColWorker = 
LibMatrixDNNIm2ColHelper.Im2colWorker.getWorker( _params.input1, 
im2ColOutBlock, _params, true);
                        long time1 = 0; long time2 = 0;
                        for(int n = _rl; n < _ru; n++)  {

http://git-wip-us.apache.org/repos/asf/systemml/blob/4744da16/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
index 9ae39bf..d427a26 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
@@ -20,38 +20,73 @@ package org.apache.sysml.runtime.matrix.data;
 
 import java.util.Arrays;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
 /**
  * This class contains the different implementation of im2col operation
  */
 public class LibMatrixDNNIm2ColHelper {
-       
+       private static final Log LOG = 
LogFactory.getLog(LibMatrixDNNIm2ColHelper.class.getName());
        static interface Im2colWorker {
                public void execute(int n);
                public void execute(int n, int c);
                public static Im2colWorker getWorker(MatrixBlock input, 
MatrixBlock im2ColOutBlock, ConvolutionParameters params, boolean allChannels) {
-                       if(im2ColOutBlock.isInSparseFormat() || 
im2ColOutBlock.getDenseBlock() == null)
-                               throw new RuntimeException("im2col output is 
always in dense format");
                        if(allChannels) {
                                if(!input.isInSparseFormat()) {
-                                       if (params.stride_h == 1 && 
params.stride_w == 1 && params.pad_h == 0 && params.pad_w == 0) 
+                                       // Note: Only dense im2col operators 
require the im2ColOutBlock to be allocated in the dense format.
+                                       im2ColOutBlock.allocateDenseBlock();
+                                       if (params.stride_h == 1 && 
params.stride_w == 1 && params.pad_h == 0 && params.pad_w == 0)  {
+                                               if(LOG.isTraceEnabled()) 
LOG.trace("Using DenseIm2colWorkerStride1Pad0AllChannels operator to perform 
im2col.");
                                                return new 
DenseIm2colWorkerStride1Pad0AllChannels(input.getDenseBlock(), 
im2ColOutBlock.getDenseBlock(), params);
-                                       else
+                                       }
+                                       else {
+                                               if(LOG.isTraceEnabled()) 
LOG.trace("Using DenseIm2colWorkerAllChannels operator to perform im2col.");
                                                return new 
DenseIm2colWorkerAllChannels(input.getDenseBlock(), 
im2ColOutBlock.getDenseBlock(), params);
+                                       }
+                               }
+                               else {
+                                       if(LOG.isTraceEnabled()) 
LOG.trace("Using SparseIm2colWorkerAllChannels operator to perform im2col.");
+                                       double sparsity = 
Math.min(MatrixBlock.SPARSITY_TURN_POINT, (input.getNonZeros()*2.0) / 
(input.getNumRows()*input.getNumColumns()));
+                                       
initializeSparseIm2ColBlock(im2ColOutBlock, 
(long)Math.ceil(params.P*params.Q*sparsity));
+                                       return new 
SparseSparseIm2colWorkerAllChannels(input, im2ColOutBlock, params);
                                }
-                               else 
-                                       return new 
SparseIm2colWorkerAllChannels(input, im2ColOutBlock, params);
                        }
                        else {
                                if(!input.isInSparseFormat()) {
-                                       if (params.stride_h == 1 && 
params.stride_w == 1 && params.pad_h == 0 && params.pad_w == 0) 
+                                       // Note: Only dense im2col operators 
require the im2ColOutBlock to be allocated in the dense format.
+                                       im2ColOutBlock.allocateDenseBlock();
+                                       if (params.stride_h == 1 && 
params.stride_w == 1 && params.pad_h == 0 && params.pad_w == 0) {
+                                               if(LOG.isTraceEnabled()) 
LOG.trace("Using DenseIm2colWorkerStride1Pad0 operator to perform im2col.");
                                                return new 
DenseIm2colWorkerStride1Pad0(input.getDenseBlock(), 
im2ColOutBlock.getDenseBlock(), params);
-                                       else
+                                       }
+                                       else {
+                                               if(LOG.isTraceEnabled()) 
LOG.trace("Using DenseIm2colWorker operator to perform im2col.");
                                                return new 
DenseIm2colWorker(input.getDenseBlock(), im2ColOutBlock.getDenseBlock(), 
params);
+                                       }
+                               }
+                               else {
+                                       if(LOG.isTraceEnabled()) 
LOG.trace("Using SparseIm2colWorker operator to perform im2col.");
+                                       double sparsity = 
Math.min(MatrixBlock.SPARSITY_TURN_POINT, (input.getNonZeros()*2.0) / 
(input.getNumRows()*input.getNumColumns()));
+                                       
initializeSparseIm2ColBlock(im2ColOutBlock, 
(long)Math.ceil(params.P*params.Q*sparsity));
+                                       return new 
SparseSparseIm2colWorker(input, im2ColOutBlock, params);
                                }
-                               else 
-                                       return new SparseIm2colWorker(input, 
im2ColOutBlock, params);
                        }
                }
+               
+               static void initializeSparseIm2ColBlock(MatrixBlock 
im2ColOutBlock, long worstCaseNNPerRow) {
+                       if(worstCaseNNPerRow >= Integer.MAX_VALUE)
+                               throw new RuntimeException("The dimension of 
intermediate im2col matrix exceeded:" + worstCaseNNPerRow);
+                       // Set to sparse
+                       im2ColOutBlock.sparse = true;
+                       im2ColOutBlock.denseBlock = null;
+                       im2ColOutBlock.allocateSparseRowsBlock();
+                       
+                       for(int r = 0; r < im2ColOutBlock.getNumRows(); r++) {
+                               im2ColOutBlock.getSparseBlock().allocate(r, 
(int) worstCaseNNPerRow);
+                       }
+                       im2ColOutBlock.setNonZeros(0);
+               }
        }
        
        /**
@@ -238,20 +273,23 @@ public class LibMatrixDNNIm2ColHelper {
        }
        
        /**
-        * Performing dense im2col (general case)
+        * Performing sparse im2col for all channels for a given row n of the 
input matrix.
         */
-       static class SparseIm2colWorkerAllChannels implements Im2colWorker {
-               MatrixBlock input; double [] outputArray; 
-               int CRS; int S; int R; int P; int Q; int H; int W; 
-               int stride_h; int stride_w; int pad_h; int pad_w; double [] 
temp;
-               public SparseIm2colWorkerAllChannels(MatrixBlock input, 
MatrixBlock im2ColOutBlock, ConvolutionParameters params) {
+       static class SparseSparseIm2colWorkerAllChannels implements 
Im2colWorker {
+               MatrixBlock input;  MatrixBlock output;
+               int CRS; int S; int R; int P; int Q; int H; int W; int RS; int 
HW;
+               int stride_h; int stride_w; int pad_h; int pad_w;
+               public SparseSparseIm2colWorkerAllChannels(MatrixBlock input, 
MatrixBlock im2ColOutBlock, ConvolutionParameters params) {
                        this.input = input;
-                       this.outputArray = im2ColOutBlock.getDenseBlock();
+                       this.output = im2ColOutBlock;
                        this.CRS = params.C * params.R * params.S;
+                       this.RS = params.R * params.S;
+                       this.HW = params.H * params.W;
                        this.H = params.H; this.W = params.W; this.R = 
params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
                        this.stride_h = params.stride_h; this.stride_w = 
params.stride_w;
                        this.pad_h = params.pad_h; this.pad_w = params.pad_w;
-                       temp = new double[input.getNumColumns()];
+                       if(!input.isInSparseFormat()) 
+                               throw new RuntimeException("Incorrect operator 
selection. Expected dense input for SparseIm2colWorkerAllChannels");
                }
                
                @Override
@@ -261,71 +299,107 @@ public class LibMatrixDNNIm2ColHelper {
 
                @Override
                public void execute(int n) {
-                       // Using a temporary array improves performance by not 
requiring binary search for getValue
-                       // Since the access pattern depends on 
ConvolutionParameters, this serves as a temporary fix.
-                       fillTemp(input, n);
-                       // final int nOffset = n * params.C*params.H*params.W;
-                       for (int c = 0; c < CRS; ++c) {
-                               int wOffset = c % S;
-                               int hOffset = (c / S) % R;
-                               int cInput = c / R / S;
-                               for (int h = 0; h < P; ++h) {
-                                       int outOffset = (c * P + h) * Q;
-                                       int hPadded = h * stride_h - pad_h + 
hOffset;
-                                       int tempOffset = (cInput * H + hPadded) 
* W;
-                                       // int inputOffset = nOffset + 
tempOffset;
-                                       if (hPadded < 0 || hPadded >= H) {
-                                               Arrays.fill(outputArray, 
outOffset, outOffset+Q, 0);
-                                       } else {
-                                               for (int w = 0; w < Q; ++w) {
-                                                       int wPadded = w * 
stride_w - pad_w + wOffset;
-                                                       if (wPadded >= 0 && 
wPadded < W) 
-                                                               
outputArray[outOffset + w] = temp[tempOffset + wPadded];
-                                                       else
-                                                               
outputArray[outOffset + w] = 0;
-                                               }
-                                       }
-                               }
-                       }
-               }
-               // Returns the row of matrix in dense format
-               private void fillTemp(MatrixBlock input, int n) {
-                       if(input.getNumColumns() != temp.length) {
-                               throw new RuntimeException("Invalid 
parameters");
-                       }
-                       // Use temporary array to avoid binary search
-                       if(input.isInSparseFormat()) {
-                               Arrays.fill(temp, 0);
-                               if( !input.sparseBlock.isEmpty(n) ) {
-                                       int apos = input.sparseBlock.pos(n);
-                                       int alen = input.sparseBlock.size(n);
-                                       int[] aix = 
input.sparseBlock.indexes(n);
-                                       double[] avals = 
input.sparseBlock.values(n);
-                                       for(int j=apos; j<apos+alen; j++)
-                                               temp[ aix[j] ] = avals[j];
+                       if( !input.sparseBlock.isEmpty(n) ) {
+                               output.sparseBlock.reset();
+                               output.setNonZeros(0);
+                               int apos = input.sparseBlock.pos(n);
+                               int alen = input.sparseBlock.size(n);
+                               int[] aix = input.sparseBlock.indexes(n);
+                               double[] avals = input.sparseBlock.values(n);
+                               
+                               // Iterate over the sparse block
+                               for(int j=apos; j<apos+alen; j++) {
+                                       // Note: the input is of shape [N, CHW]
+                                       int chw = aix[j];
+                                       
+                                       // Get individual zero-based c,h,w 
indexes from zero-based 'chw'
+                                       int cInput = chw / HW;
+                                       int hInput = (chw - cInput*HW)/W;
+                                       int wInput = chw % W; 
+                                       
+                                       appendInputValueToIm2colOutput(output, 
cInput, hInput, wInput, avals[j], 
+                                                       R, S, P, Q, stride_h, 
stride_w, pad_h, pad_w);
                                }
+                               // Since the chw are appended in sorted order, 
no need to sort the output rows
+                               // if(meta.sortRows) output.sortSparseRows();
                        }
                        else {
-                               System.arraycopy(input.getDenseBlock(), 
n*input.getNumColumns(), temp, 0, input.getNumColumns());
+                               output.setNonZeros(0);
                        }
                }
        }
        
        /**
-        * Performing dense im2col (general case)
+        * Appends the value corresponding to the given [, cInput, hInput, 
wInput] to the appropriate im2col location of the output
+        * 
+        * @param output output matrix block
+        * @param cInput input channel index (zero-based)
+        * @param hInput input height index (zero-based)
+        * @param wInput input width index (zero-based)
+        * @param value input value
+        * @param R filter height
+        * @param S filter width
+        * @param P output height
+        * @param Q output width
+        * @param stride_h stride height
+        * @param stride_w stride width
+        * @param pad_h pad height
+        * @param pad_w pad width
+        */
+       private static void appendInputValueToIm2colOutput(MatrixBlock output, 
int cInput, int hInput, int wInput, double value, 
+                       int R, int S, int P, int Q, int stride_h, int stride_w, 
int pad_h, int pad_w) {
+               if(value == 0) 
+                       return;
+               int RS = R*S;
+               // For the given h,w index, insert avals[j] into respective 
r,s,p,q locations
+               
+               // Constraints: for(int r = 0; r < R; r++) { if(0 <= p && p < P 
&& (hInput - r + pad_h) % stride_h == 0) { ... } }
+               // Constraint 1: p >= 0 and p = (hInput - r + pad_h)  / stride_h
+               // Therefore,  r <= hInput + pad_h 
+               // Constraint 2: p < P and p = (hInput - r + pad_h)  / stride_h
+               // Therefore,  hInput + pad_h - P*stride_h < r
+               // Math.max(0, hInput + pad_h - P*stride_h + 1) <= r <= 
Math.min(R-1, hInput + pad_h)
+               int rMin = Math.max(0, hInput + pad_h - P*stride_h + 1);
+               int rMax = Math.min(R-1, hInput + pad_h);
+               int sMin = Math.max(0, wInput + pad_w - Q*stride_w + 1);
+               int sMax = Math.min(S-1, wInput + pad_w);
+               // Constraint 3: (hInput - r + pad_h) % stride_h == 0
+               while((hInput - rMin + pad_h) % stride_h != 0 && rMin <= rMax) 
rMin++;
+               while((wInput - sMin + pad_w) % stride_w != 0 && sMin <= sMax) 
sMin++;  
+               
+               for(int r = rMin; r <= rMax; r += stride_h) {
+                       // Only append value if h == hInput, where h = (r - 
pad_h) + p*stride_h and 0 <= p < P
+                       // Therefore, p = (hInput - r + pad_h)  / stride_h. Use 
the same logic for q.
+                       final int p = (hInput - r + pad_h)  / stride_h;
+                       final int pQ = p*Q;
+                       final int outRowIndex = cInput*RS + r*S;
+                       for(int s = sMin; s <= sMax; s += stride_w) {
+                               int q = (wInput - s + pad_w)  / stride_w;
+                               // chw -> [crs, pq]
+                               output.appendValue(outRowIndex + s, pQ + q, 
value);
+                               // Since the chw are appended in sorted order, 
no need to sort the output rows
+                               // if(meta.lastIndexPerRow[outRowIndex + s] > 
p*Q + q) meta.sortRows = true;
+                               // meta.lastIndexPerRow[outRowIndex + s] = p*Q 
+ q;
+                       }
+               }
+       }
+       
+       /**
+        * Performing sparse im2col for a given channel c and for a given row n 
of the input matrix.
         */
-       static class SparseIm2colWorker implements Im2colWorker {
-               MatrixBlock input; double [] outputArray; 
-               int CRS; int S; int R; int P; int Q; int H; int W; 
-               int stride_h; int stride_w; int pad_h; int pad_w; double [] 
temp;
-               public SparseIm2colWorker(MatrixBlock input, MatrixBlock 
im2ColOutBlock, ConvolutionParameters params) {
+       static class SparseSparseIm2colWorker implements Im2colWorker {
+               MatrixBlock input; MatrixBlock output;
+               int CRS; int S; int R; int P; int Q; int H; int W; int HW; int 
RS;
+               int stride_h; int stride_w; int pad_h; int pad_w; 
+               public SparseSparseIm2colWorker(MatrixBlock input, MatrixBlock 
im2ColOutBlock, ConvolutionParameters params) {
                        this.input = input;
-                       this.outputArray = im2ColOutBlock.getDenseBlock();
+                       this.output = im2ColOutBlock;
                        this.CRS = params.C * params.R * params.S;
+                       this.HW = params.H*params.W;
+                       this.RS = params.R*params.S;
                        this.H = params.H; this.W = params.W; this.R = 
params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
                        this.stride_h = params.stride_h; this.stride_w = 
params.stride_w;
                        this.pad_h = params.pad_h; this.pad_w = params.pad_w;
-                       temp = new double[input.getNumColumns()];
                }
                
                @Override
@@ -335,52 +409,36 @@ public class LibMatrixDNNIm2ColHelper {
 
                @Override
                public void execute(int n, int cInput) {
-                       // Using a temporary array improves performance by not 
requiring binary search for getValue
-                       // Since the access pattern depends on 
ConvolutionParameters, this serves as a temporary fix.
-                       fillTemp(input, n); int RS = R*S;
-                       for (int rs = 0; rs < RS; ++rs) {
-                               int wOffset = rs % S;
-                               int hOffset = rs / S;
-                               for (int h = 0; h < P; ++h) {
-                                       int outOffset = (rs * P + h) * Q;
-                                       int hPadded = h * stride_h - pad_h + 
hOffset;
-                                       int tempOffset = (cInput * H + hPadded) 
* W;
-                                       // int inputOffset = nOffset + 
tempOffset;
-                                       if (hPadded < 0 || hPadded >= H) {
-                                               Arrays.fill(outputArray, 
outOffset, outOffset+Q, 0);
-                                       } else {
-                                               for (int w = 0; w < Q; ++w) {
-                                                       int wPadded = w * 
stride_w - pad_w + wOffset;
-                                                       if (wPadded >= 0 && 
wPadded < W) 
-                                                               
outputArray[outOffset + w] = temp[tempOffset + wPadded];
-                                                       else
-                                                               
outputArray[outOffset + w] = 0;
-                                               }
+                       if( !input.sparseBlock.isEmpty(n) ) {
+                               output.sparseBlock.reset();
+                               output.setNonZeros(0);
+                               int apos = input.sparseBlock.pos(n);
+                               int alen = input.sparseBlock.size(n);
+                               int[] aix = input.sparseBlock.indexes(n);
+                               double[] avals = input.sparseBlock.values(n);
+                               
+                               // Iterate over the sparse block
+                               for(int j=apos; j<apos+alen; j++) {
+                                       // Note: the input is of shape [N, CHW]
+                                       int chw = aix[j];
+                                       
+                                       if(cInput == (chw / HW)) {
+                                               // Get individual zero-based 
c,h,w indexes from zero-based 'chw'
+                                               int hInput = (chw - 
cInput*HW)/W;
+                                               int wInput = chw % W; 
+                                               
+                                               
appendInputValueToIm2colOutput(output, cInput, hInput, wInput, avals[j], 
+                                                               R, S, P, Q, 
stride_h, stride_w, pad_h, pad_w);
                                        }
                                }
-                       }
-               }
-               // Returns the row of matrix in dense format
-               private void fillTemp(MatrixBlock input, int n) {
-                       if(input.getNumColumns() != temp.length) {
-                               throw new RuntimeException("Invalid 
parameters");
-                       }
-                       // Use temporary array to avoid binary search
-                       if(input.isInSparseFormat()) {
-                               Arrays.fill(temp, 0);
-                               if( !input.sparseBlock.isEmpty(n) ) {
-                                       int apos = input.sparseBlock.pos(n);
-                                       int alen = input.sparseBlock.size(n);
-                                       int[] aix = 
input.sparseBlock.indexes(n);
-                                       double[] avals = 
input.sparseBlock.values(n);
-                                       for(int j=apos; j<apos+alen; j++)
-                                               temp[ aix[j] ] = avals[j];
-                               }
+                               // Since the chw are appended in sorted order, 
no need to sort the output rows
+                               // if(meta.sortRows) output.sortSparseRows();
                        }
                        else {
-                               System.arraycopy(input.getDenseBlock(), 
n*input.getNumColumns(), temp, 0, input.getNumColumns());
+                               output.setNonZeros(0);
                        }
                }
+               
        }
 
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/4744da16/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java 
b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
index d40b7a1..56e0e92 100644
--- a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
+++ b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
@@ -49,7 +49,8 @@ public abstract class GPUTests extends AutomatedTestBase {
        protected final static String TEST_DIR = 
"org/apache/sysml/api/mlcontext";
        protected static SparkSession spark;
        protected final double THRESHOLD = 1e-9;    // for relative error
-
+       private static final boolean PRINT_MAT_ERROR = false;
+       
        @BeforeClass
        public static void beforeClass() {
                spark = createSystemMLSparkSession("GPUTests", "local");
@@ -139,7 +140,7 @@ public abstract class GPUTests extends AutomatedTestBase {
                genMLC.close();
                return in1;
        }
-
+       
        /**
         * Generates a random input matrix with a given size and sparsity
         *
@@ -152,6 +153,22 @@ public abstract class GPUTests extends AutomatedTestBase {
         * @return a random matrix with given size and sparsity
         */
        protected Matrix generateInputMatrix(SparkSession spark, int m, int n, 
double min, double max, double sparsity, int seed) {
+               return generateInputMatrix(spark, m, n, min, max, sparsity, 
seed, false);
+       }
+
+       /**
+        * Generates a random input matrix with a given size and sparsity
+        *
+        * @param spark    valid instance of {@link SparkSession}
+        * @param m        number of rows
+        * @param n        number of columns
+        * @param min      min for RNG
+        * @param max      max for RNG
+        * @param sparsity sparsity (1 = completely dense, 0 = completely 
sparse)
+        * @param performRounding performs rounding after generation of random 
matrix
+        * @return a random matrix with given size and sparsity
+        */
+       protected Matrix generateInputMatrix(SparkSession spark, int m, int n, 
double min, double max, double sparsity, int seed, boolean performRounding) {
                // Generate a random matrix of size m * n
                MLContext genMLC = new MLContext(spark);
                String scriptStr;
@@ -160,12 +177,47 @@ public abstract class GPUTests extends AutomatedTestBase {
                } else {
                        scriptStr = "in1 = rand(rows=" + m + ", cols=" + n + ", 
sparsity = " + sparsity + ", seed= " + seed
                                        + ", min=" + min + ", max=" + max + ")";
+                       if(performRounding)
+                               scriptStr += "; in1 = round(in1)";
                }
                Script generateScript = 
ScriptFactory.dmlFromString(scriptStr).out("in1");
                Matrix in1 = genMLC.execute(generateScript).getMatrix("in1");
                genMLC.close();
                return in1;
        }
+       
+       private void printMatrixIfNotEqual(MatrixBlock expectedMB, MatrixBlock 
actualMB) {
+               long rows = expectedMB.getNumRows();
+               long cols = expectedMB.getNumColumns();
+               boolean matrixNotEqual = false;
+               for (int i = 0; i < rows && !matrixNotEqual; i++) {
+                       for (int j = 0; j < cols; j++) {
+                               double expectedDouble = 
expectedMB.quickGetValue(i, j);
+                               double actualDouble = actualMB.quickGetValue(i, 
j);
+                               if (expectedDouble != 0.0 && 
!Double.isNaN(expectedDouble) && Double.isFinite(expectedDouble)) {
+                                       double relativeError = 
Math.abs((expectedDouble - actualDouble) / expectedDouble);
+                                       if(relativeError >= getTHRESHOLD()) {
+                                               matrixNotEqual = true;
+                                               break;
+                                       }
+                               }
+                       }
+               }
+               if(matrixNotEqual) {
+                       System.out.println("Expected mb != Actual mb. 
Mismatches are as follows:");
+                       for (int i = 0; i < rows; i++) {
+                               for (int j = 0; j < cols; j++) {
+                                       double expectedDouble = 
expectedMB.quickGetValue(i, j);
+                                       double actualDouble = 
actualMB.quickGetValue(i, j);
+                                       if (expectedDouble != 0.0 && 
!Double.isNaN(expectedDouble) && Double.isFinite(expectedDouble)) 
+                                               System.out.print("(" + i + "," 
+ j  + " : " + expectedDouble + " != " + actualDouble + ") ");
+                               }
+                       }
+                       System.out.println();
+               }
+               else
+                       System.out.println("Expected mb = Actual mb");
+       }
 
        /**
         * Asserts that the values in two matrices are in {@link 
UnaryOpTests#THRESHOLD} of each other
@@ -183,6 +235,8 @@ public abstract class GPUTests extends AutomatedTestBase {
                        Assert.assertEquals(rows, actualMB.getNumRows());
                        Assert.assertEquals(cols, actualMB.getNumColumns());
 
+                       if(PRINT_MAT_ERROR) printMatrixIfNotEqual(expectedMB, 
actualMB);
+                       
                        for (int i = 0; i < rows; i++) {
                                for (int j = 0; j < cols; j++) {
                                        double expectedDouble = 
expectedMB.quickGetValue(i, j);

http://git-wip-us.apache.org/repos/asf/systemml/blob/4744da16/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java 
b/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
index c53e803..aba0cae 100644
--- a/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
+++ b/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
@@ -24,9 +24,6 @@ import java.util.HashMap;
 import java.util.List;
 
 import org.apache.sysml.api.mlcontext.Matrix;
-import org.apache.sysml.runtime.DMLRuntimeException;
-import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
-import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
 import org.apache.sysml.test.utils.TestUtils;
 import org.junit.Assert;
@@ -44,28 +41,32 @@ import org.junit.Test;
  * <code>
  * mvn -Dit.test=org.apache.sysml.test.gpu.NeuralNetworkOpTests verify 
-PgpuTests
  * </code>
+ * 
+ * Note: generateInputMatrix(...) method in this test performs rounding of 
input matrix. This helps
+ * to test the correctness of our operators at logical level, but not the 
precision.
+ * 
  */
 public class NeuralNetworkOpTests extends GPUTests {
 
        private final static String TEST_NAME = "NeuralNetworkOpTests";
        // The MAX_OP_SIZE is to take into consideration the memory available 
on the GPU as well as
        // limits set by cudnn (operands need to be less than 2GB)
-       private static final double MAX_OP_SIZE;
-
-       static {
-               double MAX = 0.5 * 1024 * 1024 * 1024; // 0.5 GB (this HAS to 
be less than 2GB)
-               try {
-                       // Cap the maximum allowed operand size to 1/3rd of the 
usable GPU memory or MAX, whichever is lesser
-                       List<GPUContext> gCtxs = 
GPUContextPool.reserveAllGPUContexts();
-                       long availableMemory = 
gCtxs.get(0).getAvailableMemory();
-                       double averageMemoryPerOperand = availableMemory / 3.0;
-                       MAX_OP_SIZE = Math.min(averageMemoryPerOperand, MAX);
-                       GPUContextPool.freeAllGPUContexts();
-               } catch (DMLRuntimeException e) {
-                       throw new RuntimeException(e);
-               }
-
-       }
+       private static final double MAX_OP_SIZE = 0.5 * 1024 * 1024 * 1024; // 
0.5 GB (this HAS to be less than 2GB)
+
+//     static {
+//             double MAX = 0.5 * 1024 * 1024 * 1024; // 0.5 GB (this HAS to 
be less than 2GB)
+//             try {
+//                     // Cap the maximum allowed operand size to 1/3rd of the 
usable GPU memory or MAX, whichever is lesser
+//                     List<GPUContext> gCtxs = 
GPUContextPool.reserveAllGPUContexts();
+//                     long availableMemory = 
gCtxs.get(0).getAvailableMemory();
+//                     double averageMemoryPerOperand = availableMemory / 3.0;
+//                     MAX_OP_SIZE = Math.min(averageMemoryPerOperand, MAX);
+//                     GPUContextPool.freeAllGPUContexts();
+//             } catch (DMLRuntimeException e) {
+//                     throw new RuntimeException(e);
+//             }
+//
+//     }
 
        private final int seed = 42;
 
@@ -74,28 +75,20 @@ public class NeuralNetworkOpTests extends GPUTests {
        private final List<Integer> Nlst = Arrays.asList(128, 64, 32);
     private final List<Integer> Clst = Arrays.asList(30, 20, 3);
     private final List<Integer> Hlst = Arrays.asList(400, 128, 32);
-    private final List<Integer> Wlst = Arrays.asList(400, 128, 32);
     private final List<Integer> Klst = Arrays.asList(30, 20, 10);
     private final List<Integer> Rlst = Arrays.asList(128, 63, 4);
-    private final List<Integer> Slst = Arrays.asList(128, 63, 4);
-    private final List<Integer> strideHeightLst = Arrays.asList(9, 3);
-    private final List<Integer> strideWidthLst = Arrays.asList(9, 3);
-    private final List<Integer> padHeightLst = Arrays.asList(3, 1);
-    private final List<Integer> padWidthLst = Arrays.asList(3, 1);
-    private final List<Double> sparsitylst = Arrays.asList(1.0);    // Only 
test for dense
-    */
-       private final List<Integer> Nlst = Arrays.asList(128, 64);
-       private final List<Integer> Clst = Arrays.asList(30, 3);
-       private final List<Integer> Hlst = Arrays.asList(256, 64);
-       private final List<Integer> Wlst = Arrays.asList(256, 64);
-       private final List<Integer> Klst = Arrays.asList(30, 20);
-       private final List<Integer> Rlst = Arrays.asList(128, 3);
-       private final List<Integer> Slst = Arrays.asList(128, 3);
-       private final List<Integer> strideHeightLst = Arrays.asList(9, 1);
-       private final List<Integer> strideWidthLst = Arrays.asList(9, 1);
-       private final List<Integer> padHeightLst = Arrays.asList(3, 1);
-       private final List<Integer> padWidthLst = Arrays.asList(3, 1);
-       private final List<Double> sparsitylst = Arrays.asList(1.0);    // Only 
test for dense
+    private final List<Integer> strideLst = Arrays.asList(9, 3);
+    private final List<Integer> padLst = Arrays.asList(3, 1);
+    private final List<Double> sparsitylst = Arrays.asList(1.0, 0.1, 0.3);
+        */
+       private final List<Integer> Nlst = Arrays.asList(32);
+       private final List<Integer> Clst = Arrays.asList(3);
+       private final List<Integer> Hlst = Arrays.asList(64);
+       private final List<Integer> Klst = Arrays.asList(3);
+       private final List<Integer> Rlst = Arrays.asList(3,5);
+       private final List<Integer> strideLst = Arrays.asList(1, 2);
+       private final List<Integer> padLst = Arrays.asList(0,1);
+       private final List<Double> sparsitylst = Arrays.asList(1.0, 0.1, 0.3);
 
        @Override
        public void setUp() {
@@ -109,7 +102,6 @@ public class NeuralNetworkOpTests extends GPUTests {
                return 1e-5;
        }
 
-       @Ignore
        @Test
        public void testConv2d() {
                String scriptStr = "O = conv2d(image, filter, padding=[padH, 
padW], stride=[strideH, strideW], input_shape=[N,C,H,W], 
filter_shape=[K,C,R,S])";
@@ -117,75 +109,74 @@ public class NeuralNetworkOpTests extends GPUTests {
                for (long N : Nlst) {
                        for (long C : Clst) {
                                for (long H : Hlst) {
-                                       for (long W : Wlst) {
-                                               for (long K : Klst) {
-                                                       for (long R : Rlst) {
-                                                               for (long S : 
Slst) {
-                                                                       for 
(long strideH : strideHeightLst) {
-                                                                               
for (long strideW : strideWidthLst) {
-                                                                               
        for (long padH : padHeightLst) {
-                                                                               
                for (long padW : padWidthLst) {
-                                                                               
                        for (double sparsity : sparsitylst) {
-
-                                                                               
                                // Make sure ops fit in GPU memory and within 
constraints of cudnn
-                                                                               
                                long imageSize = N * C * H * W * 8l;
-                                                                               
                                if (imageSize > MAX_OP_SIZE)  // image size
-                                                                               
                                        continue;
-                                                                               
                                long filterSize = K * C * R * S * 8l;
-                                                                               
                                if (filterSize > MAX_OP_SIZE)  // filter size
-                                                                               
                                        continue;
-                                                                               
                                // filter is smaller than image + padding
-                                                                               
                                if (R > (H + padH) || S > (W + padW))
-                                                                               
                                        continue;
-
-                                                                               
                                int P = (int) ConvolutionUtils.getP(H, R, 
strideH, padH);
-                                                                               
                                int Q = (int) ConvolutionUtils.getQ(W, S, 
strideW, padW);
-
-                                                                               
                                long doutSize = N * K * P * Q * 8l;
-                                                                               
                                if (doutSize > MAX_OP_SIZE) // dout/output size
-                                                                               
                                        continue;
-
-                                                                               
                                double imageSizeInMB = imageSize / (1024.0 * 
1024.0);
-                                                                               
                                double filterSizeInMB = filterSize / (1024.0 * 
1024.0);
-                                                                               
                                double doutSizeInMB = doutSize / (1024.0 * 
1024.0);
-                                                                               
                                System.out
-                                                                               
                                                .format("conv2d, 
image[%d,%d,%d,%d](%.1fMB), filter[%d,%d,%d,%d](%.1f), 
dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], padding[%d,%d]",
-                                                                               
                                                                N, C, H, W, 
imageSizeInMB, N, C, R, S,
-                                                                               
                                                                filterSizeInMB, 
N, K, P, Q, doutSizeInMB,
-                                                                               
                                                                strideH, 
strideW, padH, padW);
-                                                                               
                                Matrix image = generateInputMatrix(spark, (int) 
N,
-                                                                               
                                                (int) (C * H * W), -127, 127, 
sparsity, seed);
-                                                                               
                                Matrix filter = generateInputMatrix(spark, 
(int) K,
-                                                                               
                                                (int) (C * R * S), -127, 127, 
sparsity, seed);
-                                                                               
                                HashMap<String, Object> inputs = new 
HashMap<>();
-                                                                               
                                inputs.put("N", N);
-                                                                               
                                inputs.put("C", C);
-                                                                               
                                inputs.put("H", H);
-                                                                               
                                inputs.put("W", W);
-                                                                               
                                inputs.put("K", K);
-                                                                               
                                inputs.put("R", R);
-                                                                               
                                inputs.put("S", S);
-                                                                               
                                inputs.put("strideH", strideH);
-                                                                               
                                inputs.put("strideW", strideW);
-                                                                               
                                inputs.put("padH", padH);
-                                                                               
                                inputs.put("padW", padW);
-                                                                               
                                inputs.put("image", image);
-                                                                               
                                inputs.put("filter", filter);
-                                                                               
                                List<Object> outCPU = runOnCPU(spark, 
scriptStr, inputs,
-                                                                               
                                                Arrays.asList("O"));
-                                                                               
                                List<Object> outGPU = runOnGPU(spark, 
scriptStr, inputs,
-                                                                               
                                                Arrays.asList("O"));
-                                                                               
                                assertHeavyHitterPresent("gpu_conv2d");
-                                                                               
                                assertEqualObjects(outCPU.get(0), 
outGPU.get(0));
-                                                                               
                                clearGPUMemory();
-                                                                               
                        }
-                                                                               
                }
-                                                                               
        }
-                                                                               
}
+                                       long W = H;
+                                       for (long K : Klst) {
+                                               for (long R : Rlst) {
+                                                       long S = R;
+                                                       for (long strideH : 
strideLst) {
+                                                               long strideW = 
strideH;
+                                                               for (long padH 
: padLst) {
+                                                                       long 
padW = padH;
+                                                                       for 
(double sparsity : sparsitylst) {
+
+                                                                               
// Make sure ops fit in GPU memory and within constraints of cudnn
+                                                                               
long imageSize = N * C * H * W * 8l;
+                                                                               
if (imageSize > MAX_OP_SIZE)  // image size
+                                                                               
        continue;
+                                                                               
long filterSize = K * C * R * S * 8l;
+                                                                               
if (filterSize > MAX_OP_SIZE)  // filter size
+                                                                               
        continue;
+                                                                               
// filter is smaller than image + padding
+                                                                               
if (R > (H + padH) || S > (W + padW))
+                                                                               
        continue;
+
+                                                                               
int P = (int) ConvolutionUtils.getP(H, R, strideH, padH);
+                                                                               
int Q = (int) ConvolutionUtils.getQ(W, S, strideW, padW);
+
+                                                                               
long doutSize = N * K * P * Q * 8l;
+                                                                               
if (doutSize > MAX_OP_SIZE) // dout/output size
+                                                                               
        continue;
+
+                                                                               
double imageSizeInMB = imageSize / (1024.0 * 1024.0);
+                                                                               
double filterSizeInMB = filterSize / (1024.0 * 1024.0);
+                                                                               
double doutSizeInMB = doutSize / (1024.0 * 1024.0);
+                                                                               
System.out
+                                                                               
.format("conv2d, image[%d,%d,%d,%d](%.1fMB), filter[%d,%d,%d,%d](%.1f), 
dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], padding[%d,%d]",
+                                                                               
                N, C, H, W, imageSizeInMB, N, C, R, S,
+                                                                               
                filterSizeInMB, N, K, P, Q, doutSizeInMB,
+                                                                               
                strideH, strideW, padH, padW);
+                                                                               
Matrix image = generateInputMatrix(spark, (int) N,
+                                                                               
                (int) (C * H * W), -127, 127, sparsity, seed, true);
+                                                                               
Matrix filter = generateInputMatrix(spark, (int) K,
+                                                                               
                (int) (C * R * S), -127, 127, sparsity, seed, true);
+                                                                               
HashMap<String, Object> inputs = new HashMap<>();
+                                                                               
inputs.put("N", N);
+                                                                               
inputs.put("C", C);
+                                                                               
inputs.put("H", H);
+                                                                               
inputs.put("W", W);
+                                                                               
inputs.put("K", K);
+                                                                               
inputs.put("R", R);
+                                                                               
inputs.put("S", S);
+                                                                               
inputs.put("strideH", strideH);
+                                                                               
inputs.put("strideW", strideW);
+                                                                               
inputs.put("padH", padH);
+                                                                               
inputs.put("padW", padW);
+                                                                               
inputs.put("image", image);
+                                                                               
inputs.put("filter", filter);
+                                                                               
List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
+                                                                               
                Arrays.asList("O"));
+                                                                               
List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
+                                                                               
                Arrays.asList("O"));
+                                                                               
assertHeavyHitterPresent("gpu_conv2d");
+                                                                               
assertEqualObjects(outCPU.get(0), outGPU.get(0));
+                                                                               
clearGPUMemory();
                                                                        }
                                                                }
                                                        }
                                                }
+
+
+
                                        }
                                }
                        }
@@ -237,11 +228,11 @@ public class NeuralNetworkOpTests extends GPUTests {
                double filterSizeInMB = filterSize / (1024.0 * 1024.0);
                double doutSizeInMB = doutSize / (1024.0 * 1024.0);
                System.out
-                               .format("conv2d, image[%d,%d,%d,%d](%.1fMB), 
filter[%d,%d,%d,%d](%.1f), dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], 
padding[%d,%d]",
-                                               N, C, H, W, imageSizeInMB, N, 
C, R, S, filterSizeInMB, N, K, P, Q, doutSizeInMB, strideH,
-                                               strideW, padH, padW);
-               Matrix image = generateInputMatrix(spark, (int) N, (int) (C * H 
* W), -1, 1, sparsity, seed);
-               Matrix filter = generateInputMatrix(spark, (int) K, (int) (C * 
R * S), -1, 1.0, sparsity, seed);
+               .format("conv2d, image[%d,%d,%d,%d](%.1fMB), 
filter[%d,%d,%d,%d](%.1f), dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], 
padding[%d,%d]",
+                               N, C, H, W, imageSizeInMB, N, C, R, S, 
filterSizeInMB, N, K, P, Q, doutSizeInMB, strideH,
+                               strideW, padH, padW);
+               Matrix image = generateInputMatrix(spark, (int) N, (int) (C * H 
* W), -1, 1, sparsity, seed, true);
+               Matrix filter = generateInputMatrix(spark, (int) K, (int) (C * 
R * S), -1, 1.0, sparsity, seed, true);
                HashMap<String, Object> inputs = new HashMap<>();
                inputs.put("N", N);
                inputs.put("C", C);
@@ -263,7 +254,6 @@ public class NeuralNetworkOpTests extends GPUTests {
                clearGPUMemory();
        }
 
-       @Ignore
        @Test
        public void testConv2dBackwardFilter() {
                String scriptStr = "O = conv2d_backward_filter(image, dout, 
padding=[padH, padW], stride=[strideH, strideW], input_shape=[N,C,H,W], 
filter_shape=[K,C,R,S])";
@@ -271,83 +261,82 @@ public class NeuralNetworkOpTests extends GPUTests {
                for (long N : Nlst) {
                        for (long C : Clst) {
                                for (long H : Hlst) {
-                                       for (long W : Wlst) {
-                                               for (long K : Klst) {
-                                                       for (long R : Rlst) {
-                                                               for (long S : 
Slst) {
-                                                                       for 
(long strideH : strideHeightLst) {
-                                                                               
for (long strideW : strideWidthLst) {
-                                                                               
        for (long padH : padHeightLst) {
-                                                                               
                for (long padW : padWidthLst) {
-                                                                               
                        for (double sparsity : sparsitylst) {
-
-                                                                               
                                // filter is smaller than image + padding
-                                                                               
                                if (R > (H + padH) || S > (W + padW))
-                                                                               
                                        continue;
-
-                                                                               
                                // Make sure ops fit in GPU memory and within 
constraints of cudnn
-                                                                               
                                long imageSize = N * C * H * W * 8l;
-                                                                               
                                if (imageSize > MAX_OP_SIZE)  // image size
-                                                                               
                                        continue;
-                                                                               
                                long filterSize = K * C * R * S * 8l;
-                                                                               
                                if (filterSize > MAX_OP_SIZE)  // filter size
-                                                                               
                                        continue;
-
-                                                                               
                                int P = (int) ConvolutionUtils.getP(H, R, 
strideH, padH);
-                                                                               
                                int Q = (int) ConvolutionUtils.getQ(W, S, 
strideW, padW);
-
-                                                                               
                                long doutSize = N * K * P * Q * 8l;
-                                                                               
                                if (doutSize > MAX_OP_SIZE) // dout/output size
-                                                                               
                                        continue;
-
-                                                                               
                                double imageSizeInMB = imageSize / (1024.0 * 
1024.0);
-                                                                               
                                double filterSizeInMB = filterSize / (1024.0 * 
1024.0);
-                                                                               
                                double doutSizeInMB = doutSize / (1024.0 * 
1024.0);
-                                                                               
                                System.out
-                                                                               
                                                
.format("conv2d_backward_filter, image[%d,%d,%d,%d](%.1fMB), 
filter[%d,%d,%d,%d](%.1f), dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], 
padding[%d,%d]",
-                                                                               
                                                                N, C, H, W, 
imageSizeInMB, N, C, R, S,
-                                                                               
                                                                filterSizeInMB, 
N, K, P, Q, doutSizeInMB,
-                                                                               
                                                                strideH, 
strideW, padH, padW);
-                                                                               
                                Matrix image = generateInputMatrix(spark, (int) 
N,
-                                                                               
                                                (int) (C * H * W), -127.0, 127, 
sparsity, seed);
-                                                                               
                                Matrix dout = generateInputMatrix(spark, (int) 
N,
-                                                                               
                                                (int) (K * P * Q), -127.0, 127, 
sparsity, seed);
-                                                                               
                                HashMap<String, Object> inputs = new 
HashMap<>();
-                                                                               
                                inputs.put("N", N);
-                                                                               
                                inputs.put("C", C);
-                                                                               
                                inputs.put("H", H);
-                                                                               
                                inputs.put("W", W);
-                                                                               
                                inputs.put("K", K);
-                                                                               
                                inputs.put("R", R);
-                                                                               
                                inputs.put("S", S);
-                                                                               
                                inputs.put("strideH", strideH);
-                                                                               
                                inputs.put("strideW", strideW);
-                                                                               
                                inputs.put("padH", padH);
-                                                                               
                                inputs.put("padW", padW);
-                                                                               
                                inputs.put("image", image);
-                                                                               
                                inputs.put("dout", dout);
-                                                                               
                                List<Object> outCPU = runOnCPU(spark, 
scriptStr, inputs,
-                                                                               
                                                Arrays.asList("O"));
-                                                                               
                                List<Object> outGPU = runOnGPU(spark, 
scriptStr, inputs,
-                                                                               
                                                Arrays.asList("O"));
-                                                                               
                                
assertHeavyHitterPresent("gpu_conv2d_backward_filter");
-                                                                               
                                assertEqualObjects(outCPU.get(0), 
outGPU.get(0));
-                                                                               
                                clearGPUMemory();
-                                                                               
                        }
-                                                                               
                }
-                                                                               
        }
-                                                                               
}
+                                       long W = H;
+                                       for (long K : Klst) {
+                                               for (long R : Rlst) {
+                                                       long S = R;
+                                                       for (long strideH : 
strideLst) {
+                                                               long strideW = 
strideH;
+                                                               for (long padH 
: padLst) {
+                                                                       long 
padW = padH;
+                                                                       for 
(double sparsity : sparsitylst) {
+
+                                                                               
// filter is smaller than image + padding
+                                                                               
if (R > (H + padH) || S > (W + padW))
+                                                                               
        continue;
+
+                                                                               
// Make sure ops fit in GPU memory and within constraints of cudnn
+                                                                               
long imageSize = N * C * H * W * 8l;
+                                                                               
if (imageSize > MAX_OP_SIZE)  // image size
+                                                                               
        continue;
+                                                                               
long filterSize = K * C * R * S * 8l;
+                                                                               
if (filterSize > MAX_OP_SIZE)  // filter size
+                                                                               
        continue;
+
+                                                                               
int P = (int) ConvolutionUtils.getP(H, R, strideH, padH);
+                                                                               
int Q = (int) ConvolutionUtils.getQ(W, S, strideW, padW);
+
+                                                                               
long doutSize = N * K * P * Q * 8l;
+                                                                               
if (doutSize > MAX_OP_SIZE) // dout/output size
+                                                                               
        continue;
+
+                                                                               
double imageSizeInMB = imageSize / (1024.0 * 1024.0);
+                                                                               
double filterSizeInMB = filterSize / (1024.0 * 1024.0);
+                                                                               
double doutSizeInMB = doutSize / (1024.0 * 1024.0);
+                                                                               
System.out
+                                                                               
.format("conv2d_backward_filter, image[%d,%d,%d,%d](%.1fMB), 
filter[%d,%d,%d,%d](%.1f), dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], 
padding[%d,%d]",
+                                                                               
                N, C, H, W, imageSizeInMB, N, C, R, S,
+                                                                               
                filterSizeInMB, N, K, P, Q, doutSizeInMB,
+                                                                               
                strideH, strideW, padH, padW);
+                                                                               
Matrix image = generateInputMatrix(spark, (int) N,
+                                                                               
                (int) (C * H * W), -127.0, 127, sparsity, seed, true);
+                                                                               
Matrix dout = generateInputMatrix(spark, (int) N,
+                                                                               
                (int) (K * P * Q), -127.0, 127, sparsity, seed, true);
+                                                                               
HashMap<String, Object> inputs = new HashMap<>();
+                                                                               
inputs.put("N", N);
+                                                                               
inputs.put("C", C);
+                                                                               
inputs.put("H", H);
+                                                                               
inputs.put("W", W);
+                                                                               
inputs.put("K", K);
+                                                                               
inputs.put("R", R);
+                                                                               
inputs.put("S", S);
+                                                                               
inputs.put("strideH", strideH);
+                                                                               
inputs.put("strideW", strideW);
+                                                                               
inputs.put("padH", padH);
+                                                                               
inputs.put("padW", padW);
+                                                                               
inputs.put("image", image);
+                                                                               
inputs.put("dout", dout);
+                                                                               
List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
+                                                                               
                Arrays.asList("O"));
+                                                                               
List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
+                                                                               
                Arrays.asList("O"));
+                                                                               
assertHeavyHitterPresent("gpu_conv2d_backward_filter");
+                                                                               
assertEqualObjects(outCPU.get(0), outGPU.get(0));
+                                                                               
clearGPUMemory();
                                                                        }
                                                                }
                                                        }
                                                }
+
+
+
+
                                        }
                                }
                        }
                }
        }
 
-       @Ignore
        @Test
        public void testConv2dBackwardData() {
                String scriptStr = "O = conv2d_backward_data(filter, dout, 
padding=[padH, padW], stride=[strideH, strideW], input_shape=[N,C,H,W], 
filter_shape=[K,C,R,S])";
@@ -355,237 +344,237 @@ public class NeuralNetworkOpTests extends GPUTests {
                for (long N : Nlst) {
                        for (long C : Clst) {
                                for (long H : Hlst) {
-                                       for (long W : Wlst) {
-                                               for (long K : Klst) {
-                                                       for (long R : Rlst) {
-                                                               for (long S : 
Slst) {
-                                                                       for 
(long strideH : strideHeightLst) {
-                                                                               
for (long strideW : strideWidthLst) {
-                                                                               
        for (long padH : padHeightLst) {
-                                                                               
                for (long padW : padWidthLst) {
-                                                                               
                        for (double sparsity : sparsitylst) {
-
-                                                                               
                                // filter is smaller than image + padding
-                                                                               
                                if (R > (H + padH) || S > (W + padW))
-                                                                               
                                        continue;
-
-                                                                               
                                // Make sure ops fit in GPU memory and within 
constraints of cudnn
-                                                                               
                                long imageSize = N * C * H * W * 8l;
-                                                                               
                                if (imageSize > MAX_OP_SIZE)  // image size
-                                                                               
                                        continue;
-                                                                               
                                long filterSize = K * C * R * S * 8l;
-                                                                               
                                if (filterSize > MAX_OP_SIZE)  // filter size
-                                                                               
                                        continue;
-
-                                                                               
                                int P = (int) ConvolutionUtils.getP(H, R, 
strideH, padH);
-                                                                               
                                int Q = (int) ConvolutionUtils.getQ(W, S, 
strideW, padW);
-
-                                                                               
                                long doutSize = N * K * P * Q * 8l;
-                                                                               
                                if (doutSize > MAX_OP_SIZE) // dout/output size
-                                                                               
                                        continue;
-
-                                                                               
                                double imageSizeInMB = imageSize / (1024.0 * 
1024.0);
-                                                                               
                                double filterSizeInMB = filterSize / (1024.0 * 
1024.0);
-                                                                               
                                double doutSizeInMB = doutSize / (1024.0 * 
1024.0);
-                                                                               
                                System.out
-                                                                               
                                                .format("conv2d_backward_data, 
image[%d,%d,%d,%d](%.1fMB), filter[%d,%d,%d,%d](%.1f), 
dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], padding[%d,%d]",
-                                                                               
                                                                N, C, H, W, 
imageSizeInMB, N, C, R, S,
-                                                                               
                                                                filterSizeInMB, 
N, K, P, Q, doutSizeInMB,
-                                                                               
                                                                strideH, 
strideW, padH, padW);
-
-                                                                               
                                Matrix filter = generateInputMatrix(spark, 
(int) K,
-                                                                               
                                                (int) (C * R * S), -127.0, 127, 
sparsity, seed);
-                                                                               
                                Matrix dout = generateInputMatrix(spark, (int) 
N,
-                                                                               
                                                (int) (K * P * Q), -127.0, 127, 
sparsity, seed);
-                                                                               
                                HashMap<String, Object> inputs = new 
HashMap<>();
-                                                                               
                                inputs.put("N", N);
-                                                                               
                                inputs.put("C", C);
-                                                                               
                                inputs.put("H", H);
-                                                                               
                                inputs.put("W", W);
-                                                                               
                                inputs.put("K", K);
-                                                                               
                                inputs.put("R", R);
-                                                                               
                                inputs.put("S", S);
-                                                                               
                                inputs.put("strideH", strideH);
-                                                                               
                                inputs.put("strideW", strideW);
-                                                                               
                                inputs.put("padH", padH);
-                                                                               
                                inputs.put("padW", padW);
-                                                                               
                                inputs.put("filter", filter);
-                                                                               
                                inputs.put("dout", dout);
-                                                                               
                                List<Object> outCPU = runOnCPU(spark, 
scriptStr, inputs,
-                                                                               
                                                Arrays.asList("O"));
-                                                                               
                                List<Object> outGPU = runOnGPU(spark, 
scriptStr, inputs,
-                                                                               
                                                Arrays.asList("O"));
-                                                                               
                                
assertHeavyHitterPresent("gpu_conv2d_backward_data");
-                                                                               
                                assertEqualObjects(outCPU.get(0), 
outGPU.get(0));
-                                                                               
                                clearGPUMemory();
-                                                                               
                        }
-                                                                               
                }
-                                                                               
        }
-                                                                               
}
+                                       long W = H;
+                                       for (long K : Klst) {
+                                               for (long R : Rlst) {
+                                                       long S = R;
+                                                       for (long strideH : 
strideLst) {
+                                                               long strideW = 
strideH;
+                                                               for (long padH 
: padLst) {
+                                                                       long 
padW = padH;
+                                                                       for 
(double sparsity : sparsitylst) {
+
+                                                                               
// filter is smaller than image + padding
+                                                                               
if (R > (H + padH) || S > (W + padW))
+                                                                               
        continue;
+
+                                                                               
// Make sure ops fit in GPU memory and within constraints of cudnn
+                                                                               
long imageSize = N * C * H * W * 8l;
+                                                                               
if (imageSize > MAX_OP_SIZE)  // image size
+                                                                               
        continue;
+                                                                               
long filterSize = K * C * R * S * 8l;
+                                                                               
if (filterSize > MAX_OP_SIZE)  // filter size
+                                                                               
        continue;
+
+                                                                               
int P = (int) ConvolutionUtils.getP(H, R, strideH, padH);
+                                                                               
int Q = (int) ConvolutionUtils.getQ(W, S, strideW, padW);
+
+                                                                               
long doutSize = N * K * P * Q * 8l;
+                                                                               
if (doutSize > MAX_OP_SIZE) // dout/output size
+                                                                               
        continue;
+
+                                                                               
double imageSizeInMB = imageSize / (1024.0 * 1024.0);
+                                                                               
double filterSizeInMB = filterSize / (1024.0 * 1024.0);
+                                                                               
double doutSizeInMB = doutSize / (1024.0 * 1024.0);
+                                                                               
System.out
+                                                                               
.format("conv2d_backward_data, image[%d,%d,%d,%d](%.1fMB), 
filter[%d,%d,%d,%d](%.1f), dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], 
padding[%d,%d]",
+                                                                               
                N, C, H, W, imageSizeInMB, N, C, R, S,
+                                                                               
                filterSizeInMB, N, K, P, Q, doutSizeInMB,
+                                                                               
                strideH, strideW, padH, padW);
+
+                                                                               
Matrix filter = generateInputMatrix(spark, (int) K,
+                                                                               
                (int) (C * R * S), -127.0, 127, sparsity, seed, true);
+                                                                               
Matrix dout = generateInputMatrix(spark, (int) N,
+                                                                               
                (int) (K * P * Q), -127.0, 127, sparsity, seed, true);
+                                                                               
HashMap<String, Object> inputs = new HashMap<>();
+                                                                               
inputs.put("N", N);
+                                                                               
inputs.put("C", C);
+                                                                               
inputs.put("H", H);
+                                                                               
inputs.put("W", W);
+                                                                               
inputs.put("K", K);
+                                                                               
inputs.put("R", R);
+                                                                               
inputs.put("S", S);
+                                                                               
inputs.put("strideH", strideH);
+                                                                               
inputs.put("strideW", strideW);
+                                                                               
inputs.put("padH", padH);
+                                                                               
inputs.put("padW", padW);
+                                                                               
inputs.put("filter", filter);
+                                                                               
inputs.put("dout", dout);
+                                                                               
List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
+                                                                               
                Arrays.asList("O"));
+                                                                               
List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
+                                                                               
                Arrays.asList("O"));
+                                                                               
assertHeavyHitterPresent("gpu_conv2d_backward_data");
+                                                                               
assertEqualObjects(outCPU.get(0), outGPU.get(0));
+                                                                               
clearGPUMemory();
                                                                        }
                                                                }
                                                        }
                                                }
                                        }
+
+
+
+
                                }
                        }
                }
        }
 
-       @Ignore
        @Test
+       @Ignore
        public void testMaxPool() {
                String scriptStr = "O = max_pool(image, padding=[padH, padW], 
stride=[strideH, strideW], input_shape=[N,C,H,W], pool_size=[R,S])";
 
                for (long N : Nlst) {
                        for (long C : Clst) {
                                for (long H : Hlst) {
-                                       for (long W : Wlst) {
-                                               for (long R : Rlst) {
-                                                       for (long S : Slst) {
-                                                               for (long 
strideH : strideHeightLst) {
-                                                                       for 
(long strideW : strideWidthLst) {
-                                                                               
for (long padH : padHeightLst) {
-                                                                               
        for (long padW : padWidthLst) {
-                                                                               
                for (double sparsity : sparsitylst) {
-
-                                                                               
                        // pool is smaller than image + padding
-                                                                               
                        if (R > (H + padH) || S > (W + padW))
-                                                                               
                                continue;
-
-                                                                               
                        // Make sure ops fit in GPU memory and within 
constraints of cudnn
-                                                                               
                        long imageSize = N * C * H * W * 8l;
-                                                                               
                        if (imageSize > MAX_OP_SIZE)  // image size
-                                                                               
                                continue;
-                                                                               
                        long poolSize = R * S * 8l;
-                                                                               
                        if (poolSize > MAX_OP_SIZE)  // filter size
-                                                                               
                                continue;
-
-                                                                               
                        int P = (int) ConvolutionUtils.getP(H, R, strideH, 
padH);
-                                                                               
                        int Q = (int) ConvolutionUtils.getQ(W, S, strideW, 
padW);
-
-                                                                               
                        long doutSize = N * C * P * Q * 8l;
-                                                                               
                        if (doutSize > MAX_OP_SIZE) // dout/output size
-                                                                               
                                continue;
-
-                                                                               
                        double imageSizeInMB = imageSize / (1024.0 * 1024.0);
-                                                                               
                        double poolSizeInMB = poolSize / (1024.0 * 1024.0);
-                                                                               
                        double doutSizeInMB = doutSize / (1024.0 * 1024.0);
-                                                                               
                        System.out
-                                                                               
                                        .format("max_pool, 
image[%d,%d,%d,%d](%.1fMB), pool[%d,%d](%.1f), dout[%d,%d,%d,%d](%.1fMB), 
stride[%d,%d], padding[%d,%d]",
-                                                                               
                                                        N, C, H, W, 
imageSizeInMB, R, S, poolSizeInMB, N, C,
-                                                                               
                                                        P, Q, doutSizeInMB, 
strideH, strideW, padH, padW);
-
-                                                                               
                        Matrix image = generateInputMatrix(spark, (int) N,
-                                                                               
                                        (int) (C * H * W), -127.0, 127, 
sparsity, seed);
-                                                                               
                        HashMap<String, Object> inputs = new HashMap<>();
-                                                                               
                        inputs.put("N", N);
-                                                                               
                        inputs.put("C", C);
-                                                                               
                        inputs.put("H", H);
-                                                                               
                        inputs.put("W", W);
-                                                                               
                        inputs.put("R", R);
-                                                                               
                        inputs.put("S", S);
-                                                                               
                        inputs.put("strideH", strideH);
-                                                                               
                        inputs.put("strideW", strideW);
-                                                                               
                        inputs.put("padH", padH);
-                                                                               
                        inputs.put("padW", padW);
-                                                                               
                        inputs.put("image", image);
-                                                                               
                        List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
-                                                                               
                                        Arrays.asList("O"));
-                                                                               
                        List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
-                                                                               
                                        Arrays.asList("O"));
-                                                                               
                        assertHeavyHitterPresent("gpu_maxpooling");
-                                                                               
                        assertEqualObjects(outCPU.get(0), outGPU.get(0));
-                                                                               
                        clearGPUMemory();
-                                                                               
                }
-                                                                               
        }
-                                                                               
}
-                                                                       }
+                                       long W = H;
+                                       for (long R : Rlst) {
+                                               long S = R;
+                                               for (long strideH : strideLst) {
+                                                       long strideW = strideH;
+                                                       for (long padH : 
padLst) {
+                                                               long padW = 
padH;
+                                                               for (double 
sparsity : sparsitylst) {
+
+                                                                       // pool 
is smaller than image + padding
+                                                                       if (R > 
(H + padH) || S > (W + padW))
+                                                                               
continue;
+
+                                                                       // Make 
sure ops fit in GPU memory and within constraints of cudnn
+                                                                       long 
imageSize = N * C * H * W * 8l;
+                                                                       if 
(imageSize > MAX_OP_SIZE)  // image size
+                                                                               
continue;
+                                                                       long 
poolSize = R * S * 8l;
+                                                                       if 
(poolSize > MAX_OP_SIZE)  // filter size
+                                                                               
continue;
+
+                                                                       int P = 
(int) ConvolutionUtils.getP(H, R, strideH, padH);
+                                                                       int Q = 
(int) ConvolutionUtils.getQ(W, S, strideW, padW);
+
+                                                                       long 
doutSize = N * C * P * Q * 8l;
+                                                                       if 
(doutSize > MAX_OP_SIZE) // dout/output size
+                                                                               
continue;
+
+                                                                       double 
imageSizeInMB = imageSize / (1024.0 * 1024.0);
+                                                                       double 
poolSizeInMB = poolSize / (1024.0 * 1024.0);
+                                                                       double 
doutSizeInMB = doutSize / (1024.0 * 1024.0);
+                                                                       
System.out
+                                                                       
.format("max_pool, image[%d,%d,%d,%d](%.1fMB), pool[%d,%d](%.1f), 
dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], padding[%d,%d]",
+                                                                               
        N, C, H, W, imageSizeInMB, R, S, poolSizeInMB, N, C,
+                                                                               
        P, Q, doutSizeInMB, strideH, strideW, padH, padW);
+
+                                                                       Matrix 
image = generateInputMatrix(spark, (int) N,
+                                                                               
        (int) (C * H * W), -127.0, 127, sparsity, seed, true);
+                                                                       
HashMap<String, Object> inputs = new HashMap<>();
+                                                                       
inputs.put("N", N);
+                                                                       
inputs.put("C", C);
+                                                                       
inputs.put("H", H);
+                                                                       
inputs.put("W", W);
+                                                                       
inputs.put("R", R);
+                                                                       
inputs.put("S", S);
+                                                                       
inputs.put("strideH", strideH);
+                                                                       
inputs.put("strideW", strideW);
+                                                                       
inputs.put("padH", padH);
+                                                                       
inputs.put("padW", padW);
+                                                                       
inputs.put("image", image);
+                                                                       
List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
+                                                                               
        Arrays.asList("O"));
+                                                                       
List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
+                                                                               
        Arrays.asList("O"));
+                                                                       
assertHeavyHitterPresent("gpu_maxpooling");
+                                                                       
assertEqualObjects(outCPU.get(0), outGPU.get(0));
+                                                                       
clearGPUMemory();
                                                                }
                                                        }
                                                }
                                        }
+
+
+
+
                                }
                        }
                }
        }
 
-       @Ignore
        @Test
+       @Ignore
        public void testMaxPoolBackward() {
                String scriptStr = "O = max_pool_backward(image, dout, 
padding=[padH, padW], stride=[strideH, strideW], input_shape=[N,C,H,W], 
pool_size=[R,S])";
 
                for (long N : Nlst) {
                        for (long C : Clst) {
                                for (long H : Hlst) {
-                                       for (long W : Wlst) {
-                                               for (long R : Rlst) {
-                                                       for (long S : Slst) {
-                                                               for (long 
strideH : strideHeightLst) {
-                                                                       for 
(long strideW : strideWidthLst) {
-                                                                               
for (long padH : padHeightLst) {
-                                                                               
        for (long padW : padWidthLst) {
-                                                                               
                for (double sparsity : sparsitylst) {
-
-                                                                               
                        // pool is smaller than image + padding
-                                                                               
                        if (R > (H + padH) || S > (W + padW))
-                                                                               
                                continue;
-
-                                                                               
                        // Make sure ops fit in GPU memory and within 
constraints of cudnn
-                                                                               
                        long imageSize = N * C * H * W * 8l;
-                                                                               
                        if (imageSize > MAX_OP_SIZE)  // image size
-                                                                               
                                continue;
-                                                                               
                        long poolSize = R * S * 8l;
-                                                                               
                        if (poolSize > MAX_OP_SIZE)  // filter size
-                                                                               
                                continue;
-
-                                                                               
                        int P = (int) ConvolutionUtils.getP(H, R, strideH, 
padH);
-                                                                               
                        int Q = (int) ConvolutionUtils.getQ(W, S, strideW, 
padW);
-
-                                                                               
                        long doutSize = N * C * P * Q * 8l;
-                                                                               
                        if (doutSize > MAX_OP_SIZE) // dout/output size
-                                                                               
                                continue;
-
-                                                                               
                        double imageSizeInMB = imageSize / (1024.0 * 1024.0);
-                                                                               
                        double poolSizeInMB = poolSize / (1024.0 * 1024.0);
-                                                                               
                        double doutSizeInMB = doutSize / (1024.0 * 1024.0);
-                                                                               
                        System.out
-                                                                               
                                        .format("max_pool_backward, 
image[%d,%d,%d,%d](%.1fMB), pool[%d,%d](%.1f), dout[%d,%d,%d,%d](%.1fMB), 
stride[%d,%d], padding[%d,%d]",
-                                                                               
                                                        N, C, H, W, 
imageSizeInMB, R, S, poolSizeInMB, N, C,
-                                                                               
                                                        P, Q, doutSizeInMB, 
strideH, strideW, padH, padW);
-
-                                                                               
                        Matrix image = generateInputMatrix(spark, (int) N,
-                                                                               
                                        (int) (C * H * W), -127.0, 127, 
sparsity, seed);
-                                                                               
                        Matrix dout = generateInputMatrix(spark, (int) N, (int) 
(C * P * Q),
-                                                                               
                                        -127.0, 127, sparsity, seed);
-                                                                               
                        HashMap<String, Object> inputs = new HashMap<>();
-                                                                               
                        inputs.put("N", N);
-                                                                               
                        inputs.put("C", C);
-                                                                               
                        inputs.put("H", H);
-                                                                               
                        inputs.put("W", W);
-                                                                               
                        inputs.put("R", R);
-                                                                               
                        inputs.put("S", S);
-                                                                               
                        inputs.put("strideH", strideH);
-                                                                               
                        inputs.put("strideW", strideW);
-                                                                               
                        inputs.put("padH", padH);
-                                                                               
                        inputs.put("padW", padW);
-                                                                               
                        inputs.put("image", image);
-                                                                               
                        inputs.put("dout", dout);
-                                                                               
                        List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
-                                                                               
                                        Arrays.asList("O"));
-                                                                               
                        List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
-                                                                               
                                        Arrays.asList("O"));
-                                                                               
                        assertHeavyHitterPresent("gpu_maxpooling_backward");
-                                                                               
                        assertEqualObjects(outCPU.get(0), outGPU.get(0));
-                                                                               
                        clearGPUMemory();
-                                                                               
                }
-                                                                               
        }
-                                                                               
}
-                                                                       }
+                                       long W = H;
+                                       for (long R : Rlst) {
+                                               long S = R;
+                                               for (long strideH : strideLst) {
+                                                       long strideW = strideH;
+                                                       for (long padH : 
padLst) {
+                                                               long padW = 
padH;
+                                                               for (double 
sparsity : sparsitylst) {
+
+                                                                       // pool 
is smaller than image + padding
+                                                                       if (R > 
(H + padH) || S > (W + padW))
+                                                                               
continue;
+
+                                                                       // Make 
sure ops fit in GPU memory and within constraints of cudnn
+                                                                       long 
imageSize = N * C * H * W * 8l;
+                                                                       if 
(imageSize > MAX_OP_SIZE)  // image size
+                                                                               
continue;
+                                                                       long 
poolSize = R * S * 8l;
+                                                                       if 
(poolSize > MAX_OP_SIZE)  // filter size
+                                                                               
continue;
+
+                                                                       int P = 
(int) ConvolutionUtils.getP(H, R, strideH, padH);
+                                                                       int Q = 
(int) ConvolutionUtils.getQ(W, S, strideW, padW);
+
+                                                                       long 
doutSize = N * C * P * Q * 8l;
+                                                                       if 
(doutSize > MAX_OP_SIZE) // dout/output size
+                                                                               
continue;
+
+                                                                       double 
imageSizeInMB = imageSize / (1024.0 * 1024.0);
+                                                                       double 
poolSizeInMB = poolSize / (1024.0 * 1024.0);
+                                                                       double 
doutSizeInMB = doutSize / (1024.0 * 1024.0);
+                                                                       
System.out
+                                                                       
.format("max_pool_backward, image[%d,%d,%d,%d](%.1fMB), pool[%d,%d](%.1f), 
dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], padding[%d,%d]",
+                                                                               
        N, C, H, W, imageSizeInMB, R, S, poolSizeInMB, N, C,
+                                                                               
        P, Q, doutSizeInMB, strideH, strideW, padH, padW);
+
+                                                                       Matrix 
image = generateInputMatrix(spark, (int) N,
+                                                                               
        (int) (C * H * W), -127.0, 127, sparsity, seed, true);
+                                                                       Matrix 
dout = generateInputMatrix(spark, (int) N, (int) (C * P * Q),
+                                                                               
        -127.0, 127, sparsity, seed, true);
+                                                                       
HashMap<String, Object> inputs = new HashMap<>();
+                                                                       
inputs.put("N", N);
+                                                                       
inputs.put("C", C);
+                                                                       
inputs.put("H", H);
+                                                                       
inputs.put("W", W);
+                                                                       
inputs.put("R", R);
+                                                                       
inputs.put("S", S);
+                                                                       
inputs.put("strideH", strideH);
+                                                                       
inputs.put("strideW", strideW);
+                                                                       
inputs.put("padH", padH);
+                                                                       
inputs.put("padW", padW);
+                                                                       
inputs.put("image", image);
+                                                                       
inputs.put("dout", dout);
+                                                                       
List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
+                                                                               
        Arrays.asList("O"));
+                                                                       
List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
+                                                                               
        Arrays.asList("O"));
+                                                                       
assertHeavyHitterPresent("gpu_maxpooling_backward");
+                                                                       
assertEqualObjects(outCPU.get(0), outGPU.get(0));
+                                                                       
clearGPUMemory();
                                                                }
                                                        }
                                                }
                                        }
+
+
+
+
                                }
                        }
                }

systemml git commit: [SYSTEMML-540] Support sparse output for CP im2col operator

Reply via email to