Repository: incubator-systemml
Updated Branches:
  refs/heads/master 23ccab85c -> 8c069ed8e


[SYSTEMML-769] Improved performance of im2col and some bug fixes

1. Improved the performance of im2col.
2. Added flags to disable SystemML's caching and also to disable sparse
operations. By default, we enable both caching and sparse operations.
However, having this flag allows an educated user to disable them (for
example: deep learning).
3. Bug fix for single-threaded execution for conv2d_backward_filter.
4. Cleaned up the code for reuse non-zeroed array and invocation of
ConvTasks.
5. Updated developer documentation for LibMatrixDNN.
6. Added sparse test cases for conv2d. Will add similar testcases for
other LibMatrixDNN operations  soon.

Closes #306.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/8c069ed8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/8c069ed8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/8c069ed8

Branch: refs/heads/master
Commit: 8c069ed8e54c6941b25ccf8ede41389875e25d0b
Parents: 23ccab8
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Mon Dec 5 14:34:42 2016 -0800
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Mon Dec 5 14:34:42 2016 -0800

----------------------------------------------------------------------
 .../java/org/apache/sysml/api/DMLScript.java    |  26 +-
 .../org/apache/sysml/hops/ConvolutionOp.java    |   2 +-
 .../controlprogram/caching/MatrixObject.java    |  11 -
 .../cp/ConvolutionCPInstruction.java            |  45 +-
 .../matrix/data/ConvolutionParameters.java      | 110 ++++
 .../sysml/runtime/matrix/data/LibMatrixDNN.java | 660 ++++++++-----------
 .../sysml/runtime/matrix/data/MatrixBlock.java  |  19 +-
 .../java/org/apache/sysml/utils/Statistics.java |  21 +-
 .../functions/tensor/Conv2DTest.java            | 158 +++--
 9 files changed, 527 insertions(+), 525 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8c069ed8/src/main/java/org/apache/sysml/api/DMLScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java 
b/src/main/java/org/apache/sysml/api/DMLScript.java
index 7da66b5..3a09005 100644
--- a/src/main/java/org/apache/sysml/api/DMLScript.java
+++ b/src/main/java/org/apache/sysml/api/DMLScript.java
@@ -109,6 +109,9 @@ public class DMLScript
        public static boolean USE_ACCELERATOR = false;
        public static boolean FORCE_ACCELERATOR = false;
        
+       public static boolean DISABLE_SPARSE = false;
+       public static boolean DISABLE_CACHING = false;
+       
        // flag that indicates whether or not to suppress any prints to stdout
        public static boolean _suppressPrint2Stdout = false;
        
@@ -133,6 +136,8 @@ public class DMLScript
                        //+ "   -debug: <flags> (optional) run in debug mode\n"
                        //+ "                   Optional <flags> that is 
supported for this mode is optimize=(on|off)\n"
                        + "   -exec: <mode> (optional) execution mode (hadoop, 
singlenode, [hybrid], hybrid_spark)\n"
+                       + "   -disable-sparse: disable sparse operations\n"
+                       + "   -disable-caching: disable SystemML's multi-level 
cache\n"
                        + "   -explain: <type> (optional) explain plan (hops, 
[runtime], recompile_hops, recompile_runtime)\n"
                        + "   -stats: <count> (optional) monitor and report 
caching/recompilation statistics, default heavy hitter count is 10\n"
                        + "   -clean: (optional) cleanup all SystemML working 
directories (FS, DFS).\n"
@@ -267,7 +272,9 @@ public class DMLScript
                
                //parse arguments and set execution properties
                RUNTIME_PLATFORM oldrtplatform = rtplatform; //keep old 
rtplatform
-               ExplainType oldexplain = EXPLAIN; //keep old explain    
+               ExplainType oldexplain = EXPLAIN; //keep old explain
+               boolean oldDisableSparse = DISABLE_SPARSE;
+               boolean oldDisableCaching = DISABLE_CACHING;
                
                // Reset global flags to avoid errors in test suite
                ENABLE_DEBUG_MODE = false;
@@ -286,6 +293,12 @@ public class DMLScript
                                        if( args.length > (i+1) && 
!args[i+1].startsWith("-") )
                                                EXPLAIN = 
Explain.parseExplainType(args[++i]);
                                }
+                               else if( 
args[i].equalsIgnoreCase("-disable-caching") ) { 
+                                       DISABLE_CACHING = true;
+                               }
+                               else if( 
args[i].equalsIgnoreCase("-disable-sparse") ) { 
+                                       DISABLE_SPARSE = true;
+                               }
                                else if( args[i].equalsIgnoreCase("-stats") ) {
                                        STATISTICS = true;
                                        if (args.length > (i + 1) && !args[i + 
1].startsWith("-"))
@@ -376,6 +389,8 @@ public class DMLScript
                        //reset runtime platform and visualize flag
                        rtplatform = oldrtplatform;
                        EXPLAIN = oldexplain;
+                       DISABLE_SPARSE = oldDisableSparse;
+                       DISABLE_CACHING = oldDisableCaching;
                }
                
                return true;
@@ -670,6 +685,11 @@ public class DMLScript
                {  
                        initHadoopExecution( dmlconf );
                        
+                       if(DISABLE_CACHING) {
+                               //disable caching globally 
+                               CacheableData.disableCaching();
+                       }
+                       
                        //run execute (w/ exception handling to ensure proper 
shutdown)
                        ec = ExecutionContextFactory.createContext(rtprog);
                        rtprog.execute( ec );  
@@ -935,6 +955,4 @@ public class DMLScript
                        throw new DMLException("Failed to run SystemML 
workspace cleanup.", ex);
                }
        }
-
-       public static final boolean REUSE_NONZEROED_OUTPUT = false;
-}  
+}  
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8c069ed8/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java 
b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index 8446b11..d9ff962 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -32,7 +32,7 @@ import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
-import org.apache.sysml.runtime.matrix.data.LibMatrixDNN.ConvolutionParameters;
+import org.apache.sysml.runtime.matrix.data.ConvolutionParameters;
 
 public class ConvolutionOp extends Hop  implements MultiThreadedHop
 {      

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8c069ed8/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java
index 40cae0c..b73adb0 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java
@@ -40,7 +40,6 @@ import org.apache.sysml.runtime.matrix.MatrixFormatMetaData;
 import org.apache.sysml.runtime.matrix.MetaData;
 import org.apache.sysml.runtime.matrix.data.FileFormatProperties;
 import org.apache.sysml.runtime.matrix.data.InputInfo;
-import org.apache.sysml.runtime.matrix.data.LibMatrixDNN;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.OutputInfo;
 import org.apache.sysml.runtime.util.DataConverter;
@@ -189,16 +188,6 @@ public class MatrixObject extends 
CacheableData<MatrixBlock>
                return ((double)mc.getNonZeros())/mc.getRows()/mc.getCols();
        }
        
-       @Override
-       protected void clearReusableData() {
-               if(DMLScript.REUSE_NONZEROED_OUTPUT) {
-                       if(_data == null)
-                               getCache();
-                       if( _data != null && !_data.isVector() )
-                               
LibMatrixDNN.cacheReuseableData(_data.getDenseBlock());
-               }
-       }
-       
        // *********************************************
        // ***                                       ***
        // ***       HIGH-LEVEL PUBLIC METHODS       ***

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8c069ed8/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index 5997760..e0238aa 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -21,19 +21,17 @@ package org.apache.sysml.runtime.instructions.cp;
 
 import java.util.ArrayList;
 
-import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.functionobjects.SwapIndex;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
+import org.apache.sysml.runtime.matrix.data.ConvolutionParameters;
 import org.apache.sysml.runtime.matrix.data.LibMatrixDNN;
-import org.apache.sysml.runtime.matrix.data.LibMatrixDNN.ConvolutionParameters;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
-import org.apache.sysml.utils.Statistics;
 
 public class ConvolutionCPInstruction extends UnaryCPInstruction {
        
@@ -42,7 +40,6 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
        private ArrayList<CPOperand> _filter_shape;
        private ArrayList<CPOperand> _stride = new ArrayList<CPOperand>();
        private ArrayList<CPOperand> _padding = new ArrayList<CPOperand>();
-       private boolean _reuseNonZeroedOutput = false;
        private int _numThreads = -1;
        
        public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand 
out, String opcode, String istr, int numThreads) throws DMLRuntimeException {
@@ -194,7 +191,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                }
                else {
                        // As we always fill the output first with bias
-                       outputBlock = getDenseOutputBlock(ec, 
input.getNumRows(), input.getNumColumns(), true);
+                       outputBlock = getDenseOutputBlock(ec, 
input.getNumRows(), input.getNumColumns());
                        LibMatrixDNN.bias_add(input, bias, outputBlock, 
_numThreads);
                }
                
@@ -239,10 +236,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                                outputBlock = new MatrixBlock(N, C*P*Q, true, 
0);
                        }
                        else {
-                               // Is eligible for REUSE_NONZEROED_OUTPUT but 
cannot guarantee that previous output has been rmvar-ed
-                               // without somewhat expensive HashMap checks
-                               outputBlock = getDenseOutputBlock(ec, N, C*P*Q, 
false);
-                               
params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
+                               outputBlock = getDenseOutputBlock(ec, N, C*P*Q);
                                LibMatrixDNN.maxpooling(matBlock, outputBlock, 
params);
                        }
                }
@@ -252,10 +246,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                                outputBlock = new MatrixBlock(N, C*H*W, true, 
0);
                        }
                        else {
-                               // Is eligible for REUSE_NONZEROED_OUTPUT but 
cannot guarantee that previous output has been rmvar-ed
-                               // without somewhat expensive HashMap checks
-                               outputBlock = getDenseOutputBlock(ec, N, C*H*W, 
false);
-                               
params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
+                               outputBlock = getDenseOutputBlock(ec, N, C*H*W);
                                LibMatrixDNN.maxpooling_backward(matBlock, 
dout, outputBlock, params);
                        }
                        ec.releaseMatrixInput(_in2.getName());
@@ -266,8 +257,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                                outputBlock = new MatrixBlock(N, K*P*Q, true, 
0);
                        }
                        else {
-                               outputBlock = getDenseOutputBlock(ec, N, K*P*Q, 
false);
-                               
params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
+                               outputBlock = getDenseOutputBlock(ec, N, K*P*Q);
                                LibMatrixDNN.conv2d(matBlock, filter, 
outputBlock, params);
                        }
                        ec.releaseMatrixInput(_in2.getName());
@@ -278,8 +268,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                                outputBlock = new MatrixBlock(K, C*R*S, true, 
0);
                        }
                        else {
-                               outputBlock = getDenseOutputBlock(ec, K, C*R*S, 
false);
-                               
params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
+                               outputBlock = getDenseOutputBlock(ec, K, C*R*S);
                                LibMatrixDNN.conv2d_backward_filter(matBlock, 
dout, outputBlock, params);
                        }
                        ec.releaseMatrixInput(_in2.getName());
@@ -290,8 +279,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                                outputBlock = new MatrixBlock(N, C * H * W, 
true, 0);
                        }
                        else {
-                               outputBlock = getDenseOutputBlock(ec, N, C * H 
* W, false);
-                               
params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
+                               outputBlock = getDenseOutputBlock(ec, N, C * H 
* W);
                                LibMatrixDNN.conv2d_backward_data(matBlock, 
dout, outputBlock, params);
                        }
                        ec.releaseMatrixInput(_in2.getName());
@@ -305,25 +293,10 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                ec.setMatrixOutput(getOutputVariableName(), outputBlock);
        }
        
-       @SuppressWarnings("unused")
-       private MatrixBlock getDenseOutputBlock(ExecutionContext ec, int 
numRows, int numCols, boolean reuseNonZeroedOutput1) throws DMLRuntimeException 
{
-               long start = -1;
-               if(DMLScript.STATISTICS)
-                       start = System.nanoTime();
-               
+       private MatrixBlock getDenseOutputBlock(ExecutionContext ec, int 
numRows, int numCols) throws DMLRuntimeException {
                MatrixBlock outputBlock = new MatrixBlock(numRows, numCols, 
false, numRows * numCols);
-               _reuseNonZeroedOutput = false;
-               if(reuseNonZeroedOutput1 && DMLScript.REUSE_NONZEROED_OUTPUT) {
-                       _reuseNonZeroedOutput = true;
-                       outputBlock.allocateDenseBlock(true, 
!_reuseNonZeroedOutput);  
-               }
-               else  {
-                       outputBlock.allocateDenseBlock();
-               }
+               outputBlock.allocateDenseBlock();
                outputBlock.setNonZeros(-1);
-
-               if(DMLScript.STATISTICS)
-                       
Statistics.incrementAllocationTime(System.nanoTime()-start, false);
                return outputBlock;
        }
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8c069ed8/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
new file mode 100644
index 0000000..27fcf87
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.matrix.data;
+
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.util.ConvolutionUtils;
+
+/**
+ * This class is container that stores parameters required for executing 
following operations:
+ * conv2d, conv2d_backward_data, conv2d_backward_filter, maxpooling, 
maxpooling_backward 
+ */
+public class ConvolutionParameters {
+       public int N; public int C; public int H; public int W;
+       public int K; public int R; public int S; public int stride_h; public 
int stride_w; public int pad_h; public int pad_w;
+       public int P; public int Q; public int numThreads;
+       
+       public AtomicLong outputNNZ = new AtomicLong(-1);
+       
+       MatrixBlock input1; MatrixBlock input2; MatrixBlock output;
+       
+       private int convertToInt(long val) throws DMLRuntimeException {
+               if( val > Integer.MAX_VALUE ) {
+                       throw new DMLRuntimeException("The value for 
ConvolutionParameters is too large:" + val);
+               }
+               return (int) val;
+       }
+       
+       public boolean compare(ConvolutionParameters that) {
+               if(this.N == that.N && this.C == that.C && this.H == that.H && 
this.W == that.W
+                               && this.K == that.K && this.R == that.R && 
this.S == that.S && this.stride_h == that.stride_h
+                                && this.stride_w == that.stride_w  && 
this.pad_h == that.pad_h
+                                 && this.pad_w == that.pad_w   && 
this.numThreads == that.numThreads) {
+                       return true;
+               }
+               return false;
+       }
+       
+       public String toString() {
+               return "(" + N + " " + C + " " + H + " " + W + " " + K + " " + 
R + " " + S + ")";  
+       }
+       
+       public ConvolutionParameters(long N, long C, long H, long W,
+                       long K, long R, long S, long stride_h, long stride_w, 
long pad_h, long pad_w, int numThreads) throws DMLRuntimeException {
+               this.N = convertToInt(N);
+               this.C = convertToInt(C);
+               this.H = convertToInt(H);
+               this.W = convertToInt(W);
+               this.K = convertToInt(K);
+               this.R = convertToInt(R);
+               this.S = convertToInt(S);
+               this.stride_h = convertToInt(stride_h);
+               this.stride_w = convertToInt(stride_w);
+               this.pad_h = convertToInt(pad_h);
+               this.pad_w = convertToInt(pad_w);
+               if(H >= 0 && pad_h >= 0 && R >= 0 && stride_h >= 0)
+                       P = (int) ((H + 2 * pad_h - R) / stride_h + 1);
+               else
+                       P = -1;
+               // P = convertToInt(ConvolutionUtils.getP(H, R, stride_h, 
pad_h));
+               
+               if(W >= 0 && pad_w >= 0 && S >= 0 && stride_w >= 0)
+                       Q = (int) ((W + 2 * pad_w - S) / stride_w + 1);
+               else
+                       Q = -1;
+               // Q = convertToInt(ConvolutionUtils.getQ(W, S, stride_w, 
pad_w));
+               
+               this.numThreads = numThreads;
+       }
+       
+       public ConvolutionParameters(int N, int C, int H, int W,
+               int K, int R, int S, int stride_h, int stride_w, int pad_h, int 
pad_w, int numThreads) {
+               this.N = N;
+               this.C = C;
+               this.H = H;
+               this.W = W;
+               this.K = K;
+               this.R = R;
+               this.S = S;
+               this.stride_h = stride_h;
+               this.stride_w = stride_w;
+               this.pad_h = pad_h;
+               this.pad_w = pad_w;
+               P = (int) ConvolutionUtils.getP(H, R, stride_h, pad_h);
+               Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w);
+               this.numThreads = numThreads;
+       }
+       
+       public boolean isOutputThreadSafe() {
+               return output.isThreadSafe();
+       }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8c069ed8/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index 5763521..54af11c 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -18,13 +18,12 @@
  */
 package org.apache.sysml.runtime.matrix.data;
 
-import java.lang.ref.SoftReference;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import java.util.concurrent.Callable;
-import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentLinkedQueue;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
@@ -36,57 +35,32 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
-import org.apache.sysml.runtime.util.ConvolutionUtils;
 
+/**
+ * This class allows users to invoke deep learning related operations 
+ * (such as conv2d, conv2d_backward_data, conv2d_backward_filter, maxpooling, 
maxpooling_backward, bias_add)
+ * using multiple threads.
+ * 
+ * The methods accept the input matrices as MatrixBlock and the parameters 
using ConvolutionParameters.
+ * 
+ * To run in single thread, please set ConvolutionParameters.numThreads to 1.
+ */
 public class LibMatrixDNN {
        
        protected static final Log LOG =  
LogFactory.getLog(LibMatrixDNN.class.getName());
-       
-       public static final boolean ALLOW_MULTI_THREADED_OPS = true;
-       // Using hashmap to avoid any performance impacts of multimap
-       private static final ConcurrentHashMap<Integer, 
SoftReference<double[]>> non_zeroed_double_arr = new ConcurrentHashMap<Integer, 
SoftReference<double[]>>();
-       private static final int NON_ZEROED_DOUBLE_ARR_THRESHOLD = 100;
-       public static void cacheReuseableData(double[] arr) {
-               if(arr != null && arr.length >= 
NON_ZEROED_DOUBLE_ARR_THRESHOLD) {
-                       // Put the last recently removed arrays into the 
NON_ZEROED_DOUBLE_ARR as 
-                       // it has lower probability of being garbage collected
-                       // new Integer(arr.length) can be avoided here as 
autoboxing will do the trick
-                       non_zeroed_double_arr.put(arr.length, new 
SoftReference<double[]>(arr));
-               }
-       }
-       private static boolean warnedSingleThread = false;
-       private static void warnSingleThreaded() {
-               if(!warnedSingleThread) {
-                       throw new RuntimeException("WARN: Single thread 
execution in LibMatrixDNN");
-                       // LOG.warn("WARN: Single thread execution in 
LibMatrixDNN");
-                       // warnedSingleThread = true;
-               }
-       }
-       public static double[] getReuseableData(long length) {
-               if(length >= NON_ZEROED_DOUBLE_ARR_THRESHOLD) {
-                       // Explicit "new Integer" required here for 
HashMap.remove
-                       SoftReference<double[]> arr = 
non_zeroed_double_arr.remove(new Integer((int) length));
-                       if(arr != null) {
-                               return arr.get();
-                       }
-               }
-               return null;
-       }
+       // 
------------------------------------------------------------------------------------------------
+       // Useful flags for performance testing:
+       private static boolean DISPLAY_STATISTICS = false;
+       private static final boolean ALLOW_MULTI_THREADED_OPS = true;
+       // 
------------------------------------------------------------------------------------------------
        
        enum TaskType {
                MaxPooling_Forward, MaxPooling_Backward, 
+               // Alternate approaches that we tried but the performance was 
unsatisfactory be included: direct, non-looped im2col
                LoopedIm2ColConv2d, LoopedIm2ColConv2dBwdFilter, 
LoopedIm2ColConv2dBwdData
        }
        
-       public static class TemporaryConvolutionData {
-               public int [] minIndexArrR;
-               public int [] minIndexArrS;
-               public int [] maxIndexArrR;
-               public int [] maxIndexArrS;
-               int minCommonIndexS;
-               int maxCommonIndexS;
-       }
-       
+       // 
------------------------------------------------------------------------------------------------
        private static AtomicLong conv2dSparseCount = new AtomicLong(0);
        private static AtomicLong conv2dDenseCount = new AtomicLong(0);
        private static AtomicLong conv2dBwdFilterSparseCount = new 
AtomicLong(0);
@@ -105,7 +79,7 @@ public class LibMatrixDNN {
        private static AtomicLong loopedConvBwdDataCol2ImTime = new 
AtomicLong(0);
        
        public static void appendStatistics(StringBuilder sb) {
-               if(DMLScript.STATISTICS && (conv2dDenseCount.get() != 0 || 
conv2dSparseCount.get() != 0)) {
+               if(DMLScript.STATISTICS && DISPLAY_STATISTICS && 
(conv2dDenseCount.get() != 0 || conv2dSparseCount.get() != 0)) {
                        sb.append("LibMatrixDNN dense count 
(conv/bwdF/bwdD/im2col/maxBwd):\t" 
                                        + conv2dDenseCount.get() + "/"
                                        + conv2dBwdFilterDenseCount.get() + "/"
@@ -149,94 +123,9 @@ public class LibMatrixDNN {
                loopedConvBwdDataMatMultTime.set(0);
                loopedConvBwdDataCol2ImTime.set(0);
        }
+       // 
------------------------------------------------------------------------------------------------
+       
        
-       public static class ConvolutionParameters {
-               public int N; public int C; public int H; public int W;
-               public int K; public int R; public int S; public int stride_h; 
public int stride_w; public int pad_h; public int pad_w;
-               public int P; public int Q; public int numThreads;
-               
-               public AtomicLong outputNNZ = new AtomicLong(-1);
-               
-               MatrixBlock input1; MatrixBlock input2; MatrixBlock output;
-               boolean reuseNonZeroedOutput = false;
-               
-               public TemporaryConvolutionData tmpData;
-               
-               private int convertToInt(long val) throws DMLRuntimeException {
-                       if( val > Integer.MAX_VALUE ) {
-                               throw new DMLRuntimeException("The value for 
ConvolutionParameters is too large:" + val);
-                       }
-                       return (int) val;
-               }
-               
-               public boolean compare(ConvolutionParameters that) {
-                       if(this.N == that.N && this.C == that.C && this.H == 
that.H && this.W == that.W
-                                       && this.K == that.K && this.R == that.R 
&& this.S == that.S && this.stride_h == that.stride_h
-                                        && this.stride_w == that.stride_w  && 
this.pad_h == that.pad_h
-                                         && this.pad_w == that.pad_w   && 
this.numThreads == that.numThreads) {
-                               return true;
-                       }
-                       return false;
-               }
-               
-               public String toString() {
-                       return "(" + N + " " + C + " " + H + " " + W + " " + K 
+ " " + R + " " + S + ")";  
-               }
-               
-               public ConvolutionParameters(long N, long C, long H, long W,
-                               long K, long R, long S, long stride_h, long 
stride_w, long pad_h, long pad_w, int numThreads) throws DMLRuntimeException {
-                       this.N = convertToInt(N);
-                       this.C = convertToInt(C);
-                       this.H = convertToInt(H);
-                       this.W = convertToInt(W);
-                       this.K = convertToInt(K);
-                       this.R = convertToInt(R);
-                       this.S = convertToInt(S);
-                       this.stride_h = convertToInt(stride_h);
-                       this.stride_w = convertToInt(stride_w);
-                       this.pad_h = convertToInt(pad_h);
-                       this.pad_w = convertToInt(pad_w);
-                       if(H >= 0 && pad_h >= 0 && R >= 0 && stride_h >= 0)
-                               P = (int) ((H + 2 * pad_h - R) / stride_h + 1);
-                       else
-                               P = -1;
-                       // P = convertToInt(ConvolutionUtils.getP(H, R, 
stride_h, pad_h));
-                       
-                       if(W >= 0 && pad_w >= 0 && S >= 0 && stride_w >= 0)
-                               Q = (int) ((W + 2 * pad_w - S) / stride_w + 1);
-                       else
-                               Q = -1;
-                       // Q = convertToInt(ConvolutionUtils.getQ(W, S, 
stride_w, pad_w));
-                       
-                       this.numThreads = numThreads;
-               }
-               
-               public ConvolutionParameters(int N, int C, int H, int W,
-                       int K, int R, int S, int stride_h, int stride_w, int 
pad_h, int pad_w, int numThreads) {
-                       this.N = N;
-                       this.C = C;
-                       this.H = H;
-                       this.W = W;
-                       this.K = K;
-                       this.R = R;
-                       this.S = S;
-                       this.stride_h = stride_h;
-                       this.stride_w = stride_w;
-                       this.pad_h = pad_h;
-                       this.pad_w = pad_w;
-                       P = (int) ConvolutionUtils.getP(H, R, stride_h, pad_h);
-                       Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w);
-                       this.numThreads = numThreads;
-               }
-               
-               public void setReuseNonZeroedOutput(boolean 
reuseNonZeroedOutput) {
-                       this.reuseNonZeroedOutput = reuseNonZeroedOutput;
-               }
-
-               public boolean isOutputThreadSafe() {
-                       return output.isThreadSafe();
-               }
-       }
        
        public static void conv2d_backward_data(MatrixBlock filter, MatrixBlock 
dout, MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
                params.input1 = filter;
@@ -250,12 +139,7 @@ public class LibMatrixDNN {
                        throw new DMLRuntimeException("Only positive strides 
supported");
                }
                
-               // Convert filter (which is relatively small matrix) to dense
-               if(params.input1.isInSparseFormat()) {
-                       params.input1.sparseToDense();
-               }
-               
-               if(DMLScript.STATISTICS) {
+               if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
                        if(filter.isInSparseFormat() || 
dout.isInSparseFormat()) {
                                conv2dBwdDataSparseCount.addAndGet(1);
                        }
@@ -264,20 +148,7 @@ public class LibMatrixDNN {
                        }
                }
                
-               params.reuseNonZeroedOutput = true;
-               
-               int constrainedNumThreads = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
-               if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
-                       warnSingleThreaded();
-                       MatrixBlock dout_reshaped = new 
MatrixBlock(params.P*params.Q, params.K, false);
-                       dout_reshaped.allocateDenseBlock(true);
-                       for (int n = 0; n < params.N; n++) {
-                               doLoopedIm2ColConv2dBwdData(n, dout_reshaped, 
params);
-                       }
-               }
-               else {
-                       runConvTask(constrainedNumThreads, 1, 
TaskType.LoopedIm2ColConv2dBwdData, params);
-               }
+               runConvTask(TaskType.LoopedIm2ColConv2dBwdData, params);
        }
        
        public static void conv2d_backward_filter(MatrixBlock input, 
MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
@@ -292,7 +163,7 @@ public class LibMatrixDNN {
                        throw new DMLRuntimeException("Only positive strides 
supported");
                }
                
-               if(DMLScript.STATISTICS) {
+               if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
                        if(input.isInSparseFormat() || dout.isInSparseFormat()) 
{
                                conv2dBwdFilterSparseCount.addAndGet(1);
                        }
@@ -301,26 +172,15 @@ public class LibMatrixDNN {
                        }
                }
                
-               params.reuseNonZeroedOutput = true;
-               
-               int constrainedNumThreads = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
-               if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
-                       warnSingleThreaded();
-                       MatrixBlock im2ColOutBlock = new 
MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
-                       im2ColOutBlock.allocateDenseBlock(true);
-                       MatrixBlock dout_reshaped = new 
MatrixBlock(params.P*params.Q, params.K, false);
-                       dout_reshaped.allocateDenseBlock(true);
-                       for (int n = 0; n < params.N; n++) {
-                               params.output = 
doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock, dout_reshaped, params.output, 
params);
-                       }
-               }
-               else {
-                       runConvTask(constrainedNumThreads, 1, 
TaskType.LoopedIm2ColConv2dBwdFilter, params);
-               }
-               
+               runConvTask(TaskType.LoopedIm2ColConv2dBwdFilter, params);
        }
        
-       // ret += elem 
+       /**
+        * Performs the operation: ret += elem
+        * @param ret
+        * @param elem
+        * @throws DMLRuntimeException
+        */
        private static void elementWiseInPlaceAddition(MatrixBlock ret, 
MatrixBlock elem) throws DMLRuntimeException {
                if(ret.getNumRows() != elem.getNumRows() || ret.getNumColumns() 
!= elem.getNumColumns()) {
                        throw new DMLRuntimeException("Incorrect dimensions");
@@ -346,7 +206,12 @@ public class LibMatrixDNN {
                }
        }
        
-       // ret += t(elem) 
+       /**
+        * Performs the operation: ret += t(elem)
+        * @param ret
+        * @param elem
+        * @throws DMLRuntimeException
+        */
        private static void elementWiseInPlaceTransposedAddition(MatrixBlock 
ret, MatrixBlock elem) throws DMLRuntimeException {
                if(ret.getNumRows() != elem.getNumColumns() || 
ret.getNumColumns() != elem.getNumRows()) {
                        throw new DMLRuntimeException("Incorrect dimensions");
@@ -383,12 +248,12 @@ public class LibMatrixDNN {
                dout_reshaped.recomputeNonZeros();
                
                MatrixBlock temp = new MatrixBlock(params.P*params.Q, 
params.C*params.R*params.S, false);
-               long t1 = DMLScript.STATISTICS ? System.nanoTime() : 0;
+               long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
                LibMatrixMult.matrixMult(dout_reshaped, filter, temp, false);
-               long t2 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
+               long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0 ;
                doCol2imOverSingleImage(n, temp, params);
-               long t3 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
-               if(DMLScript.STATISTICS) {
+               long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0 ;
+               if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
                        loopedConvBwdDataMatMultTime.addAndGet(t2-t1);
                        loopedConvBwdDataCol2ImTime.addAndGet(t3-t2);
                }
@@ -396,22 +261,19 @@ public class LibMatrixDNN {
        
        private static MatrixBlock doLoopedIm2ColConv2dBwdFilter(int n, 
                        MatrixBlock im2ColOutBlock, MatrixBlock dout_reshaped, 
MatrixBlock partialRetBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
-               long nnz = 0;
-               long t1 = DMLScript.STATISTICS ? System.nanoTime() : 0;
-               for (int c = 0; c < params.C; c++) {
-                       nnz += doIm2colOverInputPath_NCHW(n, c, im2ColOutBlock, 
params);
-               }
-               long t2 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
-               im2ColOutBlock.setNonZeros(nnz);
+               long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+               doIm2col(n, im2ColOutBlock, params);
+               im2ColOutBlock.recomputeNonZeros();
+               long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0 ;
                
                doRotate180(n, 0, params.input2, dout_reshaped.denseBlock, 
params, true);
                dout_reshaped.recomputeNonZeros();
                
                MatrixBlock temp = new MatrixBlock(params.C*params.R*params.S, 
params.K, false);
-               long t3 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
+               long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0 ;
                LibMatrixMult.matrixMult(im2ColOutBlock, dout_reshaped, temp, 
false);
-               long t4 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
-               if(DMLScript.STATISTICS) {
+               long t4 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0 ;
+               if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
                        loopedConvBwdFilterMatMultTime.addAndGet(t4-t3);
                        loopedConvBwdFilterIm2ColTime.addAndGet(t2-t1);
                }
@@ -436,7 +298,7 @@ public class LibMatrixDNN {
                        throw new DMLRuntimeException("Incorrect input to 
conv2d");
                }
                
-               if(DMLScript.STATISTICS) {
+               if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
                        if(input.isInSparseFormat() || 
filter.isInSparseFormat()) {
                                conv2dSparseCount.addAndGet(1);
                        }
@@ -445,46 +307,39 @@ public class LibMatrixDNN {
                        }
                }
                
-               params.reuseNonZeroedOutput = true;
-               int constrainedNumThreads = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
-               if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
-                       warnSingleThreaded();
-                       MatrixBlock im2ColOutBlock = new 
MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
-                       im2ColOutBlock.allocateDenseBlock(true);
-                       for (int n = 0; n < params.N; n++) {
-                               doLoopedIm2ColConv2d(n, im2ColOutBlock, params);
-                       }
+               if(!input.isInSparseFormat() && TEST_SPARSE_INPUT) {
+                       input.denseToSparse();
                }
-               else {
-                       runConvTask(constrainedNumThreads, 1, 
TaskType.LoopedIm2ColConv2d, params);
+               if(!filter.isInSparseFormat() && TEST_SPARSE_FILTER) {
+                       filter.denseToSparse();
                }
+               
+               runConvTask(TaskType.LoopedIm2ColConv2d, params);
        }
        
        private static void doLoopedIm2ColConv2d(int n, MatrixBlock 
im2ColOutBlock, ConvolutionParameters params) throws DMLRuntimeException {
-               long nnz = 0;
-               long t1 = DMLScript.STATISTICS ? System.nanoTime() : 0;
-               for (int c = 0; c < params.C; c++) {
-                       nnz += doIm2colOverInputPath_NCHW(n, c, im2ColOutBlock, 
params);
-               }
-               long t2 = DMLScript.STATISTICS ? System.nanoTime() : 0;
+               long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
+               doIm2col(n, im2ColOutBlock, params);
+               im2ColOutBlock.recomputeNonZeros();
+               long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
                
-               im2ColOutBlock.setNonZeros(nnz);
                MatrixBlock matMultOutBlock = new MatrixBlock(params.K, 
params.P*params.Q, false);
                LibMatrixMult.matrixMult(params.input2, im2ColOutBlock, 
matMultOutBlock, false);
-               long t3 = DMLScript.STATISTICS ? System.nanoTime() : 0;
+               long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
                
-               if(DMLScript.STATISTICS) {
+               if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
                        loopedConvIm2ColTime.addAndGet(t2 - t1);
                        loopedConvMatMultTime.addAndGet(t3 - t2);
                }
                
+               // 
-----------------------------------------------------------------------------
+               // Copying is required as LibMatrixMult.matrixMult (and/or 
Java) is not pointer aware.
+               // This is not required in Native implementation
                int destPos = n*params.K*params.P*params.Q;
                int length = params.K*params.P*params.Q;
-               if(params.reuseNonZeroedOutput && 
matMultOutBlock.isEmptyBlock()) {
-                       Arrays.fill(params.output.denseBlock, destPos, destPos 
+ length, 0);
-               }
-               else if(!matMultOutBlock.isEmptyBlock()) {
+               if(!matMultOutBlock.isEmptyBlock()) {
                        if(matMultOutBlock.isInSparseFormat()) {
+                               // NOTE: Potential bottlenc to copy sparse 
matmult back to dense output
                                Iterator<IJV> iter = 
matMultOutBlock.sparseBlock.getIterator();
                                final int outOffset = 
n*params.K*params.P*params.Q;
                                while(iter.hasNext()) {
@@ -498,6 +353,7 @@ public class LibMatrixDNN {
                        else
                                System.arraycopy(matMultOutBlock.denseBlock, 0, 
params.output.denseBlock, destPos, length);
                }
+               // 
-----------------------------------------------------------------------------
        }
        
        
@@ -513,7 +369,7 @@ public class LibMatrixDNN {
                        throw new DMLRuntimeException("Incorrect dout 
dimensions in maxpooling_backward:" + input.getNumRows() + " " + 
input.getNumColumns() + " " + params.N + " " + params.K*params.P*params.Q);
                }
                
-               if(DMLScript.STATISTICS) {
+               if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
                        if(input.isInSparseFormat() || dout.isInSparseFormat()) 
{
                                maxPoolBwdSparseCount.addAndGet(1);
                        }
@@ -525,16 +381,7 @@ public class LibMatrixDNN {
                if (params.output.isInSparseFormat())
                        throw new DMLRuntimeException("Sparse 
maxpooling_backward is not supported");
 
-               int constrainedNumThreads = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
-               if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
-                       warnSingleThreaded();
-                       for (int n = 0; n < params.N; n++) {
-                               doPoolingBackward(n, params);
-                       }
-               }
-               else {
-                       runConvTask(constrainedNumThreads, 1, 
TaskType.MaxPooling_Backward, params);
-               }
+               runConvTask(TaskType.MaxPooling_Backward, params);
        }
        
        private static void doPoolingBackward(int n, ConvolutionParameters 
params) throws DMLRuntimeException {
@@ -775,19 +622,7 @@ public class LibMatrixDNN {
                }
                
                params.outputNNZ.set(0);
-               
-               int constrainedNumThreads = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
-               if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
-                       warnSingleThreaded();
-                       for (int n = 0; n < params.N; n++) {
-                               for (int c = 0; c < params.C; c++) {
-                                       doPooling(n, c, params);
-                               }
-                       }
-               }
-               else {
-                       runConvTask(constrainedNumThreads, params.C, 
TaskType.MaxPooling_Forward, params);
-               }
+               runConvTask(TaskType.MaxPooling_Forward, params);
                outputBlock.setNonZeros(params.outputNNZ.get());
        }
 
@@ -866,143 +701,161 @@ public class LibMatrixDNN {
                }
        }
        
-       private static int [] getTaskSize(int constrainedNumThreads, int 
maxNumTaskSize1, int maxNumTaskSize2) {
-               int taskSize1 = 1; int taskSize2 = 1;
-               // Why this heuristics ? To reduce the impact of the 
thread-creation overhead in case of small tasks
-               int approxNumTasksToCreate = 3*constrainedNumThreads;
-               while((maxNumTaskSize1*maxNumTaskSize2)/(taskSize1*taskSize2) > 
approxNumTasksToCreate) {
-                       // Possibility of creating too many tasks, increase 
taskSize2
-                       taskSize2 *= 2;
-                       if(taskSize2 >= maxNumTaskSize2) {
-                               taskSize2 = maxNumTaskSize2;
-                               break;
+       // 
----------------------------------------------------------------------------------------------------------------
+       private static void addMatrixBlocks(int poolSize, TaskType type, 
ConvolutionParameters params, 
+                       ConcurrentLinkedQueue<MatrixBlock> im2ColOutBlocks, 
ConcurrentLinkedQueue<MatrixBlock> doutReshapedBlocks,
+                       ConcurrentLinkedQueue<MatrixBlock> partialRetBlocks) {
+               for(int i = 0; i < poolSize; i++) {
+                       if(type == TaskType.LoopedIm2ColConv2d || type == 
TaskType.LoopedIm2ColConv2dBwdFilter) {
+                               MatrixBlock im2ColOutBlock = new 
MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
+                               im2ColOutBlock.allocateDenseBlock(true);
+                               im2ColOutBlocks.add(im2ColOutBlock);
                        }
-               }
-               while((maxNumTaskSize1*maxNumTaskSize2)/(taskSize1*taskSize2) > 
approxNumTasksToCreate) {
-                       // Possibility of creating too many tasks, increase 
taskSize1
-                       taskSize1 *= 2;
-                       if(taskSize1 >= maxNumTaskSize1) {
-                               taskSize1 = maxNumTaskSize1;
-                               break;
+                       
+                       if(type == TaskType.LoopedIm2ColConv2dBwdFilter) {
+                               MatrixBlock partialRetBlock = new 
MatrixBlock(params.K, params.C*params.R*params.S, false);
+                               partialRetBlock.allocateDenseBlock(true);
+                               partialRetBlocks.add(partialRetBlock);
+                       }
+                       
+                       if(type == TaskType.LoopedIm2ColConv2dBwdData || type 
== TaskType.LoopedIm2ColConv2dBwdFilter) {
+                               MatrixBlock doutReshapedBlock = new 
MatrixBlock(params.P*params.Q, params.K, false);
+                               doutReshapedBlock.allocateDenseBlock(true);
+                               doutReshapedBlocks.add(doutReshapedBlock);
                        }
                }
-               int [] ret = new int[2];
-               ret[0] = taskSize1;
-               ret[1] = taskSize2;
-               return ret;
-       }
-       
-       private static void runSequentialConvTask(int NSize, int Z, TaskType 
type, ConvolutionParameters params) throws DMLRuntimeException {
-               ConvTask task = new ConvTask(0, NSize, 0, Z, type, params);
-               warnSingleThreaded();
-               try {
-                       task.call();
-               } catch (Exception e) {
-                       throw new DMLRuntimeException("Error while executing 
single-threaded " + type.name(), e);
-               }
-       }
-       
-       private static void runConvTask(int constrainedNumThreads, int Z, 
TaskType type, ConvolutionParameters params) throws DMLRuntimeException {
-               if (params.isOutputThreadSafe() && constrainedNumThreads > 1)
-                       runParallelConvTask(constrainedNumThreads, params.N, Z, 
type, params);
-               else
-                       runSequentialConvTask(params.N, Z, type, params);
        }
-       
-       private static void runParallelConvTask(int constrainedNumThreads, int 
NSize, int Z, TaskType type, ConvolutionParameters params) throws 
DMLRuntimeException {
-               ArrayList<ConvTask> tasks = new ArrayList<ConvTask>();
-               if(NSize >= constrainedNumThreads || Z == 1) {
-                       int numNTasks = (int) Math.ceil(((double) NSize) / 
constrainedNumThreads);
-                       for (int n = 0; n < NSize; n += numNTasks) {
-                               tasks.add(new ConvTask(n, Math.min(NSize, 
n+numNTasks), 0, Z, type, params));
+       // Methods to execute convolution-related tasks using multiple threads.
+       private static void runConvTask(TaskType type, ConvolutionParameters 
params) throws DMLRuntimeException {
+               int constrainedNumThreads = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               ConcurrentLinkedQueue<MatrixBlock> im2ColOutBlocks = new 
ConcurrentLinkedQueue<MatrixBlock>();
+               ConcurrentLinkedQueue<MatrixBlock> doutReshapedBlocks = new 
ConcurrentLinkedQueue<MatrixBlock>();
+               ConcurrentLinkedQueue<MatrixBlock> partialRetBlocks = new 
ConcurrentLinkedQueue<MatrixBlock>();
+               if (ALLOW_MULTI_THREADED_OPS && params.isOutputThreadSafe() && 
constrainedNumThreads > 1) {
+                       int poolSize = Math.min(constrainedNumThreads, 
params.N);
+                       addMatrixBlocks(poolSize, type, params, 
im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks);
+                       ArrayList<ConvTask> tasks = new ArrayList<ConvTask>();
+                       int NSize = params.N - poolSize;
+                       if(NSize >= constrainedNumThreads) {
+                               for(int n = 0; n < params.N; n++) 
+                                       tasks.add(new ConvTask(n, n+1, type, 
params, im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks));
                        }
-               }
-               else {
-                       int [] taskSizes = getTaskSize(constrainedNumThreads, 
NSize, Z);
-                       for (int n = 0; n < NSize; n += taskSizes[0]) {
-                               for (int z = 0; z < Z; z += taskSizes[1]) {
-                                       tasks.add(new ConvTask(n, 
Math.min(NSize, n+taskSizes[0]), z, Math.min(Z, z+taskSizes[1]), type, params));
+                       else {
+                               int numNTasks = (int) Math.ceil(((double) 
NSize) / constrainedNumThreads);
+                               for (int n = 0; n < NSize; n += numNTasks) {
+                                       tasks.add(new ConvTask(n, 
Math.min(NSize, n+numNTasks), type, params, im2ColOutBlocks, 
doutReshapedBlocks, partialRetBlocks));
                                }
+                               for (int n = NSize; n < params.N; n++)
+                                       tasks.add(new ConvTask(n, n+1, type, 
params, im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks));
+                       }
+                       
+                       ExecutorService pool = Executors.newFixedThreadPool( 
poolSize );
+                       List<Future<Object>> taskret;
+                       try {
+                               taskret = pool.invokeAll(tasks);
+                               pool.shutdown();
+                               for( Future<Object> task : taskret ) {
+                                       task.get();
+                               }
+                               if(type == 
TaskType.LoopedIm2ColConv2dBwdFilter) {
+                                       for(MatrixBlock partialRetBlock : 
partialRetBlocks) {
+                                               
elementWiseInPlaceAddition(params.output, partialRetBlock);
+                                       }
+                               }
+                       } catch (InterruptedException e) {
+                               throw new DMLRuntimeException("Error while 
executing multi-threaded " + type.name(), e);
+                       } catch (ExecutionException e) {
+                               throw new DMLRuntimeException("Error while 
executing multi-threaded " + type.name(), e);
                        }
-                       LOG.debug("Reduce number of tasks from " + (NSize*Z)  + 
"(" + NSize + "," + Z + ") to " + tasks.size());
                }
-
-               ExecutorService pool = Executors.newFixedThreadPool( 
Math.min(constrainedNumThreads, tasks.size()) );
-               List<Future<Object>> taskret;
-               try {
-                       taskret = pool.invokeAll(tasks);
-                       pool.shutdown();
-                       for( Future<Object> task : taskret ) {
-                               switch(type) {
-                                       case LoopedIm2ColConv2dBwdFilter:
-                                               
elementWiseInPlaceAddition(params.output, (MatrixBlock) task.get());
-                                               break;
-                                       default:
-                                               task.get();
+               else {
+                       addMatrixBlocks(1, type, params, im2ColOutBlocks, 
doutReshapedBlocks, partialRetBlocks);
+                       ConvTask task = new ConvTask(0, 0, type, params, 
im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks);
+                       try {
+                               for(int n = 0; n < params.N; n++) {
+                                       task.n1 = n;
+                                       task.n2 = n+1;
+                                       task.call();
+                               }
+                               if(type == 
TaskType.LoopedIm2ColConv2dBwdFilter) {
+                                       for(MatrixBlock partialRetBlock : 
partialRetBlocks) {
+                                               
elementWiseInPlaceAddition(params.output, partialRetBlock);
+                                       }
                                }
+                       } catch (Exception e) {
+                               throw new DMLRuntimeException("Error while 
executing single-threaded " + type.name(), e);
                        }
-               } catch (InterruptedException e) {
-                       throw new DMLRuntimeException("Error while executing 
multi-threaded " + type.name(), e);
-               } catch (ExecutionException e) {
-                       throw new DMLRuntimeException("Error while executing 
multi-threaded " + type.name(), e);
                }
        }
+       // 
----------------------------------------------------------------------------------------------------------------
        
+       /**
+        * The ConvTask allows the convolution operations (such s conv2d, 
conv2d_backward, maxpooling, etc)
+        * to be executed in multi-thread manner.
+        * 
+        */
        private static class ConvTask implements Callable<Object> {
-               int n1; int n2; int z1; int z2; 
+               public int n1; public int n2; 
                ConvolutionParameters params;
                TaskType type;
-               public ConvTask(int n1, int n2, int z1, int z2, TaskType type, 
ConvolutionParameters params) {
+               ConcurrentLinkedQueue<MatrixBlock> im2ColOutBlocks;
+               ConcurrentLinkedQueue<MatrixBlock> partialRetBlocks;
+               ConcurrentLinkedQueue<MatrixBlock> doutReshapedBlocks;
+               public ConvTask(int n1, int n2, TaskType type, 
ConvolutionParameters params, 
+                               ConcurrentLinkedQueue<MatrixBlock> 
im2ColOutBlocks,
+                               ConcurrentLinkedQueue<MatrixBlock> 
doutReshapedBlocks,
+                               ConcurrentLinkedQueue<MatrixBlock> 
partialRetBlocks) {
                        this.n1 = n1;
                        this.n2 = n2;
-                       this.z1 = z1;
-                       this.z2 = z2;
                        this.type = type;
                        this.params = params;
+                       this.im2ColOutBlocks = im2ColOutBlocks;
+                       this.partialRetBlocks = partialRetBlocks;
+                       this.doutReshapedBlocks = doutReshapedBlocks;
                }
                
                @Override
                public Object call() throws DMLRuntimeException {
                        switch(type) {
                                case MaxPooling_Forward:
-                                       for (int n = n1; n < n2; n++) {
-                                               for (int z = z1; z < z2; z++) {
-                                                       doPooling(n, z, params);
+                               {
+                                       for(int n = n1; n < n2; n++) {
+                                               for (int c = 0; c < params.C; 
c++) {
+                                                       doPooling(n, c, params);
                                                }
                                        }
                                        break;
+                               }
                                case MaxPooling_Backward:
-                                       for (int n = n1; n < n2; n++) {
+                                       for(int n = n1; n < n2; n++) 
                                                doPoolingBackward(n, params);
-                                       }
                                        break;
                                case LoopedIm2ColConv2d:
-                                       MatrixBlock im2ColOutBlock = new 
MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
-                                       im2ColOutBlock.allocateDenseBlock(true);
-                                       for (int n = n1; n < n2; n++) {
+                               {       
+                                       MatrixBlock im2ColOutBlock = 
im2ColOutBlocks.remove();
+                                       for(int n = n1; n < n2; n++) 
                                                doLoopedIm2ColConv2d(n, 
im2ColOutBlock, params);
-                                       }
+                                       im2ColOutBlocks.add(im2ColOutBlock);
                                        break;
+                               }
                                case LoopedIm2ColConv2dBwdFilter:
                                {
-                                       MatrixBlock im2ColOutBlock1 = new 
MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
-                                       
im2ColOutBlock1.allocateDenseBlock(true);
-                                       MatrixBlock partialRetBlock = new 
MatrixBlock(params.K, params.C*params.R*params.S, false);
-                                       
partialRetBlock.allocateDenseBlock(true);
-                                       MatrixBlock dout_reshaped = new 
MatrixBlock(params.P*params.Q, params.K, false);
-                                       dout_reshaped.allocateDenseBlock(true);
-                                       for (int n = n1; n < n2; n++) {
-                                               partialRetBlock = 
doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock1, dout_reshaped, 
partialRetBlock, params);
-                                       }
-                                       return partialRetBlock;
+                                       MatrixBlock im2ColOutBlock = 
im2ColOutBlocks.remove();
+                                       MatrixBlock partialRetBlock = 
partialRetBlocks.remove();
+                                       MatrixBlock doutReshapedBlock = 
doutReshapedBlocks.remove();
+                                       for(int n = n1; n < n2; n++) 
+                                               partialRetBlock = 
doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock, doutReshapedBlock, 
partialRetBlock, params);
+                                       im2ColOutBlocks.add(im2ColOutBlock);
+                                       partialRetBlocks.add(partialRetBlock);
+                                       
doutReshapedBlocks.add(doutReshapedBlock);
+                                       break;
                                }
                                case LoopedIm2ColConv2dBwdData:
                                {
-                                       MatrixBlock dout_reshaped = new 
MatrixBlock(params.P*params.Q, params.K, false);
-                                       dout_reshaped.allocateDenseBlock(true);
-                                       for (int n = n1; n < n2; n++) {
-                                               doLoopedIm2ColConv2dBwdData(n, 
dout_reshaped, params);
-                                       }
+                                       MatrixBlock doutReshapedBlock = 
doutReshapedBlocks.remove();
+                                       for(int n = n1; n < n2; n++) 
+                                               doLoopedIm2ColConv2dBwdData(n, 
doutReshapedBlock, params);
+                                       
doutReshapedBlocks.add(doutReshapedBlock);
                                        break;
                                }
                                default:
@@ -1092,69 +945,102 @@ public class LibMatrixDNN {
                        }
                }
        }
-               
-       private static long doIm2colOverInputPath_NCHW(int n, int c, 
MatrixBlock output, ConvolutionParameters params) throws DMLRuntimeException {
+       
+       private static void doIm2colDense(int n, double [] inputArray, double 
[] outputArray, ConvolutionParameters params) {
+               int CRS = params.C * params.R * params.S;
+               final int nOffset = n * params.C*params.H*params.W;
+               if (params.stride_h == 1 && params.stride_w == 1 && 
params.pad_h == 0 && params.pad_w == 0) {
+                       for (int c = 0; c < CRS; ++c) {
+                               int wOffset = c % params.S;
+                               int hOffset = (c / params.S) % params.R;
+                               int cInput = c / params.R / params.S;
+                               for (int h = 0; h < params.P; ++h) {
+                                       int hPadded = h + hOffset;
+                                       int outOffset = (c * params.P + h) * 
params.Q;
+                                       int inputOffset = nOffset + (cInput * 
params.H + hPadded) * params.W;
+                                       System.arraycopy(inputArray, 
inputOffset + wOffset, outputArray, outOffset, params.Q);
+                                       int w = params.Q - 1;
+                                       int wPadded = w + wOffset;
+                                       if (hPadded < params.H && wPadded < 
params.W)
+                                               outputArray[outOffset + w] = 
inputArray[inputOffset + wPadded];
+                                       else
+                                               outputArray[outOffset + w] = 0;
+                               }
+                       }
+               } else {
+                       for (int c = 0; c < CRS; ++c) {
+                               int wOffset = c % params.S;
+                               int hOffset = (c / params.S) % params.R;
+                               int cInput = c / params.R / params.S;
+                               for (int h = 0; h < params.P; ++h) {
+                                       int outOffset = (c * params.P + h) * 
params.Q;
+                                       int hPadded = h * params.stride_h - 
params.pad_h + hOffset;
+                                       int inputOffset = nOffset + (cInput * 
params.H + hPadded) * params.W;
+                                       if (hPadded < 0 || hPadded >= params.H) 
{
+                                               Arrays.fill(outputArray, 
outOffset, outOffset+params.Q, 0);
+                                       } else {
+                                               for (int w = 0; w < params.Q; 
++w) {
+                                                       int wPadded = w * 
params.stride_w - params.pad_w + wOffset;
+                                                       if (wPadded >= 0 && 
wPadded < params.W)
+                                                               
outputArray[outOffset + w] = inputArray[inputOffset + wPadded];
+                                                       else
+                                                               
outputArray[outOffset + w] = 0;
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+       
+       // Keeping this as a separate sparse method to allow for further dense 
optimizations
+       private static void doIm2colSparse(int n, MatrixBlock input, double [] 
outputArray, ConvolutionParameters params) {
+               int CRS = params.C * params.R * params.S;
+               // final int nOffset = n * params.C*params.H*params.W;
+               for (int c = 0; c < CRS; ++c) {
+                       int wOffset = c % params.S;
+                       int hOffset = (c / params.S) % params.R;
+                       int cInput = c / params.R / params.S;
+                       for (int h = 0; h < params.P; ++h) {
+                               int outOffset = (c * params.P + h) * params.Q;
+                               int hPadded = h * params.stride_h - 
params.pad_h + hOffset;
+                               int tempOffset = (cInput * params.H + hPadded) 
* params.W;
+                               // int inputOffset = nOffset + tempOffset;
+                               if (hPadded < 0 || hPadded >= params.H) {
+                                       Arrays.fill(outputArray, outOffset, 
outOffset+params.Q, 0);
+                               } else {
+                                       for (int w = 0; w < params.Q; ++w) {
+                                               int wPadded = w * 
params.stride_w - params.pad_w + wOffset;
+                                               if (wPadded >= 0 && wPadded < 
params.W) {
+                                                       // NOTE: Potential 
performance bottleneck as we have to do binary search to getValue
+                                                       outputArray[outOffset + 
w] = input.getValue(n, tempOffset + wPadded);
+                                               }
+                                               else
+                                                       outputArray[outOffset + 
w] = 0;
+                                       }
+                               }
+                       }
+               }
+       }
+       
+       private static void doIm2col(int n, MatrixBlock output, 
ConvolutionParameters params) throws DMLRuntimeException {
                double [] inputArray = null;
                if (!params.input1.isInSparseFormat())
                        inputArray = params.input1.getDenseBlock();
                double [] outputArray = null;
-               if(output == null && !params.output.isInSparseFormat())
-                       outputArray = params.output.getDenseBlock();
-               else if(output != null && !output.isInSparseFormat())
+               if(!output.isInSparseFormat())
                        outputArray = output.getDenseBlock();
-               else {
+               else 
                        throw new DMLRuntimeException("Sparse output is not 
supported for im2col");
-               }
                
-               final int inputOffset = n*params.C*params.H*params.W + 
c*params.H*params.W;
-               int outputOffset;
-               if(output == null)
-                       outputOffset = (c*params.R*params.S*params.N + 
n)*params.P*params.Q;
+               if(inputArray != null)
+                       doIm2colDense(n, inputArray, outputArray, params);
                else
-                       outputOffset = (c*params.R*params.S)*params.P*params.Q;
-               
-               long tmpNNZ = 0;
-               for (int r = 0; r < params.R; r++) { // Get an input patch of 
size R X S
-                       for (int s = 0; s < params.S; s++) {
-                               int localIndex;
-                               if(output == null)
-                                       localIndex = outputOffset + 
((r*params.S*params.N + s*params.N)*params.P*params.Q);
-                               else
-                                       localIndex = outputOffset + 
((r*params.S + s)*params.P*params.Q);
-                               
-                               int input_row = r - params.pad_h;
-                               // And copy it to outputArray[i] (taking care 
of padding & striding)
-                               for (int p = params.P; p > 0; p--) {
-                                       if (input_row >= 0 && input_row < 
params.H) {
-                                               int input_col = s - 
params.pad_w;
-                                               for (int q = params.Q; q > 0; 
q--, localIndex++) {
-                                                       if (input_col >= 0 && 
input_col < params.W) {
-                                                               // Copy from 
[channel c, height input_row, width input_col]
-                                                               if(inputArray 
!= null)
-                                                                       
outputArray[localIndex] = inputArray[inputOffset + input_row*params.W + 
input_col];
-                                                               else
-                                                                       
outputArray[localIndex] = params.input1.quickGetValue(n, c*params.H*params.W + 
input_row*params.W + input_col);
-                                                               
if(outputArray[localIndex] != 0)
-                                                                       
tmpNNZ++;
-                                                       }
-                                                       else 
if(params.reuseNonZeroedOutput) {
-                                                               
outputArray[localIndex] = 0;
-                                                       }
-                                                       input_col += 
params.stride_w;
-                                               }
-                                       } else {
-                                               if(params.reuseNonZeroedOutput) 
{
-                                                       for(int i = localIndex; 
i < localIndex + params.Q; i++) {
-                                                               
outputArray[localIndex] = 0;
-                                                       }
-                                               }
-                                               localIndex += params.Q;
-                                       }
-                                       input_row += params.stride_h;
-                               }
-                       }
-               }
-               
-               return tmpNNZ;
+                       doIm2colSparse(n, params.input1, outputArray, params);
        }
+       
+       // 
------------------------------------------------------------------------------------------------
+       // Used in integration tests. Please donot edit them
+       public static boolean TEST_SPARSE_INPUT = false;
+       public static boolean TEST_SPARSE_FILTER = false;
+       // 
------------------------------------------------------------------------------------------------
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8c069ed8/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index db53ef6..c8576f2 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -33,7 +33,6 @@ import java.util.Iterator;
 
 import org.apache.commons.math3.random.Well1024a;
 import org.apache.hadoop.io.DataInputBuffer;
-import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.hops.Hop.OpOp2;
 import org.apache.sysml.hops.OptimizerUtils;
@@ -334,8 +333,7 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
                        allocateDenseBlock();
        }
        
-       @SuppressWarnings("unused")
-       public void allocateDenseBlock(boolean clearNNZ, boolean zeroOut) 
+       public void allocateDenseBlock(boolean clearNNZ) 
                        throws RuntimeException 
        {
                long limit = (long)rlen * clen;
@@ -348,17 +346,10 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
                }
                
                //allocate block if non-existing or too small (guaranteed to be 
0-initialized),
-               if(!zeroOut && DMLScript.REUSE_NONZEROED_OUTPUT 
-                       && (denseBlock == null || denseBlock.length < limit)
-                       && rlen != 1 && clen != 1 ) // Not a column vector 
-               {
-                       denseBlock = LibMatrixDNN.getReuseableData(limit);
-               }
                if(denseBlock == null || denseBlock.length < limit) {
                        denseBlock = new double[(int)limit];
                }
                
-               
                //clear nnz if necessary
                if( clearNNZ ) {
                        nonZeros = 0;
@@ -367,12 +358,6 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
                sparse = false;
        }
 
-       public void allocateDenseBlock(boolean clearNNZ) 
-               throws RuntimeException 
-       {
-               allocateDenseBlock(clearNNZ, true);
-       }
-
        public void allocateSparseRowsBlock() {
                allocateSparseRowsBlock(true);
        }
@@ -1045,7 +1030,7 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
        ////////
        // basic block handling functions       
 
-       private void denseToSparse() 
+       void denseToSparse() 
        {       
                //set target representation
                sparse = true;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8c069ed8/src/main/java/org/apache/sysml/utils/Statistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/Statistics.java 
b/src/main/java/org/apache/sysml/utils/Statistics.java
index 38b5e35..0af3c44 100644
--- a/src/main/java/org/apache/sysml/utils/Statistics.java
+++ b/src/main/java/org/apache/sysml/utils/Statistics.java
@@ -39,6 +39,7 @@ import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.MRJobInstruction;
 import org.apache.sysml.runtime.instructions.cp.FunctionCallCPInstruction;
 import org.apache.sysml.runtime.instructions.spark.SPInstruction;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNN;
 
 /**
  * This class captures all statistics.
@@ -100,10 +101,6 @@ public class Statistics
        private static AtomicLong lTotalLix = new AtomicLong(0);
        private static AtomicLong lTotalLixUIP = new AtomicLong(0);
        
-       
-       private static AtomicLong denseBlockAllocationTime = new AtomicLong(0);
-       private static AtomicLong sparseBlockAllocationTime = new AtomicLong(0);
-       
        public static long cudaInitTime = 0;
        public static long cudaLibrariesInitTime = 0;
        public static AtomicLong cudaConversionTime = new AtomicLong(0);        
// Measures time spent in converting between sparse block types
@@ -118,13 +115,6 @@ public class Statistics
        public static AtomicLong cudaFromDevCount = new AtomicLong(0);
        public static AtomicLong cudaEvictionCount = new AtomicLong(0);
        
-       public static void incrementAllocationTime(long allocationTime, boolean 
isSparse) {
-               if(isSparse)
-                       sparseBlockAllocationTime.addAndGet(allocationTime);
-               else
-                       denseBlockAllocationTime.addAndGet(allocationTime);
-       }
-       
        public static synchronized void setNoOfExecutedMRJobs(int 
iNoOfExecutedMRJobs) {
                Statistics.iNoOfExecutedMRJobs = iNoOfExecutedMRJobs;
        }
@@ -371,9 +361,6 @@ public class Statistics
                resetJVMgcCount();
                resetCPHeavyHitters();
                
-               denseBlockAllocationTime.set(0);
-               sparseBlockAllocationTime.set(0);
-               
                cudaInitTime = 0;
                cudaLibrariesInitTime = 0;
                cudaAllocTime.set(0);
@@ -385,6 +372,7 @@ public class Statistics
                cudaToDevCount.set(0);
                cudaFromDevCount.set(0);
                cudaEvictionCount.set(0);
+               LibMatrixDNN.resetStatistics();
        }
 
        public static void resetJITCompileTime(){
@@ -648,10 +636,6 @@ public class Statistics
                        sb.append("Cache hits (Mem, WB, FS, HDFS):\t" + 
CacheStatistics.displayHits() + ".\n");
                        sb.append("Cache writes (WB, FS, HDFS):\t" + 
CacheStatistics.displayWrites() + ".\n");
                        sb.append("Cache times (ACQr/m, RLS, EXP):\t" + 
CacheStatistics.displayTime() + " sec.\n");
-                       if(DMLScript.REUSE_NONZEROED_OUTPUT) {
-                               sb.append("Allocation time (Dense/Sparse):\t" + 
String.format("%.3f", denseBlockAllocationTime.doubleValue()/1000000000) 
-                                               + "/" + String.format("%.3f", 
sparseBlockAllocationTime.doubleValue()/1000000000)  + " sec.\n");
-                       }
                        sb.append("HOP DAGs recompiled (PRED, SB):\t" + 
getHopRecompiledPredDAGs() + "/" + getHopRecompiledSBDAGs() + ".\n");
                        sb.append("HOP DAGs recompile time:\t" + 
String.format("%.3f", ((double)getHopRecompileTime())/1000000000) + " sec.\n");
                        if( getFunRecompiles()>0 ) {
@@ -681,6 +665,7 @@ public class Statistics
                        sb.append("Total JIT compile time:\t\t" + 
((double)getJITCompileTime())/1000 + " sec.\n");
                        sb.append("Total JVM GC count:\t\t" + getJVMgcCount() + 
".\n");
                        sb.append("Total JVM GC time:\t\t" + 
((double)getJVMgcTime())/1000 + " sec.\n");
+                       LibMatrixDNN.appendStatistics(sb);
                        sb.append("Heavy hitter instructions (name, time, 
count):\n" + getHeavyHitters(maxHeavyHitters));
                }
                

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8c069ed8/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
index e247d08..1b516db 100644
--- 
a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
@@ -23,6 +23,7 @@ import java.util.HashMap;
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
 import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNN;
 import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
 import org.apache.sysml.test.integration.AutomatedTestBase;
 import org.apache.sysml.test.integration.TestConfiguration;
@@ -47,35 +48,83 @@ public class Conv2DTest extends AutomatedTestBase
        public void testConv2DDense1() 
        {
                int numImg = 5; int imgSize = 3; int numChannels = 3; int 
numFilters = 6; int filterSize = 2; int stride = 1; int pad = 0;
-               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad);
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, false);
        }
        
        @Test
        public void testConv2DDense2() 
        {
                int numImg = 1; int imgSize = 10; int numChannels = 4; int 
numFilters = 3; int filterSize = 4; int stride = 2; int pad = 0;
-               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad);
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, false);
        }
        
        @Test
        public void testConv2DDense3() 
        {
                int numImg = 1; int imgSize = 10; int numChannels = 4; int 
numFilters = 3; int filterSize = 4; int stride = 2; int pad = 1;
-               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad);
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, false);
        }
        
        @Test
        public void testConv2DDense4() 
        {
                int numImg = 3; int imgSize = 10; int numChannels = 1; int 
numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
-               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad);
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, false);
        }
        
        @Test
        public void testConv2DDense5() 
        {
                int numImg = 3; int imgSize = 8; int numChannels = 2; int 
numFilters = 3; int filterSize = 3; int stride = 1; int pad = 2;
-               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad);
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, false);
+       }
+       
+       @Test
+       public void testConv2DDense6() 
+       {
+               int numImg = 1; int imgSize = 10; int numChannels = 4; int 
numFilters = 3; int filterSize = 4; int stride = 1; int pad = 0;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, false);
+       }
+       
+       @Test
+       public void testConv2DDense7() 
+       {
+               int numImg = 3; int imgSize = 10; int numChannels = 1; int 
numFilters = 3; int filterSize = 2; int stride = 1; int pad = 0;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, false);
+       }
+       
+       @Test
+       public void testConv2DSparse1() 
+       {
+               int numImg = 5; int imgSize = 3; int numChannels = 3; int 
numFilters = 6; int filterSize = 2; int stride = 1; int pad = 0;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, true);
+       }
+       
+       @Test
+       public void testConv2DSparse2() 
+       {
+               int numImg = 1; int imgSize = 10; int numChannels = 4; int 
numFilters = 3; int filterSize = 4; int stride = 2; int pad = 0;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, true);
+       }
+       
+       @Test
+       public void testConv2DSparse3() 
+       {
+               int numImg = 1; int imgSize = 10; int numChannels = 4; int 
numFilters = 3; int filterSize = 4; int stride = 2; int pad = 1;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, true);
+       }
+       
+       public void testConv2DSparse4() 
+       {
+               int numImg = 3; int imgSize = 10; int numChannels = 1; int 
numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, true);
+       }
+       
+       @Test
+       public void testConv2DSparse5() 
+       {
+               int numImg = 3; int imgSize = 8; int numChannels = 2; int 
numFilters = 3; int filterSize = 3; int stride = 1; int pad = 2;
+               runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, 
numFilters, filterSize, stride, pad, true);
        }
        
        /**
@@ -84,57 +133,64 @@ public class Conv2DTest extends AutomatedTestBase
         * @param sparse
         */
        public void runConv2DTest( ExecType et, int imgSize, int numImg, int 
numChannels, int numFilters, 
-                       int filterSize, int stride, int pad) 
+                       int filterSize, int stride, int pad, boolean sparse) 
        {
                RUNTIME_PLATFORM oldRTP = rtplatform;
                        
                boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
                
-               try
-               {
-                   TestConfiguration config = getTestConfiguration(TEST_NAME);
-                   if(et == ExecType.SPARK) {
-                       rtplatform = RUNTIME_PLATFORM.SPARK;
-                   }
-                   else {
-                       rtplatform = (et==ExecType.MR)? RUNTIME_PLATFORM.HADOOP 
: RUNTIME_PLATFORM.SINGLE_NODE;
-                   }
-                       if( rtplatform == RUNTIME_PLATFORM.SPARK )
-                               DMLScript.USE_LOCAL_SPARK_CONFIG = true;
-                       
-                       loadTestConfiguration(config);
-               
-                       /* This is for running the junit test the new way, 
i.e., construct the arguments directly */
-                       String RI_HOME = SCRIPT_DIR + TEST_DIR;
-                       fullDMLScriptName = RI_HOME + TEST_NAME + ".dml";
-                       
-                       
-                       programArgs = new String[]{"-explain", "-args",  "" + 
imgSize, "" + numImg, 
-                               "" + numChannels, "" + numFilters, 
-                               "" + filterSize, "" + stride, "" + pad, 
-                               output("B")};
-                       
-                       fullRScriptName = RI_HOME + TEST_NAME + ".R";
-                       rCmd = "Rscript" + " " + fullRScriptName + " " + 
imgSize + " " + numImg + 
-                                       " " + numChannels + " " + numFilters + 
-                                       " " + filterSize + " " + stride + " " + 
pad + " " + expectedDir(); 
-                       
-                       boolean exceptionExpected = false;
-                       int expectedNumberOfJobs = -1;
-                       runTest(true, exceptionExpected, null, 
expectedNumberOfJobs);
-
-                       // Run comparison R script
-                       runRScript(true);
-                       HashMap<CellIndex, Double> bHM = readRMatrixFromFS("B");
-                       
-                       HashMap<CellIndex, Double> dmlfile = 
readDMLMatrixFromHDFS("B");
-                       TestUtils.compareMatrices(dmlfile, bHM, epsilon, 
"B-DML", "B-R");
-                       
-               }
-               finally
-               {
-                       rtplatform = oldRTP;
-                       DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+               synchronized(LibMatrixDNN.class) {
+                       try
+                       {
+                               LibMatrixDNN.TEST_SPARSE_INPUT = true;
+                               LibMatrixDNN.TEST_SPARSE_FILTER = true;
+                               
+                           TestConfiguration config = 
getTestConfiguration(TEST_NAME);
+                           if(et == ExecType.SPARK) {
+                               rtplatform = RUNTIME_PLATFORM.SPARK;
+                           }
+                           else {
+                               rtplatform = (et==ExecType.MR)? 
RUNTIME_PLATFORM.HADOOP : RUNTIME_PLATFORM.SINGLE_NODE;
+                           }
+                               if( rtplatform == RUNTIME_PLATFORM.SPARK )
+                                       DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+                               
+                               loadTestConfiguration(config);
+                       
+                               /* This is for running the junit test the new 
way, i.e., construct the arguments directly */
+                               String RI_HOME = SCRIPT_DIR + TEST_DIR;
+                               fullDMLScriptName = RI_HOME + TEST_NAME + 
".dml";
+                               
+                               
+                               programArgs = new String[]{"-explain", "-args", 
 "" + imgSize, "" + numImg, 
+                                       "" + numChannels, "" + numFilters, 
+                                       "" + filterSize, "" + stride, "" + pad, 
+                                       output("B")};
+                               
+                               fullRScriptName = RI_HOME + TEST_NAME + ".R";
+                               rCmd = "Rscript" + " " + fullRScriptName + " " 
+ imgSize + " " + numImg + 
+                                               " " + numChannels + " " + 
numFilters + 
+                                               " " + filterSize + " " + stride 
+ " " + pad + " " + expectedDir(); 
+                               
+                               boolean exceptionExpected = false;
+                               int expectedNumberOfJobs = -1;
+                               runTest(true, exceptionExpected, null, 
expectedNumberOfJobs);
+       
+                               // Run comparison R script
+                               runRScript(true);
+                               HashMap<CellIndex, Double> bHM = 
readRMatrixFromFS("B");
+                               
+                               HashMap<CellIndex, Double> dmlfile = 
readDMLMatrixFromHDFS("B");
+                               TestUtils.compareMatrices(dmlfile, bHM, 
epsilon, "B-DML", "B-R");
+                               
+                       }
+                       finally
+                       {
+                               rtplatform = oldRTP;
+                               DMLScript.USE_LOCAL_SPARK_CONFIG = 
sparkConfigOld;
+                               LibMatrixDNN.TEST_SPARSE_INPUT = false;
+                               LibMatrixDNN.TEST_SPARSE_FILTER = false;
+                       }
                }
        }
 }

Reply via email to