[3/5] systemml git commit: [SYSTEMML-540] Support sparse GPU conv2d as well as fix memory estimation of convolution operations

niketanpansare Thu, 07 Sep 2017 12:51:04 -0700

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java 
b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index 2b9335c..59ac29e 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -191,7 +191,7 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
 //             // TODO: Inserting reblock requires knowing columns apriori
 //             ConvolutionTransform transform1 = new 
ConvolutionTransform(addReblockIfNecessary(et, lopOp, in), lopOp, 
getDataType(), getValueType(), et, k);
 //             setReblockedOutputDimension(et, transform1);
-               ConvolutionTransform transform1 = new ConvolutionTransform(in, 
lopOp, getDataType(), getValueType(), et, k);
+               ConvolutionTransform transform1 = new ConvolutionTransform(in, 
lopOp, getDataType(), getValueType(), et, k, computeIntermediateMemEstimate(-1, 
-1, -1 ));
                setOutputDimensions(transform1);
                
                setLineNumbers(transform1);
@@ -223,13 +223,171 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                return OptimizerUtils.estimateSizeExactSparsity(dim1, dim2, 
sparsity);
        }
        
+       // ---------------------------------------------------------------
+       // Utility methods to guard the computation of memory estimates in 
presense of unknowns
+       private static class IntermediateDimensions {
+               int dim1; int dim2; double sp;
+               public IntermediateDimensions(ConvolutionOp h, String dim1Str, 
String dim2Str, double sp) {
+                       dim1 = (int) h.getDim(dim1Str);
+                       dim2 = (int) h.getDim(dim2Str);
+                       this.sp = sp;
+               }
+               public IntermediateDimensions(ConvolutionOp h, String dim1Str, 
String dim2Str) {
+                       dim1 = (int) h.getDim(dim1Str);
+                       dim2 = (int) h.getDim(dim2Str);
+                       sp = 1;
+               }
+               public IntermediateDimensions(ConvolutionOp h, int dim1, String 
dim2Str) {
+                       this.dim1 = dim1;
+                       dim2 = (int) h.getDim(dim2Str);
+                       sp = 1;
+               }
+               
+               /**
+                * Add two computed memory estimates
+                * 
+                * @param val1 memory estimate 1
+                * @param val2 memory estimate 2
+                * @return sum of memory estimates
+                */
+               static double guardedAdd(double val1, double val2) {
+                       if(val1 < 0 || val2 < 0) return 
OptimizerUtils.DEFAULT_SIZE;
+                       double ret = val1 + val2;
+                       if(ret >= OptimizerUtils.DEFAULT_SIZE) return 
OptimizerUtils.DEFAULT_SIZE;
+                       else return ret;
+               }
+               
+               /**
+                * Compute memory estimates for given intermediate matrices 
+                * 
+                * @param intermediates list of intermediates
+                * @param numWorkers number of workers
+                * @return memory estimate
+                */
+               public static double 
addEstimateSizes(ArrayList<IntermediateDimensions> intermediates, int 
numWorkers) {
+                       double memBudget = 0; 
+                       for(int i = 0; i < intermediates.size(); i++) {
+                               memBudget = guardedAdd(memBudget, 
OptimizerUtils.estimateSizeExactSparsity(
+                                               intermediates.get(i).dim1, 
intermediates.get(i).dim2, intermediates.get(i).sp)*numWorkers);
+                       }
+                       return memBudget;
+               }
+               
+               /**
+                * Compute max of two computed memory estimates
+                * @param val1 memory estimate 1
+                * @param val2 memory estimate 2
+                * @return max of memory estimates
+                */
+               public static double guardedMax(double val1, double val2) {
+                       if(val1 < 0 || val2 < 0) return 
OptimizerUtils.DEFAULT_SIZE;
+                       double ret = Math.max(val1, val2);
+                       if(ret >= OptimizerUtils.DEFAULT_SIZE) return 
OptimizerUtils.DEFAULT_SIZE;
+                       else return ret;
+               }
+       }
+       
+       /**
+        * Helper utility to compute intermediate memory estimate
+        * 
+        * @param gpuIntermediates intermediates for GPU
+        * @param cpIntermediates intermediates for CP
+        * @return memory estimates
+        */
+       private double computeIntermediateMemEstimateHelper(
+                       ArrayList<IntermediateDimensions> gpuIntermediates,
+                       ArrayList<IntermediateDimensions> cpIntermediates) {
+               // Since CP operators use row-level parallelism by default
+               int numWorkers = (int) 
Math.min(OptimizerUtils.getConstrainedNumThreads(_maxNumThreads), 
Math.max(getDim("N"), 1));
+               if(DMLScript.USE_ACCELERATOR) {
+                       // Account for potential sparse-to-dense conversion
+                       double gpuMemBudget = 
IntermediateDimensions.addEstimateSizes(gpuIntermediates, 1);
+                       double cpMemoryBudget = 
IntermediateDimensions.addEstimateSizes(cpIntermediates, numWorkers);
+                       if(cpMemoryBudget > gpuMemBudget) {
+                               double oneThreadCPMemBudget = 
IntermediateDimensions.addEstimateSizes(cpIntermediates, 1);
+                               if(oneThreadCPMemBudget <= gpuMemBudget) {
+                                       // Why limit CPU ? in-order to give 
more opportunity to compile GPU operators
+                                       cpMemoryBudget = oneThreadCPMemBudget;
+                               }
+                       }
+                       // Finally, use the maximum of CP and GPU memory budget
+                       return 
IntermediateDimensions.guardedMax(cpMemoryBudget, gpuMemBudget);
+               }
+               else {
+                       // When -gpu flag is not provided, the memory estimates 
for CP are not affected.
+                       return 
IntermediateDimensions.addEstimateSizes(cpIntermediates, numWorkers);
+               }
+       }
+       
        @Override
-       protected double computeIntermediateMemEstimate( long dim1, long dim2, 
long nnz )
+       protected double computeIntermediateMemEstimate( long ignoreDim1, long 
ignoreDim2, long ignoreNnz )
        {       
-               //default: no intermediate memory requirements
-               return 0;
+               ArrayList<IntermediateDimensions> gpuIntermediates = new 
ArrayList<IntermediateDimensions>();
+               ArrayList<IntermediateDimensions> cpIntermediates = new 
ArrayList<IntermediateDimensions>();
+               if(getOp() == ConvOp.DIRECT_CONV2D) {
+                       // Assumption: To compile a GPU conv2d operator, 
following should fit on the GPU:
+                       // 1. output in dense format (i.e. 
computeOutputMemEstimate) 
+                       // 2. input in any format
+                       // 3. atleast one input row in dense format
+                       // 4. filter in dense format
+                       
+                       // Account for potential sparse-to-dense conversion of 
atleast 1 input row and filter
+                       gpuIntermediates.add(new IntermediateDimensions(this, 
1, "CHW"));
+                       gpuIntermediates.add(new IntermediateDimensions(this, 
"K", "CRS"));
+                       
+                       // im2col operation preserves the worst-case sparsity 
of the input.
+                       cpIntermediates.add(new IntermediateDimensions(this, 
"CRS", "PQ", getInput().get(0).getSparsity()));
+               }
+               else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) {
+                       // Assumption: To compile a GPU conv2d_backward_data 
operator, following should fit on the GPU:
+                       // 1. output in dense format (i.e. 
computeOutputMemEstimate) 
+                       // 2. dout in any format
+                       // 3. atleast one dout row in dense format
+                       // 4. filter in dense format
+                       
+                       // Account for potential sparse-to-dense conversion of 
atleast 1 input row and filter
+                       gpuIntermediates.add(new IntermediateDimensions(this, 
1, "KPQ"));
+                       gpuIntermediates.add(new IntermediateDimensions(this, 
"K", "CRS"));
+                       
+                       // There are 2 intermediates: rotate180 and input to 
col2im for conv2d_backward_data
+                       // rotate180 preserves the "exact" sparsity of the dout 
matrix
+                       cpIntermediates.add(new IntermediateDimensions(this, 
"PQ", "K", getInput().get(1).getSparsity()));
+                       // Note: worst-case sparsity for the input of col2im 
(of size NPQ x CRS where N is determined by degree of parallelism)
+                       cpIntermediates.add(new IntermediateDimensions(this, 
"PQ", "CRS"));
+               }
+               else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER) {
+                       // Assumption: To compile a GPU conv2d_backward_filter 
operator, following should fit on the GPU:
+                       // 1. output in dense format (i.e. 
computeOutputMemEstimate) 
+                       // 2. dout in any format
+                       // 3. atleast one dout and input row in dense format
+                       
+                       // Account for potential sparse-to-dense conversion of 
atleast 1 input + dout row
+                       gpuIntermediates.add(new IntermediateDimensions(this, 
1, "CHW"));
+                       gpuIntermediates.add(new IntermediateDimensions(this, 
1, "KPQ"));
+                       
+                       // There are 2 intermediates: im2col and rotate180 for 
conv2d_backward_filter
+                       // rotate180 preserves the "exact" sparsity of the dout 
matrix
+                       cpIntermediates.add(new IntermediateDimensions(this, 
"PQ", "K", getInput().get(1).getSparsity()));
+                       // im2col operation preserves the worst-case sparsity 
of the input.
+                       cpIntermediates.add(new IntermediateDimensions(this, 
"CRS", "PQ", getInput().get(0).getSparsity()));
+               }
+               else if(getOp() == ConvOp.MAX_POOLING) {
+                       // Account for potential sparse-to-dense conversion of 
atleast 1 input row
+                       gpuIntermediates.add(new IntermediateDimensions(this, 
1, "CHW"));
+               }
+               else if(getOp() == ConvOp.MAX_POOLING_BACKWARD) {
+                       // Account for potential sparse-to-dense conversion of 
atleast 1 input + dout row
+                       gpuIntermediates.add(new IntermediateDimensions(this, 
1, "CHW"));
+                       gpuIntermediates.add(new IntermediateDimensions(this, 
1, "CPQ"));
+               }
+               
+               if(gpuIntermediates.size() > 0 || cpIntermediates.size() > 0)
+                       return 
computeIntermediateMemEstimateHelper(gpuIntermediates, cpIntermediates);
+               else
+                       return 0;
        }
        
+       
        @Override
        protected long[] inferOutputCharacteristics( MemoTable memo )
        {
@@ -243,65 +401,9 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                        ret[2] = -1;
                        return (ret[0]>0 && ret[1]>0) ? ret : null;
                }
-       
-               ConvolutionParameters params;
-               try {
-                       params = parseInput();
-               } catch (DMLRuntimeException e) {
-                       throw new RuntimeException(e);
-               }
                
-               switch(op) 
-               {
-                       case MAX_POOLING: {
-                               // input
-                               long N = getInput().get(0)._dim1;
-                               ret[0] = N;
-                               ret[1] = getExtractedVal(params.C, params.P, 
params.Q);
-                               ret[2] = -1;
-                               break;
-                       }
-                       case DIRECT_CONV2D: {
-                               // input, filter
-                               long N = getInput().get(0)._dim1;
-                               ret[0] = N;
-                               ret[1] = getExtractedVal(params.K, params.P, 
params.Q);
-                               ret[2] = -1;
-                               break;
-                       }
-                       case DIRECT_CONV2D_BACKWARD_FILTER: {
-                               // input, dout  
-                               ret[0] = params.K;
-                               ret[1] = getExtractedVal(params.C, params.R, 
params.S);
-                               ret[2] = -1;
-                               break;
-                       }
-                       case MAX_POOLING_BACKWARD: {
-                               // input, dout
-                               ret[0] = getInput().get(0)._dim1;
-                               ret[1] = getInput().get(0)._dim2;
-                               ret[2] = -1;
-                               break;
-                       }
-                       case DIRECT_CONV2D_BACKWARD_DATA: {
-                               // filter, dout
-                               long N = getInput().get(1)._dim1;
-                               ret[0] = N;
-                               ret[1] = getExtractedVal(params.C, params.H, 
params.W);
-                               ret[2] = -1;
-                               break;
-                       }
-                       default:
-                               throw new RuntimeException("Unsupported op:" + 
op.name());
-               }
-               
-               if(LOG.isDebugEnabled() && (ret[0] <= 0 || ret[1] <= 0)) {
-                       LOG.debug("Unknown dimensions for ConvolutionOp in 
inferOutputCharacteristics:" + op.name() + " " + ret[0] + " " + ret[1] + 
-                                       " img_dim=[" + params.N + " " + 
params.C + " " + params.H + " " + params.W + "]" +
-                                       " filter_dim=[" + params.K + " " + 
params.C + " " + params.H + " " + params.W + "]" + 
-                                       " output_feature_map=[" + params.P + " 
" + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" +
-                                       " pad=[" + params.pad_h + " " + 
params.pad_w + "]");
-               }
+               refreshSizeInformation();
+               ret[0] = _dim1; ret[1] = _dim2; ret[2] = _nnz;
                
                //safe return (create entry only if at least dims known)
                return (ret[0]>0 && ret[1]>0) ? ret : null;
@@ -347,50 +449,44 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                return _etype;
        }
        
+       // Caching parameters speed-ups dynamic recompilation time by avoiding 
unnecessary computeSizeInformation
+       private ConvolutionParameters _cachedParams = new 
ConvolutionParameters(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
_maxNumThreads);
        // stride1, stride2, padding1, padding2  
        // input_shape1, input_shape2, input_shape3, input_shape4, 
        // filter_shape1, filter_shape2, filter_shape3, filter_shape4
        ConvolutionParameters parseInput() throws DMLRuntimeException {
-               ConvolutionParameters params = null;
                if(op == ConvOp.MAX_POOLING_BACKWARD 
                                || op == ConvOp.DIRECT_CONV2D 
                                || op == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER
                                || op == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) {
-                       params = new ConvolutionParameters(
-                                       
computeSizeInformation(getInput().get(6)),
-                                       
computeSizeInformation(getInput().get(7)), 
-                                       
computeSizeInformation(getInput().get(8)), 
-                                       
computeSizeInformation(getInput().get(9)), 
-                                       
computeSizeInformation(getInput().get(10)), 
-                                       
computeSizeInformation(getInput().get(12)), 
-                                       
computeSizeInformation(getInput().get(13)), 
-                                       
computeSizeInformation(getInput().get(2)), 
-                                       
computeSizeInformation(getInput().get(3)), 
-                                       
computeSizeInformation(getInput().get(4)), 
-                                       
computeSizeInformation(getInput().get(5)), _maxNumThreads);
+                       _cachedParams.setIfUnknown(
+                                       getInput().get(6),
+                                       getInput().get(7), 
+                                       getInput().get(8), 
+                                       getInput().get(9), 
+                                       getInput().get(10), 
+                                       getInput().get(12), 
+                                       getInput().get(13), 
+                                       getInput().get(2), 
+                                       getInput().get(3), 
+                                       getInput().get(4), 
+                                       getInput().get(5), _maxNumThreads);
                }
                else {
-                       params = new ConvolutionParameters(
-                                       
computeSizeInformation(getInput().get(5)),
-                                       
computeSizeInformation(getInput().get(6)), 
-                                       
computeSizeInformation(getInput().get(7)), 
-                                       
computeSizeInformation(getInput().get(8)), 
-                                       
computeSizeInformation(getInput().get(9)), 
-                                       
computeSizeInformation(getInput().get(11)), 
-                                       
computeSizeInformation(getInput().get(12)), 
-                                       
computeSizeInformation(getInput().get(1)), 
-                                       
computeSizeInformation(getInput().get(2)), 
-                                       
computeSizeInformation(getInput().get(3)), 
-                                       
computeSizeInformation(getInput().get(4)), _maxNumThreads);
-               }
-               return params;
-       }
-
-       public static long getExtractedVal(long val1, long val2, long val3) {
-               if(val1 == -1 || val2 == -1 || val3 == -1) {
-                       return -1;
+                       _cachedParams.setIfUnknown(
+                                       getInput().get(5),
+                                       getInput().get(6), 
+                                       getInput().get(7), 
+                                       getInput().get(8), 
+                                       getInput().get(9), 
+                                       getInput().get(11), 
+                                       getInput().get(12), 
+                                       getInput().get(1), 
+                                       getInput().get(2), 
+                                       getInput().get(3), 
+                                       getInput().get(4), _maxNumThreads);
                }
-               return val1*val2*val3;
+               return _cachedParams;
        }
        
        @Override
@@ -400,72 +496,50 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                        Hop input1 = getInput().get(0);
                        setDim1(input1.getDim1());
                        setDim2(input1.getDim2());
+                       _nnz = -1; // cannot infer stats
                        return;
                }
                
-               ConvolutionParameters params;
-               try {
-                       params = parseInput();
-               } catch (DMLRuntimeException e) {
-                       throw new RuntimeException(e);
-               }
-               
                switch(op) 
                {
                        case MAX_POOLING:
                        {       
-                               // input
-                               long N = getInput().get(0)._dim1;
-                               _dim1 = N;
-                               _dim2 = getExtractedVal(params.C, params.P, 
params.Q);
+                               _dim1 = getDim("N");
+                               _dim2 = getDim("CPQ");
                                _nnz = -1; // cannot infer stats
                                break;
                        }
                        case MAX_POOLING_BACKWARD:
                        {
-                               // input, dout
-                               _dim1 = getInput().get(0)._dim1;
-                               _dim2 = getInput().get(0)._dim2;
+                               _dim1 = getDim("N");
+                               _dim2 = getDim("CHW");
                                _nnz = -1;
                                break;
                        }
                        case DIRECT_CONV2D:
                        {
-                               // input, filter
-                               long N = getInput().get(0)._dim1;
-                               _dim1 = N;
-                               _dim2 = getExtractedVal(params.K, params.P, 
params.Q);
+                               _dim1 = getDim("N");
+                               _dim2 = getDim("KPQ");
                                _nnz = -1; // cannot infer stats
                                break;
                        }
                        case DIRECT_CONV2D_BACKWARD_DATA:
                        {
-                               // filter, dout
-                               long N = getInput().get(1)._dim1;
-                               _dim1 = N;
-                               _dim2 = getExtractedVal(params.C, params.H, 
params.W);
+                               _dim1 = getDim("N");
+                               _dim2 = getDim("CHW");
                                _nnz = -1; // cannot infer stats
                                break;
                        }
                        case DIRECT_CONV2D_BACKWARD_FILTER:
                        {
-                               // input, dout  
-                               _dim1 = params.K;
-                               _dim2 = getExtractedVal(params.C, params.R, 
params.S);
+                               _dim1 = getDim("K");
+                               _dim2 = getDim("CRS");
                                _nnz = -1; // cannot infer stats
                                break;
                        }
                        default:
                                throw new RuntimeException("The sizes are not 
refreshed for " + op.name());
                }
-               
-               if(LOG.isDebugEnabled() && (_dim1 <= 0 || _dim2 <= 0)) {
-                       LOG.debug("Unknown dimensions for ConvolutionOp in 
refreshSizeInformation:" + op.name() + " " + _dim1 + " " + _dim2 + 
-                                       " img_dim=[" + params.N + " " + 
params.C + " " + params.H + " " + params.W + "]" +
-                                       " filter_dim=[" + params.K + " " + 
params.C + " " + params.H + " " + params.W + "]" + 
-                                       " output_feature_map=[" + params.P + " 
" + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" +
-                                       " pad=[" + params.pad_h + " " + 
params.pad_w + "]");
-               }
        }
        
        @Override
@@ -511,4 +585,132 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
        public int getMaxNumThreads() {
                return _maxNumThreads;
        }
+       
+       
+       // 
------------------------------------------------------------------------------------------------------
+       // Utility methods to get the dimensions taking into account unknown 
dimensions
+       
+       /**
+        * Convenient method to get the dimensions required by ConvolutionOp.
+        * 
+        * @param dimString can be K, CRS, N, CHW, KPQ, PQ
+        * @return either -1 or value associated with the dimString
+        */
+       private long getDim(String dimString) {
+               if(op == ConvOp.BIAS_ADD || op == ConvOp.BIAS_MULTIPLY) {
+                       throw new RuntimeException("getDim method should not be 
invoked for bias_add and bias_multiply");
+               }
+               ConvolutionParameters params;
+               try {
+                       params = parseInput();
+               } catch (DMLRuntimeException e) {
+                       throw new RuntimeException(e);
+               }
+               Hop filter = null;      // shape: K x CRS 
+               Hop input = null;       // shape: N x CHW
+               Hop dout = null;        // shape: N x KPQ
+               Hop dout1 = null;       // shape: N x CPQ
+               
+               if(getOp() == ConvOp.DIRECT_CONV2D) {
+                       input  = getInput().get(0);
+                       filter = getInput().get(1);
+               }
+               else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) {
+                       filter = getInput().get(0);
+                       dout  = getInput().get(1);
+               }
+               else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER) {
+                       input = getInput().get(0);
+                       dout  = getInput().get(1);
+               }
+               else if(getOp() == ConvOp.MAX_POOLING) {
+                       input = getInput().get(0);
+               }
+               else if(getOp() == ConvOp.MAX_POOLING_BACKWARD) {
+                       input = getInput().get(0);
+                       dout1  = getInput().get(1);
+               }
+               
+               long ret = -1;
+               if(dimString.equals("K") && filter != null) {
+                       ret = getNonNegative(ret, getNonNegative(params.K, 
filter._dim1));
+               }
+               else if(dimString.equals("CRS") && filter != null) {
+                       ret = getNonNegative(ret, 
getNonNegative(nonNegativeMultiply(params.C, params.R, params.S), 
filter._dim2));
+               }
+               else if(dimString.equals("N") && input != null) {
+                       ret = getNonNegative(ret, getNonNegative(params.N, 
input._dim1));
+               }
+               else if(dimString.equals("CHW") && input != null) {
+                       ret = getNonNegative(ret, 
getNonNegative(nonNegativeMultiply(params.C, params.H, params.W), input._dim2));
+               }
+               else if(dimString.equals("N") && dout != null) {
+                       ret = getNonNegative(ret, getNonNegative(params.N, 
dout._dim1));
+               }
+               else if(dimString.equals("KPQ") && dout != null) {
+                       ret = getNonNegative(ret, 
getNonNegative(nonNegativeMultiply(params.K, params.P, params.Q), dout._dim2));
+               }
+               else if(dimString.equals("N") && dout1 != null) {
+                       ret = getNonNegative(ret, getNonNegative(params.N, 
dout1._dim1));
+               }
+               else if(dimString.equals("CPQ") && dout1 != null) {
+                       ret = getNonNegative(ret, 
getNonNegative(nonNegativeMultiply(params.C, params.P, params.Q), dout1._dim2));
+               }
+               else if(dimString.equals("K")) {
+                       ret = getNonNegative(ret, params.K >= 0 ? params.K : 
-1);
+               }
+               else if(dimString.equals("CRS")) {
+                       ret = getNonNegative(ret, nonNegativeMultiply(params.C, 
params.R, params.S));
+               }
+               else if(dimString.equals("N")) {
+                       ret = getNonNegative(ret, params.N >= 0 ? params.N : 
-1);
+               }
+               else if(dimString.equals("CHW")) {
+                       ret = getNonNegative(ret, nonNegativeMultiply(params.C, 
params.H, params.W));
+               }
+               else if(dimString.equals("KPQ")) {
+                       ret = getNonNegative(ret, nonNegativeMultiply(params.K, 
params.P, params.Q));
+               }
+               else if(dimString.equals("PQ")) {
+                       ret = getNonNegative(ret, nonNegativeMultiply(params.P, 
params.Q));
+               }
+               else if(dimString.equals("CPQ")) {
+                       ret = getNonNegative(ret, nonNegativeMultiply(params.C, 
params.P, params.Q));
+               }
+               else {
+                       throw new RuntimeException("Unsupported dimension:" + 
dimString + " for operator " + getOp().name());
+               }
+               
+               if(LOG.isDebugEnabled() && ret < 0) {
+                       LOG.debug("Unknown dimension " + dimString + " for 
ConvolutionOp:" + op.name() + 
+                                       " img_dim=[" + params.N + " " + 
params.C + " " + params.H + " " + params.W + "]" +
+                                       " filter_dim=[" + params.K + " " + 
params.C + " " + params.H + " " + params.W + "]" + 
+                                       " output_feature_map=[" + params.P + " 
" + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" +
+                                       " pad=[" + params.pad_h + " " + 
params.pad_w + "]");
+               }
+               return ret;
+       }
+       
+       private long nonNegativeMultiply(long val1, long val2, long val3) {
+               if(val1 >= 0 && val2 >= 0 && val3 >= 0) {
+                       return val1 * val2 * val3;
+               }
+               else return -1;
+       }
+       private long nonNegativeMultiply(long val1, long val2) {
+               if(val1 >= 0 && val2 >= 0) {
+                       return val1 * val2;
+               }
+               else return -1;
+       }
+       private long getNonNegative(long val1, long val2) {
+               if(val1 >= 0 && val2 >= 0) {
+                       if(val1 == val2) return val1;
+                       else throw new RuntimeException("Incorrect dimensions 
in Convolution Hop: " + val1 + " != " + val2);
+               }
+               else if(val1 >= 0) return val1;
+               else if(val2 >= 0) return val2;
+               else return -1;
+       }
+       // 
------------------------------------------------------------------------------------------------------
 }


http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/hops/Hop.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/Hop.java 
b/src/main/java/org/apache/sysml/hops/Hop.java
index eeaa5f1..b454771 100644
--- a/src/main/java/org/apache/sysml/hops/Hop.java
+++ b/src/main/java/org/apache/sysml/hops/Hop.java
@@ -708,31 +708,8 @@ public abstract class Hop implements ParseInfo
                _validCPSizeEstimate = (wstats!=null) ? 
OptimizerUtils.isValidCPMatrixSize(
                        wstats[0], wstats[1], 
OptimizerUtils.getSparsity(wstats[0], wstats[1], wstats[2])) : false;
        }
-
        
        /**
-        * Computes the hop-specific output memory estimate in bytes. Should be 
0 if not
-        * applicable. 
-        * 
-        * @param dim1 dimension 1
-        * @param dim2 dimension 2
-        * @param nnz number of non-zeros
-        * @return memory estimate
-        */
-       protected abstract double computeOutputMemEstimate( long dim1, long 
dim2, long nnz );
-
-       /**
-        * Computes the hop-specific intermediate memory estimate in bytes. 
Should be 0 if not
-        * applicable.
-        * 
-        * @param dim1 dimension 1
-        * @param dim2 dimension 2
-        * @param nnz number of non-zeros
-        * @return memory estimate
-        */
-       protected abstract double computeIntermediateMemEstimate( long dim1, 
long dim2, long nnz );
-
-       /**
         * Computes the output matrix characteristics (rows, cols, nnz) based 
on worst-case output
         * and/or input estimates. Should return null if dimensions are unknown.
         * 
@@ -849,6 +826,21 @@ public abstract class Hop implements ParseInfo
        
        public abstract String getOpString();
 
+       // 
========================================================================================
+       // Design doc: Memory estimation of GPU
+       // 1. Since not all operator are supported on GPU, isGPUEnabled 
indicates whether an operation 
+       // is enabled for GPU. This method doesnot take into account any memory 
estimates.
+       // 2. To simplify memory estimation logic, the methods 
computeOutputMemEstimate and computeIntermediateMemEstimate
+       // should return maximum of memory required for GPU and CP operators. 
+       // 3. Additionally, these methods are guarded so that when -gpu flag is 
not provided, additional memory overhead due to GPU
+       // are ignored. For example: sparse-to-dense conversion on GPU. 
+       // 4. (WIP) Every GPU operators should respect the memory returned by 
computeIntermediateMemEstimate (and computeOutputMemEstimate - see below point).
+       // 5. (WIP) Every GPU operator should create output in the same format 
as the corresponding CP operator. That is,  computeOutputMemEstimate
+       // are consistent across both CP and GPU in terms of worst-case.
+       // 6. The drawback of using maximum memory (mem = Math.max(mem_gpu, 
mem_gpu)) are:
+       // - GPU operator is not selected when mem_gpu < total memory available 
on GPU < mem
+       // - CP operator is not selected (i.e. distributed operator compiled) 
when mem_cpu < driver memory budget < mem
+       
        /**
         * In memory-based optimizer mode (see 
OptimizerUtils.isMemoryBasedOptLevel()), 
         * the exectype is determined by checking this method as well as memory 
budget of this Hop. 
@@ -861,6 +853,31 @@ public abstract class Hop implements ParseInfo
         */
        public abstract boolean isGPUEnabled();
        
+       /**
+        * Computes the hop-specific output memory estimate in bytes. Should be 
0 if not
+        * applicable. 
+        * 
+        * @param dim1 dimension 1
+        * @param dim2 dimension 2
+        * @param nnz number of non-zeros
+        * @return memory estimate
+        */
+       protected abstract double computeOutputMemEstimate( long dim1, long 
dim2, long nnz );
+
+       /**
+        * Computes the hop-specific intermediate memory estimate in bytes. 
Should be 0 if not
+        * applicable.
+        * 
+        * @param dim1 dimension 1
+        * @param dim2 dimension 2
+        * @param nnz number of non-zeros
+        * @return memory estimate
+        */
+       protected abstract double computeIntermediateMemEstimate( long dim1, 
long dim2, long nnz );
+       
+       // 
========================================================================================
+
+       
        protected boolean isVector() {
                return (dimsKnown() && (_dim1 == 1 || _dim2 == 1) );
        }

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java 
b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
index 8784956..121112b 100644
--- a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
+++ b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
@@ -37,6 +37,7 @@ public class ConvolutionTransform extends Lop
        
        private OperationTypes operation = null;
        private int numThreads = -1;
+       private double intermediateMemBudget = 0;
        
        /**
         * Constructor when we have one input.
@@ -47,12 +48,14 @@ public class ConvolutionTransform extends Lop
         * @param vt value type
         * @param et execution type
         * @param k number of threads
+        * @param intermediateMemBudget intermediate memory budget
         */
-       public ConvolutionTransform(Lop input, 
ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, 
int k) 
+       public ConvolutionTransform(Lop input, 
ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, 
int k, double intermediateMemBudget) 
        {
                super(Lop.Type.Transform, dt, vt);              
                init(input, op, dt, vt, et);
                numThreads = k;
+               this.intermediateMemBudget = intermediateMemBudget;
        }
        
        public ConvolutionTransform(Lop input1, Lop input2, 
ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, 
int k) 
@@ -165,6 +168,9 @@ public class ConvolutionTransform extends Lop
                                sb.append( OPERAND_DELIMITOR );
                                sb.append( numThreads );
                        }
+                       
+                       sb.append( OPERAND_DELIMITOR );
+                       sb.append( intermediateMemBudget );
                        return sb.toString();
                }
                else {
@@ -210,6 +216,9 @@ public class ConvolutionTransform extends Lop
                        sb.append( OPERAND_DELIMITOR );
                        sb.append( numThreads );
                }
+               
+               sb.append( OPERAND_DELIMITOR );
+               sb.append( intermediateMemBudget );
        }
 
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index 629b688..e91029e 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -22,6 +22,10 @@ package org.apache.sysml.runtime.instructions.cp;
 import java.util.ArrayList;
 import java.util.Arrays;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.functionobjects.SwapIndex;
@@ -41,24 +45,25 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
        private ArrayList<CPOperand> _filter_shape;
        private ArrayList<CPOperand> _stride = new ArrayList<CPOperand>();
        private ArrayList<CPOperand> _padding = new ArrayList<CPOperand>();
-       private int _numThreads = -1;
-
-       private ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand 
out, String opcode, String istr,
-                       int numThreads) throws DMLRuntimeException {
-               super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, 
out, opcode, istr);
-               if (!(opcode.equals("bias_add") || 
opcode.equals("relu_backward") || opcode.equals("bias_multiply"))) {
-                       throw new DMLRuntimeException(
-                                       "Incorrect usage. Expected the opcode 
to be bias_add or bias_multiply or relu_backward, but found "
-                                                       + opcode);
+       private int _numThreads = -1;   private double 
_intermediateMemoryBudget = 0;
+       private static final Log LOG = 
LogFactory.getLog(ConvolutionCPInstruction.class.getName());
+       private static boolean warnedUnderUtilitization = false;
+       
+       public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand 
out, String opcode, String istr, int numThreads, double 
intermediateMemoryBudget) throws DMLRuntimeException {
+               super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, 
out,
+                               opcode, istr);
+               if( !(opcode.equals("bias_add") || 
opcode.equals("relu_backward") || opcode.equals("bias_multiply") ) ) {
+                       throw new DMLRuntimeException("Incorrect usage. 
Expected the opcode to be bias_add or bias_multiply or relu_backward, but found 
" + opcode);
                }
                _in2 = in2;
                _cptype = CPINSTRUCTION_TYPE.Convolution;
                _numThreads = numThreads;
+               _intermediateMemoryBudget = intermediateMemoryBudget;
        }
 
        private ConvolutionCPInstruction(CPOperand in, CPOperand out, String 
opcode, String istr,
                        ArrayList<CPOperand> stride, ArrayList<CPOperand> 
padding, ArrayList<CPOperand> input_shape,
-                       ArrayList<CPOperand> filter_shape, int numThreads) {
+                       ArrayList<CPOperand> filter_shape, int numThreads, 
double intermediateMemoryBudget) {
                super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, 
out, opcode, istr);
                _cptype = CPINSTRUCTION_TYPE.Convolution;
                _stride = stride;
@@ -66,12 +71,15 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                _input_shape = input_shape;
                _filter_shape = filter_shape;
                _numThreads = numThreads;
+               _intermediateMemoryBudget = intermediateMemoryBudget;
        }
-
-       private ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand 
out, String opcode, String istr,
-                       ArrayList<CPOperand> stride, ArrayList<CPOperand> 
padding, ArrayList<CPOperand> input_shape,
-                       ArrayList<CPOperand> filter_shape, int numThreads) {
-               super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, 
out, opcode, istr);
+       
+       public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand 
out, String opcode,
+                       String istr, ArrayList<CPOperand> stride,
+                       ArrayList<CPOperand> padding, ArrayList<CPOperand> 
input_shape,
+                       ArrayList<CPOperand> filter_shape, int numThreads, 
double intermediateMemoryBudget) {
+               super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, 
out,
+                               opcode, istr);
                _in2 = in2;
                _cptype = CPINSTRUCTION_TYPE.Convolution;
                _stride = stride;
@@ -79,12 +87,15 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                _input_shape = input_shape;
                _filter_shape = filter_shape;
                _numThreads = numThreads;
+               _intermediateMemoryBudget = intermediateMemoryBudget;
        }
-
-       private ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand 
in3, CPOperand out, String opcode,
-                       String istr, ArrayList<CPOperand> stride, 
ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
-                       ArrayList<CPOperand> filter_shape, int numThreads) {
-               super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, 
out, opcode, istr);
+       
+       public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand 
in3, CPOperand out, String opcode,
+                       String istr, ArrayList<CPOperand> stride,
+                       ArrayList<CPOperand> padding, ArrayList<CPOperand> 
input_shape,
+                       ArrayList<CPOperand> filter_shape, int numThreads, 
double intermediateMemoryBudget) {
+               super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, 
out,
+                               opcode, istr);
                _in2 = in2;
                _in3 = in3;
                _cptype = CPINSTRUCTION_TYPE.Convolution;
@@ -93,6 +104,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                _input_shape = input_shape;
                _filter_shape = filter_shape;
                _numThreads = numThreads;
+               _intermediateMemoryBudget = intermediateMemoryBudget;
        }
 
        public static ConvolutionCPInstruction parseInstruction(String str)
@@ -101,7 +113,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                String[] parts = 
InstructionUtils.getInstructionPartsWithValueType(str);
                String opcode = parts[0];
                if (opcode.equalsIgnoreCase("maxpooling") || 
opcode.equalsIgnoreCase("relu_maxpooling")) {
-                       InstructionUtils.checkNumFields(parts, 15);
+                       InstructionUtils.checkNumFields(parts, 16);
                        // stride1, stride2, padding1, padding2
                        // input_shape1, input_shape2, input_shape3, 
input_shape4,
                        // filter_shape1, filter_shape2, filter_shape3, 
filter_shape4, k
@@ -127,13 +139,13 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                        int k = Integer.parseInt(parts[15]);
 
                        return new ConvolutionCPInstruction(in, out, opcode, 
str, stride,
-                                       padding, input_shape, filter_shape, k);
+                                       padding, input_shape, filter_shape, k, 
Double.parseDouble(parts[16]));
                } 
                else if (opcode.equalsIgnoreCase("maxpooling_backward") || 
opcode.equalsIgnoreCase("relu_maxpooling_backward")
                                || opcode.equalsIgnoreCase("conv2d")
                                || 
opcode.equalsIgnoreCase("conv2d_backward_filter")
                                || 
opcode.equalsIgnoreCase("conv2d_backward_data")) {
-                       InstructionUtils.checkNumFields(parts, 16);
+                       InstructionUtils.checkNumFields(parts, 17);
                        // dout, stride1, stride2, padding1, padding2
                        // input_shape1, input_shape2, input_shape3, 
input_shape4,
                        // filter_shape1, filter_shape2, filter_shape3, 
filter_shape4, k
@@ -160,10 +172,10 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                        int k = Integer.parseInt(parts[16]);
 
                        return new ConvolutionCPInstruction(in, in2, out, 
opcode, str, stride,
-                                       padding, input_shape, filter_shape, k);
+                                       padding, input_shape, filter_shape, k, 
Double.parseDouble(parts[17]));
                }
                else if (opcode.equalsIgnoreCase("conv2d_bias_add")) {
-                       InstructionUtils.checkNumFields(parts, 17);
+                       InstructionUtils.checkNumFields(parts, 18);
                        // dout, stride1, stride2, padding1, padding2
                        // input_shape1, input_shape2, input_shape3, 
input_shape4,
                        // filter_shape1, filter_shape2, filter_shape3, 
filter_shape4, k
@@ -191,15 +203,15 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                        int k = Integer.parseInt(parts[17]);
 
                        return new ConvolutionCPInstruction(in, in2, in3, out, 
opcode, str, stride,
-                                       padding, input_shape, filter_shape, k);
+                                       padding, input_shape, filter_shape, k, 
Double.parseDouble(parts[18]));
                }
                else if (opcode.equalsIgnoreCase("bias_add") || 
opcode.equals("relu_backward") || opcode.equalsIgnoreCase("bias_multiply") ) {
-                       InstructionUtils.checkNumFields(parts, 4);
+                       InstructionUtils.checkNumFields(parts, 5);
                        CPOperand in = new CPOperand(parts[1]);
                        CPOperand in2 = new CPOperand(parts[2]);
                        CPOperand out = new CPOperand(parts[3]);
                        int k = Integer.parseInt(parts[4]);
-                       return new ConvolutionCPInstruction(in, in2, out, 
opcode, str, k);
+                       return new ConvolutionCPInstruction(in, in2, out, 
opcode, str, k, Double.parseDouble(parts[5]));
                }
                else {
                        throw new DMLRuntimeException("Unknown opcode while 
parsing a ConvolutionCPInstruction: " + str);
@@ -363,6 +375,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                        ec.releaseMatrixInput(_in2.getName(), 
getExtendedOpcode());
                }
                else if (instOpcode.equalsIgnoreCase("conv2d")) {
+                       resetNumThreads(params, C*R*S, P*Q, 
matBlock.getNonZeros() / (matBlock.getNumRows()*matBlock.getNumColumns()));
                        MatrixBlock filter = ec.getMatrixInput(_in2.getName(), 
getExtendedOpcode());
                        if(filter.isEmpty() || matBlock.isEmpty()) {
                                outputBlock = new MatrixBlock(N, K*P*Q, true);
@@ -377,6 +390,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                        ec.releaseMatrixInput(_in2.getName(), 
getExtendedOpcode());
                }
                else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
+                       resetNumThreads(params, C*R*S, P*Q, 
matBlock.getNonZeros() / (matBlock.getNumRows()*matBlock.getNumColumns()));
                        MatrixBlock filter = ec.getMatrixInput(_in3.getName(), 
getExtendedOpcode());
                        MatrixBlock bias = ec.getMatrixInput(_in2.getName(), 
getExtendedOpcode());
                        if(bias.getNumRows() != params.K || 
bias.getNumColumns() != 1) {
@@ -446,6 +460,27 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction {
                ec.setMatrixOutput(getOutputVariableName(), outputBlock, 
getExtendedOpcode());
        }
        
+       /**
+        * Reset the number of thread to respect the intermediate CP memory 
budget
+        * 
+        * @param params convolution parameters
+        * @param numRows number of rows of intermediate matrix used per thread
+        * @param numCols number of rows of intermediate matrix used per thread
+        * @param sparsity sparsity of intermediate matrix used per thread
+        */
+       private void resetNumThreads(ConvolutionParameters params, int numRows, 
int numCols, double sparsity) {
+               if(DMLScript.USE_ACCELERATOR) {
+                       double memBudget1Thread = 
OptimizerUtils.estimateSizeExactSparsity(numRows, numCols, sparsity);
+                       int limitedDegreeOfParallelism = (int) 
Math.floor(_intermediateMemoryBudget / memBudget1Thread);
+                       if(params.numThreads > limitedDegreeOfParallelism) {
+                               params.numThreads = limitedDegreeOfParallelism;
+                               if(!warnedUnderUtilitization)
+                                       LOG.warn("CPU Under-utilization to 
respect the intermediate memory budget. To avoid this, please try reducing the 
mini-batch or forcing gpu execution.");
+                               warnedUnderUtilitization = true;
+                       }
+               }
+       }
+       
        private MatrixBlock getDenseOutputBlock(int numRows, int numCols) 
throws DMLRuntimeException {
                MatrixBlock outputBlock = new MatrixBlock(numRows, numCols, 
false);
                outputBlock.allocateDenseBlock();

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index 5b37576..b25f787 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -27,6 +27,7 @@ import org.apache.sysml.runtime.functionobjects.SwapIndex;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCuDNN;
 import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
 import org.apache.sysml.utils.GPUStatistics;
@@ -40,9 +41,9 @@ public class ConvolutionGPUInstruction extends GPUInstruction 
{
        private ArrayList<CPOperand> _filter_shape;
        private ArrayList<CPOperand> _stride = new ArrayList<CPOperand>();
        private ArrayList<CPOperand> _padding = new ArrayList<CPOperand>();
-
-       private ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, 
CPOperand out, String opcode, String istr)
-                       throws DMLRuntimeException {
+       private double _intermediateMemoryBudget = 0;
+       
+       public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, 
CPOperand out, String opcode, String istr, double intermediateMemoryBudget) 
throws DMLRuntimeException {
                super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), 
opcode, istr);
                if (!(opcode.equals("bias_add") || 
opcode.equals("bias_multiply") || opcode.equals("relu_backward"))) {
                        throw new DMLRuntimeException(
@@ -53,18 +54,23 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                _input2 = in2;
                _gputype = GPUINSTRUCTION_TYPE.Convolution;
                _output = out;
+               _intermediateMemoryBudget = intermediateMemoryBudget;
        }
-
-       private ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, 
CPOperand in3, CPOperand out, String opcode,
-                       String istr, ArrayList<CPOperand> stride, 
ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
-                       ArrayList<CPOperand> filter_shape) {
-               this(in1, in2, out, opcode, istr, stride, padding, input_shape, 
filter_shape);
+       
+       public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, 
CPOperand in3, CPOperand out, String opcode,
+                       String istr, ArrayList<CPOperand> stride,
+                       ArrayList<CPOperand> padding, ArrayList<CPOperand> 
input_shape,
+                       ArrayList<CPOperand> filter_shape, double 
intermediateMemoryBudget) 
+       {
+               this(in1, in2, out, opcode, istr, stride, padding,  
input_shape, filter_shape, intermediateMemoryBudget);
                _input3 = in3;
        }
-
-       private ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, 
CPOperand out, String opcode, String istr,
-                       ArrayList<CPOperand> stride, ArrayList<CPOperand> 
padding, ArrayList<CPOperand> input_shape,
-                       ArrayList<CPOperand> filter_shape) {
+       
+       public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, 
CPOperand out, String opcode,
+                       String istr, ArrayList<CPOperand> stride,
+                       ArrayList<CPOperand> padding, ArrayList<CPOperand> 
input_shape,
+                       ArrayList<CPOperand> filter_shape, double 
intermediateMemoryBudget) 
+       {
                super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), 
opcode, istr);
                _gputype = GPUINSTRUCTION_TYPE.Convolution;
 
@@ -75,6 +81,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction 
{
                _padding = padding;
                _input_shape = input_shape;
                _filter_shape = filter_shape;
+               _intermediateMemoryBudget = intermediateMemoryBudget;
        }
 
        public static ConvolutionGPUInstruction parseInstruction(String str)
@@ -87,7 +94,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction 
{
                         || opcode.equalsIgnoreCase("conv2d_backward_filter")
                         || opcode.equalsIgnoreCase("conv2d_backward_data")
                         || opcode.equalsIgnoreCase("maxpooling_backward")) ) {
-                       InstructionUtils.checkNumFields(parts, 15);
+                       InstructionUtils.checkNumFields(parts, 16);
                        CPOperand in1 = new CPOperand(parts[1]);
                        CPOperand in2 = new CPOperand(parts[2]);
                        CPOperand out = new CPOperand(parts[15]);
@@ -110,10 +117,10 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                        filter_shape.add(new CPOperand(parts[14]));
 
                        return new ConvolutionGPUInstruction(in1, in2, out, 
opcode, str, stride,
-                                       padding, input_shape, filter_shape);
+                                       padding, input_shape, filter_shape, 
Double.parseDouble(parts[16]));
                }
                else if (opcode.equalsIgnoreCase("conv2d_bias_add")) {
-                       InstructionUtils.checkNumFields(parts, 16);
+                       InstructionUtils.checkNumFields(parts, 17);
                        CPOperand in1 = new CPOperand(parts[1]);
                        CPOperand in2 = new CPOperand(parts[2]);
                        CPOperand in3 = new CPOperand(parts[3]);
@@ -137,10 +144,10 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                        filter_shape.add(new CPOperand(parts[15]));
 
                        return new ConvolutionGPUInstruction(in1, in2, in3, 
out, opcode, str, stride,
-                                       padding, input_shape, filter_shape);
+                                       padding, input_shape, filter_shape, 
Double.parseDouble(parts[17]));
                }
                else if (opcode.equalsIgnoreCase("maxpooling")) {
-                       InstructionUtils.checkNumFields(parts, 14);
+                       InstructionUtils.checkNumFields(parts, 15);
                        CPOperand in1 = new CPOperand(parts[1]);
                        CPOperand out = new CPOperand(parts[14]);
                
@@ -162,14 +169,14 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                        filter_shape.add(new CPOperand(parts[13]));
 
                        return new ConvolutionGPUInstruction(in1, null, out, 
opcode, str, stride,
-                                       padding, input_shape, filter_shape);
+                                       padding, input_shape, filter_shape, 
Double.parseDouble(parts[15]));
                }
                else if( opcode.equalsIgnoreCase("bias_add") || 
opcode.equalsIgnoreCase("relu_backward") || 
opcode.equalsIgnoreCase("bias_multiply")  ) {
-                       InstructionUtils.checkNumFields(parts, 3);
+                       InstructionUtils.checkNumFields(parts, 4);
                        CPOperand in1 = new CPOperand(parts[1]);
                        CPOperand in2 = new CPOperand(parts[2]);
                        CPOperand out = new CPOperand(parts[3]);
-                       return new ConvolutionGPUInstruction(in1, in2, out, 
opcode, str);
+                       return new ConvolutionGPUInstruction(in1, in2, out, 
opcode, str, Double.parseDouble(parts[4]));
                }
                else {
                        throw new DMLRuntimeException("Unknown opcode while 
parsing a ConvolutionGPUInstruction: " + str);      
@@ -251,8 +258,8 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                        
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, K * P * Q);
                        
-                       LibMatrixCUDA.conv2d(ec.getGPUContext(0), 
getExtendedOpcode(), image, filter, out, N, C, H, W,
-                                       K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
+                       LibMatrixCuDNN.conv2d(ec.getGPUContext(0), 
getExtendedOpcode(), image, filter, out, N, C, H, W,
+                                       K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q, _intermediateMemoryBudget);
                }
                else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
                        MatrixObject image = 
getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -266,8 +273,8 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                        
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, K * P * Q);
                        
-                       LibMatrixCUDA.conv2dBiasAdd(ec.getGPUContext(0), 
getExtendedOpcode(), image, bias, filter, out, N, C, H, W,
-                                               K, R, S, pad_h, pad_w, 
stride_h, stride_w, P, Q);
+                       LibMatrixCuDNN.conv2dBiasAdd(ec.getGPUContext(0), 
getExtendedOpcode(), image, bias, filter, out, N, C, H, W,
+                                               K, R, S, pad_h, pad_w, 
stride_h, stride_w, P, Q, _intermediateMemoryBudget);
                }
                else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) 
{
                        MatrixObject image = 
getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -281,8 +288,8 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                        
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), K, C * R * S);
                        
-                       LibMatrixCUDA.conv2dBackwardFilter(ec.getGPUContext(0), 
getExtendedOpcode(), image, dout, out, N, C, H, W,
-                                       K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
+                       
LibMatrixCuDNN.conv2dBackwardFilter(ec.getGPUContext(0), getExtendedOpcode(), 
image, dout, out, N, C, H, W,
+                                       K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q, _intermediateMemoryBudget);
                        // TODO: For now always copy the device data to host
                        // ec.gpuCtx.copyDeviceToHost(outputBlock);
                }
@@ -298,8 +305,8 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                        
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * H * W);
                        
-                       LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(0), 
getExtendedOpcode(), filter, dout, out, N, C, H, W,
-                                       K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
+                       LibMatrixCuDNN.conv2dBackwardData(ec.getGPUContext(0), 
getExtendedOpcode(), filter, dout, out, N, C, H, W,
+                                       K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q, _intermediateMemoryBudget);
                }
                else if (instOpcode.equalsIgnoreCase("maxpooling")) {
                        MatrixObject image = 
getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -311,8 +318,8 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * P * Q);
                        
                        if(instOpcode.equalsIgnoreCase("maxpooling"))
-                               LibMatrixCUDA.maxpooling(ec.getGPUContext(0), 
getExtendedOpcode(), image, out, N, C, H, W,
-                                       K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
+                               LibMatrixCuDNN.maxpooling(ec.getGPUContext(0), 
getExtendedOpcode(), image, out, N, C, H, W,
+                                       K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q, _intermediateMemoryBudget);
                }
                else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
                        MatrixObject image = 
getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -326,8 +333,8 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                        
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * H * W);
                        
-                       LibMatrixCUDA.maxpoolingBackward(ec.getGPUContext(0), 
getExtendedOpcode(), image, dout, out, N, C, H, W,
-                                       K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
+                       LibMatrixCuDNN.maxpoolingBackward(ec.getGPUContext(0), 
getExtendedOpcode(), image, dout, out, N, C, H, W,
+                                       K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q, _intermediateMemoryBudget);
                }
                else {
                        throw new DMLRuntimeException("Unsupported GPU context 
for " + instOpcode);
@@ -345,6 +352,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction {
                ec.releaseMatrixOutputForGPUInstruction(_output.getName());
        }
 
+
        private int getScalarInput(ExecutionContext ec, ArrayList<CPOperand> 
aL, int index) 
                throws DMLRuntimeException 
        {

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
index af27dc6..5096566 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
@@ -24,6 +24,7 @@ import 
org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCuDNN;
 import org.apache.sysml.runtime.matrix.operators.Operator;
 import org.apache.sysml.utils.GPUStatistics;
 
@@ -44,7 +45,7 @@ public class MatrixBuiltinGPUInstruction extends 
BuiltinUnaryGPUInstruction {
 
                switch(opcode) {
                        case "sel+":
-                               LibMatrixCUDA.relu(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName()); break;
+                               LibMatrixCuDNN.relu(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName()); break;
                        case "exp":
                                LibMatrixCUDA.exp(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat, _output.getName()); break;
                        case "sqrt":

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index c6b82c4..197daaf 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -49,6 +49,7 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.conf.DMLConfig;
+import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
@@ -151,6 +152,11 @@ public class GPUContext {
                LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, 
Available: " + (free[0] * (1e-6)) + " MB on "
                                + this);
 
+               if(GPUContextPool.initialGPUMemBudget() > 
OptimizerUtils.getLocalMemBudget()) {
+                       LOG.warn("Potential under-utilization: GPU memory (" + 
GPUContextPool.initialGPUMemBudget() 
+                                       + ") > driver memory budget (" + 
OptimizerUtils.getLocalMemBudget() + "). "
+                                       + "Consider increasing the driver 
memory budget.");
+               }
        }
 
        private void initializeCudaLibraryHandles() throws DMLRuntimeException {

http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
index 78b6e3b..6d06ee5 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
@@ -21,6 +21,7 @@ package org.apache.sysml.runtime.matrix.data;
 
 import java.io.Serializable;
 
+import org.apache.sysml.hops.Hop;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
 
@@ -34,7 +35,9 @@ public class ConvolutionParameters implements Serializable {
        public int K; public int R; public int S; public int stride_h; public 
int stride_w; public int pad_h; public int pad_w;
        public int P; public int Q; public int numThreads;
        
+       // Optional variables used by ConvolutionCPInstruction
        public boolean enableNative = false;
+       
        public MatrixBlock input1; public MatrixBlock input2; public 
MatrixBlock output;
        
        public MatrixBlock bias;
@@ -62,6 +65,28 @@ public class ConvolutionParameters implements Serializable {
                                "], pad=[" + pad_h + "," + pad_w + "])";  
        }
        
+       public void setIfUnknown(Hop N, Hop C, Hop H, Hop W,
+                       Hop K, Hop R, Hop S, Hop stride_h, Hop stride_w, Hop 
pad_h, Hop pad_w, int numThreads) throws DMLRuntimeException {
+               if(this.N < 0) this.N = 
convertToInt(Hop.computeSizeInformation(N));
+               if(this.C < 0) this.C = 
convertToInt(Hop.computeSizeInformation(C));
+               if(this.H < 0) this.H = 
convertToInt(Hop.computeSizeInformation(H));
+               if(this.W < 0) this.W = 
convertToInt(Hop.computeSizeInformation(W));
+               if(this.K < 0) this.K = 
convertToInt(Hop.computeSizeInformation(K));
+               if(this.R < 0) this.R = 
convertToInt(Hop.computeSizeInformation(R));
+               if(this.S < 0) this.S = 
convertToInt(Hop.computeSizeInformation(S));
+               if(this.stride_h < 0) this.stride_h = 
convertToInt(Hop.computeSizeInformation(stride_h));
+               if(this.stride_w < 0) this.stride_w = 
convertToInt(Hop.computeSizeInformation(stride_w));
+               if(this.pad_h < 0) this.pad_h = 
convertToInt(Hop.computeSizeInformation(pad_h));
+               if(this.pad_w < 0) this.pad_w = 
convertToInt(Hop.computeSizeInformation(pad_w));
+               if(this.P < 0 && this.H >= 0 && this.R >= 0 && this.stride_h >= 
0 && this.pad_h >= 0) {
+                       this.P = (int) ConvolutionUtils.getP(this.H, this.R, 
this.stride_h, this.pad_h);
+               }
+               if(this.Q < 0 && this.W >= 0 && this.S >= 0 && this.stride_w >= 
0 && this.pad_w >= 0) {
+                       this.Q = (int) ConvolutionUtils.getQ(this.W, this.S, 
this.stride_w, this.pad_w);
+               }
+               this.numThreads = numThreads;
+       }
+       
        public ConvolutionParameters(long N, long C, long H, long W,
                        long K, long R, long S, long stride_h, long stride_w, 
long pad_h, long pad_w, int numThreads) throws DMLRuntimeException {
                this.N = convertToInt(N);

[3/5] systemml git commit: [SYSTEMML-540] Support sparse GPU conv2d as well as fix memory estimation of convolution operations

Reply via email to