Repository: systemml
Updated Branches:
  refs/heads/master 3ca053535 -> 8df0697e0


[MINOR] [SYSTEMML-445] GPU bugfix for metadata checking

This commit contains two fixes:
- First fix ensures that the method allocating output matrix (for eg:
getSparseMatrixOutputForGPUInstruction,
getDenseMatrixOutputForGPUInstruction) has correct dimension by forcing
the caller methods to specify the expected dimensions. If the expected
dimensions don't match to the ones in the symbol table, it throws a
DMLRuntimeException.
- The second fix ensures that long to int conversions in LibMatrixCUDA is 
guarded.

Closes #629.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/8df0697e
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/8df0697e
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/8df0697e

Branch: refs/heads/master
Commit: 8df0697e08c02f409da717920a1944c67c54c6e5
Parents: 3ca0535
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Wed Aug 23 09:51:54 2017 -0800
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Wed Aug 23 10:51:54 2017 -0700

----------------------------------------------------------------------
 .../context/ExecutionContext.java               |  62 +++-
 .../gpu/ConvolutionGPUInstruction.java          |  32 +-
 .../instructions/gpu/GPUInstruction.java        |   6 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      | 299 +++++++++++--------
 4 files changed, 244 insertions(+), 155 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/8df0697e/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
index 8d27e3b..0bd73d1 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
@@ -284,55 +284,97 @@ public class ExecutionContext {
                                
((MatrixFormatMetaData)oldMetaData).getOutputInfo(),
                                
((MatrixFormatMetaData)oldMetaData).getInputInfo()));
        }
+       
+       /**
+        * Compares two potential dimensions d1 and d2 and return the one which 
is not -1.
+        * This method is useful when the dimensions are not known at compile 
time, but are known at runtime.
+        *  
+        * @param d1 dimension1
+        * @param d2 dimension1
+        * @return valid d1 or d2
+        * @throws DMLRuntimeException if error occurs
+        */
+       private long validateDimensions(long d1, long d2) throws 
DMLRuntimeException {
+               if(d1 >= 0 && d2 >= 0 && d1 != d2) {
+                       throw new DMLRuntimeException("Incorrect dimensions:" + 
d1 + " != " + d2);
+               }
+               return Math.max(d1, d2);
+       }
 
        /**
         * Allocates a dense matrix on the GPU (for output)
         * @param varName       name of the output matrix (known by this {@link 
ExecutionContext})
+        * @param numRows number of rows of matrix object
+        * @param numCols number of columns of matrix object
         * @return a pair containing the wrapping {@link MatrixObject} and a 
boolean indicating whether a cuda memory allocation took place (as opposed to 
the space already being allocated)
         * @throws DMLRuntimeException
         */
-       public Pair<MatrixObject, Boolean> 
getDenseMatrixOutputForGPUInstruction(String varName)
+       public Pair<MatrixObject, Boolean> 
getDenseMatrixOutputForGPUInstruction(String varName, long numRows, long 
numCols)
                throws DMLRuntimeException 
        {       
-               MatrixObject mo = allocateGPUMatrixObject(varName);
+               MatrixObject mo = allocateGPUMatrixObject(varName, numRows, 
numCols);
                boolean allocated = 
mo.getGPUObject(getGPUContext(0)).acquireDeviceModifyDense();
                mo.getMatrixCharacteristics().setNonZeros(-1);
                return new Pair<MatrixObject, Boolean>(mo, allocated);
        }
 
-    /**
+       /**
      * Allocates a sparse matrix in CSR format on the GPU.
      * Assumes that mat.getNumRows() returns a valid number
      * 
      * @param varName variable name
+     * @param numRows number of rows of matrix object
+        * @param numCols number of columns of matrix object
      * @param nnz number of non zeroes
      * @return matrix object
      * @throws DMLRuntimeException if DMLRuntimeException occurs
      */
-    public Pair<MatrixObject, Boolean> 
getSparseMatrixOutputForGPUInstruction(String varName, long nnz)
+    public Pair<MatrixObject, Boolean> 
getSparseMatrixOutputForGPUInstruction(String varName, long numRows, long 
numCols, long nnz)
         throws DMLRuntimeException
     {
-        MatrixObject mo = allocateGPUMatrixObject(varName);
+       MatrixObject mo = allocateGPUMatrixObject(varName, numRows, numCols);
         mo.getMatrixCharacteristics().setNonZeros(nnz);
                                boolean allocated = 
mo.getGPUObject(getGPUContext(0)).acquireDeviceModifySparse();
         return new Pair<MatrixObject, Boolean>(mo, allocated);
     } 
 
-       /**
+    /**
         * Allocates the {@link GPUObject} for a given LOPS Variable (eg. 
_mVar3)
         * @param varName variable name
+        * @param numRows number of rows of matrix object
+        * @param numCols number of columns of matrix object
         * @return matrix object
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public MatrixObject allocateGPUMatrixObject(String varName) throws 
DMLRuntimeException {
+       public MatrixObject allocateGPUMatrixObject(String varName, long 
numRows, long numCols) throws DMLRuntimeException {
                MatrixObject mo = getMatrixObject(varName);
+               long dim1 = -1; long dim2 = -1;
+               DMLRuntimeException e = null;
+               try {
+                       dim1 = validateDimensions(mo.getNumRows(), numRows);
+               } catch(DMLRuntimeException e1) {
+                       e = e1;
+               }
+               try {
+                       dim2 = validateDimensions(mo.getNumColumns(), numCols);
+               } catch(DMLRuntimeException e1) {
+                       e = e1;
+               }
+               if(e != null) {
+                       throw new DMLRuntimeException("Incorrect dimensions 
given to allocateGPUMatrixObject: [" + numRows + "," + numCols + "], "
+                                       + "[" + mo.getNumRows() + "," + 
mo.getNumColumns() + "]", e);
+               }
+               if(dim1 != mo.getNumRows() || dim2 != mo.getNumColumns()) {
+                       // Set unknown dimensions
+                       mo.getMatrixCharacteristics().setDimension(dim1, dim2);
+               }
                if( mo.getGPUObject(getGPUContext(0)) == null ) {
                        GPUObject newGObj = 
getGPUContext(0).createGPUObject(mo);
-                       // The lock is added here for an output block
-                       // so that any block currently in use is not 
deallocated by eviction on the GPU
-                       newGObj.addLock();
                        mo.setGPUObject(getGPUContext(0), newGObj);
                }
+               // The lock is added here for an output block
+               // so that any block currently in use is not deallocated by 
eviction on the GPU
+               mo.getGPUObject(getGPUContext(0)).addLock();
                return mo;
        }
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/8df0697e/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index e5ea097..c23253b 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -182,9 +182,8 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                GPUStatistics.incrementNoOfExecutedGPUInst();
                MatrixObject input = getMatrixInputForGPUInstruction(ec, 
_input1.getName());
                MatrixObject bias = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
+               MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, 
_output.getName(), input.getNumRows(), input.getNumColumns());
                
-               ec.setMetaData(_output.getName(), input.getNumRows(), 
input.getNumColumns());
-               MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, 
_output.getName());
                if(instOpcode.equalsIgnoreCase("bias_add"))
                        LibMatrixCUDA.biasAdd(ec.getGPUContext(0), 
getExtendedOpcode(), input, bias, out);
                else if(instOpcode.equalsIgnoreCase("bias_multiply"))
@@ -195,13 +194,14 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                ec.releaseMatrixOutputForGPUInstruction(_output.getName());
        }
 
+       // (X > 0) * dout
        public void processReLUBackwardInstruction(ExecutionContext ec) throws 
DMLRuntimeException {
                GPUStatistics.incrementNoOfExecutedGPUInst();
                MatrixObject input = getMatrixInputForGPUInstruction(ec, 
_input1.getName());
                MatrixObject dout = getMatrixInputForGPUInstruction(ec, 
_input2.getName());
                
-               MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, 
_output.getName());
-               ec.setMetaData(_output.getName(), input.getNumRows(), 
input.getNumColumns());
+               MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, 
_output.getName(), input.getNumRows(), input.getNumColumns());
+               
                LibMatrixCUDA.reluBackward(ec.getGPUContext(0), 
getExtendedOpcode(), input, dout, out);
                // release inputs/outputs
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
@@ -251,8 +251,8 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        if(filter.getNumRows() != K || filter.getNumColumns() 
!= C*R*S) 
                                throw new DMLRuntimeException("Incorrect 
dimensions for filter in conv2d");
                        
-                       ec.setMetaData(_output.getName(), N, K * P * Q);
-                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
+                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, K * P * Q);
+                       
                        LibMatrixCUDA.conv2d(ec.getGPUContext(0), 
getExtendedOpcode(), image, filter, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                }
@@ -266,8 +266,8 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        if(filter.getNumRows() != K || filter.getNumColumns() 
!= C*R*S) 
                                throw new DMLRuntimeException("Incorrect 
dimensions for filter in conv2d");
                        
-                       ec.setMetaData(_output.getName(), N, K * P * Q);
-                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
+                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, K * P * Q);
+                       
                        LibMatrixCUDA.conv2dBiasAdd(ec.getGPUContext(0), 
getExtendedOpcode(), image, bias, filter, out, N, C, H, W,
                                                K, R, S, pad_h, pad_w, 
stride_h, stride_w, P, Q);
                }
@@ -281,8 +281,8 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                                throw new DMLRuntimeException("Incorrect 
dimensions for dout in conv2d_backward_filter: " + 
                                                dout.getNumRows() + " != " +  N 
+ " || " + dout.getNumColumns() + " != " + K*P*Q);
                        
-                       ec.setMetaData(_output.getName(), K, C * R * S);
-                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
+                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), K, C * R * S);
+                       
                        LibMatrixCUDA.conv2dBackwardFilter(ec.getGPUContext(0), 
getExtendedOpcode(), image, dout, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                        // TODO: For now always copy the device data to host
@@ -298,8 +298,8 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                                throw new DMLRuntimeException("Incorrect 
dimensions for dout in conv2d_backward_data: " + 
                                                dout.getNumRows() + " != " +  N 
+ " || " + dout.getNumColumns() + " != " + K*P*Q);
                        
-                       ec.setMetaData(_output.getName(), N, C * H * W);
-                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
+                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * H * W);
+                       
                        LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(0), 
getExtendedOpcode(), filter, dout, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                }
@@ -310,8 +310,8 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                                throw new DMLRuntimeException("Incorrect 
dimensions for image in maxpooling: " + 
                                                image.getNumRows() + " != " +  
N + " || " + image.getNumColumns() + " != " + C*H*W);
                        
-                       ec.setMetaData(_output.getName(), N, C * P * Q);
-                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
+                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * P * Q);
+                       
                        if(instOpcode.equalsIgnoreCase("maxpooling"))
                                LibMatrixCUDA.maxpooling(ec.getGPUContext(0), 
getExtendedOpcode(), image, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
@@ -326,8 +326,8 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                                throw new DMLRuntimeException("Incorrect 
dimensions for image in maxpooling_backward: " + 
                                                image.getNumRows() + " != " +  
N + " || " + image.getNumColumns() + " != " + K*P*Q);
                        
-                       ec.setMetaData(_output.getName(), N, C * H * W);
-                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
+                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * H * W);
+                       
                        LibMatrixCUDA.maxpoolingBackward(ec.getGPUContext(0), 
getExtendedOpcode(), image, dout, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                }

http://git-wip-us.apache.org/repos/asf/systemml/blob/8df0697e/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index a5388cb..b962eb7 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -208,12 +208,14 @@ public abstract class GPUInstruction extends Instruction
         * Also records performance information into {@link Statistics}
         * @param ec            active {@link ExecutionContext}
         * @param name  name of input matrix (that the {@link ExecutionContext} 
is aware of)
+        * @param numRows number of rows of matrix object
+        * @param numCols number of columns of matrix object
         * @return      the matrix object
         * @throws DMLRuntimeException  if an error occurs
         */
-       protected MatrixObject 
getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name) throws 
DMLRuntimeException {
+       protected MatrixObject 
getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String name, long 
numRows, long numCols) throws DMLRuntimeException {
                long t0 = System.nanoTime();
-               Pair<MatrixObject, Boolean> mb = 
ec.getDenseMatrixOutputForGPUInstruction(name);
+               Pair<MatrixObject, Boolean> mb = 
ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols);
                if (mb.getValue()) 
GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), 
GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t0);
                return mb.getKey();
        }

http://git-wip-us.apache.org/repos/asf/systemml/blob/8df0697e/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index b8b4f8b..62c0e0d 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -574,8 +574,8 @@ public class LibMatrixCUDA {
                long t1=0;
                if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
                getCudaKernels(gCtx).launchKernel("relu_backward",
-                                               
ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols),
-                                               imagePointer, doutPointer, 
outputPointer, (int)rows, (int)cols);
+                                               
ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)),
+                                               imagePointer, doutPointer, 
outputPointer, toInt(rows), toInt(cols));
                if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1);
 
        }
@@ -613,8 +613,8 @@ public class LibMatrixCUDA {
                long t1 = 0;
                if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
                getCudaKernels(gCtx).launchKernel("bias_multiply",
-                                               
ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols),
-                                               imagePointer, biasPointer, 
outputPointer, (int)rows, (int)cols, (int) PQ);
+                                               
ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)),
+                                               imagePointer, biasPointer, 
outputPointer, toInt(rows), toInt(cols), toInt(PQ));
                if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
 
        }
@@ -635,9 +635,9 @@ public class LibMatrixCUDA {
                Pointer imagePointer = getDensePointer(gCtx, input, instName);
                Pointer biasPointer = getDensePointer(gCtx, bias, instName);
                Pointer outputPointer = getDensePointer(gCtx, outputBlock, 
instName);
-               int rows = (int) input.getNumRows();
-               int cols = (int) input.getNumColumns();
-               int K = (int) bias.getNumRows();
+               int rows = toInt(input.getNumRows());
+               int cols = toInt(input.getNumColumns());
+               int K = toInt(bias.getNumRows());
                if(bias.getNumColumns() != 1 || cols % K != 0) {
                        throw new DMLRuntimeException("Incorrect inputs for 
bias_add: input[" + rows + " X " + cols + "] and bias[" + K + " X " + 
bias.getNumColumns() + "]");
                }
@@ -704,8 +704,8 @@ public class LibMatrixCUDA {
         LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" 
+ gCtx);
         int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
 
-               int N = (int) image.getNumRows();
-               int C = (int) scale.getNumColumns();
+               int N = toInt(image.getNumRows());
+               int C = toInt(scale.getNumColumns());
                long CHW = image.getNumColumns();
                validateBatchNormalizationDimensions(scale, bias, runningMean, 
runningVar, C);
 
@@ -750,8 +750,8 @@ public class LibMatrixCUDA {
         LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" 
+ gCtx);
         int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
 
-               int N = (int) image.getNumRows();
-               int C = (int) scale.getNumColumns();
+               int N = toInt(image.getNumRows());
+               int C = toInt(scale.getNumColumns());
                long CHW = image.getNumColumns();
                validateBatchNormalizationDimensions(scale, bias, runningMean, 
runningVar, C);
 
@@ -855,8 +855,8 @@ public class LibMatrixCUDA {
         LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" + gCtx);
         int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
 
-               int N = (int) image.getNumRows();
-               int C = (int) scale.getNumColumns();
+               int N = toInt(image.getNumRows());
+               int C = toInt(scale.getNumColumns());
                long CHW = image.getNumColumns();
 
                // Allocate descriptors
@@ -1241,7 +1241,7 @@ public class LibMatrixCUDA {
                long N = in.getNumRows();
                long CHW = in.getNumColumns();
                MatrixObject output = ec.getMatrixObject(outputName);
-               getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName);        // Allocated the dense output matrix
+               getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, 
in.getNumRows(), in.getNumColumns()); // Allocated the dense output matrix
                long t0=0;
                cudnnTensorDescriptor srcTensorDesc = 
in.getGPUObject(gCtx).getTensorDescriptor();
                if(N*CHW >= numDoublesIn2GB ||  srcTensorDesc == null) {
@@ -1251,8 +1251,8 @@ public class LibMatrixCUDA {
                        Pointer dstData = getDensePointer(gCtx, output, 
instName);
                        Pointer srcData = getDensePointer(gCtx, in, instName); 
// TODO: FIXME: Add sparse kernel support for relu
                        getCudaKernels(gCtx).launchKernel("relu",
-                                                       
ExecutionConfig.getConfigForSimpleMatrixOperations((int)N, (int)CHW),
-                                                       srcData, dstData, 
(int)N, (int) CHW);
+                                                       
ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)),
+                                                       srcData, dstData, 
toInt(N), toInt(CHW));
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0);
                }
                else {
@@ -1297,21 +1297,20 @@ public class LibMatrixCUDA {
                        return;
                }
 
-               // For dense TSMM, exploit cublasDsyrk(...) and call custom 
kernel to flip the matrix
-               MatrixObject output = ec.getMatrixObject(outputName);
-               getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName);        // Allocated the dense output matrix
-
                // Since CuBLAS expects inputs in column-major format,
                // reverse the order of matrix-multiplication and take care of 
dimension mismatch.
                int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_N : 
cublasOperation.CUBLAS_OP_T;
                // Note: the dimensions are swapped
-               int m = (int) (isLeftTransposed ? left.getNumColumns() : 
left.getNumRows());
-               int k = (int) (isLeftTransposed ? left.getNumRows() : 
left.getNumColumns());
+               int m = toInt(isLeftTransposed ? left.getNumColumns() : 
left.getNumRows());
+               int k = toInt(isLeftTransposed ? left.getNumRows() : 
left.getNumColumns());
+                               
+               // For dense TSMM, exploit cublasDsyrk(...) and call custom 
kernel to flip the matrix
+               MatrixObject output = getDenseMatrixOutputForGPUInstruction(ec, 
instName, outputName, m, m);    // Allocated the dense output matrix
 
                if(m == -1)
                        throw new DMLRuntimeException("Incorrect dimensions");
 
-               int lda = (int) (isLeftTransposed ? m : k);
+               int lda = toInt(isLeftTransposed ? m : k);
                int ldc = m;
 
                if(!left.getGPUObject(gCtx).isAllocated())
@@ -1351,7 +1350,7 @@ public class LibMatrixCUDA {
                if(ret.getNumRows() != ret.getNumColumns()) {
                        throw new DMLRuntimeException("Only square matrix 
kernel is implemented for copyUpperToLowerTriangle");
                }
-               int dim = (int) ret.getNumRows();
+               int dim = toInt(ret.getNumRows());
                getCudaKernels(gCtx).launchKernel("copy_u2l_dense",
                                                
ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim),
                                                getDensePointer(gCtx, ret, 
instName), dim, dim*dim);
@@ -1401,19 +1400,22 @@ public class LibMatrixCUDA {
                boolean bothSparse = left.getGPUObject(gCtx).isSparse() && 
right.getGPUObject(gCtx).isSparse();
 
                MatrixObject output = ec.getMatrixObject(outputName);
+               
+               long outRLen = isLeftTransposed ? left.getNumColumns() : 
left.getNumRows();
+               long outCLen = isRightTransposed ? right.getNumRows() : 
right.getNumColumns();
 
                if (bothDense) {                // Dense C = Dense A * Dense B
                        // For both dense, do cuBLAS
-                       getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName);        // Allocated the dense output matrix
+                       getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName, outRLen, outCLen); // Allocated the dense output matrix
                        denseDenseMatmult(gCtx, instName, output, left, right, 
isLeftTransposed, isRightTransposed);
                }
                else if (bothSparse){   // Sparse C = Sparse A * Sparse B
-                       ec.allocateGPUMatrixObject(outputName);
+                       ec.allocateGPUMatrixObject(outputName, outRLen, 
outCLen);
                        bothSparseMatmult(gCtx, instName, output, left, right, 
isLeftTransposed, isRightTransposed);
                }
                else {  // Either of A or B is sparse, Sparse C = Sparse/Dense 
A * Dense/Sparse B
                        // Convert the dense to sparse and use the 
cusparseDcsrgemm routine
-                       ec.allocateGPUMatrixObject(outputName);
+                       ec.allocateGPUMatrixObject(outputName, outRLen, 
outCLen);
                        eitherSparseMatmult(gCtx, instName, output, left, 
right, isLeftTransposed, isRightTransposed);
                }
 
@@ -1436,10 +1438,10 @@ public class LibMatrixCUDA {
        private static void eitherSparseMatmult(GPUContext gCtx, String 
instName, MatrixObject output, MatrixObject left, MatrixObject right,
                                                                                
                                                                                
                boolean isLeftTransposed, boolean isRightTransposed) throws 
DMLRuntimeException {
 
-               int m = (int) (isLeftTransposed ? left.getNumColumns() : 
left.getNumRows()) ;
-               int n = (int) (isRightTransposed ? right.getNumRows() : 
right.getNumColumns());
-               int k = (int) (isLeftTransposed ? left.getNumRows() :  
left.getNumColumns());
-               int k1 = (int) (isRightTransposed ? right.getNumColumns() : 
right.getNumRows());
+               int m = toInt(isLeftTransposed ? left.getNumColumns() : 
left.getNumRows()) ;
+               int n = toInt(isRightTransposed ? right.getNumRows() : 
right.getNumColumns());
+               int k = toInt(isLeftTransposed ? left.getNumRows() :  
left.getNumColumns());
+               int k1 = toInt(isRightTransposed ? right.getNumColumns() : 
right.getNumRows());
                if(k != k1)
                        throw new DMLRuntimeException("Dimension mismatch: " + 
k + " != " + k1);
 
@@ -1521,10 +1523,10 @@ public class LibMatrixCUDA {
                        if (GPUStatistics.DISPLAY_STATISTICS && allocated) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
                        Pointer C = getDensePointer(gCtx, output, instName);
                        denseDenseMatmult(gCtx, instName, C,
-                                                       (int) 
left.getNumRows(), (int) left.getNumColumns(),
-                                                       (int) 
right.getNumColumns(), (int) right.getNumRows(),
-                                                       isLeftTransposed, 
!isRightTransposed,
-                                                       ADense, 
BDenseTransposed);
+                                       toInt(left.getNumRows()), 
toInt(left.getNumColumns()),
+                                       toInt(right.getNumColumns()), 
toInt(right.getNumRows()),
+                                       isLeftTransposed, !isRightTransposed,
+                                       ADense, BDenseTransposed);
 
                        gCtx.cudaFreeHelper(instName, BDenseTransposed);
                }
@@ -1602,10 +1604,10 @@ public class LibMatrixCUDA {
 
                                Pointer C = getDensePointer(gCtx, output, 
instName);
                                denseDenseMatmult(gCtx, instName, C,
-                                                               (int) 
left.getNumColumns(), (int) left.getNumRows(),
-                                                               (int) 
right.getNumRows(), (int) right.getNumColumns(),
-                                                               
!isLeftTransposed, isRightTransposed,
-                                                               
ADenseTransposed, BDense);
+                                               toInt(left.getNumColumns()), 
toInt(left.getNumRows()),
+                                               toInt(right.getNumRows()), 
toInt(right.getNumColumns()),
+                                               !isLeftTransposed, 
isRightTransposed,
+                                               ADenseTransposed, BDense);
 
                                gCtx.cudaFreeHelper(instName, ADenseTransposed);
                        }
@@ -1661,10 +1663,10 @@ public class LibMatrixCUDA {
         */
        private static void bothSparseMatmult(GPUContext gCtx, String instName, 
MatrixObject output, MatrixObject left, MatrixObject right,
                                                                                
                                                                                
        boolean isLeftTransposed, boolean isRightTransposed) throws 
DMLRuntimeException {
-        int m = (int) (isLeftTransposed ? left.getNumColumns() : 
left.getNumRows()) ;
-               int n = (int) (isRightTransposed ? right.getNumRows() : 
right.getNumColumns());
-               int k = (int) (isLeftTransposed ? left.getNumRows() :  
left.getNumColumns());
-               int k1 = (int) (isRightTransposed ? right.getNumColumns() : 
right.getNumRows());
+        int m = toInt(isLeftTransposed ? left.getNumColumns() : 
left.getNumRows()) ;
+               int n = toInt(isRightTransposed ? right.getNumRows() : 
right.getNumColumns());
+               int k = toInt(isLeftTransposed ? left.getNumRows() :  
left.getNumColumns());
+               int k1 = toInt(isRightTransposed ? right.getNumColumns() : 
right.getNumRows());
                if(k != k1)
                        throw new DMLRuntimeException("Dimension mismatch: " + 
k + " != " + k1);
 
@@ -1765,10 +1767,10 @@ public class LibMatrixCUDA {
                Pointer leftPtr = getDensePointer(gCtx, left, instName);
                Pointer rightPtr = getDensePointer(gCtx, right, instName);
 
-               int leftRows = (int) left.getNumRows();
-               int leftCols = (int) left.getNumColumns();
-               int rightRows = (int) right.getNumRows();
-               int rightCols = (int) right.getNumColumns();
+               int leftRows = toInt(left.getNumRows());
+               int leftCols = toInt(left.getNumColumns());
+               int rightRows = toInt(right.getNumRows());
+               int rightCols = toInt(right.getNumColumns());
                Pointer C = getDensePointer(gCtx, output, instName);
                denseDenseMatmult(gCtx, instName, C, leftRows, leftCols, 
rightRows, rightCols, isLeftTransposed, isRightTransposed,
                                                leftPtr, rightPtr);
@@ -1978,11 +1980,22 @@ public class LibMatrixCUDA {
                        // throw new DMLRuntimeException("Internal Error - Not 
implemented");
 
                }
+               
+               long outRLen = -1;
+               long outCLen = -1;
+               if (indexFn instanceof ReduceRow) { // COL{SUM, MAX...}
+                       outRLen = 1;
+                       outCLen = clen;
+               }
+               else if (indexFn instanceof ReduceCol) { // ROW{SUM, MAX,...}
+                       outRLen = rlen;
+                       outCLen = 1;
+               }
 
                Pointer out = null;
                if (reductionDirection == REDUCTION_COL || reductionDirection 
== REDUCTION_ROW) {
                        // Matrix output
-                       MatrixObject out1 = 
getDenseMatrixOutputForGPUInstruction(ec, instName, output);
+                       MatrixObject out1 = 
getDenseMatrixOutputForGPUInstruction(ec, instName, output, outRLen, outCLen);
                        out = getDensePointer(gCtx, out1, instName);
                }
 
@@ -2403,16 +2416,17 @@ public class LibMatrixCUDA {
 
                Pointer A, C;
                if (isSparseAndEmpty(gCtx, in)) {
-                       setOutputToConstant(ec, gCtx, instName, 
op.executeScalar(0.0), outputName);
+                       setOutputToConstant(ec, gCtx, instName, 
op.executeScalar(0.0), outputName, in.getNumRows(),
+                                       in.getNumColumns());
                        return;
                } else {
                        A = getDensePointer(gCtx, in, instName);
-                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);     // 
Allocated the dense output matrix
+                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, 
in.getNumRows(), in.getNumColumns());        // Allocated the dense output 
matrix
                        C = getDensePointer(gCtx, out, instName);
                }
 
-               int rlenA = (int) in.getNumRows();
-               int clenA = (int) in.getNumColumns();
+               int rlenA = toInt(in.getNumRows());
+               int clenA = toInt(in.getNumColumns());
 
                matrixScalarOp(gCtx, instName, A, constant, rlenA, clenA, C, 
op);
        }
@@ -2434,6 +2448,10 @@ public class LibMatrixCUDA {
                        throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
                double constant = op.getConstant();
                LOG.trace("GPU : matrixScalarArithmetic, scalar: " + constant + 
", GPUContext=" + gCtx);
+               
+               int outRLen = isInputTransposed ? (int) in.getNumColumns() : 
(int) in.getNumRows();
+               int outCLen = isInputTransposed ? (int) in.getNumRows() : (int) 
in.getNumColumns();
+               
                //boolean isCUDALibAvailable = (op.fn instanceof Multiply
                //              || (op.fn instanceof Divide && op instanceof 
RightScalarOperator && constant != 0)) && !isSparseAndEmpty(gCtx, in);
                //if(!isCUDALibAvailable) {
@@ -2442,11 +2460,10 @@ public class LibMatrixCUDA {
                                        deviceCopy(ec, gCtx, instName, in, 
outputName, isInputTransposed);
                                }
                                else if(op.fn instanceof Multiply || op.fn 
instanceof And) {
-                                       setOutputToConstant(ec, gCtx, instName, 
0.0, outputName);
-
+                                       setOutputToConstant(ec, gCtx, instName, 
0.0, outputName, outRLen, outCLen);
                                }
                                else if(op.fn instanceof Power) {
-                                       setOutputToConstant(ec, gCtx, instName, 
1.0, outputName);
+                                       setOutputToConstant(ec, gCtx, instName, 
1.0, outputName, outRLen, outCLen);
                                }
                 // TODO:
                 // x/0.0 is either +Infinity or -Infinity according to Java.
@@ -2468,7 +2485,7 @@ public class LibMatrixCUDA {
                                }
                        }
                        else if(constant == 1.0 && op.fn instanceof Or) {
-                               setOutputToConstant(ec, gCtx, instName, 1.0, 
outputName);
+                               setOutputToConstant(ec, gCtx, instName, 1.0, 
outputName, outRLen, outCLen);
                        }
                        else if(constant == 1.0 && (op.fn instanceof And || 
op.fn instanceof Power)) {
                                deviceCopy(ec, gCtx, instName, in, outputName, 
isInputTransposed);
@@ -2518,9 +2535,9 @@ public class LibMatrixCUDA {
                boolean in2SparseAndEmpty = isSparseAndEmpty(gCtx, in2);
                if (in1SparseAndEmpty && in2SparseAndEmpty) {
                        if (op.fn instanceof LessThan || op.fn instanceof 
GreaterThan || op.fn instanceof NotEquals) {
-                               setOutputToConstant(ec, gCtx, instName, 0.0, 
outputName);
+                               setOutputToConstant(ec, gCtx, instName, 0.0, 
outputName, in1.getNumRows(), in1.getNumColumns());
                        } else if (op.fn instanceof LessThanEquals || op.fn 
instanceof GreaterThanEquals || op.fn instanceof Equals) {
-                               setOutputToConstant(ec, gCtx, instName, 1.0, 
outputName);
+                               setOutputToConstant(ec, gCtx, instName, 1.0, 
outputName, in1.getNumRows(), in1.getNumColumns());
                        }
                } else if (in1SparseAndEmpty) {
                        matrixScalarRelational(ec, gCtx, instName, in2, 
outputName, new LeftScalarOperator(op.fn, 0.0));
@@ -2591,12 +2608,12 @@ public class LibMatrixCUDA {
                if(isInputTransposed)
                        throw new DMLRuntimeException("Transposing the input is 
not supported");
 
-               int rlenA = (int) in.getNumRows();
-               int clenA = (int) in.getNumColumns();
+               int rlenA = toInt(in.getNumRows());
+               int clenA = toInt(in.getNumColumns());
                Pointer A = getDensePointer(gCtx, in, instName); // TODO: 
FIXME: Implement sparse binCellSparseScalarOp kernel
                double scalar = op.getConstant();
                // MatrixObject out = ec.getMatrixObject(outputName);
-               MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, 
instName, outputName);     // Allocated the dense output matrix
+               MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, 
instName, outputName, rlenA, clenA);       // Allocated the dense output matrix
                Pointer C = getDensePointer(gCtx, out, instName);
                matrixScalarOp(gCtx, instName, A, scalar, rlenA, clenA, C, op);
        }
@@ -2648,16 +2665,21 @@ public class LibMatrixCUDA {
                        throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
                boolean isEmpty1 = isSparseAndEmpty(gCtx, in1);
                boolean isEmpty2 = isSparseAndEmpty(gCtx, in2);
-               int rlenA = (int) in1.getNumRows();
-               int rlenB = (int) in2.getNumRows();
-               int clenA = (int) in1.getNumColumns();
-               int clenB = (int) in2.getNumColumns();
+               int rlenA = toInt(in1.getNumRows());
+               int rlenB = toInt(in2.getNumRows());
+               int clenA = toInt(in1.getNumColumns());
+               int clenB = toInt(in2.getNumColumns());
                int vecStatusA = getVectorStatus(rlenA, clenA).code();
                int vecStatusB = getVectorStatus(rlenB, clenB).code();
+               
+               if(isLeftTransposed || isRightTransposed) {
+                       throw new DMLRuntimeException("Unsupported operator: 
GPU transposed binary op " + isLeftTransposed + " " + isRightTransposed);
+               }
+               long outRLen = Math.max(rlenA, rlenB);
+               long outCLen = Math.max(clenA, clenB);
 
                if (isEmpty1 && isEmpty2){
-                       MatrixObject out = ec.getMatrixObject(outputName);
-                       ec.allocateGPUMatrixObject(outputName);
+                       MatrixObject out = 
ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
                        // When both inputs are empty, the output is empty too 
(except in the case of division)
                        if (op.fn instanceof Divide || op.fn instanceof 
IntegerDivide || op.fn instanceof Modulus) {
                                
out.getGPUObject(gCtx).allocateAndFillDense(Double.NaN);
@@ -2681,8 +2703,14 @@ public class LibMatrixCUDA {
                        Pointer A = getDensePointer(gCtx, in1, instName); // 
TODO: FIXME: Implement sparse binCellSparseOp kernel
                        Pointer B = getDensePointer(gCtx, in2, instName); // 
TODO: FIXME: Implement sparse binCellSparseOp kernel
 
-                       MatrixObject out = ec.getMatrixObject(outputName);
-                       getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName);        // Allocated the dense output matrix
+                       // Allocated the dense output matrix
+                       MatrixObject out = null;
+                       try {
+                               out = getDenseMatrixOutputForGPUInstruction(ec, 
instName, outputName, outRLen, outCLen);
+                       } catch(DMLRuntimeException e) {
+                               throw new DMLRuntimeException("Incorrect 
dimensions: dimA:[" + rlenA + "," + clenA + "]"
+                                               + " dimB:[" + rlenB + "," + 
clenB + "] out:[" + outRLen + "," + outCLen + "]", e);
+                       }
                        Pointer C = getDensePointer(gCtx, out, instName);
 
                        int maxRlen = Math.max(rlenA, rlenB);
@@ -2781,7 +2809,7 @@ public class LibMatrixCUDA {
        private static void deviceCopy(ExecutionContext ec, GPUContext gCtx, 
String instName, MatrixObject src, String outputName) throws 
DMLRuntimeException {
                Pointer srcPtr = getDensePointer(gCtx, src, instName); // TODO: 
FIXME: Implement sparse kernel
                MatrixObject out = ec.getMatrixObject(outputName);
-               getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName);        // Allocated the dense output matrix
+               getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, 
toInt(src.getNumRows()), toInt(src.getNumColumns()));   // Allocated the dense 
output matrix
                Pointer destPtr = getDensePointer(gCtx, out, instName);
                deviceCopy(instName, srcPtr, destPtr, (int)src.getNumRows(), 
(int)src.getNumColumns());
        }
@@ -2793,10 +2821,11 @@ public class LibMatrixCUDA {
                        throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
                Pointer A = getDensePointer(gCtx, in, instName); // TODO: 
FIXME: Implement sparse kernel
                MatrixObject out = ec.getMatrixObject(outputName);
-               getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName);        // Allocated the dense output matrix
+               int rlen = toInt(out.getNumRows());
+               int clen = toInt(out.getNumColumns());
+               getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, 
rlen, clen);    // Allocated the dense output matrix
                Pointer ret = getDensePointer(gCtx, out, instName);
-               int rlen = (int) out.getNumRows();
-               int clen = (int) out.getNumColumns();
+               
                // out.getMatrixCharacteristics().setNonZeros(rlen*clen);
                // compareAndSet(double* A,  double* ret, int rlen, int clen, 
double compareVal, double ifEqualsVal, double ifNotEqualsVal)
                long t0=0;
@@ -2814,19 +2843,20 @@ public class LibMatrixCUDA {
         * @param instName    name of the invoking instruction to record{@link 
Statistics}.
         * @param constant              scalar value with which to fill the 
matrix
         * @param outputName    (internal) name of the matrix that is to be 
filled
+        * @param numRows number of rows of output matrix object
+        * @param numCols number of columns of output matrix object
         * @throws DMLRuntimeException if error
         */
-       private static void setOutputToConstant(ExecutionContext ec, GPUContext 
gCtx, String instName, double constant, String outputName) throws 
DMLRuntimeException {
+       private static void setOutputToConstant(ExecutionContext ec, GPUContext 
gCtx, String instName, double constant, String outputName, long numRows, long 
numCols) throws DMLRuntimeException {
                if (ec.getGPUContext(0) != gCtx)
                        throw new DMLRuntimeException("GPU : Invalid internal 
state, the GPUContext set with the ExecutionContext is not the same used to run 
this LibMatrixCUDA function");
                if(constant == 0) {
-                       getSparseMatrixOutputForGPUInstruction(ec, 0, instName, 
outputName);
+                       getSparseMatrixOutputForGPUInstruction(ec, numRows, 
numCols, 0, instName, outputName);
                } else {
-                       //MatrixObject out = ec.getMatrixObject(outputName);
-                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);    // 
Allocated the dense output matrix
+                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, numRows, 
numCols);   // Allocated the dense output matrix
                        Pointer A = getDensePointer(gCtx, out, instName);
-                       int rlen = (int) out.getNumRows();
-                       int clen = (int) out.getNumColumns();
+                       int rlen = toInt(out.getNumRows());
+                       int clen = toInt(out.getNumColumns());
                        long t0 = 0;
                        if (GPUStatistics.DISPLAY_STATISTICS)
                                t0 = System.nanoTime();
@@ -2914,6 +2944,9 @@ public class LibMatrixCUDA {
                Pointer betaPtr = pointerTo(beta);
                int transa = isLeftTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
                int transb = isRightTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
+               
+               long outRLen = isLeftTransposed ? in1.getNumColumns() : 
in1.getNumRows();
+               long outCLen = isLeftTransposed ? in1.getNumRows() : 
in1.getNumColumns();
 
                MatrixObject out = ec.getMatrixObject(outputName);
                boolean isSparse1 = isInSparseFormat(gCtx, in1);
@@ -2945,7 +2978,7 @@ public class LibMatrixCUDA {
                        }
                        CSRPointer B = 
in2.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 
-                       ec.allocateGPUMatrixObject(outputName);
+                       ec.allocateGPUMatrixObject(outputName, outRLen, 
outCLen);
 
                        if (in1 == in2 && isLeftTransposed == true && 
isLeftTransposed == isRightTransposed) {
                                // Special case for transpose
@@ -2973,8 +3006,8 @@ public class LibMatrixCUDA {
                                //long sizeOfC = CSRPointer.estimateSize(C.nnz, 
out.getNumRows());
                                if (GPUStatistics.DISPLAY_STATISTICS)
                                        t0 = System.nanoTime();
-                               
JCusparse.cusparseDcsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, 
(int) A.nnz, A.val, A.rowPtr, A.colInd, betaPtr,
-                                               B.descr, (int) B.nnz, B.val, 
B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd);
+                               
JCusparse.cusparseDcsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, 
toInt(A.nnz), A.val, A.rowPtr, A.colInd, betaPtr,
+                                               B.descr, toInt(B.nnz), B.val, 
B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd);
                                //cudaDeviceSynchronize;
                                if (GPUStatistics.DISPLAY_STATISTICS)
                                        
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_DGEAM_LIB,
@@ -2983,24 +3016,24 @@ public class LibMatrixCUDA {
                } else {
                        // Dense-Dense dgeam
 
-                       int lda = (int) in1.getNumColumns();
-                       int ldb = (int) in2.getNumColumns();
-                       int m = (int) in1.getNumColumns();
-                       int n = (int) in2.getNumRows();
+                       int lda = toInt(in1.getNumColumns());
+                       int ldb = toInt(in2.getNumColumns());
+                       int m = toInt(in1.getNumColumns());
+                       int n = toInt(in2.getNumRows());
                        if (isLeftTransposed && isRightTransposed) {
-                               m = (int) in1.getNumRows();
-                               n = (int) in2.getNumColumns();
+                               m = toInt(in1.getNumRows());
+                               n = toInt(in2.getNumColumns());
                        }
                        else if (isLeftTransposed) {
-                               m = (int) in1.getNumRows();
+                               m = toInt(in1.getNumRows());
                        } else if (isRightTransposed) {
-                               n = (int) in2.getNumColumns();
+                               n = toInt(in2.getNumColumns());
                        }
                        int ldc = m;
 
                        Pointer A = getDensePointer(gCtx, in1, instName);
                        Pointer B = getDensePointer(gCtx, in2, instName);
-                       getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName);        // Allocated the dense output matrix
+                       getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName, outRLen, outCLen);      // Allocated the dense output matrix
                        Pointer C = getDensePointer(gCtx, out, instName);
 
                        if (GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
@@ -3042,6 +3075,12 @@ public class LibMatrixCUDA {
        //******************* End of Re-org Functions ************************/
        //********************************************************************/
 
+       private static int toInt(long num) throws DMLRuntimeException {
+               if(num >= Integer.MAX_VALUE || num <= Integer.MIN_VALUE) {
+                       throw new DMLRuntimeException("GPU : Exceeded supported 
size " + num);
+               }
+               return (int)num;
+       }
 
        //********************************************************************/
        //**************** Matrix Manipulation Functions *********************/
@@ -3054,23 +3093,24 @@ public class LibMatrixCUDA {
                LOG.trace("GPU : cbind" + ", GPUContext=" + gCtx);
 
                long t1 = 0;
+               
+               long rowsA = toInt(in1.getNumRows());
+               long colsA = toInt(in1.getNumColumns());
+               long rowsB = toInt(in2.getNumRows());
+               long colsB = toInt(in2.getNumColumns());
+
+               if (rowsA != rowsB) {
+                       throw new DMLRuntimeException("GPU : Invalid internal 
state - the rows must match up for a cbind operation");
+               }
 
                // only Dense supported
-               MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, 
instName, outputName);
+               MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, 
instName, outputName, rowsA, colsA + colsB);
                Pointer C = getDensePointer(gCtx, out, instName);
                Pointer A = getDensePointer(gCtx, in1, instName);
                Pointer B = getDensePointer(gCtx, in2, instName);
 
-               int rowsA = (int) in1.getNumRows();
-               int colsA = (int) in1.getNumColumns();
-               int rowsB = (int) in2.getNumRows();
-               int colsB = (int) in2.getNumColumns();
-
-               if (rowsA != rowsB){
-                       throw new DMLRuntimeException("GPU : Invalid internal 
state - the rows must match up for a cbind operation");
-               }
-               int maxRows = Math.max(rowsA, rowsB);
-               int maxCols = Math.max(colsA, colsB);
+               int maxRows = toInt(Math.max(rowsA, rowsB));
+               int maxCols = toInt(Math.max(colsA, colsB));
 
                if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
                getCudaKernels(gCtx)
@@ -3086,21 +3126,22 @@ public class LibMatrixCUDA {
                LOG.trace("GPU : rbind" + ", GPUContext=" + gCtx);
 
                long t1 = 0;
+               
+               int rowsA = toInt(in1.getNumRows());
+               int colsA = toInt(in1.getNumColumns());
+               int rowsB = toInt(in2.getNumRows());
+               int colsB = toInt(in2.getNumColumns());
+
+               if (colsA != colsB){
+                       throw new DMLRuntimeException("GPU : Invalid internal 
state - the columns must match up for a rbind operation");
+               }
 
                // only Dense supported
-               MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, 
instName, outputName);
+               MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, 
instName, outputName, rowsA + rowsB, colsA);
                Pointer C = getDensePointer(gCtx, out, instName);
                Pointer A = getDensePointer(gCtx, in1, instName);
                Pointer B = getDensePointer(gCtx, in2, instName);
 
-               int rowsA = (int) in1.getNumRows();
-               int colsA = (int) in1.getNumColumns();
-               int rowsB = (int) in2.getNumRows();
-               int colsB = (int) in2.getNumColumns();
-
-               if (colsA != colsB){
-                       throw new DMLRuntimeException("GPU : Invalid internal 
state - the columns must match up for a rbind operation");
-               }
                int maxRows = Math.max(rowsA, rowsB);
                int maxCols = Math.max(colsA, colsB);
 
@@ -3353,14 +3394,14 @@ public class LibMatrixCUDA {
                long t1=0;
                if (isSparseAndEmpty) {
                        MatrixObject out = ec.getMatrixObject(outputName);
-                       ec.allocateGPUMatrixObject(outputName);
+                       ec.allocateGPUMatrixObject(outputName, 
in1.getNumRows(), in1.getNumColumns());
                        
out.getGPUObject(gCtx).allocateAndFillDense(sparseAndEmptyFillValue);
                } else {
                        // Dense
-                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
+                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, 
in1.getNumRows(), in1.getNumColumns());
                        Pointer output = getDensePointer(gCtx, out, instName);
                        Pointer input = getDensePointer(gCtx, in1, instName);
-                       int size = (int)(in1.getNumColumns() * 
in1.getNumRows());
+                       int size = toInt(in1.getNumColumns() * 
in1.getNumRows());
                        if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
                        getCudaKernels(gCtx).launchKernel(kernel, 
ExecutionConfig.getConfigForSimpleVectorOperations(size),
                                        input, output, size);
@@ -3387,7 +3428,7 @@ public class LibMatrixCUDA {
                Pointer A = getDensePointer(gCtx, in1, instName);
                Pointer B = getDensePointer(gCtx, in2, instName);
                MatrixObject out = ec.getMatrixObject(outputName);
-               getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName);        // Allocated the dense output matrix
+               getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, 
in1.getNumRows(), in1.getNumColumns()); // Allocated the dense output matrix
                Pointer C = getDensePointer(gCtx, out, instName);
 
                long t1=0, t2=0;
@@ -3406,7 +3447,7 @@ public class LibMatrixCUDA {
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t1);
 
                        if (GPUStatistics.DISPLAY_STATISTICS) t2 = 
System.nanoTime();
-                       JCublas2.cublasDaxpy(getCublasHandle(gCtx), (int) n, 
alphaPtr, B, 1, C, 1);
+                       JCublas2.cublasDaxpy(getCublasHandle(gCtx), toInt(n), 
alphaPtr, B, 1, C, 1);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DAXPY_LIB, System.nanoTime() - t2);
                }
                else {
@@ -3416,8 +3457,8 @@ public class LibMatrixCUDA {
                        // Note: Vector-Matrix operation is not supported
                        // daxpy_matrix_vector(double* A,  double* B, double 
alpha, double* ret, int rlenA, int clenA, int rlenB, int clenB)
                        if (GPUStatistics.DISPLAY_STATISTICS) t1 = 
System.nanoTime();
-                       int rlenA = (int) in1.getNumRows(); int clenA =  (int) 
in1.getNumColumns();
-                       int rlenB = (int) in2.getNumRows(); int clenB =  (int) 
in2.getNumColumns();
+                       int rlenA = toInt(in1.getNumRows()); int clenA =  
toInt(in1.getNumColumns());
+                       int rlenB = toInt(in2.getNumRows()); int clenB =  
toInt(in2.getNumColumns());
                        
getCudaKernels(gCtx).launchKernel("daxpy_matrix_vector", 
ExecutionConfig.getConfigForSimpleMatrixOperations(rlenA, clenA),
                                        A, B, constant, C, rlenA, clenA, rlenB, 
clenB);
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_DAXPY_MV_KERNEL, System.nanoTime() - t1);
@@ -3448,11 +3489,11 @@ public class LibMatrixCUDA {
         if (!isInSparseFormat(gCtx, in1) && !isInSparseFormat(gCtx, in2)) {    
// Both dense
             GPUObject Aobj = in1.getGPUObject(gCtx);
             GPUObject bobj = in2.getGPUObject(gCtx);
-            int m = (int) in1.getNumRows();
-            int n = (int) in1.getNumColumns();
-            if ((int) in2.getNumRows() != m)
+            int m = toInt(in1.getNumRows());
+            int n = toInt(in1.getNumColumns());
+            if (in2.getNumRows() != m)
                 throw new DMLRuntimeException("GPU : Incorrect input for 
solve(), rows in A should be the same as rows in B");
-            if ((int) in2.getNumColumns() != 1)
+            if (in2.getNumColumns() != 1)
                 throw new DMLRuntimeException("GPU : Incorrect input for 
solve(), columns in B should be 1");
 
 
@@ -3524,7 +3565,7 @@ public class LibMatrixCUDA {
 
             // TODO  : Find a way to assign bTobj directly to the output and 
set the correct flags so as to not crash
             // There is an avoidable copy happening here
-            MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, 
instName, outputName);
+                       MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, 
in1.getNumColumns(), 1);
             cudaMemcpy(out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), 
bTobj.getJcudaDenseMatrixPtr(), n * 1 * Sizeof.DOUBLE, 
cudaMemcpyDeviceToDevice);
 
             gCtx.cudaFreeHelper(instName, work);
@@ -3578,13 +3619,15 @@ public class LibMatrixCUDA {
         * @param ec            active {@link ExecutionContext}
         * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param name  name of input matrix (that the {@link ExecutionContext} 
is aware of)
+        * @param numRows number of rows of output matrix object
+        * @param numCols number of columns of output matrix object
         * @return      the matrix object
         * @throws DMLRuntimeException  if an error occurs
         */
-       private static MatrixObject 
getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String instName, 
String name) throws DMLRuntimeException {
+       private static MatrixObject 
getDenseMatrixOutputForGPUInstruction(ExecutionContext ec, String instName, 
String name, long numRows, long numCols) throws DMLRuntimeException {
                long t0=0;
                if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-               Pair<MatrixObject, Boolean> mb = 
ec.getDenseMatrixOutputForGPUInstruction(name);
+               Pair<MatrixObject, Boolean> mb = 
ec.getDenseMatrixOutputForGPUInstruction(name, numRows, numCols);
                if (mb.getValue())
                        if (GPUStatistics.DISPLAY_STATISTICS)
                                GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t0);
@@ -3595,16 +3638,18 @@ public class LibMatrixCUDA {
         * Helper method to get the output block (allocated on the GPU)
         * Also records performance information into {@link Statistics}
         * @param ec            active {@link ExecutionContext}
+        * @param numRows number of rows of matrix object
+        * @param numCols number of columns of matrix object
         * @param nnz number of non zeroes in output matrix
         * @param instName the invoking instruction's name for record {@link 
Statistics}.
         * @param name  name of input matrix (that the {@link ExecutionContext} 
is aware of)
         * @return      the matrix object
         * @throws DMLRuntimeException  if an error occurs
         */
-       private static MatrixObject 
getSparseMatrixOutputForGPUInstruction(ExecutionContext ec, long nnz, String 
instName, String name) throws DMLRuntimeException {
+       private static MatrixObject 
getSparseMatrixOutputForGPUInstruction(ExecutionContext ec, long numRows, long 
numCols, long nnz, String instName, String name) throws DMLRuntimeException {
                long t0=0;
                if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-               Pair<MatrixObject, Boolean> mb = 
ec.getSparseMatrixOutputForGPUInstruction(name, nnz);
+               Pair<MatrixObject, Boolean> mb = 
ec.getSparseMatrixOutputForGPUInstruction(name, numRows, numCols, nnz);
                if (mb.getValue())
                        if (GPUStatistics.DISPLAY_STATISTICS)
                                GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_ALLOCATE_SPARSE_OUTPUT, System.nanoTime() - t0);

Reply via email to