Repository: incubator-systemml
Updated Branches:
  refs/heads/master 154f0778c -> 3caae2718


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3caae271/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 2af894b..c0e6d4f 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -54,6 +54,7 @@ import static jcuda.runtime.JCuda.cudaDeviceSynchronize;
 import static jcuda.runtime.JCuda.cudaFree;
 import static jcuda.runtime.JCuda.cudaMalloc;
 import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 import static jcuda.jcudnn.cudnnActivationMode.CUDNN_ACTIVATION_RELU;
 import org.apache.commons.logging.Log;
@@ -61,30 +62,13 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
-import org.apache.sysml.runtime.functionobjects.And;
-import org.apache.sysml.runtime.functionobjects.Divide;
-import org.apache.sysml.runtime.functionobjects.Equals;
-import org.apache.sysml.runtime.functionobjects.GreaterThan;
-import org.apache.sysml.runtime.functionobjects.GreaterThanEquals;
-import org.apache.sysml.runtime.functionobjects.LessThan;
-import org.apache.sysml.runtime.functionobjects.LessThanEquals;
-import org.apache.sysml.runtime.functionobjects.Minus;
-import org.apache.sysml.runtime.functionobjects.Multiply;
-import org.apache.sysml.runtime.functionobjects.Multiply2;
-import org.apache.sysml.runtime.functionobjects.NotEquals;
-import org.apache.sysml.runtime.functionobjects.Or;
-import org.apache.sysml.runtime.functionobjects.Plus;
-import org.apache.sysml.runtime.functionobjects.Power;
-import org.apache.sysml.runtime.functionobjects.Power2;
-import org.apache.sysml.runtime.functionobjects.ValueFunction;
+import org.apache.sysml.runtime.functionobjects.*;
+import org.apache.sysml.runtime.instructions.cp.DoubleObject;
 import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig;
 import org.apache.sysml.runtime.instructions.gpu.context.JCudaKernels;
 import org.apache.sysml.runtime.instructions.gpu.context.JCudaObject;
 import 
org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.CSRPointer;
-import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
-import org.apache.sysml.runtime.matrix.operators.LeftScalarOperator;
-import org.apache.sysml.runtime.matrix.operators.RightScalarOperator;
-import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysml.runtime.matrix.operators.*;
 import org.apache.sysml.utils.Statistics;
 import static jcuda.jcudnn.JCudnn.cudnnAddTensor;
 
@@ -105,7 +89,7 @@ import jcuda.jcusparse.cusparseHandle;
 
 //FIXME move could to respective instructions, this is not a block library
 public class LibMatrixCUDA {
-       
+
        public static cudnnHandle cudnnHandle;
        public static cublasHandle cublasHandle;
        public static cusparseHandle cusparseHandle;
@@ -114,7 +98,7 @@ public class LibMatrixCUDA {
     private static final Log LOG = 
LogFactory.getLog(LibMatrixCUDA.class.getName());
 
        private static int CONVOLUTION_PREFERENCE = 
cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-       
+
        public static void conv2d(MatrixObject image, MatrixObject filter, 
MatrixObject outputBlock, int N, int C, int H, int W,
                        int K, int R, int S, int pad_h, int pad_w, int 
stride_h, int stride_w, int P, int Q)
                        throws DMLRuntimeException {
@@ -124,7 +108,7 @@ public class LibMatrixCUDA {
                if(isInSparseFormat(filter)) {
                        ((JCudaObject)filter.getGPUObject()).sparseToDense();
                }
-               
+
                cudnnTensorDescriptor srcTensorDesc = null;
                cudnnTensorDescriptor dstTensorDesc = null;
                cudnnFilterDescriptor filterDesc = null;
@@ -138,24 +122,24 @@ public class LibMatrixCUDA {
                        srcTensorDesc = allocateTensorDescriptor(N, C, H, W);
                        dstTensorDesc = allocateTensorDescriptor(N, K, P, Q);
                        filterDesc = allocateFilterDescriptor(K, C, R, S);
-                       
+
                        // Allocate data
                        // (Pointer) gpuCtx.prepare(image, true, true);
                        // (Pointer) gpuCtx.prepare(filter, true, true);
-                       
-                       Pointer imagePointer = 
((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr; 
-                       Pointer filterPointer = 
((JCudaObject)filter.getGPUObject()).jcudaDenseMatrixPtr; 
-                       Pointer dstPointer = 
((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr; 
-                       
-                       int padding [] = { pad_h, pad_w }; 
+
+                       Pointer imagePointer = 
((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr;
+                       Pointer filterPointer = 
((JCudaObject)filter.getGPUObject()).jcudaDenseMatrixPtr;
+                       Pointer dstPointer = 
((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr;
+
+                       int padding [] = { pad_h, pad_w };
                        int strides [] = { stride_h, stride_w };
                        convDesc = allocateConvolutionDescriptor(padding, 
strides);
-                       
+
                        // Select the best algorithm depending on the data and 
supported CUDA
-                       
-                       int algo = -1; 
+
+                       int algo = -1;
                        workSpace = new Pointer();
-                       
+
                        if(CONVOLUTION_PREFERENCE == 
cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE) {
                                algo = 
jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
                        }
@@ -181,11 +165,11 @@ public class LibMatrixCUDA {
                        else {
                                throw new DMLRuntimeException("Unsupported 
preference criteria for convolution");
                        }
-                       
+
                        alpha = pointerTo(1.0);
                        beta = pointerTo(0.0f);
-                       int status = cudnnConvolutionForward(cudnnHandle, 
alpha, 
-                                       srcTensorDesc, imagePointer, 
+                       int status = cudnnConvolutionForward(cudnnHandle, alpha,
+                                       srcTensorDesc, imagePointer,
                                        filterDesc, filterPointer,
                                        convDesc, algo, workSpace, sizeInBytes, 
beta,
                                        dstTensorDesc, dstPointer);
@@ -194,12 +178,12 @@ public class LibMatrixCUDA {
                        }
                }
                finally {
-                       
+
                        if(alpha != null)
                                cudaFree(alpha);
                        if(beta != null)
                                cudaFree(beta);
-                       
+
                        if(srcTensorDesc != null)
                                cudnnDestroyTensorDescriptor(srcTensorDesc);
                        if(dstTensorDesc != null)
@@ -211,33 +195,33 @@ public class LibMatrixCUDA {
                        if(workSpace != null && sizeInBytes != 0)
                                cudaFree(workSpace);
                }
-       }       
-       
+       }
+
        private static cudnnConvolutionDescriptor 
allocateConvolutionDescriptor(int padding [], int strides []) {
                cudnnConvolutionDescriptor convDesc = new 
cudnnConvolutionDescriptor();
                cudnnCreateConvolutionDescriptor(convDesc);
-               cudnnSetConvolution2dDescriptor(convDesc, padding[0], 
padding[1], strides[0], strides[1], 1, 1, CUDNN_CROSS_CORRELATION);             
  
+               cudnnSetConvolution2dDescriptor(convDesc, padding[0], 
padding[1], strides[0], strides[1], 1, 1, CUDNN_CROSS_CORRELATION);
                return convDesc;
        }
-       
+
        public static  Pointer pointerTo(double value) {
         return Pointer.to(new double[] { value });
     }
-       
+
        private static  cudnnTensorDescriptor allocateTensorDescriptor(int N, 
int C, int H, int W) {
                cudnnTensorDescriptor ret = new cudnnTensorDescriptor();
                cudnnCreateTensorDescriptor(ret);
                cudnnSetTensor4dDescriptor(ret, CUDNN_TENSOR_NCHW, 
CUDNN_DATA_DOUBLE, N, C, H, W);
                return ret;
        }
-       
+
        private static cudnnFilterDescriptor allocateFilterDescriptor(int K, 
int C, int R, int S) {
                cudnnFilterDescriptor filterDesc = new cudnnFilterDescriptor();
                cudnnCreateFilterDescriptor(filterDesc);
                cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_DOUBLE, K, C, 
R, S);
                return filterDesc;
        }
-       
+
        /**
         * allocates pooling descriptor, used in poolingForward and 
poolingBackward
         * @param R                     pooling window height
@@ -254,7 +238,7 @@ public class LibMatrixCUDA {
                cudnnSetPooling2dDescriptor(poolingDesc, CUDNN_POOLING_MAX, R, 
S, pad_h, pad_w, stride_h, stride_w);
                return poolingDesc;
        }
-       
+
        public static void bias_add(MatrixObject input, MatrixObject bias, 
MatrixObject outputBlock) throws DMLRuntimeException {
                if(isInSparseFormat(input)) {
                        ((JCudaObject)input.getGPUObject()).sparseToDense();
@@ -272,11 +256,11 @@ public class LibMatrixCUDA {
                        int PQ = (int) input.getNumColumns() / K;
                        alpha = pointerTo(1.0); // TODO
                        beta = pointerTo(1.0);
-                       
+
                        // Allocate descriptors
                        biasTensorDesc = allocateTensorDescriptor(1, K, 1, 1);
                        dstTensorDesc = allocateTensorDescriptor(N, K, PQ, 1);
-                       Pointer imagePointer = 
((JCudaObject)input.getGPUObject()).jcudaDenseMatrixPtr; 
+                       Pointer imagePointer = 
((JCudaObject)input.getGPUObject()).jcudaDenseMatrixPtr;
                        Pointer biasPointer = 
((JCudaObject)bias.getGPUObject()).jcudaDenseMatrixPtr;
                        Pointer outputPointer = 
((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr;
                        // TODO: Avoid memcpy by allowing update in-place 
bias_add
@@ -293,9 +277,9 @@ public class LibMatrixCUDA {
                        if(dstTensorDesc != null)
                                cudnnDestroyTensorDescriptor(dstTensorDesc);
                }
-               
+
        }
-       
+
        public static void conv2d_backward_filter(MatrixObject image, 
MatrixObject dout,
                        MatrixObject outputBlock, int N, int C, int H, int W, 
int K, int R,
                        int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
@@ -312,7 +296,7 @@ public class LibMatrixCUDA {
                cudnnTensorDescriptor doutTensorDesc = null;
                cudnnFilterDescriptor dwDesc = null;
                cudnnConvolutionDescriptor convDesc = null;
-               
+
                Pointer workSpace = null;
                long sizeInBytes = 0;
                try {
@@ -320,27 +304,27 @@ public class LibMatrixCUDA {
                        xTensorDesc = allocateTensorDescriptor(N, C, H, W);
                        doutTensorDesc = allocateTensorDescriptor(N, K, P, Q);
                        dwDesc = allocateFilterDescriptor(K, C, R, S);
-                       
+
                        // Allocate data
-                       Pointer imagePointer = 
((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr; 
-                       Pointer doutPointer = 
((JCudaObject)dout.getGPUObject()).jcudaDenseMatrixPtr; 
-                       Pointer dwPointer = 
((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr; 
-                       
+                       Pointer imagePointer = 
((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr;
+                       Pointer doutPointer = 
((JCudaObject)dout.getGPUObject()).jcudaDenseMatrixPtr;
+                       Pointer dwPointer = 
((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr;
+
                        alpha = pointerTo(1.0); // TODO
                        beta = pointerTo(0.0f);
-                       
-                       int padding [] = { pad_h, pad_w }; 
+
+                       int padding [] = { pad_h, pad_w };
                        int strides [] = { stride_h, stride_w };
                        convDesc = allocateConvolutionDescriptor(padding, 
strides);
                        long sizeInBytesArray[] = { 0 };
-                       
+
                        // TODO: Select the best algorithm depending on the 
data and supported CUDA
                        int algo = 
jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
                        workSpace = new Pointer();
                        
cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle,
                                        xTensorDesc, doutTensorDesc, convDesc, 
dwDesc, algo, sizeInBytesArray);
-                       
-                       int status = 
cudnnConvolutionBackwardFilter(cudnnHandle, alpha, xTensorDesc, imagePointer, 
+
+                       int status = 
cudnnConvolutionBackwardFilter(cudnnHandle, alpha, xTensorDesc, imagePointer,
                                        doutTensorDesc, doutPointer, convDesc, 
algo, workSpace, sizeInBytes, beta, dwDesc, dwPointer);
                        if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnConvolutionBackwardFilter: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
@@ -357,28 +341,28 @@ public class LibMatrixCUDA {
                                cudnnDestroyTensorDescriptor(doutTensorDesc);
                        if(dwDesc != null)
                                cudnnDestroyFilterDescriptor(dwDesc);
-                       
+
                        if(convDesc != null)
                                cudnnDestroyConvolutionDescriptor(convDesc);
-                       
+
                        if(workSpace != null && sizeInBytes != 0)
                                cudaFree(workSpace);
                }
        }
-       
+
        private static long numDoublesIn2GB = 125000000;
-       
+
        public static void relu(ExecutionContext ec, MatrixObject in, String 
outputName) throws DMLRuntimeException {
                if(isInSparseFormat(in)) {
                        // TODO: FIXME: Implement sparse relu kernel
                        ((JCudaObject)in.getGPUObject()).sparseToDense();
                }
-               
+
                cudnnTensorDescriptor srcTensorDesc = null;
                cudnnTensorDescriptor dstTensorDesc = null;
                Pointer alpha = null;
                Pointer beta = null;
-               
+
                try {
                        alpha = pointerTo(1.0f);
                        beta = pointerTo(0.0f);
@@ -386,34 +370,34 @@ public class LibMatrixCUDA {
                        long H = in.getNumColumns();
                        long W = 1;
                        Pointer srcData = 
((JCudaObject)in.getGPUObject()).jcudaDenseMatrixPtr;
-                       
+
                        MatrixObject output = ec.getMatrixObject(outputName);
                        ec.getDenseMatrixOutputForGPUInstruction(outputName);   
// Allocated the dense output matrix
                        Pointer dstData = 
((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr;
-                       
+
                        if(N*H*W >= numDoublesIn2GB) {
                                // Invokes relu(double* A,  double* ret, int 
rlen, int clen)
                                kernels.launchKernel("relu",
-                                               
ExecutionConfig.getConfigForSimpleMatrixOperations((int)N, (int) (H*W)), 
+                                               
ExecutionConfig.getConfigForSimpleMatrixOperations((int)N, (int) (H*W)),
                                                srcData, dstData, (int)N, (int) 
H*W);
                        }
                        else {
                                // Allocate descriptors
                                srcTensorDesc = 
allocateTensorDescriptor((int)N, 1, (int)H, (int)W);
                                dstTensorDesc = 
allocateTensorDescriptor((int)N, 1, (int)H, (int)W);
-                               
-                   cudnnActivationForward(cudnnHandle, CUDNN_ACTIVATION_RELU, 
-                       alpha, srcTensorDesc, srcData, 
+
+                   cudnnActivationForward(cudnnHandle, CUDNN_ACTIVATION_RELU,
+                       alpha, srcTensorDesc, srcData,
                        beta, dstTensorDesc, dstData);
                        }
                }
                finally {
-                       
+
                        if(alpha != null)
                                cudaFree(alpha);
                        if(beta != null)
                                cudaFree(beta);
-                       
+
                        if(srcTensorDesc != null)
                                cudnnDestroyTensorDescriptor(srcTensorDesc);
                        if(dstTensorDesc != null)
@@ -423,7 +407,7 @@ public class LibMatrixCUDA {
 
        /**
         * Performs tsmm, A %*% A' or A' %*% A, on GPU by exploiting 
cublasDsyrk(...)
-        * 
+        *
         * @param ec execution context
         * @param left input matrix, as in a tsmm expression like A %*% A' or 
A' %*% A, we just need to check whether the left one is transposed or not, I 
named it 'left'
         * @param outputName output matrix name
@@ -437,44 +421,44 @@ public class LibMatrixCUDA {
                matmult(ec, left, left, outputName, isLeftTransposed, 
!isLeftTransposed);
                return;
            }
-       
+
            // For dense TSMM, exploit cublasDsyrk(...) and call custom kernel 
to flip the matrix
            MatrixObject output = ec.getMatrixObject(outputName);
            ec.getDenseMatrixOutputForGPUInstruction(outputName);       // 
Allocated the dense output matrix
-           
+
            // Since CuBLAS expects inputs in column-major format,
-           // reverse the order of matrix-multiplication and take care of 
dimension mismatch.      
+           // reverse the order of matrix-multiplication and take care of 
dimension mismatch.
            int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_N : 
cublasOperation.CUBLAS_OP_T;
            // Note: the dimensions are swapped
            int m = (int) (isLeftTransposed ? left.getNumColumns() : 
left.getNumRows());
            int k = (int) (isLeftTransposed ? left.getNumRows() : 
left.getNumColumns());
-       
+
            if(m == -1)
                    throw new DMLRuntimeException("Incorrect dimensions");
-       
+
            double[] alpha = {1.0d};
            double[] beta = {0.0d};
-       
+
            int lda = (int) (isLeftTransposed ? m : k);
            int ldc = m;
-       
+
            if(!left.getGPUObject().isAllocated())
                    throw new DMLRuntimeException("Input is not allocated:" + 
left.getGPUObject().isAllocated());
            if(!output.getGPUObject().isAllocated())
                    throw new DMLRuntimeException("Output is not allocated:" + 
output.getGPUObject().isAllocated());
-       
+
            Pointer A = ((JCudaObject)left.getGPUObject()).jcudaDenseMatrixPtr;
            Pointer C = 
((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr;
-           
+
            JCublas2.cublasDsyrk(cublasHandle, 
cublasFillMode.CUBLAS_FILL_MODE_LOWER,transa, m, k, Pointer.to(alpha), A, lda, 
Pointer.to(beta), C, ldc);
            copyUpperToLowerTriangle(output);
        }
-       
+
        /**
         * Used for all version of TSMM where the result is known to be 
symmetric.
         * Hence, we compute only the upper triangular matrix and copy this 
partial
         * result down to lower triangular matrix once.
-        * 
+        *
         * @param ret
         * @throws DMLRuntimeException
         */
@@ -487,10 +471,15 @@ public class LibMatrixCUDA {
                }
                int dim = (int) ret.getNumRows();
                kernels.launchKernel("copyUpperToLowerTriangleDense",
-                               
ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim), 
+                               
ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim),
                                
((JCudaObject)ret.getGPUObject()).jcudaDenseMatrixPtr, dim, dim*dim);
        }
-       
+
+
+       //********************************************************************/
+       //***************** MATRIX MULTIPLY Functions ************************/
+       //********************************************************************/
+
        /**
         * Matrix multiply on GPU
         * Examines sparsity and shapes and routes call to appropriate method
@@ -507,13 +496,13 @@ public class LibMatrixCUDA {
         */
        public static MatrixObject matmult(ExecutionContext ec, MatrixObject 
left1, MatrixObject right1, String outputName,
                        boolean isLeftTransposed1, boolean isRightTransposed1) 
throws DMLRuntimeException {
-               
+
                if(!left1.getGPUObject().isAllocated() || 
!right1.getGPUObject().isAllocated())
                        throw new DMLRuntimeException("One of input is not 
allocated:" + left1.getGPUObject().isAllocated() + " " + 
right1.getGPUObject().isAllocated());
-               
+
                boolean bothDense = !left1.getGPUObject().isInSparseFormat() && 
!right1.getGPUObject().isInSparseFormat();
                boolean bothSparse = left1.getGPUObject().isInSparseFormat() && 
right1.getGPUObject().isInSparseFormat();
-               
+
                MatrixObject output = ec.getMatrixObject(outputName);
 
                if (bothDense) {                // Dense C = Dense A * Dense B
@@ -530,10 +519,10 @@ public class LibMatrixCUDA {
                        ec.allocateGPUMatrixObject(outputName);
                        eitherSparseMatmult(output, left1, right1, 
isLeftTransposed1, isRightTransposed1);
                }
-               
+
                return output;
        }
-       
+
        /**
         * One of the matrices is sparse, the other dense
         * C = op(A) x op(B)
@@ -546,22 +535,22 @@ public class LibMatrixCUDA {
         */
        protected static void eitherSparseMatmult(MatrixObject output, 
MatrixObject left, MatrixObject right,
                        boolean isLeftTransposed, boolean isRightTransposed) 
throws DMLRuntimeException {
-               
+
                int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : 
CUSPARSE_OPERATION_NON_TRANSPOSE;
                int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : 
CUSPARSE_OPERATION_NON_TRANSPOSE;
-               
+
                int m = (int) (isLeftTransposed ? left.getNumColumns() : 
left.getNumRows()) ;
                int n = (int) (isRightTransposed ? right.getNumRows() : 
right.getNumColumns());
                int k = (int) (isLeftTransposed ? left.getNumRows() :  
left.getNumColumns());
                int k1 = (int) (isRightTransposed ? right.getNumColumns() : 
right.getNumRows());
-               if(k != k1) 
+               if(k != k1)
                        throw new DMLRuntimeException("Dimension mismatch: " + 
k + " != " + k1);
-               
+
                if(m == -1 || n == -1 || k == -1)
                        throw new DMLRuntimeException("Incorrect dimensions");
-               
-               
-               if (left.getGPUObject().isInSparseFormat()) {   
+
+
+               if (left.getGPUObject().isInSparseFormat()) {
                        // Left sparse, right dense
                        sparseDenseMatmult(output, left, right, 
isLeftTransposed, isRightTransposed, transA, transB, m, n, k);
                } else {
@@ -569,7 +558,7 @@ public class LibMatrixCUDA {
                        denseSparseMatmult(output, right, left, 
isLeftTransposed, isRightTransposed, transA, transB, m, n, k);
                }
        }
-       
+
        /**
         * C = op(A) * op(B) where A is dense and B is sparse
         * If B is ultrasparse, A is converted to a sparse matrix and {@code 
sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, 
CSRPointer)} is invoked
@@ -612,10 +601,10 @@ public class LibMatrixCUDA {
                        // Note the arguments to denseDenseMatmult to 
accommodate for this.
                        Pointer BDenseTransposed = 
B.toColumnMajorDenseMatrix(cusparseHandle, cublasHandle, 
(int)right.getNumRows(), (int)right.getNumColumns());
                        output.getGPUObject().acquireDeviceModifyDense();       
// To allocate the dense matrix
-                       Pointer C = 
((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr;           
-                       denseDenseMatmult(C, 
+                       Pointer C = 
((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr;
+                       denseDenseMatmult(C,
                                        (int) left.getNumRows(), (int) 
left.getNumColumns(),
-                                       (int) right.getNumColumns(), (int) 
right.getNumRows(), 
+                                       (int) right.getNumColumns(), (int) 
right.getNumRows(),
                                        isLeftTransposed, !isRightTransposed,
                                        ADense, BDenseTransposed);
                        cudaFree(BDenseTransposed);
@@ -643,15 +632,15 @@ public class LibMatrixCUDA {
                        throws DMLRuntimeException {
                CSRPointer A = 
((JCudaObject)left.getGPUObject()).jcudaSparseMatrixPtr;
                Pointer BDense = 
((JCudaObject)right.getGPUObject()).jcudaDenseMatrixPtr;
-               
-               if (n == 1){    
+
+               if (n == 1){
                        // Sparse Matrix - Dense Vector multiply
                        LOG.debug(" GPU Sparse Matrix - Dense Vector Mutliply");
                        sparseMatrixDenseVectorMult(output, A, BDense, transA, 
(int)left.getNumRows(), (int)left.getNumColumns());
-                       
+
                } else {
                        // Sparse Matrix Dense Matrix multiply
-                       if (A.isUltraSparse(m, k)){     
+                       if (A.isUltraSparse(m, k)){
                                LOG.debug(" GPU Sparse-Dense Matrix 
Multiplication (Converted to Sparse-Sparse)");
                                // Convert right to CSR and do cuSparse matmul
                                long t0 = System.nanoTime();
@@ -664,17 +653,17 @@ public class LibMatrixCUDA {
                                sparseSparseMatmult(output, transA, transB, m, 
n, k, A, B);
                                B.deallocate();
                                cudaFree(BT);
-                       } else {                                        
+                       } else {
                                LOG.debug(" GPU Sparse-Dense Matrix 
Multiplication (Converted to Dense-Dense)");
                                // Convert left to dense and do a cuBlas matmul
                                // ADenseTransposed is a column major matrix
                                // Note the arguments to denseDenseMatmult to 
accommodate for this.
                                Pointer ADenseTransposed = 
A.toColumnMajorDenseMatrix(cusparseHandle, cublasHandle, 
(int)left.getNumRows(), (int)left.getNumColumns());
                                
output.getGPUObject().acquireDeviceModifyDense();       // To allocate the 
dense matrix
-                               Pointer C = 
((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr;           
-                               denseDenseMatmult(C, 
+                               Pointer C = 
((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr;
+                               denseDenseMatmult(C,
                                                (int) left.getNumColumns(), 
(int) left.getNumRows(),
-                                               (int) right.getNumRows(), (int) 
right.getNumColumns(), 
+                                               (int) right.getNumRows(), (int) 
right.getNumColumns(),
                                                !isLeftTransposed, 
isRightTransposed,
                                                ADenseTransposed, BDense);
                                cudaFree(ADenseTransposed);
@@ -720,25 +709,25 @@ public class LibMatrixCUDA {
         */
        protected static void bothSparseMatmult(MatrixObject output, 
MatrixObject left, MatrixObject right,
                        boolean isLeftTransposed, boolean isRightTransposed) 
throws DMLRuntimeException {
-               
+
                int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : 
CUSPARSE_OPERATION_NON_TRANSPOSE;
                int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : 
CUSPARSE_OPERATION_NON_TRANSPOSE;
-               
+
                int m = (int) (isLeftTransposed ? left.getNumColumns() : 
left.getNumRows()) ;
                int n = (int) (isRightTransposed ? right.getNumRows() : 
right.getNumColumns());
                int k = (int) (isLeftTransposed ? left.getNumRows() :  
left.getNumColumns());
                int k1 = (int) (isRightTransposed ? right.getNumColumns() : 
right.getNumRows());
-               if(k != k1) 
+               if(k != k1)
                        throw new DMLRuntimeException("Dimension mismatch: " + 
k + " != " + k1);
-               
+
                if(m == -1 || n == -1 || k == -1)
                        throw new DMLRuntimeException("Incorrect dimensions");
-                       
+
                CSRPointer A = 
((JCudaObject)left.getGPUObject()).jcudaSparseMatrixPtr;
                CSRPointer B = 
((JCudaObject)right.getGPUObject()).jcudaSparseMatrixPtr;
-               
+
                // TODO if (m == 1) {   // Vector-matrix multiplication
-               
+
                if (!isRightTransposed && right.getNumColumns() == 1){  // 
Matrix-Vector multiplication
                        sparseMatrixVectorMult(output, transA, 
(int)left.getNumRows(), (int)left.getNumColumns(), (int)right.getNumRows(), A, 
B);
                } else {                                                        
                                        // Matrix-Matrix multiplication
@@ -753,7 +742,7 @@ public class LibMatrixCUDA {
         * @param transA        if A is to be transposed or not (the op in 
op(A))
         * @param m                     number of rows in A (not op(A))
         * @param n                     number of cols in A (not op(A))
-        * @param k                     number of rows in B, (cols in B is 
assumed to be 1)             
+        * @param k                     number of rows in B, (cols in B is 
assumed to be 1)
         * @param A                     left sparse matrix on GPU
         * @param B                     right sparse vector on GPU
         * @throws DMLRuntimeException if DMLRuntimeException occurs
@@ -786,7 +775,7 @@ public class LibMatrixCUDA {
                
((JCudaObject)output.getGPUObject()).setSparseMatrixCudaPointer(C);
                long sizeOfC = CSRPointer.estimateSize(C.nnz, 
output.getNumRows());
                output.getGPUObject().setDeviceModify(sizeOfC);
-               
+
                cusparseDcsrgemm(cusparseHandle, transA, transB, m, n, k,
                                A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd,
                                B.descr, (int)B.nnz, B.val, B.rowPtr, B.colInd,
@@ -797,7 +786,7 @@ public class LibMatrixCUDA {
        /**
         * Dense dense matrix multiply
         * C = op(A) * op(B), A and B are dense matrices
-        * @param output                                output object C on host 
with GPU data allocated                         
+        * @param output                                output object C on host 
with GPU data allocated
         * @param left1                                 left matrix A on host 
(in row-major order)
         * @param right1                                right matrix B on host 
(in row-major order)
         * @param isLeftTransposed1     op for A, transposed or not
@@ -806,15 +795,15 @@ public class LibMatrixCUDA {
         */
        protected static void denseDenseMatmult(MatrixObject output, 
MatrixObject left1, MatrixObject right1,
                        boolean isLeftTransposed1, boolean isRightTransposed1) 
throws DMLRuntimeException {
-               
+
                Pointer leftPtr = 
((JCudaObject)left1.getGPUObject()).jcudaDenseMatrixPtr;
                Pointer rightPtr = 
((JCudaObject)right1.getGPUObject()).jcudaDenseMatrixPtr;
-               
+
                int leftRows = (int) left1.getNumRows();
                int leftCols = (int) left1.getNumColumns();
                int rightRows = (int) right1.getNumRows();
                int rightCols = (int) right1.getNumColumns();
-               Pointer C = 
((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr;           
+               Pointer C = 
((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr;
                denseDenseMatmult(C, leftRows, leftCols, rightRows, rightCols, 
isLeftTransposed1, isRightTransposed1,
                                leftPtr, rightPtr);
        }
@@ -824,7 +813,7 @@ public class LibMatrixCUDA {
         * C = op(A) * op(B), A and B are dense matrices
         * On the host, the matrices are in row-major format; cuBLAS expects 
them in column-major format.
         * What we have as input is t(A) and t(B), t(X) = transpose of X.
-        * We do t(B) %*% t(A) to get t(C); 
+        * We do t(B) %*% t(A) to get t(C);
         * If we were to calculate t(t(C), we would get the resultant matrix C, 
but this would be in column-major format.
         * What we really want is t(C). This we already have as the result of 
t(B) %*% t(A).
         * @param output                        output allocated on GPU in 
column major format
@@ -841,50 +830,50 @@ public class LibMatrixCUDA {
        public static void denseDenseMatmult(Pointer output, int leftRows1, int 
leftCols1, int rightRows1,
                        int rightCols1, boolean isLeftTransposed1, boolean 
isRightTransposed1, Pointer leftPtr, Pointer rightPtr)
                        throws DMLRuntimeException {
-               
+
                Pointer A = rightPtr;
                Pointer B = leftPtr;
-               
+
                int leftRows = rightCols1;
                int leftCols = rightRows1;
                int rightRows = leftCols1;
                int rightCols = leftRows1;
-               
-               boolean isLeftTransposed = isRightTransposed1; 
-               boolean isRightTransposed = isLeftTransposed1; 
-               
+
+               boolean isLeftTransposed = isRightTransposed1;
+               boolean isRightTransposed = isLeftTransposed1;
+
                // Note: the dimensions are swapped
                int m = (int) (isLeftTransposed ? leftCols : leftRows) ;
                int n = (int) (isRightTransposed ? rightRows : rightCols);
                int k = (int) (isLeftTransposed ?  leftRows : leftCols);
                int k1 = (int) (isRightTransposed ?  rightCols : rightRows);
-               if(k != k1) 
+               if(k != k1)
                        throw new DMLRuntimeException("Dimension mismatch: " + 
k + " != " + k1);
-               
+
                if(m == -1 || n == -1 || k == -1)
                        throw new DMLRuntimeException("Incorrect dimensions");
-               
+
                double[] one = { 1 };
                double[] zero = { 0 };
-               
+
                //int lda = leftRows;
                //int ldb = leftCols;
         int lda = isLeftTransposed ?  k : m;
         int ldb = isRightTransposed ? n : k;
                int ldc = m;
-               
+
                int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_T : 
cublasOperation.CUBLAS_OP_N;
                int transb = isRightTransposed ? cublasOperation.CUBLAS_OP_T : 
cublasOperation.CUBLAS_OP_N;
 
                Pointer C = output;
-               if (m == 1 && n == 1){ 
+               if (m == 1 && n == 1){
                        // Vector product
                        LOG.debug(" GPU Dense-dense Vector Product");
                        double[] result = {0};
                        JCublas2.cublasDdot(cublasHandle, k, A, 1, B, 1, 
Pointer.to(result));
                        // By default in CuBlas V2, cublas pointer mode is set 
to CUBLAS_POINTER_MODE_HOST.
                        // This means that scalar values passed are on host (as 
opposed to on device).
-                       // The result is copied from the host back to the 
device so that the rest of 
+                       // The result is copied from the host back to the 
device so that the rest of
                        // infrastructure can treat it uniformly.
                        cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, 
cudaMemcpyHostToDevice);
                } else if (m == 1) {
@@ -902,6 +891,293 @@ public class LibMatrixCUDA {
                }
        }
 
+       //********************************************************************/
+       //***************** END OF MATRIX MULTIPLY Functions *****************/
+       //********************************************************************/
+
+
+
+       //********************************************************************/
+       //****************  UNARY AGGREGATE Functions ************************/
+       //********************************************************************/
+
+       /**
+        * Direction of reduction for aggregate binary operations
+        */
+       private enum ReductionDirection{
+               ALL, ROW, COL, DIAG;
+       };
+
+       /**
+        * Entry point to perform Unary aggregate operations on the GPU.
+        * The execution context object is used to allocate memory for the GPU.
+        * @param ec                    Instance of {@link ExecutionContext}, 
from which the output variable will be allocated
+        * @param in1                   input matrix
+        * @param output        output matrix/scalar name
+        * @param op                    Instance of {@link 
AggregateUnaryOperator} which encapsulates the direction of 
reduction/aggregation and the reduction operation.
+        * @throws DMLRuntimeException
+        */
+       public static void unaryAggregate(ExecutionContext ec, MatrixObject 
in1, String output, AggregateUnaryOperator op)
+                                       throws DMLRuntimeException {
+
+               final int REDUCTION_ALL = 1;
+               final int REDUCTION_ROW = 2;
+               final int REDUCTION_COL = 3;
+               final int REDUCTION_DIAG = 4;
+
+               // A kahan sum implemention is not provided. is a "uak+" or 
other kahan operator is encountered,
+               // it just does regular summation reduction.
+               final int OP_PLUS = 1;
+               final int OP_PLUS_SQ = 2;
+               final int OP_MEAN = 3;
+               final int OP_VARIANCE = 4;
+               final int OP_MULTIPLY = 5;
+               final int OP_MAX = 6;
+               final int OP_MIN = 7;
+               final int OP_MAXINDEX = 8;
+               final int OP_MININDEX = 9;
+
+
+               // Sanity Checks
+               if(!in1.getGPUObject().isAllocated())
+                       throw new DMLRuntimeException("Internal Error - The 
input is not allocated for a GPU Aggregate Unary:" + 
in1.getGPUObject().isAllocated());
+
+               boolean isSparse = in1.getGPUObject().isInSparseFormat();
+               IndexFunction indexFn = op.indexFn;
+               AggregateOperator aggOp = op.aggOp;
+
+               // Convert Reduction direction to a number to pass to CUDA 
kernel
+               int reductionDirection = -1;
+               if (indexFn instanceof ReduceAll){
+                       reductionDirection = REDUCTION_ALL;
+               } else if (indexFn instanceof ReduceRow){
+                       reductionDirection = REDUCTION_ROW;
+               } else if (indexFn instanceof ReduceCol){
+                       reductionDirection = REDUCTION_COL;
+               } else if (indexFn instanceof ReduceDiag){
+                       reductionDirection = REDUCTION_DIAG;
+               } else {
+                       throw new DMLRuntimeException("Internal Error - Invalid 
index function type, only reducing along rows, columns, diagonals or all 
elements is supported in Aggregate Unary operations");
+               }
+               assert reductionDirection !=-1 : "Internal Error - Incorrect 
type of reduction direction set for aggregate unary GPU instruction";
+
+               // Convert function type to a number to pass to the CUDA Kernel
+               int opIndex = -1;
+               if (aggOp.increOp.fn instanceof KahanPlus) {
+                       opIndex = OP_PLUS;
+               } else if (aggOp.increOp.fn instanceof KahanPlusSq) {
+                       opIndex = OP_PLUS_SQ;
+               } else if (aggOp.increOp.fn instanceof Mean) {
+                       opIndex = OP_MEAN;
+               } else if (aggOp.increOp.fn instanceof CM) {
+                       assert ((CM)aggOp.increOp.fn).getAggOpType() == 
CMOperator.AggregateOperationTypes.VARIANCE : "Internal Error - Invalid Type of 
CM operator for Aggregate Unary operation on GPU";
+                       opIndex = OP_VARIANCE;
+               } else if (aggOp.increOp.fn instanceof Plus) {
+                       opIndex = OP_PLUS;
+               } else if (aggOp.increOp.fn instanceof Multiply) {
+                       opIndex = OP_MULTIPLY;
+               } else if (aggOp.increOp.fn instanceof Builtin) {
+                       Builtin b = (Builtin)aggOp.increOp.fn;
+                       switch(b.bFunc) {
+                               case MAX: opIndex = OP_MAX; break;
+                               case MIN: opIndex = OP_MIN; break;
+                               case MAXINDEX: opIndex = OP_MAXINDEX; break;
+                               case MININDEX: opIndex = OP_MININDEX;break;
+                               default:
+                                       new DMLRuntimeException("Internal Error 
- Unsupported Builtin Function for Aggregate unary being done on GPU");
+                       }
+               } else {
+                       throw new DMLRuntimeException("Internal Error - 
Aggregate operator has invalid Value function");
+               }
+               assert opIndex != -1 : "Internal Error - Incorrect type of 
operation set for aggregate unary GPU instruction";
+
+
+               //TODO - care about reductionDirection & opIndex
+
+               int rlen = (int)in1.getNumRows();
+               int clen = (int)in1.getNumColumns();
+               if (isSparse){
+                       long nnz = in1.getNnz();
+                       assert nnz > 0 : "Internal Error - number of non zeroes 
set to " + nnz + " in Aggregate Binary for GPU";
+                       MatrixObject out = 
ec.getSparseMatrixOutputForGPUInstruction(output, nnz);
+                       throw new DMLRuntimeException("Internal Error - Not 
implemented");
+
+               } else {
+                       Pointer out = null;
+                       if (reductionDirection == REDUCTION_ALL || 
reductionDirection == REDUCTION_DIAG) {
+                               // Scalar output
+                               out = new Pointer();
+                               cudaMalloc(out, Sizeof.DOUBLE);
+                       } else {
+                               // Matrix output
+                               MatrixObject out1 = 
ec.getDenseMatrixOutputForGPUInstruction(output);
+                               out = ((JCudaObject) 
out1.getGPUObject()).jcudaDenseMatrixPtr;
+                       }
+
+                       Pointer in = 
((JCudaObject)in1.getGPUObject()).jcudaDenseMatrixPtr;
+                       int size = rlen * clen;
+
+                       // For scalars, set the scalar output in the Execution 
Context object
+                       switch (opIndex){
+                               case OP_PLUS: {
+                                       switch(reductionDirection) {
+                                               case REDUCTION_ALL : {
+                                                       double result = 
reduce_single(in, size);
+                                                       
ec.setScalarOutput(output, new DoubleObject(result));
+                                                       break;
+                                               }
+                                               case REDUCTION_DIAG :
+                                               case REDUCTION_COL :
+                                               case REDUCTION_ROW :
+                                                       throw new 
DMLRuntimeException("Internal Error - Row, Column and Diag summation not 
implemented yet");
+                                       }
+                                       break;
+                               }
+                               case OP_PLUS_SQ : {
+                                       switch(reductionDirection) {
+                                               case REDUCTION_ALL:
+                                               case REDUCTION_COL:
+                                               case REDUCTION_ROW:
+                                                       throw new 
DMLRuntimeException("Internal Error - All, Row & Column summation square of 
matrix not implemented yet for GPU");
+                                               default:
+                                                       throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
summation squared");
+                                       }
+                                       // break;
+                               }
+                               case OP_MEAN:{
+                                       switch(reductionDirection) {
+                                               case REDUCTION_ALL:
+                                               case REDUCTION_COL:
+                                               case REDUCTION_ROW:
+                                                       throw new 
DMLRuntimeException("Internal Error - All, Row & Column mean of matrix not 
implemented yet for GPU ");
+                                               default:
+                                                       throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
mean");
+                                       }
+                                       // break;
+                               }
+                               case OP_VARIANCE : {
+                                       switch(reductionDirection) {
+                                               case REDUCTION_ALL:
+                                               case REDUCTION_COL:
+                                               case REDUCTION_ROW:
+                                                       throw new 
DMLRuntimeException("Internal Error - All, Row & Column variance of matrix not 
implemented yet for GPU ");
+                                               default:
+                                                       throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
variance");
+                                       }
+                                       // break;
+                               }
+                               case OP_MULTIPLY : {
+                                       switch (reductionDirection) {
+                                               case REDUCTION_ALL:
+                                                       throw new 
DMLRuntimeException("Internal Error - All element multiplication of matrix not 
implemented yet for GPU ");
+                                               default:
+                                                       throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
multiplication");
+                                       }
+                                       // break;
+                               }
+                               case OP_MAX :{
+                                       switch(reductionDirection) {
+                                               case REDUCTION_ALL:
+                                               case REDUCTION_COL:
+                                               case REDUCTION_ROW:
+                                                       throw new 
DMLRuntimeException("Internal Error - All, Row & Column max of matrix not 
implemented yet for GPU ");
+                                               default:
+                                                       throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for max");
+                                       }
+                                       // break;
+                               }
+                               case OP_MIN :{
+                                       switch(reductionDirection) {
+                                               case REDUCTION_ALL:
+                                               case REDUCTION_COL:
+                                               case REDUCTION_ROW:
+                                                       throw new 
DMLRuntimeException("Internal Error - All, Row & Column min of matrix not 
implemented yet for GPU ");
+                                               default:
+                                                       throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for min");
+                                       }
+                                       // break;
+                               }
+                               case OP_MAXINDEX : {
+                                       switch(reductionDirection) {
+                                               case REDUCTION_COL:
+                                                       throw new 
DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented 
yet for GPU ");
+                                               default:
+                                                       throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
maxindex");
+                                       }
+                                       // break;
+                               }
+                               case OP_MININDEX : {
+                                       switch(reductionDirection) {
+                                               case REDUCTION_COL:
+                                                       throw new 
DMLRuntimeException("Internal Error - Column minindex of matrix not implemented 
yet for GPU ");
+                                               default:
+                                                       throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
minindex");
+                                       }
+                                       // break;
+                               }
+                               default : throw new 
DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!");
+                       }
+
+               }
+       }
+
+
+       private static double reduce_single(Pointer in, int n) throws 
DMLRuntimeException {
+               int[] tmp = getThreadsBlocksAndSharedMem(n);
+               int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
+
+               Pointer tempOut = JCudaObject.allocate(n * Sizeof.DOUBLE);
+               kernels.launchKernel("reduce", new ExecutionConfig(blocks, 
threads, sharedMem),
+                                               in, tempOut, n);
+               cudaDeviceSynchronize();
+               int s = n;
+               while (s > 1) {
+                       tmp = getThreadsBlocksAndSharedMem(n);
+                       blocks = tmp[0]; threads = tmp[1]; sharedMem = tmp[2];
+                       kernels.launchKernel("reduce", new 
ExecutionConfig(blocks, threads, sharedMem),
+                                                       tempOut, tempOut, s);
+                       s = (s + (threads*2-1)) / (threads*2);
+               }
+               double[] result = {-1f};
+               cudaMemcpy(Pointer.to(result), tempOut, Sizeof.DOUBLE, 
cudaMemcpyDeviceToHost);
+               cudaFree(tempOut);
+
+               return result[0];
+       }
+
+
+       private static int[] getThreadsBlocksAndSharedMem(int n){
+               final int MAX_THREADS = 1024;
+               final int MAX_BLOCKS = 65535;
+               int threads = (n < MAX_THREADS*2) ? nextPow2((n + 1)/ 2) : 
MAX_THREADS;
+
+               int blocks = (n + (threads * 2 - 1)) / (threads * 2);
+               blocks = Math.min(MAX_BLOCKS, blocks);
+
+               int sharedMemSize = threads * Sizeof.DOUBLE;
+               if (threads <= 32){
+                       sharedMemSize *= 2;
+               }
+               return new int[] {blocks, threads, sharedMemSize};
+       }
+
+
+       private static int nextPow2(int x)
+       {
+               --x;
+               x |= x >> 1;
+               x |= x >> 2;
+               x |= x >> 4;
+               x |= x >> 8;
+               x |= x >> 16;
+               return ++x;
+       }
+
+       //********************************************************************/
+       //****************  END OF UNARY AGGREGATE Functions *****************/
+       //********************************************************************/
+
+
        public static void conv2d_backward_data(MatrixObject filter, 
MatrixObject dout,
                        MatrixObject output, int N, int C, int H, int W, int K, 
int R,
                        int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
@@ -918,7 +1194,7 @@ public class LibMatrixCUDA {
                cudnnTensorDescriptor dxDesc = null;
                cudnnFilterDescriptor wDesc = null;
                cudnnConvolutionDescriptor convDesc = null;
-               
+
                Pointer workSpace = null;
                long sizeInBytes = 0;
                try {
@@ -926,27 +1202,27 @@ public class LibMatrixCUDA {
                        wDesc = allocateFilterDescriptor(K, C, R, S);
                        dyDesc = allocateTensorDescriptor(N, K, P, Q);
                        dxDesc = allocateTensorDescriptor(N, C, H, W);
-                       
+
                        // Allocate data
-                       Pointer w = 
((JCudaObject)filter.getGPUObject()).jcudaDenseMatrixPtr; 
-                       Pointer dy = 
((JCudaObject)dout.getGPUObject()).jcudaDenseMatrixPtr; 
-                       Pointer dx = 
((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr; 
+                       Pointer w = 
((JCudaObject)filter.getGPUObject()).jcudaDenseMatrixPtr;
+                       Pointer dy = 
((JCudaObject)dout.getGPUObject()).jcudaDenseMatrixPtr;
+                       Pointer dx = 
((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr;
 
                        alpha = pointerTo(1.0); // TODO
                        beta = pointerTo(0.0f);
-                       
-                       int padding [] = { pad_h, pad_w }; 
+
+                       int padding [] = { pad_h, pad_w };
                        int strides [] = { stride_h, stride_w };
                        convDesc = allocateConvolutionDescriptor(padding, 
strides);
                        long sizeInBytesArray[] = { 0 };
-                       
+
                        // TODO: Select the best algorithm depending on the 
data and supported CUDA
                        int algo = 
jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
                        workSpace = new Pointer();
                        
cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle,
                                        wDesc, dyDesc, convDesc, dxDesc, algo, 
sizeInBytesArray);
-                       
-                       int status = cudnnConvolutionBackwardData(cudnnHandle, 
alpha, wDesc, w, 
+
+                       int status = cudnnConvolutionBackwardData(cudnnHandle, 
alpha, wDesc, w,
                                        dyDesc, dy, convDesc, algo, workSpace, 
sizeInBytes, beta, dxDesc, dx);
                        if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnConvolutionBackwardData: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
@@ -963,15 +1239,15 @@ public class LibMatrixCUDA {
                                cudnnDestroyTensorDescriptor(dxDesc);
                        if(wDesc != null)
                                cudnnDestroyFilterDescriptor(wDesc);
-                       
+
                        if(convDesc != null)
                                cudnnDestroyConvolutionDescriptor(convDesc);
-                       
+
                        if(workSpace != null && sizeInBytes != 0)
                                cudaFree(workSpace);
                }
        }
-       
+
        /**
         * performs maxpooling on GPU by exploiting cudnnPoolingForward(...)
         * @param image image as matrix object
@@ -1009,16 +1285,16 @@ public class LibMatrixCUDA {
                        yDesc = allocateTensorDescriptor(N, C, P, Q);
                        xDesc = allocateTensorDescriptor(N, C, H, W);
                        poolingDesc = allocatePoolingDescriptor(R, S, pad_h, 
pad_w, stride_h, stride_w);
-                       
+
                        // Allocate data
-                       Pointer x = 
((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr; 
-                       Pointer y = 
((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr; 
-                       
+                       Pointer x = 
((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr;
+                       Pointer y = 
((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr;
+
                        alpha = pointerTo(1.0);
                        beta = pointerTo(0.0f);
-                       
+
                        int status = cudnnPoolingForward(cudnnHandle, 
poolingDesc, alpha, xDesc, x, beta, yDesc, y);
-                       
+
                        if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnPoolingForward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
                        }
@@ -1036,7 +1312,7 @@ public class LibMatrixCUDA {
                                cudnnDestroyPoolingDescriptor(poolingDesc);
                }
        }
-       
+
        /**
         * performs maxpoolingBackward on GPU by exploiting 
cudnnPoolingBackward(...)
         * @param image image as matrix object
@@ -1081,34 +1357,34 @@ public class LibMatrixCUDA {
                        yDesc = allocateTensorDescriptor(N, C, P, Q);
                        dxDesc = allocateTensorDescriptor(N, C, H, W);
                        dyDesc = allocateTensorDescriptor(N, C, P, Q);
-                       
+
                        poolingDesc = allocatePoolingDescriptor(R, S, pad_h, 
pad_w, stride_h, stride_w);
-                       
+
                        // Calling PoolForward first, y is one of the inputs 
for poolBackward
                        // TODO: Remove calling poolForward after necessary 
changes at language level for poolBackward
                        Pointer y = new Pointer();
                        long numBytes = N*C*P*Q*Sizeof.DOUBLE;
                        cudaMalloc(y, numBytes);
-                       
+
                        // Allocate data
                        Pointer x = 
((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr;
                        Pointer dx = 
((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr;
                        Pointer dy = 
((JCudaObject)dout.getGPUObject()).jcudaDenseMatrixPtr;
-                       
+
                        alpha = pointerTo(1.0);
                        beta = pointerTo(0.0f);
-                       
+
                        int status = cudnnPoolingForward(cudnnHandle, 
poolingDesc, alpha, xDesc, x, beta, yDesc, y);
                        if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnPoolingForward before cudnnPoolingBackward: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
                        }
-                       
+
                        status = cudnnPoolingBackward(cudnnHandle, poolingDesc, 
alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
-                       
+
                        if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
                                throw new DMLRuntimeException("Could not 
executed cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
                        }
-                       
+
                        cudaFree(y);
                }
                finally {
@@ -1125,18 +1401,18 @@ public class LibMatrixCUDA {
                        if(dxDesc != null)
                                cudnnDestroyTensorDescriptor(dxDesc);
                        if(poolingDesc != null)
-                               cudnnDestroyPoolingDescriptor(poolingDesc);     
-               }       
+                               cudnnDestroyPoolingDescriptor(poolingDesc);
+               }
        }
        public static boolean isInSparseFormat(MatrixObject mo) {
                if(mo.getGPUObject() != null && mo.getGPUObject().isAllocated())
                        return mo.getGPUObject().isInSparseFormat();
                return MatrixBlock.evalSparseFormatInMemory(mo.getNumRows(), 
mo.getNumColumns(), mo.getNnz());
        }
-       
+
        /**
         * Performs elementwise matrix-scalar operation specified by op
-        * 
+        *
         * @param ec execution context
         * @param in input matrix
         * @param outputName output matrix name
@@ -1146,7 +1422,7 @@ public class LibMatrixCUDA {
         */
        public static void bincellOp(ExecutionContext ec, MatrixObject in, 
String outputName, boolean isInputTransposed, ScalarOperator op) throws 
DMLRuntimeException {
                double constant = op.getConstant();
-               boolean isCUDALibAvailable = (op.fn instanceof Multiply 
+               boolean isCUDALibAvailable = (op.fn instanceof Multiply
                                || (op.fn instanceof Divide && op instanceof 
RightScalarOperator && constant != 0)) && !isSparseAndEmpty(in);
                if(!isCUDALibAvailable) {
                        if(constant == 0) {
@@ -1192,16 +1468,16 @@ public class LibMatrixCUDA {
                        else {
                                throw new DMLRuntimeException("Unsupported op");
                        }
-                       
-                       // TODO: Performance optimization: Call cublasDaxpy 
if(in.getNumRows() == 1 || in.getNumColumns() == 1) 
+
+                       // TODO: Performance optimization: Call cublasDaxpy 
if(in.getNumRows() == 1 || in.getNumColumns() == 1)
                        // C = alpha* op( A ) + beta* op ( B )
                        dgeam(ec, in, in, outputName, isInputTransposed, 
isInputTransposed, alpha, 0.0);
                }
        }
-       
+
        /**
         * Utility to launch binCellScalarOp kernel
-        * 
+        *
         * @param ec
         * @param in
         * @param outputName
@@ -1209,11 +1485,11 @@ public class LibMatrixCUDA {
         * @param op
         * @throws DMLRuntimeException
         */
-       private static void launchBinCellOpKernel(ExecutionContext ec, 
MatrixObject in, String outputName, boolean isInputTransposed, 
+       private static void launchBinCellOpKernel(ExecutionContext ec, 
MatrixObject in, String outputName, boolean isInputTransposed,
                        ScalarOperator op) throws DMLRuntimeException {
                if(isInputTransposed)
                        throw new DMLRuntimeException("Transposing the input is 
not supported");
-               
+
                int rlenA = (int) in.getNumRows();
                int clenA = (int) in.getNumColumns();
                if(isInSparseFormat(in)) {
@@ -1228,13 +1504,13 @@ public class LibMatrixCUDA {
            int isLeftScalar = (op instanceof LeftScalarOperator) ? 1 : 0;
            // Invokes binCellScalarOp(double* A, double scalar, double* C, int 
rlenA, int clenA, int op, int isLeftScalar)
                kernels.launchKernel("binCellScalarOp",
-                               
ExecutionConfig.getConfigForSimpleMatrixOperations(rlenA, clenA), 
+                               
ExecutionConfig.getConfigForSimpleMatrixOperations(rlenA, clenA),
                                A, scalar, C, rlenA, clenA, getBinaryOp(op.fn), 
isLeftScalar);
        }
-       
+
        /**
         * Utility to launch binCellOp kernel
-        * 
+        *
         * @param ec
         * @param in1
         * @param in2
@@ -1244,7 +1520,7 @@ public class LibMatrixCUDA {
         * @param op
         * @throws DMLRuntimeException
         */
-       private static void launchBinCellOpKernel(ExecutionContext ec, 
MatrixObject in1, MatrixObject in2, 
+       private static void launchBinCellOpKernel(ExecutionContext ec, 
MatrixObject in1, MatrixObject in2,
                        String outputName, boolean isLeftTransposed, boolean 
isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
                boolean isSparse1 = isInSparseFormat(in1);
                boolean isEmpty1 = isSparseAndEmpty(in1);
@@ -1282,7 +1558,7 @@ public class LibMatrixCUDA {
                                
((JCudaObject)in2.getGPUObject()).sparseToDense();
                        }
                    Pointer B = 
((JCudaObject)in2.getGPUObject()).jcudaDenseMatrixPtr;
-                       
+
                        int rlenA = (int) in1.getNumRows();
                        int rlenB = (int) in2.getNumRows();
                        int clenA = (int) in1.getNumColumns();
@@ -1297,38 +1573,38 @@ public class LibMatrixCUDA {
                    int vecStatusB = getVectorStatus(in2);
 
                        kernels.launchKernel("binCellOp",
-                                       
ExecutionConfig.getConfigForSimpleMatrixOperations(maxRlen, maxClen), 
+                                       
ExecutionConfig.getConfigForSimpleMatrixOperations(maxRlen, maxClen),
                                        A, B, C, maxRlen, maxClen, vecStatusA, 
vecStatusB, getBinaryOp(op.fn));
                }
        }
-       
+
        private static int getVectorStatus(MatrixObject in) {
                long rows = in.getNumRows();
                long cols = in.getNumColumns();
                if(cols == 1)
-                       return 1; 
+                       return 1;
                else if(rows == 1)
-                       return 2; 
+                       return 2;
                else
                        return 0;
        }
-       
+
        private static boolean isSparseAndEmpty(MatrixObject in1) {
                boolean isSparse1 = isInSparseFormat(in1);
                boolean isEmpty1 = isSparse1 && 
(((JCudaObject)in1.getGPUObject()).jcudaSparseMatrixPtr.nnz == 0);
                return isEmpty1;
        }
-       
+
        private static void deviceCopy(ExecutionContext ec, MatrixObject src, 
String outputName, boolean isInputTransposed) throws DMLRuntimeException {
                if(!isInputTransposed)
                        deviceCopy(ec, src, outputName);
                else
                        transpose(ec, src, outputName);
        }
-       
+
        /**
         * Performs a deep device copy of input matrix
-        * 
+        *
         * @param ec
         * @param src
         * @param outputName
@@ -1345,7 +1621,7 @@ public class LibMatrixCUDA {
            Pointer destPtr = 
((JCudaObject)out.getGPUObject()).jcudaDenseMatrixPtr;
            deviceCopy(srcPtr, destPtr, (int)src.getNumRows(), 
(int)src.getNumColumns());
        }
-       
+
        private static void compareAndSet(ExecutionContext ec, MatrixObject in, 
String outputName, double compareVal,  double tolerance,
                        double ifEqualsVal, double ifLessThanVal, double 
ifGreaterThanVal) throws DMLRuntimeException {
                if(isInSparseFormat(in)) {
@@ -1364,7 +1640,7 @@ public class LibMatrixCUDA {
                                
ExecutionConfig.getConfigForSimpleMatrixOperations(rlen, clen),
                                A, ret, rlen, clen, compareVal, tolerance, 
ifEqualsVal, ifLessThanVal, ifGreaterThanVal);
        }
-       
+
        /**
         */
        private static void setOutputToConstant(ExecutionContext ec, double 
constant, String outputName) throws DMLRuntimeException {
@@ -1387,10 +1663,10 @@ public class LibMatrixCUDA {
                                
ExecutionConfig.getConfigForSimpleMatrixOperations(rlen, clen),
                                A, constant, rlen, clen);
        }
-       
+
        /**
         * Performs a deep copy of input device double pointer corresponding to 
matrix
-        * 
+        *
         * @param src
         * @param dest
         * @param rlen
@@ -1402,10 +1678,10 @@ public class LibMatrixCUDA {
                                
ExecutionConfig.getConfigForSimpleMatrixOperations(rlen, clen),
                                src, dest, rlen, clen);
        }
-       
+
        /**
         * Performs elementwise operation specified by op of two input matrices 
in1 and in2
-        * 
+        *
         * @param ec execution context
         * @param in1 input matrix 1
         * @param in2 input matrix 2
@@ -1415,7 +1691,7 @@ public class LibMatrixCUDA {
         * @param op binary operator
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static void bincellOp(ExecutionContext ec, MatrixObject in1, 
MatrixObject in2, 
+       public static void bincellOp(ExecutionContext ec, MatrixObject in1, 
MatrixObject in2,
                        String outputName, boolean isLeftTransposed, boolean 
isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
                boolean isCUDALibAvailable = (op.fn instanceof Plus || op.fn 
instanceof Minus) && !isSparseAndEmpty(in1) && !isSparseAndEmpty(in2) && 
!isVector(in1) && !isVector(in2);
                if(!isCUDALibAvailable) {
@@ -1439,13 +1715,13 @@ public class LibMatrixCUDA {
                        dgeam(ec, in1, in2, outputName, isLeftTransposed, 
isRightTransposed, alpha, beta);
                }
        }
-       
+
        private static boolean isVector(MatrixObject in) {
                return in.getNumRows() == 1 || in.getNumColumns() == 1;
        }
-       
-       // op = {0=plus, 1=minus, 2=multiply, 3=divide, 4=power, 
-       // 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 
10=notequal, 
+
+       // op = {0=plus, 1=minus, 2=multiply, 3=divide, 4=power,
+       // 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal,
        // 11=min, 12=max, 13=and, 14=or, 15=log}
        private static int getBinaryOp(ValueFunction fn) throws 
DMLRuntimeException {
                if(fn instanceof Plus) return 0;
@@ -1463,18 +1739,18 @@ public class LibMatrixCUDA {
                else if(fn instanceof Or) return 14;
                else if(fn instanceof Multiply2) return 2;
                else if(fn instanceof Power2) return 4;
-               
+
                throw new DMLRuntimeException("The given value function is not 
supported:" + fn.getClass().getName());
        }
-       
+
        /**
         * Performs sparse and dense dgeam given two input matrices
         * C = alpha* op( A ) + beta* op ( B )
         * where op = transpose or not (specified by isLeftTransposed and 
isRightTransposed).
-        * 
+        *
         * @param ec
         * @param in1
-        * @param in2 
+        * @param in2
         * @param outputName
         * @param isLeftTransposed
         * @param isRightTransposed
@@ -1482,7 +1758,7 @@ public class LibMatrixCUDA {
         * @param beta
         * @throws DMLRuntimeException
         */
-       private static void dgeam(ExecutionContext ec, MatrixObject in1, 
MatrixObject in2, String outputName, 
+       private static void dgeam(ExecutionContext ec, MatrixObject in1, 
MatrixObject in2, String outputName,
                        boolean isLeftTransposed, boolean isRightTransposed, 
double alpha, double beta) throws DMLRuntimeException {
                Pointer alphaPtr = pointerTo(alpha);
                Pointer betaPtr = pointerTo(beta);
@@ -1497,13 +1773,13 @@ public class LibMatrixCUDA {
                int lda = isLeftTransposed ? n : m;
                int ldb = isRightTransposed ? n : m;
                int ldc = m;
-               
+
                MatrixObject out = ec.getMatrixObject(outputName);
                boolean isSparse1 = isInSparseFormat(in1);
 //             boolean isEmpty1 = isSparse1 && 
(((JCudaObject)in1.getGPUObject()).jcudaSparseMatrixPtr.nnz == 0);
                boolean isSparse2 = isInSparseFormat(in2);
 //             boolean isEmpty2 = isSparse2 && 
(((JCudaObject)in2.getGPUObject()).jcudaSparseMatrixPtr.nnz == 0);
-               
+
                // TODO: Implement sparse-dense matrix cublasDgeam kernel
                if(isSparse1 || isSparse2) {
                        // Invoke cuSparse when either are in sparse format
@@ -1516,15 +1792,15 @@ public class LibMatrixCUDA {
                                
((JCudaObject)in2.getGPUObject()).denseToSparse();
                        }
                        CSRPointer B = 
((JCudaObject)in2.getGPUObject()).jcudaSparseMatrixPtr;
-                       
+
                        ec.allocateGPUMatrixObject(outputName);
-               
+
                CSRPointer C = CSRPointer.allocateForDgeam(cusparseHandle, A, 
B, m, n);
                        
((JCudaObject)out.getGPUObject()).setSparseMatrixCudaPointer(C);
                        long sizeOfC = CSRPointer.estimateSize(C.nnz, 
out.getNumRows());
                        out.getGPUObject().setDeviceModify(sizeOfC);
-                       JCusparse.cusparseDcsrgeam(cusparseHandle, m, n, 
alphaPtr, A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd, betaPtr, 
-                                       B.descr, (int)B.nnz, B.val, B.rowPtr, 
B.colInd, 
+                       JCusparse.cusparseDcsrgeam(cusparseHandle, m, n, 
alphaPtr, A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd, betaPtr,
+                                       B.descr, (int)B.nnz, B.val, B.rowPtr, 
B.colInd,
                                        C.descr, C.val, C.rowPtr, C.colInd);
             cudaDeviceSynchronize();
                }
@@ -1537,10 +1813,10 @@ public class LibMatrixCUDA {
                    JCublas2.cublasDgeam(cublasHandle, transa, transb, m, n, 
alphaPtr, A, lda, betaPtr, B, ldb, C, ldc);
                }
        }
-       
+
        /**
         * Transposes the input matrix using cublasDgeam
-        * 
+        *
         * @param ec execution context
         * @param in input matrix
         * @param outputName output matrix name

Reply via email to