Refactored GPU{Contex,Object} to make it friendlier for parfor

- Folded JCuda{Context,Object} to GPU{Context,Object}
- Removed "deviceMemBytes", it was redundant
- Removed all synchronized in GPU{Object,Contex}
- print GPUContext from everywhere in log.trace
- LibMatrixCUDA functions expect a GPUContext instead of getting it statically
- Restructured GPUContext to use a pool of already initialized GPUContexts
- Call cudaSetDevice when on different thread
- TODO FIXME Disabled cublasDgeam for scalarMatrixArithmetic
- TODO FIXME revisit the need to always force gpu to be used, mem est
  broken
- Ability to restrict parfor from picking up all GPUs on the machine,
  from a system property

Closes #462


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/129f0f6b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/129f0f6b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/129f0f6b

Branch: refs/heads/master
Commit: 129f0f6b0e0f6167e4137c6d47374ab96501b888
Parents: 9ed27ad
Author: Nakul Jindal <naku...@gmail.com>
Authored: Fri Apr 21 16:22:19 2017 -0700
Committer: Nakul Jindal <naku...@gmail.com>
Committed: Fri Apr 21 16:22:19 2017 -0700

----------------------------------------------------------------------
 conf/SystemML-config.xml.template               |    3 +
 .../java/org/apache/sysml/api/DMLScript.java    |   27 +-
 .../api/mlcontext/MLContextConversionUtil.java  |    4 +-
 .../sysml/api/mlcontext/ScriptExecutor.java     |   25 +-
 .../java/org/apache/sysml/conf/DMLConfig.java   |   17 +-
 .../controlprogram/ParForProgramBlock.java      |   25 +
 .../controlprogram/caching/CacheableData.java   |   80 +-
 .../context/ExecutionContext.java               |   56 +-
 .../controlprogram/parfor/LocalParWorker.java   |    5 +
 .../controlprogram/parfor/ParWorker.java        |    5 +
 .../parfor/opt/OptTreeConverter.java            |    2 +-
 .../cp/FunctionCallCPInstruction.java           |   13 +-
 .../gpu/AggregateBinaryGPUInstruction.java      |    4 +-
 .../gpu/AggregateUnaryGPUInstruction.java       |    3 +-
 .../gpu/ConvolutionGPUInstruction.java          |   25 +-
 .../instructions/gpu/GPUInstruction.java        |    3 +-
 .../instructions/gpu/MMTSJGPUInstruction.java   |    3 +-
 .../gpu/MatrixBuiltinGPUInstruction.java        |    5 +-
 .../MatrixMatrixArithmeticGPUInstruction.java   |    5 +-
 .../gpu/MatrixMatrixAxpyGPUInstruction.java     |    5 +-
 .../instructions/gpu/ReorgGPUInstruction.java   |    5 +-
 .../ScalarMatrixArithmeticGPUInstruction.java   |    4 +-
 .../instructions/gpu/context/CSRPointer.java    |  457 ++++++
 .../instructions/gpu/context/GPUContext.java    |  619 ++++++-
 .../gpu/context/GPUContextPool.java             |  158 ++
 .../instructions/gpu/context/GPUObject.java     |  957 ++++++++---
 .../instructions/gpu/context/JCudaContext.java  |  286 ----
 .../instructions/gpu/context/JCudaKernels.java  |   70 +-
 .../instructions/gpu/context/JCudaObject.java   | 1330 ---------------
 .../runtime/matrix/data/LibMatrixCUDA.java      | 1509 ++++++++++--------
 .../test/integration/AutomatedTestBase.java     |    2 +-
 31 files changed, 3035 insertions(+), 2677 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template 
b/conf/SystemML-config.xml.template
index a4c7b2f..fe4437f 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -71,4 +71,7 @@
 
    <!-- prints extra statistics information for Deep Neural Networks done in 
CP mode -->
    <systemml.stats.extraDNN>false</systemml.stats.extraDNN>
+
+   <!-- sets the maximum number of GPUs per process, -1 for all GPUs -->
+   <systemml.gpu.perProcessMax>-1</systemml.gpu.perProcessMax>
 </root>

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/api/DMLScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java 
b/src/main/java/org/apache/sysml/api/DMLScript.java
index ce60d55..febbf13 100644
--- a/src/main/java/org/apache/sysml/api/DMLScript.java
+++ b/src/main/java/org/apache/sysml/api/DMLScript.java
@@ -89,6 +89,7 @@ import 
org.apache.sysml.runtime.controlprogram.parfor.ProgramConverter;
 import 
org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysml.runtime.controlprogram.parfor.util.IDHandler;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysml.runtime.io.IOUtilFunctions;
 import org.apache.sysml.runtime.matrix.CleanupMR;
 import org.apache.sysml.runtime.matrix.data.LibMatrixDNN;
@@ -111,7 +112,7 @@ public class DMLScript
                HADOOP,             // execute all matrix operations in MR
                SINGLE_NODE,    // execute all matrix operations in CP
                HYBRID,         // execute matrix operations in CP or MR
-               HYBRID_SPARK,   // execute matrix operations in CP or Spark   
+               HYBRID_SPARK,   // execute matrix operations in CP or Spark
                SPARK                   // execute matrix operations in Spark
        }
 
@@ -476,10 +477,6 @@ public class DMLScript
                                return true;
                        }
 
-                       if (USE_ACCELERATOR){
-                               GPUContext.getGPUContext();
-                       }
-
                        if (dmlOptions.clean) {
                                cleanSystemMLWorkspace();
                                return true;
@@ -834,21 +831,35 @@ public class DMLScript
                // into performance problems are recorded and displayed
                GPUStatistics.DISPLAY_STATISTICS = 
dmlconf.getBooleanValue(DMLConfig.EXTRA_GPU_STATS);
                LibMatrixDNN.DISPLAY_STATISTICS = 
dmlconf.getBooleanValue(DMLConfig.EXTRA_DNN_STATS);
+
+               // Sets the maximum number of GPUs per process, -1 for all 
available GPUs
+               GPUContextPool.PER_PROCESS_MAX_GPUS = 
dmlconf.getIntValue(DMLConfig.MAX_GPUS_PER_PROCESS);
                
                //Step 10: execute runtime program
                Statistics.startRunTimer();
                ExecutionContext ec = null;
+               GPUContext gCtx = null;
                try 
                {  
                        //run execute (w/ exception handling to ensure proper 
shutdown)
                        ec = ExecutionContextFactory.createContext(rtprog);
+                       if (DMLScript.USE_ACCELERATOR && ec != null){
+                               gCtx = GPUContextPool.getFromPool();
+                               gCtx.initializeThread();
+                               ec.setGPUContext(gCtx);
+                               if (gCtx == null) {
+                                       throw new DMLRuntimeException("GPU : 
Could not create GPUContext, either no GPU or all GPUs currently in use");
+                               }
+                       }
                        rtprog.execute( ec );  
                        
                }
                finally //ensure cleanup/shutdown
-               {       
-                       if(DMLScript.USE_ACCELERATOR && ec != null)
-                               ec.destroyGPUContext();
+               {
+                       if(DMLScript.USE_ACCELERATOR && ec.getGPUContext() != 
null) {
+                               GPUContextPool.returnToPool(ec.getGPUContext());
+                       }
+
                        if( dmlconf.getBooleanValue(DMLConfig.CODEGEN) )
                                SpoofCompiler.cleanupCodeGenerator();
                        if(ec != null && ec instanceof SparkExecutionContext)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java 
b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
index dc20108..b37f037 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
@@ -169,7 +169,7 @@ public class MLContextConversionUtil {
                        matrixObject.acquireModify(matrixBlock);
                        matrixObject.release();
                        return matrixObject;
-               } catch (CacheException e) {
+               } catch (DMLRuntimeException e) {
                        throw new MLContextException("Exception converting 
MatrixBlock to MatrixObject", e);
                }
        }
@@ -197,7 +197,7 @@ public class MLContextConversionUtil {
                        frameObject.acquireModify(frameBlock);
                        frameObject.release();
                        return frameObject;
-               } catch (CacheException e) {
+               } catch (DMLRuntimeException e) {
                        throw new MLContextException("Exception converting 
FrameBlock to FrameObject", e);
                }
        }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java 
b/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
index ac2b92c..2368c90 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
@@ -48,6 +48,7 @@ import org.apache.sysml.runtime.controlprogram.Program;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContextFactory;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysml.utils.Explain;
 import org.apache.sysml.utils.Explain.ExplainCounts;
 import org.apache.sysml.utils.Explain.ExplainType;
@@ -244,6 +245,18 @@ public class ScriptExecutor {
                if (symbolTable != null) {
                        executionContext.setVariables(symbolTable);
                }
+               try {
+                       if (gpu) {
+                               GPUContext gCtx = GPUContextPool.getFromPool();
+                               if (gCtx == null)
+                                       throw new MLContextException("GPU : no 
GPUs or no more free GPUs available");
+                               executionContext.setGPUContext(gCtx);
+                               gCtx.initializeThread();
+                       }
+               } catch (DMLRuntimeException e) {
+                       throw new MLContextException("GPU : Exception occurred 
during initialization");
+               }
+
        }
 
        /**
@@ -324,12 +337,6 @@ public class ScriptExecutor {
                script.setScriptExecutor(this);
                // Set global variable indicating the script type
                DMLScript.SCRIPT_TYPE = script.getScriptType();
-               try {
-                       if (gpu)
-                               GPUContext.getGPUContext();
-               } catch (DMLRuntimeException e) {
-                       throw new MLContextException("Exception occurred during 
initialization of GPU", e);
-               }
        }
 
        /**
@@ -338,8 +345,10 @@ public class ScriptExecutor {
        protected void cleanupAfterExecution() {
                restoreInputsInSymbolTable();
                try {
-                       if (gpu)
-                               executionContext.destroyGPUContext();
+                       if (gpu) {
+                               GPUContext gCtx = 
executionContext.getGPUContext();
+                               GPUContextPool.returnToPool(gCtx);
+                       }
                } catch (DMLRuntimeException e) {
                        throw new MLContextException("Exception occurred during 
cleanup of GPU related resources", e);
                }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/conf/DMLConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java 
b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index a42b1ca..e974a71 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -39,17 +39,17 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.NodeList;
-import org.xml.sax.SAXException;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.parser.ParseException;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.util.LocalFileUtils;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
 
 
-public class DMLConfig 
+public class DMLConfig
 {
 
        public static final String DEFAULT_SYSTEMML_CONFIG_FILEPATH = 
"./SystemML-config.xml";
@@ -76,12 +76,11 @@ public class DMLConfig
        public static final String CODEGEN_LITERALS     = "codegen.literals"; 
//1..heuristic, 2..always
        public static final String EXTRA_GPU_STATS                      = 
"systemml.stats.extraGPU"; //boolean
        public static final String EXTRA_DNN_STATS                      = 
"systemml.stats.extraDNN"; //boolean
+       public static final String MAX_GPUS_PER_PROCESS = 
"systemml.gpu.perProcessMax"; // boolean, maximum number of gpus to use, -1 for 
all
 
-       // Fraction of available memory to use. The available memory is 
computer when the JCudaContext is created
+       // Fraction of available memory to use. The available memory is 
computer when the GPUContext is created
        // to handle the tradeoff on calling cudaMemGetInfo too often.
        public static final String GPU_MEMORY_UTILIZATION_FACTOR    = 
"gpu.memory.util.factor";
-       // Invoke cudaMemGetInfo to get available memory information. Useful if 
GPU is shared among multiple application.
-       public static final String REFRESH_AVAILABLE_MEMORY_EVERY_TIME    = 
"gpu.memory.refresh";
 
        // supported prefixes for custom map/reduce configurations
        public static final String PREFIX_MAPRED = "mapred";
@@ -121,7 +120,7 @@ public class DMLConfig
                _defaultVals.put(EXTRA_DNN_STATS,       "false" );
 
                _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR,      "0.9" );
-               _defaultVals.put(REFRESH_AVAILABLE_MEMORY_EVERY_TIME,      
"true" );
+               _defaultVals.put(MAX_GPUS_PER_PROCESS,  "-1");
        }
        
        public DMLConfig()

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java 
b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
index d6186f3..f3de422 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
@@ -102,6 +102,8 @@ import 
org.apache.sysml.runtime.instructions.cp.DoubleObject;
 import org.apache.sysml.runtime.instructions.cp.IntObject;
 import org.apache.sysml.runtime.instructions.cp.StringObject;
 import org.apache.sysml.runtime.instructions.cp.VariableCPInstruction;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysml.runtime.io.IOUtilFunctions;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 import org.apache.sysml.runtime.matrix.data.OutputInfo;
@@ -629,6 +631,11 @@ public class ParForProgramBlock extends ForProgramBlock
                        switch( _execMode )
                        {
                                case LOCAL: //create parworkers as local threads
+                                       if (DMLScript.USE_ACCELERATOR) {
+                                               
GPUContextPool.returnToPool(ec.getGPUContext());
+                                               ec.setGPUContext(null);
+                                               
setDegreeOfParallelism(GPUContextPool.getDeviceCount());
+                                       }
                                        executeLocalParFor(ec, iterVar, from, 
to, incr);
                                        break;
                                        
@@ -718,6 +725,7 @@ public class ParForProgramBlock extends ForProgramBlock
        private void executeLocalParFor( ExecutionContext ec, IntObject 
itervar, IntObject from, IntObject to, IntObject incr ) 
                throws DMLRuntimeException, InterruptedException
        {
+               LOG.trace("Local Par For (multi-threaded) with degree of 
parallelism : " + _numThreads);
                /* Step 1) init parallel workers, task queue and threads
                 *         start threads (from now on waiting for tasks)
                 * Step 2) create tasks
@@ -820,6 +828,17 @@ public class ParForProgramBlock extends ForProgramBlock
                                                
_prog.removeFunctionProgramBlock(parts[0], parts[1]);
                                        }
                        }
+
+                       // Frees up the GPUContexts used in the threaded Parfor 
and sets
+                       // the main thread to use the GPUContext
+                       if (DMLScript.USE_ACCELERATOR) {
+                               for (int i = 0; i < _numThreads; i++) {
+                                       GPUContext gCtx = 
workers[i].getExecutionContext().getGPUContext();
+                                       GPUContextPool.returnToPool(gCtx);
+                               }
+                               ec.setGPUContext(GPUContextPool.getFromPool());
+                               ec.getGPUContext().initializeThread();
+                       }
                }
                finally 
                {
@@ -1399,6 +1418,12 @@ public class ParForProgramBlock extends ForProgramBlock
                        
                        //deep copy execution context (including prepare parfor 
update-in-place)
                        ExecutionContext cpEc = 
ProgramConverter.createDeepCopyExecutionContext(ec);
+
+                       // If GPU mode is enabled, gets a GPUContext from the 
pool of GPUContexts
+                       // and sets it in the ExecutionContext
+                       if (DMLScript.USE_ACCELERATOR){
+                               
cpEc.setGPUContext(GPUContextPool.getFromPool());
+                       }
                        
                        //prepare basic update-in-place variables (vars dropped 
on result merge)
                        prepareUpdateInPlaceVariables(cpEc, pwID);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
index 054b333..904eb87 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
@@ -22,6 +22,8 @@ package org.apache.sysml.runtime.controlprogram.caching;
 import java.io.File;
 import java.io.IOException;
 import java.lang.ref.SoftReference;
+import java.util.HashMap;
+import java.util.Map;
 import java.util.concurrent.atomic.AtomicLong;
 
 import org.apache.commons.lang.mutable.MutableBoolean;
@@ -34,9 +36,10 @@ import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.LazyWriteBuffer.RPolicy;
-import org.apache.sysml.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence;
 import org.apache.sysml.runtime.instructions.cp.Data;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysml.runtime.instructions.spark.data.BroadcastObject;
 import org.apache.sysml.runtime.instructions.spark.data.RDDObject;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
@@ -50,6 +53,7 @@ import org.apache.sysml.runtime.matrix.data.OutputInfo;
 import org.apache.sysml.runtime.util.LocalFileUtils;
 import org.apache.sysml.runtime.util.MapReduceTool;
 
+
 /**
  * Each object of this class is a cache envelope for some large piece of data
  * called "cache block". For example, the body of a matrix can be the cache 
block.  
@@ -187,7 +191,7 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
        //for lazily evaluated RDDs, and (2) as abstraction for environments 
that do not necessarily have spark libraries available
        private RDDObject _rddHandle = null; //RDD handle
        private BroadcastObject<T> _bcHandle = null; //Broadcast handle
-       protected GPUObject _gpuHandle = null;
+       protected HashMap<GPUContext, GPUObject> _gpuObjects = null; //Per 
GPUContext object allocated on GPU
        
        /**
         * Basic constructor for any cacheable data.
@@ -200,6 +204,7 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
                _uniqueID = (int)_seq.getNextID();              
                _cacheStatus = CacheStatus.EMPTY;
                _numReadThreads = 0;
+               _gpuObjects = new HashMap<>();
        }
        
        /**
@@ -213,7 +218,7 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
                _hdfsFileName = that._hdfsFileName;
                _hdfsFileExists = that._hdfsFileExists; 
                _varName = that._varName;
-               _gpuHandle = that._gpuHandle;
+               _gpuObjects = that._gpuObjects;
        }
 
        
@@ -341,15 +346,16 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
                        bc.setBackReference(this);
        }
 
-       public GPUObject getGPUObject() {
-               return _gpuHandle;
+       public synchronized GPUObject getGPUObject(GPUContext gCtx) {
+               return _gpuObjects.get(gCtx);
        }
-       
-       public void setGPUObject(GPUObject handle) {
-               _gpuHandle = handle;
+
+       public synchronized void setGPUObject(GPUContext gCtx, GPUObject gObj) 
throws DMLRuntimeException {
+               GPUObject old = _gpuObjects.put(gCtx, gObj);
+               if (old != null)
+                               throw new DMLRuntimeException("GPU : 
Inconsistent internal state - this CacheableData already has a GPUObject 
assigned to the current GPUContext (" + gCtx + ")");
        }
        
-       
        // *********************************************
        // ***                                       ***
        // ***    HIGH-LEVEL METHODS THAT SPECIFY    ***
@@ -384,12 +390,20 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
                if( _data == null )
                        getCache();
                        
-               //call acquireHostRead if gpuHandle is set as well as is 
allocated  
-               if( _gpuHandle != null && _gpuHandle.isAllocated()) {
-                       _gpuHandle.acquireHostRead();
-                       if( _data == null )
-                               getCache();
-               }
+               //call acquireHostRead if gpuHandle is set as well as is 
allocated
+        boolean copiedFromGPU = false;
+        for (Map.Entry<GPUContext, GPUObject> kv : _gpuObjects.entrySet()) {
+            GPUObject gObj = kv.getValue();
+            if (gObj != null && copiedFromGPU && gObj.isDirty()) {
+                LOG.error("Inconsistent internal state - A copy of this 
CacheableData was dirty on more than 1 GPU");
+                throw new CacheException("Internal Error : Inconsistent 
internal state, A copy of this CacheableData was dirty on more than 1 GPU");
+            } else if (gObj != null){
+                copiedFromGPU = gObj.acquireHostRead();
+                if( _data == null )
+                    getCache();
+            }
+        }
+
                //read data from HDFS/RDD if required
                //(probe data for cache_nowrite / jvm_reuse)  
                if( isEmpty(true) && _data==null ) 
@@ -517,10 +531,10 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
         * 
         * @param newData new data
         * @return cacheable data
-        * @throws CacheException if CacheException occurs
+        * @throws DMLRuntimeException if error occurs
         */
        public synchronized T acquireModify(T newData)
-               throws CacheException
+               throws DMLRuntimeException
        {
                if( LOG.isTraceEnabled() )
                        LOG.trace("Acquire modify newdata "+getVarName());
@@ -631,10 +645,10 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
         * In-Status:  EMPTY, EVICTABLE, EVICTED;
         * Out-Status: EMPTY.
         * 
-        * @throws CacheException if CacheException occurs
+        * @throws DMLRuntimeException if error occurs
         */
        public synchronized void clearData() 
-               throws CacheException
+               throws DMLRuntimeException
        {
                if( LOG.isTraceEnabled() )
                        LOG.trace("Clear data "+getVarName());
@@ -661,9 +675,14 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
                        _rddHandle.setBackReference(null);
                if( _bcHandle != null )
                        _bcHandle.setBackReference(null);
-               if( _gpuHandle != null )
-                       _gpuHandle.clearData();
-               
+               if( _gpuObjects != null ) {
+                   for (GPUObject gObj : _gpuObjects.values()){
+                       if (gObj != null) {
+                    gObj.clearData();
+                }
+            }
+        }
+
                // change object state EMPTY
                setDirty(false);
                setEmpty();
@@ -731,10 +750,19 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
 
                LOG.trace("Exporting " + this.getDebugName() + " to " + fName + 
" in format " + outputFormat);
                
-               //TODO remove 
-               if( getGPUObject() != null && getGPUObject().isAllocated() ) {
-                       getGPUObject().acquireHostRead();
-               }
+               //TODO remove
+        boolean copiedFromGPU = false;
+        for (Map.Entry<GPUContext, GPUObject> kv : _gpuObjects.entrySet()) {
+            GPUObject gObj = kv.getValue();
+            if (gObj != null && copiedFromGPU && gObj.isDirty()) {
+                LOG.error("Inconsistent internal state - A copy of this 
CacheableData was dirty on more than 1 GPU");
+                throw new CacheException("Internal Error : Inconsistent 
internal state, A copy of this CacheableData was dirty on more than 1 GPU");
+            } else if (gObj != null){
+                copiedFromGPU = gObj.acquireHostRead();
+                if( _data == null )
+                    getCache();
+            }
+        }
                                
                boolean pWrite = false; // !fName.equals(_hdfsFileName); 
//persistent write flag
                if ( fName.equals(_hdfsFileName) ) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
index b6e1830..35b4cd1 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
@@ -22,6 +22,8 @@ package org.apache.sysml.runtime.controlprogram.context;
 import java.util.ArrayList;
 import java.util.HashMap;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.debug.DMLFrame;
 import org.apache.sysml.debug.DMLProgramCounter;
@@ -52,9 +54,9 @@ import org.apache.sysml.runtime.matrix.data.Pair;
 import org.apache.sysml.runtime.util.MapReduceTool;
 
 
-public class ExecutionContext 
-{
-       
+public class ExecutionContext {
+       protected static final Log LOG = 
LogFactory.getLog(ExecutionContext.class.getName());
+
        //program reference (e.g., function repository)
        protected Program _prog = null;
        
@@ -64,6 +66,8 @@ public class ExecutionContext
        //debugging (optional)
        protected DebugState _dbState = null;
 
+    protected GPUContext _gpuContext = null;
+
        protected ExecutionContext()
        {
                //protected constructor to force use of ExecutionContextFactory
@@ -94,13 +98,15 @@ public class ExecutionContext
        public void setVariables(LocalVariableMap vars) {
                _variables = vars;
        }
-       
-       public void destroyGPUContext() throws DMLRuntimeException {
-               if(GPUContext.isGPUContextCreated)
-                       GPUContext.getGPUContext().destroy();
-       }
-       
-       
+
+    public GPUContext getGPUContext() {
+        return _gpuContext;
+    }
+
+    public void setGPUContext(GPUContext _gpuContext) {
+        this._gpuContext = _gpuContext;
+    }
+
        /* -------------------------------------------------------
         * Methods to handle variables and associated data objects
         * -------------------------------------------------------
@@ -232,7 +238,7 @@ public class ExecutionContext
                throws DMLRuntimeException 
        {       
                MatrixObject mo = allocateGPUMatrixObject(varName);
-               boolean allocated = 
mo.getGPUObject().acquireDeviceModifyDense();
+               boolean allocated = 
mo.getGPUObject(getGPUContext()).acquireDeviceModifyDense();
                mo.getMatrixCharacteristics().setNonZeros(-1);
                return new Pair<MatrixObject, Boolean>(mo, allocated);
        }
@@ -251,7 +257,7 @@ public class ExecutionContext
     {
         MatrixObject mo = allocateGPUMatrixObject(varName);
         mo.getMatrixCharacteristics().setNonZeros(nnz);
-                               boolean allocated = 
mo.getGPUObject().acquireDeviceModifySparse();
+                               boolean allocated = 
mo.getGPUObject(getGPUContext()).acquireDeviceModifySparse();
         return new Pair<MatrixObject, Boolean>(mo, allocated);
     } 
 
@@ -263,8 +269,9 @@ public class ExecutionContext
         */
        public MatrixObject allocateGPUMatrixObject(String varName) throws 
DMLRuntimeException {
                MatrixObject mo = getMatrixObject(varName);
-               if( mo.getGPUObject() == null ) {
-                       mo.setGPUObject(GPUContext.createGPUObject(mo));
+               if( mo.getGPUObject(getGPUContext()) == null ) {
+                       GPUObject newGObj = getGPUContext().createGPUObject(mo);
+                       mo.setGPUObject(getGPUContext(), newGObj);
                }
                return mo;
        }
@@ -272,20 +279,23 @@ public class ExecutionContext
        public Pair<MatrixObject, Boolean> 
getMatrixInputForGPUInstruction(String varName)
                        throws DMLRuntimeException 
        {
+               GPUContext gCtx = getGPUContext();
                boolean copied = false;
                MatrixObject mo = getMatrixObject(varName);
                if(mo == null) {
                        throw new DMLRuntimeException("No matrix object 
available for variable:" + varName);
                }
-               if( mo.getGPUObject() == null ) {
-                       mo.setGPUObject(GPUContext.createGPUObject(mo));
-               }
+
                boolean acquired = false;
-               if( !mo.getGPUObject().isAllocated() ) {
+               if( mo.getGPUObject(gCtx) == null ) {
+                       GPUObject newGObj = gCtx.createGPUObject(mo);
+                       mo.setGPUObject(gCtx, newGObj);
+               } else if( !mo.getGPUObject(gCtx).isInputAllocated() ) {
                        mo.acquireRead();
                        acquired = true;
                }
-               copied = mo.getGPUObject().acquireDeviceRead();
+
+               copied = mo.getGPUObject(gCtx).acquireDeviceRead();
                if(acquired) {
                        mo.release();
                }
@@ -309,7 +319,7 @@ public class ExecutionContext
                throws DMLRuntimeException 
        {
                MatrixObject mo = getMatrixObject(varName);
-               mo.getGPUObject().releaseInput();
+               mo.getGPUObject(getGPUContext()).releaseInput();
        }
        
        /**
@@ -361,10 +371,10 @@ public class ExecutionContext
        
        public void releaseMatrixOutputForGPUInstruction(String varName) throws 
DMLRuntimeException {
                MatrixObject mo = getMatrixObject(varName);
-               if(mo.getGPUObject() == null || 
!mo.getGPUObject().isAllocated()) {
+               if(mo.getGPUObject(getGPUContext()) == null || 
!mo.getGPUObject(getGPUContext()).isAllocated()) {
                        throw new DMLRuntimeException("No output is allocated 
on GPU");
                }
-               mo.getGPUObject().releaseOutput();
+               mo.getGPUObject(getGPUContext()).releaseOutput();
        }
 
        public void setMatrixOutput(String varName, MatrixBlock outputData) 
@@ -486,7 +496,7 @@ public class ExecutionContext
                return varlist;
        }
 
-       public void cleanupMatrixObject(MatrixObject mo) 
+       public void cleanupMatrixObject(MatrixObject mo)
                throws DMLRuntimeException 
        {
                try

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
index e146821..c4684ec 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
@@ -21,6 +21,7 @@ package org.apache.sysml.runtime.controlprogram.parfor;
 
 import java.util.Collection;
 
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.CompilerConfig;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.hops.OptimizerUtils;
@@ -79,6 +80,10 @@ public class LocalParWorker extends ParWorker implements 
Runnable
                        SparkExecutionContext sec = (SparkExecutionContext)_ec;
                        sec.setThreadLocalSchedulerPool("parforPool"+_workerID);
                }
+
+               // Initialize this GPUContext to this thread
+               if (DMLScript.USE_ACCELERATOR)
+                       _ec.getGPUContext().initializeThread();
                
                //setup compiler config for worker thread
                ConfigurationManager.setLocalConfig(_cconf);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ParWorker.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ParWorker.java 
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ParWorker.java
index e0b30f4..05872b1 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ParWorker.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/ParWorker.java
@@ -49,6 +49,11 @@ public abstract class ParWorker
        protected long                      _workerID    = -1;
        
        protected ArrayList<ProgramBlock>   _childBlocks = null;
+
+       public ExecutionContext getExecutionContext() {
+               return _ec;
+       }
+
        protected ExecutionContext          _ec          = null;
        protected ArrayList<String>         _resultVars  = null;
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptTreeConverter.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptTreeConverter.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptTreeConverter.java
index 5c693d5..1d5a195 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptTreeConverter.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptTreeConverter.java
@@ -516,7 +516,7 @@ public class OptTreeConverter
                        LopProperties.ExecType et = (hop.getExecType()!=null) ? 
                                           hop.getExecType() : 
LopProperties.ExecType.CP;
                        switch( et ) {
-                               case CP:
+                               case CP:case GPU:
                                        node.setExecType(ExecType.CP); break;
                                case SPARK:
                                        node.setExecType(ExecType.SPARK); break;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
index c65553b..0958aeb 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
@@ -172,8 +172,12 @@ public class FunctionCallCPInstruction extends 
CPInstruction
                // Create a symbol table under a new execution context for the 
function invocation,
                // and copy the function arguments into the created table. 
                ExecutionContext fn_ec = 
ExecutionContextFactory.createContext(false, ec.getProgram());
+               if (DMLScript.USE_ACCELERATOR) {
+                       fn_ec.setGPUContext(ec.getGPUContext());
+                       ec.setGPUContext(null);
+                       fn_ec.getGPUContext().initializeThread();
+               }
                fn_ec.setVariables(functionVariables);
-               
                // execute the function block
                try {
                        fpb._functionName = this._functionName;
@@ -187,7 +191,6 @@ public class FunctionCallCPInstruction extends CPInstruction
                        String fname = 
DMLProgram.constructFunctionKey(_namespace, _functionName);
                        throw new DMLRuntimeException("error executing function 
" + fname, e);
                }
-               
                LocalVariableMap retVars = fn_ec.getVariables();  
                
                // cleanup all returned variables w/o binding 
@@ -206,6 +209,12 @@ public class FunctionCallCPInstruction extends 
CPInstruction
                
                // Unpin the pinned variables
                ec.unpinVariables(_boundInputParamNames, pinStatus);
+
+               if (DMLScript.USE_ACCELERATOR) {
+                       ec.setGPUContext(fn_ec.getGPUContext());
+                       fn_ec.setGPUContext(null);
+                       ec.getGPUContext().initializeThread();
+               }
                
                // add the updated binding for each return variable to the 
variables in original symbol table
                for (int i=0; i< fpb.getOutputParams().size(); i++){

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
index 55c8f8d..2531c17 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
@@ -96,7 +96,7 @@ public class AggregateBinaryGPUInstruction extends 
GPUInstruction
                int clen = (int) (_isRightTransposed ? m2.getNumRows() : 
m2.getNumColumns());
 
                ec.setMetaData(_output.getName(), rlen, clen);
-               LibMatrixCUDA.matmult(ec, getExtendedOpcode(), m1, m2, 
_output.getName(), _isLeftTransposed, _isRightTransposed);
+               LibMatrixCUDA.matmult(ec, ec.getGPUContext(), 
getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, 
_isRightTransposed);
         
                //release inputs/outputs
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
@@ -113,6 +113,6 @@ public class AggregateBinaryGPUInstruction extends 
GPUInstruction
        @SuppressWarnings("unused")
        private boolean isSparse(ExecutionContext ec, String var) throws 
DMLRuntimeException {
                MatrixObject mo = ec.getMatrixObject(var);
-               return LibMatrixCUDA.isInSparseFormat(mo);
+               return LibMatrixCUDA.isInSparseFormat(ec.getGPUContext(), mo);
        }
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
index 45db44c..bfe30f9 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
@@ -27,6 +27,7 @@ import org.apache.sysml.runtime.functionobjects.ReduceCol;
 import org.apache.sysml.runtime.functionobjects.ReduceRow;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
 import org.apache.sysml.runtime.matrix.operators.Operator;
@@ -93,7 +94,7 @@ public class AggregateUnaryGPUInstruction extends 
GPUInstruction {
       ec.setMetaData(_output.getName(), rlen, 1);
     }
 
-    LibMatrixCUDA.unaryAggregate(ec, getExtendedOpcode(), in1, 
_output.getName(), (AggregateUnaryOperator)_optr);
+    LibMatrixCUDA.unaryAggregate(ec, ec.getGPUContext(), getExtendedOpcode(), 
in1, _output.getName(), (AggregateUnaryOperator)_optr);
 
     //release inputs/outputs
     ec.releaseMatrixInputForGPUInstruction(_input1.getName());

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index a02115d..e806f3b 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -18,19 +18,20 @@
  */
 package org.apache.sysml.runtime.instructions.gpu;
 
+import java.util.ArrayList;
+
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.functionobjects.SwapIndex;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
 import org.apache.sysml.utils.GPUStatistics;
 
-import java.util.ArrayList;
-
 public class ConvolutionGPUInstruction extends GPUInstruction 
 {
        private CPOperand _input1; 
@@ -186,9 +187,9 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                ec.setMetaData(_output.getName(), input.getNumRows(), 
input.getNumColumns());
                MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, 
_output.getName());
                if(instOpcode.equalsIgnoreCase("bias_add"))
-                       LibMatrixCUDA.biasAdd(getExtendedOpcode(), input, bias, 
out);
+                       LibMatrixCUDA.biasAdd(ec.getGPUContext(), 
getExtendedOpcode(), input, bias, out);
                else if(instOpcode.equalsIgnoreCase("bias_multiply"))
-                       LibMatrixCUDA.biasMultiply(getExtendedOpcode(), input, 
bias, out);
+                       LibMatrixCUDA.biasMultiply(ec.getGPUContext(), 
getExtendedOpcode(), input, bias, out);
                // release inputs/outputs
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                ec.releaseMatrixInputForGPUInstruction(_input2.getName());
@@ -202,7 +203,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                
                MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, 
_output.getName());
                ec.setMetaData(_output.getName(), input.getNumRows(), 
input.getNumColumns());
-               LibMatrixCUDA.reluBackward(getExtendedOpcode(), input, dout, 
out);
+               LibMatrixCUDA.reluBackward(ec.getGPUContext(), 
getExtendedOpcode(), input, dout, out);
                // release inputs/outputs
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                ec.releaseMatrixInputForGPUInstruction(_input2.getName());
@@ -253,7 +254,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        
                        ec.setMetaData(_output.getName(), N, K * P * Q);
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
-                       LibMatrixCUDA.conv2d(getExtendedOpcode(), image, 
filter, out, N, C, H, W,
+                       LibMatrixCUDA.conv2d(ec.getGPUContext(), 
getExtendedOpcode(), image, filter, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                }
                else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
@@ -268,7 +269,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        
                        ec.setMetaData(_output.getName(), N, K * P * Q);
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
-                       LibMatrixCUDA.conv2dBiasAdd(getExtendedOpcode(), image, 
bias, filter, out, N, C, H, W,
+                       LibMatrixCUDA.conv2dBiasAdd(ec.getGPUContext(), 
getExtendedOpcode(), image, bias, filter, out, N, C, H, W,
                                                K, R, S, pad_h, pad_w, 
stride_h, stride_w, P, Q);
                }
                else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) 
{
@@ -283,7 +284,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        
                        ec.setMetaData(_output.getName(), K, C * R * S);
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
-                       LibMatrixCUDA.conv2dBackwardFilter(getExtendedOpcode(), 
image, dout, out, N, C, H, W,
+                       LibMatrixCUDA.conv2dBackwardFilter(ec.getGPUContext(), 
getExtendedOpcode(), image, dout, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                        // TODO: For now always copy the device data to host
                        // ec.gpuCtx.copyDeviceToHost(outputBlock);
@@ -300,7 +301,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        
                        ec.setMetaData(_output.getName(), N, C * H * W);
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
-                       LibMatrixCUDA.conv2dBackwardData(getExtendedOpcode(), 
filter, dout, out, N, C, H, W,
+                       LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(), 
getExtendedOpcode(), filter, dout, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                }
                else if (instOpcode.equalsIgnoreCase("maxpooling") || 
instOpcode.equalsIgnoreCase("relu_maxpooling")) {
@@ -313,10 +314,10 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        ec.setMetaData(_output.getName(), N, C * P * Q);
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
                        if(instOpcode.equalsIgnoreCase("maxpooling"))
-                               LibMatrixCUDA.maxpooling(getExtendedOpcode(), 
image, out, N, C, H, W,
+                               LibMatrixCUDA.maxpooling(ec.getGPUContext(), 
getExtendedOpcode(), image, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                        else
-                               
LibMatrixCUDA.reluMaxpooling(getExtendedOpcode(), image, out, N, C, H, W,
+                               
LibMatrixCUDA.reluMaxpooling(ec.getGPUContext(), getExtendedOpcode(), image, 
out, N, C, H, W,
                                                K, R, S, pad_h, pad_w, 
stride_h, stride_w, P, Q);
                }
                else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
@@ -331,7 +332,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        
                        ec.setMetaData(_output.getName(), N, C * H * W);
                        MatrixObject out = 
getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
-                       LibMatrixCUDA.maxpoolingBackward(getExtendedOpcode(), 
image, dout, out, N, C, H, W,
+                       LibMatrixCUDA.maxpoolingBackward(ec.getGPUContext(), 
getExtendedOpcode(), image, dout, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                }
                else {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index ab275e7..0b69b5e 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -19,7 +19,6 @@
 
 package org.apache.sysml.runtime.instructions.gpu;
 
-import jcuda.runtime.JCuda;
 import org.apache.sysml.lops.runtime.RunMRJobs;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
@@ -149,7 +148,7 @@ public abstract class GPUInstruction extends Instruction
        public void postprocessInstruction(ExecutionContext ec)
                                        throws DMLRuntimeException
        {
-               JCuda.cudaDeviceSynchronize();
+               //JCuda.cudaDeviceSynchronize();
        }
 
        /**

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
index 0f2542a..af97141 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
@@ -32,6 +32,7 @@ import 
org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.operators.Operator;
 import org.apache.sysml.utils.GPUStatistics;
@@ -107,7 +108,7 @@ public class MMTSJGPUInstruction extends GPUInstruction
 
                 //execute operations 
                 ec.setMetaData(_output.getName(), rlen, clen);
-                LibMatrixCUDA.matmultTSMM(ec, getExtendedOpcode(), mat, 
_output.getName(), isLeftTransposed);
+                LibMatrixCUDA.matmultTSMM(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName(), isLeftTransposed);
                 
                 ec.releaseMatrixInputForGPUInstruction(_input.getName());
                 ec.releaseMatrixOutputForGPUInstruction(_output.getName());

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
index 2766909..3fc8a98 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
@@ -23,6 +23,7 @@ import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.operators.Operator;
 import org.apache.sysml.utils.GPUStatistics;
@@ -43,10 +44,10 @@ public class MatrixBuiltinGPUInstruction extends 
BuiltinUnaryGPUInstruction {
                ec.setMetaData(_output.getName(), mat.getNumRows(), 
mat.getNumColumns());
 
                if(opcode.equals("sel+")) {
-                       LibMatrixCUDA.relu(ec, getExtendedOpcode(), mat, 
_output.getName());
+                       LibMatrixCUDA.relu(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName());
 
                } else if (opcode.equals("exp")) {
-                       LibMatrixCUDA.exp(ec, getExtendedOpcode(), mat, 
_output.getName());
+                       LibMatrixCUDA.exp(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName());
                }
                else {
                        throw new DMLRuntimeException("Unsupported GPU 
operator:" + opcode);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
index 2da1aa6..a1520a9 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
@@ -23,6 +23,7 @@ import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysml.runtime.matrix.operators.Operator;
@@ -71,10 +72,10 @@ public class MatrixMatrixArithmeticGPUInstruction extends 
ArithmeticBinaryGPUIns
                ec.setMetaData(_output.getName(), (int)rlen, (int)clen);
                
                BinaryOperator bop = (BinaryOperator) _optr;
-               LibMatrixCUDA.matrixScalarArithmetic(ec, getExtendedOpcode(), 
in1, in2, _output.getName(), isLeftTransposed, isRightTransposed, bop);
+               LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(), 
getExtendedOpcode(), in1, in2, _output.getName(), isLeftTransposed, 
isRightTransposed, bop);
                
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                ec.releaseMatrixInputForGPUInstruction(_input2.getName());
-        ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+               ec.releaseMatrixOutputForGPUInstruction(_output.getName());
        }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
index 2c833e4..28254c6 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
@@ -26,6 +26,7 @@ import 
org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
 import org.apache.sysml.runtime.instructions.cp.ScalarObject;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.operators.Operator;
 import org.apache.sysml.utils.GPUStatistics;
@@ -103,11 +104,11 @@ public class MatrixMatrixAxpyGPUInstruction extends 
ArithmeticBinaryGPUInstructi
                                        " and input2:" + rlen2 + " X " + clen2);
                }
                
-               LibMatrixCUDA.axpy(ec, getExtendedOpcode(), in1, in2, 
_output.getName(), multiplier*scalar.getDoubleValue());
+               LibMatrixCUDA.axpy(ec, ec.getGPUContext(), getExtendedOpcode(), 
in1, in2, _output.getName(), multiplier*scalar.getDoubleValue());
                
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
                ec.releaseMatrixInputForGPUInstruction(_input2.getName());
-        ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+               ec.releaseMatrixOutputForGPUInstruction(_output.getName());
        }
        
        private boolean isValidMMOperation(long rlen1, long rlen2, long clen1, 
long clen2) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
index 54ba32e..424a2c5 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
@@ -25,6 +25,7 @@ import 
org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.functionobjects.SwapIndex;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.operators.Operator;
 import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
@@ -81,10 +82,10 @@ public class ReorgGPUInstruction extends GPUInstruction
                
                //execute operation
                ec.setMetaData(_output.getName(), rlen, clen);
-               LibMatrixCUDA.transpose(ec, getExtendedOpcode(), mat, 
_output.getName());
+               LibMatrixCUDA.transpose(ec, ec.getGPUContext(), 
getExtendedOpcode(), mat, _output.getName());
                
                //release inputs/outputs
                ec.releaseMatrixInputForGPUInstruction(_input.getName());
-        ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+               ec.releaseMatrixOutputForGPUInstruction(_output.getName());
        }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
index 44cc6e2..64cb6c4 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
@@ -60,9 +60,9 @@ public class ScalarMatrixArithmeticGPUInstruction extends 
ArithmeticBinaryGPUIns
                ScalarOperator sc_op = (ScalarOperator) _optr;
                sc_op.setConstant(constant.getDoubleValue());
                
-               LibMatrixCUDA.matrixScalarArithmetic(ec, getExtendedOpcode(), 
in1, _output.getName(), isTransposed, sc_op);
+               LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(), 
getExtendedOpcode(), in1, _output.getName(), isTransposed, sc_op);
                
                ec.releaseMatrixInputForGPUInstruction(mat.getName());
-        ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+               ec.releaseMatrixOutputForGPUInstruction(_output.getName());
        }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/129f0f6b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
new file mode 100644
index 0000000..5e202a9
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -0,0 +1,457 @@
+package org.apache.sysml.runtime.instructions.gpu.context;
+
+import static jcuda.jcusparse.JCusparse.cusparseCreateMatDescr;
+import static jcuda.jcusparse.JCusparse.cusparseDcsr2dense;
+import static jcuda.jcusparse.JCusparse.cusparseSetMatIndexBase;
+import static jcuda.jcusparse.JCusparse.cusparseSetMatType;
+import static jcuda.jcusparse.JCusparse.cusparseSetPointerMode;
+import static jcuda.jcusparse.JCusparse.cusparseXcsrgeamNnz;
+import static jcuda.jcusparse.JCusparse.cusparseXcsrgemmNnz;
+import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO;
+import static jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL;
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.utils.GPUStatistics;
+
+import jcuda.Pointer;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.jcusparse.cusparseMatDescr;
+import jcuda.jcusparse.cusparsePointerMode;
+
+/**
+ * Compressed Sparse Row (CSR) format for CUDA
+ * Generalized matrix multiply is implemented for CSR format in the cuSparse 
library among other operations
+ */
+public class CSRPointer {
+
+  private static final Log LOG = LogFactory.getLog(CSRPointer.class.getName());
+
+  /** {@link GPUContext} instance to track the GPU to do work on */
+  private final GPUContext gpuContext;
+
+  private static final double ULTRA_SPARSITY_TURN_POINT = 0.0004;
+
+  public static cusparseMatDescr matrixDescriptor;
+
+  /** Number of non zeroes */
+  public long nnz;
+
+  /** double array of non zero values */
+  public Pointer val;
+
+  /** integer array of start of all rows and end of last row + 1 */
+  public Pointer rowPtr;
+
+  /** integer array of nnz values' column indices */
+  public Pointer colInd;
+
+  /** descriptor of matrix, only CUSPARSE_MATRIX_TYPE_GENERAL supported */
+  public cusparseMatDescr descr;
+
+  /**
+   * Default constructor to help with Factory method {@link 
#allocateEmpty(GPUContext, long, long)}
+   * @param gCtx   a valid {@link GPUContext}
+   */
+  private CSRPointer(GPUContext gCtx) {
+    gpuContext = gCtx;
+    val = new Pointer();
+    rowPtr = new Pointer();
+    colInd = new Pointer();
+    allocateMatDescrPointer();
+  }
+
+  private Pointer allocate(String instName, long size) throws 
DMLRuntimeException {
+    return getGPUContext().allocate(instName, size);
+  }
+
+  private Pointer allocate(long size) throws DMLRuntimeException {
+    return getGPUContext().allocate(size);
+  }
+
+  private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
+    getGPUContext().cudaFreeHelper(toFree);
+  }
+
+  private void cudaFreeHelper(Pointer toFree, boolean eager) throws 
DMLRuntimeException {
+    getGPUContext().cudaFreeHelper(toFree, eager);
+  }
+
+  private void cudaFreeHelper(String instName, Pointer toFree, boolean eager) 
throws DMLRuntimeException {
+    getGPUContext().cudaFreeHelper(instName, toFree, eager);
+  }
+
+  private static long getDoubleSizeOf(long numElems) {
+    return numElems * ((long)jcuda.Sizeof.DOUBLE);
+  }
+
+  private static long getIntSizeOf(long numElems) {
+    return numElems * ((long)jcuda.Sizeof.INT);
+  }
+
+  private GPUContext getGPUContext() throws DMLRuntimeException {
+    return gpuContext;
+  }
+
+  public static int toIntExact(long l) throws DMLRuntimeException {
+    if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) {
+      throw new DMLRuntimeException("Cannot be cast to int:" + l);
+    }
+    return (int) l;
+  }
+
+  /**
+   * @return Singleton default matrix descriptor object
+   * (set with CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO)
+   */
+  public static cusparseMatDescr getDefaultCuSparseMatrixDescriptor() {
+    if (matrixDescriptor == null) {
+      // Code from JCuda Samples - 
http://www.jcuda.org/samples/JCusparseSample.java
+      matrixDescriptor = new cusparseMatDescr();
+      cusparseCreateMatDescr(matrixDescriptor);
+      cusparseSetMatType(matrixDescriptor, CUSPARSE_MATRIX_TYPE_GENERAL);
+      cusparseSetMatIndexBase(matrixDescriptor, CUSPARSE_INDEX_BASE_ZERO);
+    }
+    return matrixDescriptor;
+  }
+
+  /**
+   * Estimate the size of a CSR matrix in GPU memory
+   * Size of pointers is not needed and is not added in
+   *
+   * @param nnz2 number of non zeroes
+   * @param rows number of rows
+   * @return size estimate
+   */
+  public static long estimateSize(long nnz2, long rows) {
+    long sizeofValArray = getDoubleSizeOf(nnz2);
+    long sizeofRowPtrArray = getIntSizeOf(rows + 1);
+    long sizeofColIndArray = getIntSizeOf(nnz2);
+    long sizeofDescr = getIntSizeOf(4);
+    // From the CUSPARSE documentation, the cusparseMatDescr in native code is 
represented as:
+    // typedef struct {
+    //         cusparseMatrixType_t MatrixType;
+    // cusparseFillMode_t FillMode;
+    // cusparseDiagType_t DiagType;
+    //         cusparseIndexBase_t IndexBase;
+    // } cusparseMatDescr_t;
+    long tot = sizeofValArray + sizeofRowPtrArray + sizeofColIndArray + 
sizeofDescr;
+    return tot;
+  }
+
+  /**
+   * Static method to copy a CSR sparse matrix from Host to Device
+   *
+   * @param dest   [input] destination location (on GPU)
+   * @param rows   number of rows
+   * @param nnz    number of non-zeroes
+   * @param rowPtr integer array of row pointers
+   * @param colInd integer array of column indices
+   * @param values double array of non zero values
+   */
+  public static void copyToDevice(CSRPointer dest, int rows, long nnz, int[] 
rowPtr, int[] colInd, double[] values) {
+    CSRPointer r = dest;
+    long t0 = 0;
+    if (DMLScript.STATISTICS) t0 = System.nanoTime();
+    r.nnz = nnz;
+    cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), 
cudaMemcpyHostToDevice);
+    cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), 
cudaMemcpyHostToDevice);
+    cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), 
cudaMemcpyHostToDevice);
+    if (DMLScript.STATISTICS) 
GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() - t0);
+    if (DMLScript.STATISTICS) GPUStatistics.cudaToDevCount.addAndGet(3);
+  }
+
+  /**
+   * Static method to copy a CSR sparse matrix from Device to host
+   *
+   * @param src    [input] source location (on GPU)
+   * @param rows   [input] number of rows
+   * @param nnz    [input] number of non-zeroes
+   * @param rowPtr [output] pre-allocated integer array of row pointers of 
size (rows+1)
+   * @param colInd [output] pre-allocated integer array of column indices of 
size nnz
+   * @param values [output] pre-allocated double array of values of size nnz
+   */
+  public static void copyToHost(CSRPointer src, int rows, long nnz, int[] 
rowPtr, int[] colInd, double[] values) {
+    CSRPointer r = src;
+    long t0 = 0;
+    if (DMLScript.STATISTICS) t0 = System.nanoTime();
+    cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 1), 
cudaMemcpyDeviceToHost);
+    cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), 
cudaMemcpyDeviceToHost);
+    cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz), 
cudaMemcpyDeviceToHost);
+    if (DMLScript.STATISTICS) 
GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t0);
+    if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(3);
+  }
+
+  /**
+   * Estimates the number of non zero elements from the results of a sparse 
cusparseDgeam operation
+   * C = a op(A) + b op(B)
+   * @param gCtx   a valid {@link GPUContext}
+   * @param handle a valid {@link cusparseHandle}
+   * @param A      Sparse Matrix A on GPU
+   * @param B      Sparse Matrix B on GPU
+   * @param m      Rows in A
+   * @param n      Columns in Bs
+   * @return CSR (compressed sparse row) pointer
+   * @throws DMLRuntimeException if DMLRuntimeException occurs
+   */
+  public static CSRPointer allocateForDgeam(GPUContext gCtx, cusparseHandle 
handle, CSRPointer A, CSRPointer B, int m, int n)
+          throws DMLRuntimeException {
+    if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
+      throw new DMLRuntimeException("Number of non zeroes is larger than 
supported by cuSparse");
+    }
+    CSRPointer C = new CSRPointer(gCtx);
+    step1AllocateRowPointers(gCtx, handle, C, m);
+    step2GatherNNZGeam(gCtx, handle, A, B, C, m, n);
+    step3AllocateValNInd(gCtx, handle, C);
+    return C;
+  }
+
+  /**
+   * Estimates the number of non-zero elements from the result of a sparse 
matrix multiplication C = A * B
+   * and returns the {@link CSRPointer} to C with the appropriate GPU memory.
+   *
+   * @param handle a valid {@link cusparseHandle}
+   * @param A      Sparse Matrix A on GPU
+   * @param transA 'T' if A is to be transposed, 'N' otherwise
+   * @param B      Sparse Matrix B on GPU
+   * @param transB 'T' if B is to be transposed, 'N' otherwise
+   * @param m      Rows in A
+   * @param n      Columns in B
+   * @param k      Columns in A / Rows in B
+   * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on 
GPU
+   * @throws DMLRuntimeException if DMLRuntimeException occurs
+   */
+  public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, 
cusparseHandle handle, CSRPointer A, int transA, CSRPointer B, int transB, int 
m, int n, int k)
+          throws DMLRuntimeException {
+    // Following the code example at 
http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at
+    // 
https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java
+    CSRPointer C = new CSRPointer(gCtx);
+    step1AllocateRowPointers(gCtx, handle, C, m);
+    step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, k);
+    step3AllocateValNInd(gCtx, handle, C);
+    return C;
+  }
+
+  /**
+   * Check for ultra sparsity
+   *
+   * @param rows number of rows
+   * @param cols number of columns
+   * @return true if ultra sparse
+   */
+  public boolean isUltraSparse(int rows, int cols) {
+    double sp = ((double) nnz / rows / cols);
+    return sp < ULTRA_SPARSITY_TURN_POINT;
+  }
+
+// 
==============================================================================================
+
+// The following methods estimate the memory needed for sparse matrices that 
are
+// results of operations on other sparse matrices using the cuSparse Library.
+// The operation is C = op(A) binaryOperation op(B), C is the output and A & B 
are the inputs
+// op = whether to transpose or not
+// binaryOperation = For cuSparse, +, - are *(matmul) are supported
+
+// From CuSparse Manual,
+// Since A and B have different sparsity patterns, cuSPARSE adopts a two-step 
approach
+// to complete sparse matrix C. In the first step, the user allocates 
csrRowPtrC of m+1
+// elements and uses function cusparseXcsrgeamNnz() to determine csrRowPtrC
+// and the total number of nonzero elements. In the second step, the user 
gathers nnzC
+//(number of nonzero elements of matrix C) from either 
(nnzC=*nnzTotalDevHostPtr)
+// or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) and allocates csrValC, csrColIndC of
+// nnzC elements respectively, then finally calls function 
cusparse[S|D|C|Z]csrgeam()
+// to complete matrix C.
+
+  /**
+   * Initializes {@link #descr} to CUSPARSE_MATRIX_TYPE_GENERAL,
+   * the default that works for DGEMM.
+   */
+  private void allocateMatDescrPointer() {
+    this.descr = getDefaultCuSparseMatrixDescriptor();
+  }
+
+  /**
+   * Factory method to allocate an empty CSR Sparse matrix on the GPU
+   *
+   * @param nnz2 number of non-zeroes
+   * @param rows number of rows
+   * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on 
GPU
+   * @throws DMLRuntimeException if DMLRuntimeException occurs
+   */
+  public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long 
rows) throws DMLRuntimeException {
+    LOG.trace("GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and 
rows=" + rows + ", GPUContext=" + gCtx);
+    assert nnz2 > -1 : "Incorrect usage of internal API, number of non zeroes 
is less than 0 when trying to allocate sparse data on GPU";
+    CSRPointer r = new CSRPointer(gCtx);
+    r.nnz = nnz2;
+    if (nnz2 == 0) {
+      // The convention for an empty sparse matrix is to just have an instance 
of the CSRPointer object
+      // with no memory allocated on the GPU.
+      return r;
+    }
+    gCtx.ensureFreeSpace(getDoubleSizeOf(nnz2) + getIntSizeOf(rows + 1) + 
getIntSizeOf(nnz2));
+    // increment the cudaCount by 1 for the allocation of all 3 arrays
+    r.val = gCtx.allocate(null, getDoubleSizeOf(nnz2));
+    r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1));
+    r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2));
+    return r;
+  }
+
+  /**
+   * Allocate row pointers of m+1 elements
+   * @param gCtx   a valid {@link GPUContext}
+   * @param handle a valid {@link cusparseHandle}
+   * @param C      Output matrix
+   * @param rowsC  number of rows in C
+   * @throws DMLRuntimeException ?
+   */
+  private static void step1AllocateRowPointers(GPUContext gCtx, cusparseHandle 
handle, CSRPointer C, int rowsC) throws DMLRuntimeException {
+    LOG.trace("GPU : step1AllocateRowPointers" + ", GPUContext=" + gCtx);
+    cusparseSetPointerMode(handle, 
cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
+    //cudaDeviceSynchronize;
+    // Do not increment the cudaCount of allocations on GPU
+    C.rowPtr = gCtx.allocate(getIntSizeOf((long) rowsC + 1));
+  }
+
+  /**
+   * Determine total number of nonzero element for the cusparseDgeam  
operation.
+   * This is done from either (nnzC=*nnzTotalDevHostPtr) or 
(nnzC=csrRowPtrC(m)-csrRowPtrC(0))
+   * @param gCtx   a valid {@link GPUContext}
+   * @param handle a valid {@link cusparseHandle}
+   * @param A      Sparse Matrix A on GPU
+   * @param B      Sparse Matrix B on GPU
+   * @param C      Output Sparse Matrix C on GPU
+   * @param m      Rows in C
+   * @param n      Columns in C
+   * @throws DMLRuntimeException ?
+   */
+  private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle 
handle, CSRPointer A, CSRPointer B, CSRPointer C, int m, int n) throws 
DMLRuntimeException {
+    LOG.trace("GPU : step2GatherNNZGeam for DGEAM" + ", GPUContext=" + gCtx);
+    int[] CnnzArray = {-1};
+    cusparseXcsrgeamNnz(handle, m, n,
+            A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd,
+            B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd,
+            C.descr, C.rowPtr, Pointer.to(CnnzArray));
+    //cudaDeviceSynchronize;
+    if (CnnzArray[0] != -1) {
+      C.nnz = CnnzArray[0];
+    } else {
+      int baseArray[] = {0};
+      cudaMemcpy(Pointer.to(CnnzArray), 
C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), 
cudaMemcpyDeviceToHost);
+      cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), 
cudaMemcpyDeviceToHost);
+      C.nnz = CnnzArray[0] - baseArray[0];
+    }
+  }
+
+// 
==============================================================================================
+
+  /**
+   * Determine total number of nonzero element for the cusparseDgemm operation.
+   * @param gCtx   a valid {@link GPUContext}
+   * @param handle a valid {@link cusparseHandle}
+   * @param A      Sparse Matrix A on GPU
+   * @param transA op - whether A is transposed
+   * @param B      Sparse Matrix B on GPU
+   * @param transB op - whether B is transposed
+   * @param C      Output Sparse Matrix C on GPU
+   * @param m      Number of rows of sparse matrix op ( A ) and C
+   * @param n      Number of columns of sparse matrix op ( B ) and C
+   * @param k      Number of columns/rows of sparse matrix op ( A ) / op ( B )
+   * @throws DMLRuntimeException ?
+   */
+  private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle 
handle, CSRPointer A, int transA, CSRPointer B, int transB, CSRPointer C, int 
m, int n, int k) throws DMLRuntimeException {
+    LOG.trace("GPU : step2GatherNNZGemm for DGEMM" + ", GPUContext=" + gCtx);
+    int[] CnnzArray = {-1};
+    if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
+      throw new DMLRuntimeException("Number of non zeroes is larger than 
supported by cuSparse");
+    }
+    cusparseXcsrgemmNnz(handle, transA, transB, m, n, k,
+            A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd,
+            B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd,
+            C.descr, C.rowPtr, Pointer.to(CnnzArray));
+    //cudaDeviceSynchronize;
+    if (CnnzArray[0] != -1) {
+      C.nnz = CnnzArray[0];
+    } else {
+      int baseArray[] = {0};
+      cudaMemcpy(Pointer.to(CnnzArray), 
C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), 
cudaMemcpyDeviceToHost);
+      cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), 
cudaMemcpyDeviceToHost);
+      C.nnz = CnnzArray[0] - baseArray[0];
+    }
+  }
+
+  /**
+   * Allocate val and index pointers.
+   * @param gCtx   a valid {@link GPUContext}
+   * @param handle a valid {@link cusparseHandle}
+   * @param C      Output sparse matrix on GPU
+   * @throws DMLRuntimeException ?
+   */
+  private static void step3AllocateValNInd(GPUContext gCtx, cusparseHandle 
handle, CSRPointer C) throws DMLRuntimeException {
+    LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + gCtx);
+    // Increment cudaCount by one when all three arrays of CSR sparse array 
are allocated
+    C.val = gCtx.allocate(null, getDoubleSizeOf(C.nnz));
+    C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz));
+  }
+
+  /**
+   * Copies this CSR matrix on the GPU to a dense column-major matrix
+   * on the GPU. This is a temporary matrix for operations such as
+   * cusparseDcsrmv.
+   * Since the allocated matrix is temporary, bookkeeping is not updated.
+   * The caller is responsible for calling "free" on the returned Pointer 
object
+   *
+   * @param cusparseHandle a valid {@link cusparseHandle}
+   * @param cublasHandle   a valid {@link cublasHandle}
+   * @param rows           number of rows in this CSR matrix
+   * @param cols           number of columns in this CSR matrix
+   * @throws DMLRuntimeException if DMLRuntimeException occurs
+   * @return A {@link Pointer} to the allocated dense matrix (in column-major 
format)
+   */
+  public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, 
cublasHandle cublasHandle, int rows, int cols) throws DMLRuntimeException {
+    LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + 
this + ", GPUContext=" + getGPUContext());
+    long size = ((long) rows) * getDoubleSizeOf((long) cols);
+    Pointer A = allocate(size);
+    // If this sparse block is empty, the allocated dense matrix, initialized 
to zeroes, will be returned.
+    if (val != null && rowPtr != null && colInd != null && nnz > 0) {
+      // Note: cusparseDcsr2dense method cannot handle empty blocks
+      cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, 
colInd, A, rows);
+      //cudaDeviceSynchronize;
+    } else {
+      LOG.warn("in CSRPointer, the values array, row pointers array or column 
indices array was null");
+    }
+    return A;
+  }
+
+  /**
+   * Calls cudaFree lazily on the allocated {@link Pointer} instances
+   */
+  public void deallocate() throws DMLRuntimeException {
+    deallocate(false);
+  }
+
+  /**
+   * Calls cudaFree lazily or eagerly on the allocated {@link Pointer} 
instances
+   *
+   * @param eager whether to do eager or lazy cudaFrees
+   */
+  public void deallocate(boolean eager) throws DMLRuntimeException {
+    if (nnz > 0) {
+      cudaFreeHelper(val, eager);
+      cudaFreeHelper(rowPtr, eager);
+      cudaFreeHelper(colInd, eager);
+    }
+  }
+
+  @Override
+  public String toString() {
+    return "CSRPointer{" +
+            "nnz=" + nnz +
+            '}';
+  }
+}

Reply via email to