systemml git commit: [SYSTEMML-445] Use PersistentLRUCache for shadow buffering

niketanpansare Thu, 20 Sep 2018 11:24:47 -0700

Repository: systemml
Updated Branches:
  refs/heads/master 69f2d377c -> 69624850e



[SYSTEMML-445] Use PersistentLRUCache for shadow buffering

- Shadow buffer is cleared eagerly in case of garbage collection to avoid OOM 
and is backed by org.apache.sysml.utils.PersistentLRUCache.
- Setting the configuration property sysml.gpu.eviction.shadow.bufferSize to 
zero disables shadow buffering. If you intend to train network larger than the 
GPU memory size, consider using large driver memory and setting 
sysml.gpu.eviction.shadow.bufferSize to a value greater than 0.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/69624850
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/69624850
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/69624850

Branch: refs/heads/master
Commit: 69624850ea872841daef1f99251d793e103502f3
Parents: 69f2d37
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Thu Sep 20 11:22:37 2018 -0700
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Thu Sep 20 11:22:37 2018 -0700

----------------------------------------------------------------------
 conf/SystemML-config.xml.template               |   8 +-
 .../java/org/apache/sysml/conf/DMLConfig.java   |   2 +-
 .../instructions/gpu/context/GPUObject.java     |  19 ++-
 .../instructions/gpu/context/ShadowBuffer.java  | 152 ++++++++++++++-----
 4 files changed, 134 insertions(+), 47 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/69624850/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template 
b/conf/SystemML-config.xml.template
index d773f79..3925c4e 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -105,9 +105,11 @@
    <!-- Advanced optimization: fraction of driver memory to use for caching 
(default: 0.15) -->
    <sysml.caching.bufferSize>0.15</sysml.caching.bufferSize>
    
-   <!-- Advanced optimization: fraction of driver memory to use for GPU shadow 
buffer. This optimization is ignored for double precision. 
-   By default, it is disabled (hence set to 0.0). If you intend to train 
network larger than GPU memory size, consider using single precision and 
setting this to 0.1 -->
-   
<sysml.gpu.eviction.shadow.bufferSize>0.0</sysml.gpu.eviction.shadow.bufferSize>
+   <!-- Advanced optimization: maximum fraction of driver memory to use for 
GPU shadow buffer. 
+   Shadow buffer is cleared eagerly on garbage collection to avoid OOM and is 
backed by org.apache.sysml.utils.PersistentLRUCache.
+   Setting this to zero disables shadow buffering. If you intend to train 
network larger than GPU memory size, 
+   consider using large driver memory and setting this to a value greater than 
0. -->
+   
<sysml.gpu.eviction.shadow.bufferSize>0.5</sysml.gpu.eviction.shadow.bufferSize>
    
    <!-- Fraction of available GPU memory to use. This is similar to 
TensorFlow's per_process_gpu_memory_fraction configuration property. (default: 
0.9) -->
    <sysml.gpu.memory.util.factor>0.9</sysml.gpu.memory.util.factor>

http://git-wip-us.apache.org/repos/asf/systemml/blob/69624850/src/main/java/org/apache/sysml/conf/DMLConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java 
b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index 5b30609..7f0ecbc 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -138,7 +138,7 @@ public class DMLConfig
                _defaultVals.put(NATIVE_BLAS_DIR,        "none" );
                _defaultVals.put(EXTRA_FINEGRAINED_STATS,"false" );
                _defaultVals.put(PRINT_GPU_MEMORY_INFO,  "false" );
-               _defaultVals.put(EVICTION_SHADOW_BUFFERSIZE,  "0.0" );
+               _defaultVals.put(EVICTION_SHADOW_BUFFERSIZE,  "0.5" );
                _defaultVals.put(STATS_MAX_WRAP_LEN,     "30" );
                _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR,      "0.9" );
                _defaultVals.put(GPU_MEMORY_ALLOCATOR,   "cuda");

http://git-wip-us.apache.org/repos/asf/systemml/blob/69624850/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 552ee3b..43e2727 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -24,6 +24,7 @@ import static jcuda.runtime.JCuda.cudaMemset;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 
+import java.io.IOException;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.LongAdder;
 
@@ -110,7 +111,11 @@ public class GPUObject {
         */
        public Pointer getDensePointer() {
                if(jcudaDenseMatrixPtr == null && shadowBuffer.isBuffered() && 
getJcudaSparseMatrixPtr() == null) {
-                       shadowBuffer.moveToDevice();
+                       try {
+                               shadowBuffer.moveToDevice();
+                       } catch (IOException e) {
+                               throw new DMLRuntimeException("Error moving the 
data from shadow buffer to the device", e);
+                       }
                }
                return jcudaDenseMatrixPtr;
        }
@@ -934,13 +939,21 @@ public class GPUObject {
                        else {
                                // If already copied to shadow buffer as part 
of previous eviction and this is not an eviction (i.e. bufferpool call for 
subsequent CP/Spark instruction),
                                // then copy from shadow buffer to MatrixObject.
-                               shadowBuffer.moveToHost();
+                               try {
+                                       shadowBuffer.moveToHost();
+                               } catch (IOException e) {
+                                       throw new DMLRuntimeException("Error 
moving the data from shadow buffer to the host memory", e);
+                               }
                                return;
                        }
                }
                else if(shadowBuffer.isEligibleForBuffering(isEviction, 
eagerDelete)) {
                        // Perform shadow buffering if (1) single precision, 
(2) during eviction, (3) for dense matrices, and (4) if the given matrix can 
fit into the shadow buffer. 
-                       shadowBuffer.moveFromDevice(instName);
+                       try {
+                               shadowBuffer.moveFromDevice(instName);
+                       } catch (IOException e) {
+                               throw new DMLRuntimeException("Error moving the 
data from the device to the shadow buffer", e);
+                       }
                        return;
                }
                else if (isDensePointerNull() && getJcudaSparseMatrixPtr() == 
null) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/69624850/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
index a36e036..4c534a0 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java
@@ -20,41 +20,65 @@ package org.apache.sysml.runtime.instructions.gpu.context;
 
 import static jcuda.runtime.JCuda.cudaMemcpy;
 
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.concurrent.atomic.AtomicLong;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.conf.DMLConfig;
+import org.apache.sysml.runtime.DMLRuntimeException;
 import 
org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.utils.GPUStatistics;
+import org.apache.sysml.utils.PersistentLRUCache;
 
 import jcuda.Pointer;
-import jcuda.Sizeof;
 
+/**
+ * Shadow buffer is a temporary staging area used during eviction.
+ * It is eagerly deleted and backed using the local filesystem in case of 
Garbage Collection
+ * or if the staging memory size exceeds the user-specified size.
+ * This is needed to respect SystemML's memory estimates, while still allowing
+ * for caching in case of GPU plans.
+ */
 public class ShadowBuffer {
        private static final Log LOG = 
LogFactory.getLog(ShadowBuffer.class.getName());
+       private static PersistentLRUCache CACHE;
+       private static AtomicLong UNIQUE_ID = new AtomicLong();
+       private static long EVICTION_SHADOW_BUFFER_MAX_BYTES; 
+       final GPUObject gpuObj;
+       boolean isBuffered = false;
+       String fileName;
        
-       GPUObject gpuObj;
-       float[] shadowPointer = null;
-       private static boolean _warnedAboutShadowBuffer = false;
-       private static long EVICTION_SHADOW_BUFFER_CURR_BYTES = 0;
-       private static long EVICTION_SHADOW_BUFFER_MAX_BYTES;
-       static {
-               if(DMLScript.FLOATING_POINT_PRECISION.equals("double")) {
-                       EVICTION_SHADOW_BUFFER_MAX_BYTES = 0;
-               }
-               else {
+       public static boolean isEnabled() {
+               if(CACHE == null && EVICTION_SHADOW_BUFFER_MAX_BYTES >= 0) {
                        double shadowBufferSize = 
ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.EVICTION_SHADOW_BUFFERSIZE);
-                       if(shadowBufferSize < 0 || shadowBufferSize > 1) 
-                               throw new RuntimeException("Incorrect value (" 
+ shadowBufferSize + ") for the configuration:" + 
DMLConfig.EVICTION_SHADOW_BUFFERSIZE);
-                       EVICTION_SHADOW_BUFFER_MAX_BYTES = (long) 
(((double)InfrastructureAnalyzer.getLocalMaxMemory())*shadowBufferSize);
+                       if(shadowBufferSize <= 0) {
+                               EVICTION_SHADOW_BUFFER_MAX_BYTES = -1; // Minor 
optimization to avoid unnecessary invoking configuration manager.
+                       }
+                       else {
+                               if(shadowBufferSize > 1) 
+                                       throw new RuntimeException("Incorrect 
value (" + shadowBufferSize + ") for the configuration:" + 
DMLConfig.EVICTION_SHADOW_BUFFERSIZE);
+                               EVICTION_SHADOW_BUFFER_MAX_BYTES = (long) 
(((double)InfrastructureAnalyzer.getLocalMaxMemory())*shadowBufferSize);
+                               try {
+                                       CACHE = new 
PersistentLRUCache(EVICTION_SHADOW_BUFFER_MAX_BYTES);
+                               } catch(IOException e) {
+                                       LOG.warn("Unable to create a temporary 
directory for shadow buffering on the local filesystem; disabling shadow 
buffering:" + e.getMessage());
+                                       EVICTION_SHADOW_BUFFER_MAX_BYTES = -1; 
// Minor optimization to avoid checking for file permission.
+                               }
+                       }
                }
+               return CACHE != null;
        }
        
        public ShadowBuffer(GPUObject gpuObj) {
+               if(isEnabled())
+                       fileName = "shadow_" + UNIQUE_ID.incrementAndGet();
                this.gpuObj = gpuObj;
+               
        }
        
        /**
@@ -63,19 +87,39 @@ public class ShadowBuffer {
         * @return true if the gpu object is shadow buffered
         */
        public boolean isBuffered() {
-               return shadowPointer != null;
+               return isBuffered;
+       }
+       
+       private static long getSizeOfDataType(long numElems) {
+               return numElems * ((long) LibMatrixCUDA.sizeOfDataType);
        }
        
        /**
         * Move the data from GPU to shadow buffer 
         * @param instName name of the instruction
+        * @throws IOException if error 
+        * @throws FileNotFoundException  if error
         */
-       public void moveFromDevice(String instName) {
+       public void moveFromDevice(String instName) throws 
FileNotFoundException, IOException {
                long start = ConfigurationManager.isStatistics() ? 
System.nanoTime() : 0;
                int numElems = 
GPUObject.toIntExact(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns());
-               shadowPointer = new float[numElems];
-               EVICTION_SHADOW_BUFFER_CURR_BYTES += 
getSizeOfFloat(shadowPointer.length);
-               cudaMemcpy(Pointer.to(shadowPointer), 
gpuObj.jcudaDenseMatrixPtr, getSizeOfDataType(numElems), 
jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost);
+       
+               if(isDoublePrecision()) {
+                       double [] shadowPointer = new double[numElems];
+                       cudaMemcpy(Pointer.to(shadowPointer), 
gpuObj.jcudaDenseMatrixPtr, getSizeOfDataType(numElems), 
jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost);
+                       CACHE.put(fileName, shadowPointer);
+                       isBuffered = true;
+               }
+               else if(isSinglePrecision()) {
+                       float [] shadowPointer = new float[numElems];
+                       cudaMemcpy(Pointer.to(shadowPointer), 
gpuObj.jcudaDenseMatrixPtr, getSizeOfDataType(numElems), 
jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost);
+                       CACHE.put(fileName, shadowPointer);
+                       isBuffered = true;
+               }
+               else {
+                       throw new DMLRuntimeException("Unsupported datatype");
+               }
+               
                gpuObj.getGPUContext().cudaFreeHelper(instName, 
gpuObj.jcudaDenseMatrixPtr, true);
                gpuObj.jcudaDenseMatrixPtr = null;
                if (ConfigurationManager.isStatistics()) {
@@ -87,24 +131,36 @@ public class ShadowBuffer {
                }
        }
        
-       private long getSizeOfFloat(long numElems) {
-               return numElems*Sizeof.FLOAT;
+
+       private static boolean isDoublePrecision() {
+               return LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.DOUBLE;
        }
        
-       private long getSizeOfDataType(long numElems) {
-               return numElems*LibMatrixCUDA.sizeOfDataType;
+       private static boolean isSinglePrecision() {
+               return LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.FLOAT;
        }
        
        /**
         * Move the data from shadow buffer to Matrix object
+        * @throws IOException if error 
+        * @throws FileNotFoundException if error
         */
-       public void moveToHost() {
+       public void moveToHost() throws FileNotFoundException, IOException {
                long start = ConfigurationManager.isStatistics() ? 
System.nanoTime() : 0;
                MatrixBlock tmp = new 
MatrixBlock(GPUObject.toIntExact(gpuObj.mat.getNumRows()), 
GPUObject.toIntExact(gpuObj.mat.getNumColumns()), false);
                tmp.allocateDenseBlock();
                double [] tmpArr = tmp.getDenseBlockValues();
-               for(int i = 0; i < shadowPointer.length; i++) {
-                       tmpArr[i] = shadowPointer[i];
+               if(isDoublePrecision()) {
+                       System.arraycopy(CACHE.getAsDoubleArray(fileName), 0, 
tmpArr, 0, tmpArr.length);
+               }
+               else if(isSinglePrecision()) {
+                       float [] shadowPointer = 
CACHE.getAsFloatArray(fileName);
+                       for(int i = 0; i < shadowPointer.length; i++) {
+                               tmpArr[i] = shadowPointer[i];
+                       }
+               }
+               else {
+                       throw new DMLRuntimeException("Unsupported datatype");
                }
                gpuObj.mat.acquireModify(tmp);
                gpuObj.mat.release();
@@ -122,12 +178,28 @@ public class ShadowBuffer {
        
        /**
         * Move the data from shadow buffer to GPU
+        * @throws IOException if error
+        * @throws FileNotFoundException if error
         */
-       public void moveToDevice() {
+       public void moveToDevice() throws FileNotFoundException, IOException {
                long start = ConfigurationManager.isStatistics() ? 
System.nanoTime() : 0;
-               long numBytes = getSizeOfDataType(shadowPointer.length);
+               int length; Pointer shadowDevicePointer;
+               if(isDoublePrecision()) {
+                       double [] shadowPointer = 
CACHE.getAsDoubleArray(fileName);
+                       length = shadowPointer.length;
+                       shadowDevicePointer = Pointer.to(shadowPointer);
+               }
+               else if(isSinglePrecision()) {
+                       float [] shadowPointer = 
CACHE.getAsFloatArray(fileName);
+                       length = shadowPointer.length;
+                       shadowDevicePointer = Pointer.to(shadowPointer);
+               }
+               else {
+                       throw new DMLRuntimeException("Unsupported datatype");
+               }
+               long numBytes = getSizeOfDataType(length);
                gpuObj.jcudaDenseMatrixPtr = 
gpuObj.getGPUContext().allocate(null, numBytes);
-               cudaMemcpy(gpuObj.jcudaDenseMatrixPtr, 
Pointer.to(shadowPointer), numBytes, 
jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice);
+               cudaMemcpy(gpuObj.jcudaDenseMatrixPtr, shadowDevicePointer, 
numBytes, jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice);
                clearShadowPointer();
                if (ConfigurationManager.isStatistics()) {
                        long totalTime = System.nanoTime() - start;
@@ -144,14 +216,14 @@ public class ShadowBuffer {
         * @return true if the given GPU object is eligible to be shadow 
buffered
         */
        public boolean isEligibleForBuffering(boolean isEviction, boolean 
eagerDelete) {
-               if(LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.FLOAT && 
isEviction && eagerDelete && !gpuObj.isDensePointerNull()) {
-                       long numBytes = 
getSizeOfFloat(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns());
-                       boolean ret = EVICTION_SHADOW_BUFFER_CURR_BYTES + 
numBytes <= EVICTION_SHADOW_BUFFER_MAX_BYTES;
-                       if(!ret && !_warnedAboutShadowBuffer) {
-                               LOG.warn("Shadow buffer is full, so using CP 
bufferpool instead. Consider increasing sysml.gpu.eviction.shadow.bufferSize.");
-                               _warnedAboutShadowBuffer = true;
+               if(isEnabled() && isEviction && eagerDelete && 
!gpuObj.isDensePointerNull()) {
+                       long numBytes = 
getSizeOfDataType(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns());
+                       if(EVICTION_SHADOW_BUFFER_MAX_BYTES <= numBytes) {
+                               return false; // Don't attempt to cache very 
large GPU objects.
+                       }
+                       else {
+                               return true; // Dense GPU objects is eligible 
for shadow buffering when called during eviction and is being eagerly deleted.
                        }
-                       return ret;
                }
                else {
                        return false;
@@ -162,9 +234,9 @@ public class ShadowBuffer {
         * Removes the content from shadow buffer
         */
        public void clearShadowPointer() {
-               if(shadowPointer != null) {
-                       EVICTION_SHADOW_BUFFER_CURR_BYTES -= 
getSizeOfFloat(shadowPointer.length);
+               if(CACHE.containsKey(fileName)) {
+                       CACHE.remove(fileName);
+                       isBuffered = false;
                }
-               shadowPointer = null;
        }
 }

systemml git commit: [SYSTEMML-445] Use PersistentLRUCache for shadow buffering

Reply via email to