Repository: systemml Updated Branches: refs/heads/master 69f2d377c -> 69624850e
[SYSTEMML-445] Use PersistentLRUCache for shadow buffering - Shadow buffer is cleared eagerly in case of garbage collection to avoid OOM and is backed by org.apache.sysml.utils.PersistentLRUCache. - Setting the configuration property sysml.gpu.eviction.shadow.bufferSize to zero disables shadow buffering. If you intend to train network larger than the GPU memory size, consider using large driver memory and setting sysml.gpu.eviction.shadow.bufferSize to a value greater than 0. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/69624850 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/69624850 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/69624850 Branch: refs/heads/master Commit: 69624850ea872841daef1f99251d793e103502f3 Parents: 69f2d37 Author: Niketan Pansare <npan...@us.ibm.com> Authored: Thu Sep 20 11:22:37 2018 -0700 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Thu Sep 20 11:22:37 2018 -0700 ---------------------------------------------------------------------- conf/SystemML-config.xml.template | 8 +- .../java/org/apache/sysml/conf/DMLConfig.java | 2 +- .../instructions/gpu/context/GPUObject.java | 19 ++- .../instructions/gpu/context/ShadowBuffer.java | 152 ++++++++++++++----- 4 files changed, 134 insertions(+), 47 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/69624850/conf/SystemML-config.xml.template ---------------------------------------------------------------------- diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template index d773f79..3925c4e 100644 --- a/conf/SystemML-config.xml.template +++ b/conf/SystemML-config.xml.template @@ -105,9 +105,11 @@ <!-- Advanced optimization: fraction of driver memory to use for caching (default: 0.15) --> <sysml.caching.bufferSize>0.15</sysml.caching.bufferSize> - <!-- Advanced optimization: fraction of driver memory to use for GPU shadow buffer. This optimization is ignored for double precision. - By default, it is disabled (hence set to 0.0). If you intend to train network larger than GPU memory size, consider using single precision and setting this to 0.1 --> - <sysml.gpu.eviction.shadow.bufferSize>0.0</sysml.gpu.eviction.shadow.bufferSize> + <!-- Advanced optimization: maximum fraction of driver memory to use for GPU shadow buffer. + Shadow buffer is cleared eagerly on garbage collection to avoid OOM and is backed by org.apache.sysml.utils.PersistentLRUCache. + Setting this to zero disables shadow buffering. If you intend to train network larger than GPU memory size, + consider using large driver memory and setting this to a value greater than 0. --> + <sysml.gpu.eviction.shadow.bufferSize>0.5</sysml.gpu.eviction.shadow.bufferSize> <!-- Fraction of available GPU memory to use. This is similar to TensorFlow's per_process_gpu_memory_fraction configuration property. (default: 0.9) --> <sysml.gpu.memory.util.factor>0.9</sysml.gpu.memory.util.factor> http://git-wip-us.apache.org/repos/asf/systemml/blob/69624850/src/main/java/org/apache/sysml/conf/DMLConfig.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java index 5b30609..7f0ecbc 100644 --- a/src/main/java/org/apache/sysml/conf/DMLConfig.java +++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java @@ -138,7 +138,7 @@ public class DMLConfig _defaultVals.put(NATIVE_BLAS_DIR, "none" ); _defaultVals.put(EXTRA_FINEGRAINED_STATS,"false" ); _defaultVals.put(PRINT_GPU_MEMORY_INFO, "false" ); - _defaultVals.put(EVICTION_SHADOW_BUFFERSIZE, "0.0" ); + _defaultVals.put(EVICTION_SHADOW_BUFFERSIZE, "0.5" ); _defaultVals.put(STATS_MAX_WRAP_LEN, "30" ); _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR, "0.9" ); _defaultVals.put(GPU_MEMORY_ALLOCATOR, "cuda"); http://git-wip-us.apache.org/repos/asf/systemml/blob/69624850/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index 552ee3b..43e2727 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -24,6 +24,7 @@ import static jcuda.runtime.JCuda.cudaMemset; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; +import java.io.IOException; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.LongAdder; @@ -110,7 +111,11 @@ public class GPUObject { */ public Pointer getDensePointer() { if(jcudaDenseMatrixPtr == null && shadowBuffer.isBuffered() && getJcudaSparseMatrixPtr() == null) { - shadowBuffer.moveToDevice(); + try { + shadowBuffer.moveToDevice(); + } catch (IOException e) { + throw new DMLRuntimeException("Error moving the data from shadow buffer to the device", e); + } } return jcudaDenseMatrixPtr; } @@ -934,13 +939,21 @@ public class GPUObject { else { // If already copied to shadow buffer as part of previous eviction and this is not an eviction (i.e. bufferpool call for subsequent CP/Spark instruction), // then copy from shadow buffer to MatrixObject. - shadowBuffer.moveToHost(); + try { + shadowBuffer.moveToHost(); + } catch (IOException e) { + throw new DMLRuntimeException("Error moving the data from shadow buffer to the host memory", e); + } return; } } else if(shadowBuffer.isEligibleForBuffering(isEviction, eagerDelete)) { // Perform shadow buffering if (1) single precision, (2) during eviction, (3) for dense matrices, and (4) if the given matrix can fit into the shadow buffer. - shadowBuffer.moveFromDevice(instName); + try { + shadowBuffer.moveFromDevice(instName); + } catch (IOException e) { + throw new DMLRuntimeException("Error moving the data from the device to the shadow buffer", e); + } return; } else if (isDensePointerNull() && getJcudaSparseMatrixPtr() == null) { http://git-wip-us.apache.org/repos/asf/systemml/blob/69624850/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java index a36e036..4c534a0 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ShadowBuffer.java @@ -20,41 +20,65 @@ package org.apache.sysml.runtime.instructions.gpu.context; import static jcuda.runtime.JCuda.cudaMemcpy; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.concurrent.atomic.AtomicLong; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.sysml.api.DMLScript; import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.conf.DMLConfig; +import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.utils.GPUStatistics; +import org.apache.sysml.utils.PersistentLRUCache; import jcuda.Pointer; -import jcuda.Sizeof; +/** + * Shadow buffer is a temporary staging area used during eviction. + * It is eagerly deleted and backed using the local filesystem in case of Garbage Collection + * or if the staging memory size exceeds the user-specified size. + * This is needed to respect SystemML's memory estimates, while still allowing + * for caching in case of GPU plans. + */ public class ShadowBuffer { private static final Log LOG = LogFactory.getLog(ShadowBuffer.class.getName()); + private static PersistentLRUCache CACHE; + private static AtomicLong UNIQUE_ID = new AtomicLong(); + private static long EVICTION_SHADOW_BUFFER_MAX_BYTES; + final GPUObject gpuObj; + boolean isBuffered = false; + String fileName; - GPUObject gpuObj; - float[] shadowPointer = null; - private static boolean _warnedAboutShadowBuffer = false; - private static long EVICTION_SHADOW_BUFFER_CURR_BYTES = 0; - private static long EVICTION_SHADOW_BUFFER_MAX_BYTES; - static { - if(DMLScript.FLOATING_POINT_PRECISION.equals("double")) { - EVICTION_SHADOW_BUFFER_MAX_BYTES = 0; - } - else { + public static boolean isEnabled() { + if(CACHE == null && EVICTION_SHADOW_BUFFER_MAX_BYTES >= 0) { double shadowBufferSize = ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.EVICTION_SHADOW_BUFFERSIZE); - if(shadowBufferSize < 0 || shadowBufferSize > 1) - throw new RuntimeException("Incorrect value (" + shadowBufferSize + ") for the configuration:" + DMLConfig.EVICTION_SHADOW_BUFFERSIZE); - EVICTION_SHADOW_BUFFER_MAX_BYTES = (long) (((double)InfrastructureAnalyzer.getLocalMaxMemory())*shadowBufferSize); + if(shadowBufferSize <= 0) { + EVICTION_SHADOW_BUFFER_MAX_BYTES = -1; // Minor optimization to avoid unnecessary invoking configuration manager. + } + else { + if(shadowBufferSize > 1) + throw new RuntimeException("Incorrect value (" + shadowBufferSize + ") for the configuration:" + DMLConfig.EVICTION_SHADOW_BUFFERSIZE); + EVICTION_SHADOW_BUFFER_MAX_BYTES = (long) (((double)InfrastructureAnalyzer.getLocalMaxMemory())*shadowBufferSize); + try { + CACHE = new PersistentLRUCache(EVICTION_SHADOW_BUFFER_MAX_BYTES); + } catch(IOException e) { + LOG.warn("Unable to create a temporary directory for shadow buffering on the local filesystem; disabling shadow buffering:" + e.getMessage()); + EVICTION_SHADOW_BUFFER_MAX_BYTES = -1; // Minor optimization to avoid checking for file permission. + } + } } + return CACHE != null; } public ShadowBuffer(GPUObject gpuObj) { + if(isEnabled()) + fileName = "shadow_" + UNIQUE_ID.incrementAndGet(); this.gpuObj = gpuObj; + } /** @@ -63,19 +87,39 @@ public class ShadowBuffer { * @return true if the gpu object is shadow buffered */ public boolean isBuffered() { - return shadowPointer != null; + return isBuffered; + } + + private static long getSizeOfDataType(long numElems) { + return numElems * ((long) LibMatrixCUDA.sizeOfDataType); } /** * Move the data from GPU to shadow buffer * @param instName name of the instruction + * @throws IOException if error + * @throws FileNotFoundException if error */ - public void moveFromDevice(String instName) { + public void moveFromDevice(String instName) throws FileNotFoundException, IOException { long start = ConfigurationManager.isStatistics() ? System.nanoTime() : 0; int numElems = GPUObject.toIntExact(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns()); - shadowPointer = new float[numElems]; - EVICTION_SHADOW_BUFFER_CURR_BYTES += getSizeOfFloat(shadowPointer.length); - cudaMemcpy(Pointer.to(shadowPointer), gpuObj.jcudaDenseMatrixPtr, getSizeOfDataType(numElems), jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost); + + if(isDoublePrecision()) { + double [] shadowPointer = new double[numElems]; + cudaMemcpy(Pointer.to(shadowPointer), gpuObj.jcudaDenseMatrixPtr, getSizeOfDataType(numElems), jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost); + CACHE.put(fileName, shadowPointer); + isBuffered = true; + } + else if(isSinglePrecision()) { + float [] shadowPointer = new float[numElems]; + cudaMemcpy(Pointer.to(shadowPointer), gpuObj.jcudaDenseMatrixPtr, getSizeOfDataType(numElems), jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost); + CACHE.put(fileName, shadowPointer); + isBuffered = true; + } + else { + throw new DMLRuntimeException("Unsupported datatype"); + } + gpuObj.getGPUContext().cudaFreeHelper(instName, gpuObj.jcudaDenseMatrixPtr, true); gpuObj.jcudaDenseMatrixPtr = null; if (ConfigurationManager.isStatistics()) { @@ -87,24 +131,36 @@ public class ShadowBuffer { } } - private long getSizeOfFloat(long numElems) { - return numElems*Sizeof.FLOAT; + + private static boolean isDoublePrecision() { + return LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.DOUBLE; } - private long getSizeOfDataType(long numElems) { - return numElems*LibMatrixCUDA.sizeOfDataType; + private static boolean isSinglePrecision() { + return LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.FLOAT; } /** * Move the data from shadow buffer to Matrix object + * @throws IOException if error + * @throws FileNotFoundException if error */ - public void moveToHost() { + public void moveToHost() throws FileNotFoundException, IOException { long start = ConfigurationManager.isStatistics() ? System.nanoTime() : 0; MatrixBlock tmp = new MatrixBlock(GPUObject.toIntExact(gpuObj.mat.getNumRows()), GPUObject.toIntExact(gpuObj.mat.getNumColumns()), false); tmp.allocateDenseBlock(); double [] tmpArr = tmp.getDenseBlockValues(); - for(int i = 0; i < shadowPointer.length; i++) { - tmpArr[i] = shadowPointer[i]; + if(isDoublePrecision()) { + System.arraycopy(CACHE.getAsDoubleArray(fileName), 0, tmpArr, 0, tmpArr.length); + } + else if(isSinglePrecision()) { + float [] shadowPointer = CACHE.getAsFloatArray(fileName); + for(int i = 0; i < shadowPointer.length; i++) { + tmpArr[i] = shadowPointer[i]; + } + } + else { + throw new DMLRuntimeException("Unsupported datatype"); } gpuObj.mat.acquireModify(tmp); gpuObj.mat.release(); @@ -122,12 +178,28 @@ public class ShadowBuffer { /** * Move the data from shadow buffer to GPU + * @throws IOException if error + * @throws FileNotFoundException if error */ - public void moveToDevice() { + public void moveToDevice() throws FileNotFoundException, IOException { long start = ConfigurationManager.isStatistics() ? System.nanoTime() : 0; - long numBytes = getSizeOfDataType(shadowPointer.length); + int length; Pointer shadowDevicePointer; + if(isDoublePrecision()) { + double [] shadowPointer = CACHE.getAsDoubleArray(fileName); + length = shadowPointer.length; + shadowDevicePointer = Pointer.to(shadowPointer); + } + else if(isSinglePrecision()) { + float [] shadowPointer = CACHE.getAsFloatArray(fileName); + length = shadowPointer.length; + shadowDevicePointer = Pointer.to(shadowPointer); + } + else { + throw new DMLRuntimeException("Unsupported datatype"); + } + long numBytes = getSizeOfDataType(length); gpuObj.jcudaDenseMatrixPtr = gpuObj.getGPUContext().allocate(null, numBytes); - cudaMemcpy(gpuObj.jcudaDenseMatrixPtr, Pointer.to(shadowPointer), numBytes, jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice); + cudaMemcpy(gpuObj.jcudaDenseMatrixPtr, shadowDevicePointer, numBytes, jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice); clearShadowPointer(); if (ConfigurationManager.isStatistics()) { long totalTime = System.nanoTime() - start; @@ -144,14 +216,14 @@ public class ShadowBuffer { * @return true if the given GPU object is eligible to be shadow buffered */ public boolean isEligibleForBuffering(boolean isEviction, boolean eagerDelete) { - if(LibMatrixCUDA.sizeOfDataType == jcuda.Sizeof.FLOAT && isEviction && eagerDelete && !gpuObj.isDensePointerNull()) { - long numBytes = getSizeOfFloat(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns()); - boolean ret = EVICTION_SHADOW_BUFFER_CURR_BYTES + numBytes <= EVICTION_SHADOW_BUFFER_MAX_BYTES; - if(!ret && !_warnedAboutShadowBuffer) { - LOG.warn("Shadow buffer is full, so using CP bufferpool instead. Consider increasing sysml.gpu.eviction.shadow.bufferSize."); - _warnedAboutShadowBuffer = true; + if(isEnabled() && isEviction && eagerDelete && !gpuObj.isDensePointerNull()) { + long numBytes = getSizeOfDataType(gpuObj.mat.getNumRows()*gpuObj.mat.getNumColumns()); + if(EVICTION_SHADOW_BUFFER_MAX_BYTES <= numBytes) { + return false; // Don't attempt to cache very large GPU objects. + } + else { + return true; // Dense GPU objects is eligible for shadow buffering when called during eviction and is being eagerly deleted. } - return ret; } else { return false; @@ -162,9 +234,9 @@ public class ShadowBuffer { * Removes the content from shadow buffer */ public void clearShadowPointer() { - if(shadowPointer != null) { - EVICTION_SHADOW_BUFFER_CURR_BYTES -= getSizeOfFloat(shadowPointer.length); + if(CACHE.containsKey(fileName)) { + CACHE.remove(fileName); + isBuffered = false; } - shadowPointer = null; } }