[SYSTEMML-1625] GPU Unit Tests (and GPU row/col variance bug fix) - Documented random matrix generation - GPU unit test using MLContext. Compares CPU output to GPU - Pseudo-unit tests for GPU implementations of unary ops, unary aggregate ops, transpose, elementwise ops, matrix multiplication ops, builtin ops & NN ops - Fixed crucial bug in col/row var - gpuTests profile for GPU tests (mvn verify -PgpuTests) - Updated intellij style for import order
Closes #513 Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/772fb588 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/772fb588 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/772fb588 Branch: refs/heads/master Commit: 772fb588324916e4225bb6e1970ca6a8f87eb414 Parents: ceeec4b Author: Nakul Jindal <naku...@gmail.com> Authored: Wed May 31 21:54:13 2017 -0700 Committer: Nakul Jindal <naku...@gmail.com> Committed: Wed May 31 21:54:13 2017 -0700 ---------------------------------------------------------------------- dev/code-style/systemml-style-intellij.xml | 18 + pom.xml | 10 + .../apache/sysml/api/ScriptExecutorUtils.java | 1 + .../context/ExecutionContext.java | 3 + .../instructions/GPUInstructionParser.java | 120 +++-- .../instructions/gpu/context/GPUContext.java | 118 +++-- .../instructions/gpu/context/GPUObject.java | 55 +- .../instructions/gpu/context/JCudaKernels.java | 3 +- .../runtime/matrix/data/LibMatrixCUDA.java | 112 ++-- .../runtime/matrix/data/LibMatrixDatagen.java | 78 +-- .../matrix/data/RandomMatrixGenerator.java | 123 ++++- .../sysml/test/gpu/AggregateUnaryOpTests.java | 133 +++++ .../apache/sysml/test/gpu/BinaryOpTests.java | 85 ++++ .../org/apache/sysml/test/gpu/GPUTests.java | 250 +++++++++ .../gpu/MatrixMatrixElementWiseOpTests.java | 271 ++++++++++ .../test/gpu/MatrixMultiplicationOpTest.java | 190 +++++++ .../sysml/test/gpu/NeuralNetworkOpTests.java | 508 +++++++++++++++++++ .../org/apache/sysml/test/gpu/ReorgOpTests.java | 70 +++ .../gpu/ScalarMatrixElementwiseOpTests.java | 131 +++++ .../org/apache/sysml/test/gpu/UnaryOpTests.java | 113 +++++ .../apache/sysml/test/gpu/UnaryOpTestsBase.java | 106 ++++ .../test/integration/gpu/ZPackageSuite.java | 46 ++ 22 files changed, 2308 insertions(+), 236 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/dev/code-style/systemml-style-intellij.xml ---------------------------------------------------------------------- diff --git a/dev/code-style/systemml-style-intellij.xml b/dev/code-style/systemml-style-intellij.xml index 248c600..1ad3209 100644 --- a/dev/code-style/systemml-style-intellij.xml +++ b/dev/code-style/systemml-style-intellij.xml @@ -16,7 +16,25 @@ * specific language governing permissions and limitations * under the License. --> + <code_scheme name="SystemML Format"> + <option name="CLASS_COUNT_TO_USE_IMPORT_ON_DEMAND" value="999" /> + <option name="NAMES_COUNT_TO_USE_IMPORT_ON_DEMAND" value="999" /> + <option name="IMPORT_LAYOUT_TABLE"> + <value> + <package name="" withSubpackages="true" static="false" /> + <emptyLine /> + <package name="java" withSubpackages="true" static="false" /> + <emptyLine /> + <package name="javax" withSubpackages="true" static="false" /> + <emptyLine /> + <package name="org" withSubpackages="true" static="false" /> + <emptyLine /> + <package name="com" withSubpackages="true" static="false" /> + <emptyLine /> + <package name="" withSubpackages="true" static="true" /> + </value> + </option> <codeStyleSettings language="JAVA"> <option name="KEEP_LINE_BREAKS" value="false" /> <option name="KEEP_FIRST_COLUMN_COMMENT" value="false" /> http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 5ce5576..99e2dec 100644 --- a/pom.xml +++ b/pom.xml @@ -436,6 +436,7 @@ <exclude>**/slowtest/**</exclude> <exclude>**/integration/**</exclude> <exclude>**/test/unit/**</exclude> + <exclude>**/test/gpu/**</exclude> </excludes> </configuration> @@ -478,6 +479,7 @@ -Djava.awt.headless=true</argLine> <includes> + <include>${gpuTestsPath}</include> <!-- Path for GPU integration tests, enabled for gpuTests profile --> <include>**/integration/applications/**/*Suite.java</include> <include>**/integration/conversion/*Suite.java</include> <include>**/integration/functions/data/*Suite.java</include> @@ -896,6 +898,14 @@ </build> </profile> + <!-- profile to enable running tests on the GPU --> + <profile> + <id>gpuTests</id> + <properties> + <gpuTestsPath>**/integration/gpu/**/*Suite.java</gpuTestsPath> + </properties> + </profile> + <profile> <!-- Can be used to ignore doclint javadoc issues --> <id>ignore-doclint</id> http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java index f582c36..674a011 100644 --- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java +++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java @@ -94,6 +94,7 @@ public class ScriptExecutorUtils { rtprog.execute(ec); } finally { // ensure cleanup/shutdown if (DMLScript.USE_ACCELERATOR && ec.getGPUContext() != null) { + ec.getGPUContext().clearTemporaryMemory(); GPUContextPool.returnToPool(ec.getGPUContext()); } if (dmlconf.getBooleanValue(DMLConfig.CODEGEN)) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java index 35b4cd1..735f394 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java @@ -271,6 +271,9 @@ public class ExecutionContext { MatrixObject mo = getMatrixObject(varName); if( mo.getGPUObject(getGPUContext()) == null ) { GPUObject newGObj = getGPUContext().createGPUObject(mo); + // The lock is added here for an output block + // so that any block currently in use is not deallocated by eviction on the GPU + newGObj.addLock(); mo.setGPUObject(getGPUContext(), newGObj); } return mo; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java index 443d0eb..e0bcd1b 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java @@ -42,81 +42,79 @@ public class GPUInstructionParser extends InstructionParser // Neural Network Operators String2GPUInstructionType.put( "relu_backward", GPUINSTRUCTION_TYPE.Convolution); String2GPUInstructionType.put( "conv2d", GPUINSTRUCTION_TYPE.Convolution); - String2GPUInstructionType.put( "conv2d_bias_add", GPUINSTRUCTION_TYPE.Convolution); + String2GPUInstructionType.put( "conv2d_bias_add", GPUINSTRUCTION_TYPE.Convolution); String2GPUInstructionType.put( "conv2d_backward_filter", GPUINSTRUCTION_TYPE.Convolution); String2GPUInstructionType.put( "conv2d_backward_data", GPUINSTRUCTION_TYPE.Convolution); String2GPUInstructionType.put( "maxpooling", GPUINSTRUCTION_TYPE.Convolution); String2GPUInstructionType.put( "maxpooling_backward", GPUINSTRUCTION_TYPE.Convolution); - String2GPUInstructionType.put( "bias_add", GPUINSTRUCTION_TYPE.Convolution); - String2GPUInstructionType.put( "bias_multiply", GPUINSTRUCTION_TYPE.Convolution); + String2GPUInstructionType.put( "bias_add", GPUINSTRUCTION_TYPE.Convolution); + String2GPUInstructionType.put( "bias_multiply", GPUINSTRUCTION_TYPE.Convolution); // Matrix Multiply Operators - String2GPUInstructionType.put( "ba+*", GPUINSTRUCTION_TYPE.AggregateBinary); - String2GPUInstructionType.put( "tsmm", GPUINSTRUCTION_TYPE.MMTSJ); + String2GPUInstructionType.put( "ba+*", GPUINSTRUCTION_TYPE.AggregateBinary); + String2GPUInstructionType.put( "tsmm", GPUINSTRUCTION_TYPE.MMTSJ); // Reorg/Transpose - String2GPUInstructionType.put( "r'", GPUINSTRUCTION_TYPE.Reorg); + String2GPUInstructionType.put( "r'", GPUINSTRUCTION_TYPE.Reorg); // Binary Cellwise - String2GPUInstructionType.put( "+" , GPUINSTRUCTION_TYPE.ArithmeticBinary); - String2GPUInstructionType.put( "-" , GPUINSTRUCTION_TYPE.ArithmeticBinary); - String2GPUInstructionType.put( "*" , GPUINSTRUCTION_TYPE.ArithmeticBinary); - String2GPUInstructionType.put( "/" , GPUINSTRUCTION_TYPE.ArithmeticBinary); - String2GPUInstructionType.put( "%%" , GPUINSTRUCTION_TYPE.ArithmeticBinary); - String2GPUInstructionType.put( "%/%" , GPUINSTRUCTION_TYPE.ArithmeticBinary); - String2GPUInstructionType.put( "^" , GPUINSTRUCTION_TYPE.ArithmeticBinary); - String2GPUInstructionType.put( "1-*" , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case - String2GPUInstructionType.put( "^2" , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special ^ case - String2GPUInstructionType.put( "*2" , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case - String2GPUInstructionType.put( "-nz" , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special - case - String2GPUInstructionType.put( "+*" , GPUINSTRUCTION_TYPE.ArithmeticBinary); - String2GPUInstructionType.put( "-*" , GPUINSTRUCTION_TYPE.ArithmeticBinary); + String2GPUInstructionType.put( "+", GPUINSTRUCTION_TYPE.ArithmeticBinary); + String2GPUInstructionType.put( "-", GPUINSTRUCTION_TYPE.ArithmeticBinary); + String2GPUInstructionType.put( "*", GPUINSTRUCTION_TYPE.ArithmeticBinary); + String2GPUInstructionType.put( "/", GPUINSTRUCTION_TYPE.ArithmeticBinary); + //String2GPUInstructionType.put( "%%", GPUINSTRUCTION_TYPE.ArithmeticBinary); + //String2GPUInstructionType.put( "%/%", GPUINSTRUCTION_TYPE.ArithmeticBinary); + String2GPUInstructionType.put( "^", GPUINSTRUCTION_TYPE.ArithmeticBinary); + String2GPUInstructionType.put( "1-*", GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case + String2GPUInstructionType.put( "^2", GPUINSTRUCTION_TYPE.ArithmeticBinary); //special ^ case + String2GPUInstructionType.put( "*2", GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case + String2GPUInstructionType.put( "-nz", GPUINSTRUCTION_TYPE.ArithmeticBinary); //special - case + String2GPUInstructionType.put( "+*", GPUINSTRUCTION_TYPE.ArithmeticBinary); + String2GPUInstructionType.put( "-*", GPUINSTRUCTION_TYPE.ArithmeticBinary); - // Builtin functions - String2GPUInstructionType.put( "sel+" , GPUINSTRUCTION_TYPE.BuiltinUnary); - String2GPUInstructionType.put( "exp" , GPUINSTRUCTION_TYPE.BuiltinUnary); - String2GPUInstructionType.put( "log" , GPUINSTRUCTION_TYPE.BuiltinUnary); - String2GPUInstructionType.put( "abs" , GPUINSTRUCTION_TYPE.BuiltinUnary); - String2GPUInstructionType.put( "sqrt" , GPUINSTRUCTION_TYPE.BuiltinUnary); - String2GPUInstructionType.put( "round" , GPUINSTRUCTION_TYPE.BuiltinUnary); - String2GPUInstructionType.put( "floor" , GPUINSTRUCTION_TYPE.BuiltinUnary); - String2GPUInstructionType.put( "ceil" , GPUINSTRUCTION_TYPE.BuiltinUnary); - String2GPUInstructionType.put( "sin" , GPUINSTRUCTION_TYPE.BuiltinUnary); - String2GPUInstructionType.put( "cos" , GPUINSTRUCTION_TYPE.BuiltinUnary); - String2GPUInstructionType.put( "tan" , GPUINSTRUCTION_TYPE.BuiltinUnary); - String2GPUInstructionType.put( "asin" , GPUINSTRUCTION_TYPE.BuiltinUnary); - String2GPUInstructionType.put( "acos" , GPUINSTRUCTION_TYPE.BuiltinUnary); - String2GPUInstructionType.put( "atan" , GPUINSTRUCTION_TYPE.BuiltinUnary); - String2GPUInstructionType.put( "sign" , GPUINSTRUCTION_TYPE.BuiltinUnary); - - - - String2GPUInstructionType.put( "solve" , GPUINSTRUCTION_TYPE.BuiltinBinary); + // Unary Builtin functions + String2GPUInstructionType.put( "sel+", GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "exp", GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "log", GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "abs", GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "sqrt", GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "round", GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "floor", GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "ceil", GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "sin", GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "cos", GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "tan", GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "asin", GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "acos", GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "atan", GPUINSTRUCTION_TYPE.BuiltinUnary); + String2GPUInstructionType.put( "sign", GPUINSTRUCTION_TYPE.BuiltinUnary); + // Binary Builtin functions + String2GPUInstructionType.put( "solve", GPUINSTRUCTION_TYPE.BuiltinBinary); // Aggregate Unary - String2GPUInstructionType.put( "ua+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Sum - String2GPUInstructionType.put( "uak+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Sum - String2GPUInstructionType.put( "uar+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum - String2GPUInstructionType.put( "uark+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum - String2GPUInstructionType.put( "uac+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum - String2GPUInstructionType.put( "uack+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum - String2GPUInstructionType.put( "ua*" , GPUINSTRUCTION_TYPE.AggregateUnary); // Multiplication - String2GPUInstructionType.put( "uamean" , GPUINSTRUCTION_TYPE.AggregateUnary); // Mean - String2GPUInstructionType.put( "uarmean" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Mean - String2GPUInstructionType.put( "uacmean" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Mean - String2GPUInstructionType.put( "uamax" , GPUINSTRUCTION_TYPE.AggregateUnary); // Max - String2GPUInstructionType.put( "uarmax" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Max - String2GPUInstructionType.put( "uacmax" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Max - String2GPUInstructionType.put( "uamin" , GPUINSTRUCTION_TYPE.AggregateUnary); // Min - String2GPUInstructionType.put( "uarmin" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Min - String2GPUInstructionType.put( "uacmin" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Min - String2GPUInstructionType.put( "uasqk+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Sum of Squares - String2GPUInstructionType.put( "uarsqk+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum of Squares - String2GPUInstructionType.put( "uacsqk+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum of Squares - String2GPUInstructionType.put( "uavar" , GPUINSTRUCTION_TYPE.AggregateUnary); // Variance - String2GPUInstructionType.put( "uarvar" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Variance - String2GPUInstructionType.put( "uacvar" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Variance + String2GPUInstructionType.put( "ua+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Sum + String2GPUInstructionType.put( "uak+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Sum + String2GPUInstructionType.put( "uar+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum + String2GPUInstructionType.put( "uark+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum + String2GPUInstructionType.put( "uac+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum + String2GPUInstructionType.put( "uack+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum + String2GPUInstructionType.put( "ua*" , GPUINSTRUCTION_TYPE.AggregateUnary); // Multiplication + String2GPUInstructionType.put( "uamean" , GPUINSTRUCTION_TYPE.AggregateUnary); // Mean + String2GPUInstructionType.put( "uarmean" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Mean + String2GPUInstructionType.put( "uacmean" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Mean + String2GPUInstructionType.put( "uamax" , GPUINSTRUCTION_TYPE.AggregateUnary); // Max + String2GPUInstructionType.put( "uarmax" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Max + String2GPUInstructionType.put( "uacmax" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Max + String2GPUInstructionType.put( "uamin" , GPUINSTRUCTION_TYPE.AggregateUnary); // Min + String2GPUInstructionType.put( "uarmin" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Min + String2GPUInstructionType.put( "uacmin" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Min + String2GPUInstructionType.put( "uasqk+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Sum of Squares + String2GPUInstructionType.put( "uarsqk+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum of Squares + String2GPUInstructionType.put( "uacsqk+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum of Squares + String2GPUInstructionType.put( "uavar" , GPUINSTRUCTION_TYPE.AggregateUnary); // Variance + String2GPUInstructionType.put( "uarvar" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Variance + String2GPUInstructionType.put( "uacvar" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Variance } public static GPUInstruction parseSingleInstruction (String str ) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java index 673601f..89a2b67 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java @@ -18,25 +18,14 @@ */ package org.apache.sysml.runtime.instructions.gpu.context; -import static jcuda.jcublas.JCublas2.cublasCreate; -import static jcuda.jcublas.JCublas2.cublasDestroy; -import static jcuda.jcudnn.JCudnn.cudnnCreate; -import static jcuda.jcudnn.JCudnn.cudnnDestroy; -import static jcuda.jcusolver.JCusolverDn.cusolverDnDestroy; -import static jcuda.jcusolver.JCusolverSp.cusolverSpDestroy; -import static jcuda.jcusparse.JCusparse.cusparseCreate; -import static jcuda.jcusparse.JCusparse.cusparseDestroy; -import static jcuda.jcusolver.JCusolverDn.cusolverDnCreate; -import static jcuda.jcusolver.JCusolverSp.cusolverSpCreate; - -import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync; -import static jcuda.runtime.JCuda.cudaFree; -import static jcuda.runtime.JCuda.cudaGetDeviceCount; -import static jcuda.runtime.JCuda.cudaMalloc; -import static jcuda.runtime.JCuda.cudaMemGetInfo; -import static jcuda.runtime.JCuda.cudaMemset; -import static jcuda.runtime.JCuda.cudaSetDevice; -import static jcuda.runtime.JCuda.cudaSetDeviceFlags; +import jcuda.Pointer; +import jcuda.jcublas.cublasHandle; +import jcuda.jcudnn.cudnnHandle; +import jcuda.jcusolver.cusolverDnHandle; +import jcuda.jcusolver.cusolverSpHandle; +import jcuda.jcusparse.cusparseHandle; +import jcuda.runtime.JCuda; +import jcuda.runtime.cudaDeviceProp; import java.util.ArrayList; import java.util.Collections; @@ -56,14 +45,24 @@ import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; import org.apache.sysml.utils.GPUStatistics; import org.apache.sysml.utils.LRUCacheMap; -import jcuda.Pointer; -import jcuda.jcublas.cublasHandle; -import jcuda.jcudnn.cudnnHandle; -import jcuda.jcusolver.cusolverDnHandle; -import jcuda.jcusolver.cusolverSpHandle; -import jcuda.jcusparse.cusparseHandle; -import jcuda.runtime.JCuda; -import jcuda.runtime.cudaDeviceProp; +import static jcuda.jcublas.JCublas2.cublasCreate; +import static jcuda.jcublas.JCublas2.cublasDestroy; +import static jcuda.jcudnn.JCudnn.cudnnCreate; +import static jcuda.jcudnn.JCudnn.cudnnDestroy; +import static jcuda.jcusolver.JCusolverDn.cusolverDnCreate; +import static jcuda.jcusolver.JCusolverDn.cusolverDnDestroy; +import static jcuda.jcusolver.JCusolverSp.cusolverSpCreate; +import static jcuda.jcusolver.JCusolverSp.cusolverSpDestroy; +import static jcuda.jcusparse.JCusparse.cusparseCreate; +import static jcuda.jcusparse.JCusparse.cusparseDestroy; +import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync; +import static jcuda.runtime.JCuda.cudaFree; +import static jcuda.runtime.JCuda.cudaGetDeviceCount; +import static jcuda.runtime.JCuda.cudaMalloc; +import static jcuda.runtime.JCuda.cudaMemGetInfo; +import static jcuda.runtime.JCuda.cudaMemset; +import static jcuda.runtime.JCuda.cudaSetDevice; +import static jcuda.runtime.JCuda.cudaSetDeviceFlags; /** * Represents a context per GPU accessible through the same JVM @@ -159,6 +158,7 @@ public class GPUContext { } + @SuppressWarnings("unused") public int getDeviceNum() { return deviceNum; } @@ -174,6 +174,7 @@ public class GPUContext { cudaSetDevice(deviceNum); } + @SuppressWarnings("unused") public static int cudaGetDevice() { int[] device = new int[1]; JCuda.cudaGetDevice(device); @@ -288,6 +289,9 @@ public class GPUContext { * @param eager true if to be done eagerly */ public void cudaFreeHelper(String instructionName, final Pointer toFree, boolean eager) { + Pointer dummy = new Pointer(); + if (toFree == dummy) // trying to free a null pointer + return; long t0 = 0; assert cudaBlockSizeMap.containsKey(toFree) : "ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up"; long size = cudaBlockSizeMap.get(toFree); @@ -382,14 +386,14 @@ public class GPUContext { return; if (allocatedGPUObjects.size() == 0) { - throw new DMLRuntimeException("There is not enough memory on device for this matrix!"); + throw new DMLRuntimeException("There is not enough memory on device for this matrix, request (" + neededSize + ")"); } Collections.sort(allocatedGPUObjects, new Comparator<GPUObject>() { @Override public int compare(GPUObject p1, GPUObject p2) { - long p1Val = p1.readLocks.get(); - long p2Val = p2.readLocks.get(); + long p1Val = p1.locks.get(); + long p2Val = p2.locks.get(); if (p1Val > 0 && p2Val > 0) { // Both are locked, so don't sort @@ -426,8 +430,8 @@ public class GPUContext { while (neededSize > getAvailableMemory() && allocatedGPUObjects.size() > 0) { GPUObject toBeRemoved = allocatedGPUObjects.get(allocatedGPUObjects.size() - 1); - if (toBeRemoved.readLocks.get() > 0) { - throw new DMLRuntimeException("There is not enough memory on device for this matrix!"); + if (toBeRemoved.locks.get() > 0) { + throw new DMLRuntimeException("There is not enough memory on device for this matrix, request (" + neededSize + ")"); } if (toBeRemoved.dirty) { toBeRemoved.copyFromDeviceToHost(); @@ -546,6 +550,7 @@ public class GPUContext { * @return the shared memory per block * @throws DMLRuntimeException ? */ + @SuppressWarnings("unused") public long getMaxSharedMemory() throws DMLRuntimeException { cudaDeviceProp deviceProp = getGPUProperties(); return deviceProp.sharedMemPerBlock; @@ -588,10 +593,10 @@ public class GPUContext { /** * Destroys this GPUContext object - * This method MUST BE called so that the GPU is available to be used again * * @throws DMLRuntimeException if error */ + @SuppressWarnings("unused") public void destroy() throws DMLRuntimeException { LOG.trace("GPU : this context was destroyed, this = " + this.toString()); clearMemory(); @@ -608,14 +613,51 @@ public class GPUContext { /** * Clears all memory used by this {@link GPUContext} - * Be careful to ensure that no memory is currently being used before invoking this + * Be careful to ensure that no memory is currently being used in the temporary memory before invoking this + * If memory is being used between MLContext invocations, they are pointed to by a {@link GPUObject} instance + * which would be part of the {@link MatrixObject}. The cleanup of that {@link MatrixObject} instance will + * cause the memory associated with that block on the GPU to be freed up. * @throws DMLRuntimeException ? */ public void clearMemory() throws DMLRuntimeException { - while (allocatedGPUObjects.isEmpty()) { + clearTemporaryMemory(); + while (!allocatedGPUObjects.isEmpty()) { GPUObject o = allocatedGPUObjects.get(0); - o.clearData(); + if (o.isDirty()){ + LOG.warn("Attempted to free GPU Memory when a block[" + o + "] is still on GPU memory, copying it back to host."); + o.acquireHostRead(); + } + o.clearData(true); } + allocatedGPUObjects.clear(); + } + + /** + * Clears up the memory used to optimize cudaMalloc/cudaFree calls + */ + public void clearTemporaryMemory() { + // To record the cuda block sizes needed by allocatedGPUObjects, others are cleared up. + HashMap<Pointer, Long> tmpCudaBlockSizeMap = new HashMap<>(); + for (GPUObject o : allocatedGPUObjects) { + if (o.isSparse()) { + CSRPointer p = o.getSparseMatrixCudaPointer(); + if (p.rowPtr != null && cudaBlockSizeMap.containsKey(p.rowPtr)) { + tmpCudaBlockSizeMap.put(p.rowPtr, cudaBlockSizeMap.get(p.rowPtr)); + } + if (p.colInd != null && cudaBlockSizeMap.containsKey(p.colInd)) { + tmpCudaBlockSizeMap.put(p.colInd, cudaBlockSizeMap.get(p.colInd)); + } + if (p.val != null && cudaBlockSizeMap.containsKey(p.val)) { + tmpCudaBlockSizeMap.put(p.val, cudaBlockSizeMap.get(p.val)); + } + + } else { + Pointer p = o.getJcudaDenseMatrixPtr(); + tmpCudaBlockSizeMap.put(p, cudaBlockSizeMap.get(p)); + } + } + + // garbage collect all temporarily allocated spaces for (LinkedList<Pointer> l : freeCUDASpaceMap.values()) { for (Pointer p : l) { cudaFreeHelper(p, true); @@ -623,7 +665,9 @@ public class GPUContext { } cudaBlockSizeMap.clear(); freeCUDASpaceMap.clear(); - allocatedGPUObjects.clear(); + + // Restore only those entries for which there are still blocks on the GPU + cudaBlockSizeMap.putAll(tmpCudaBlockSizeMap); } @Override http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index be3cc09..0ed34c5 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -86,8 +86,8 @@ public class GPUObject { /** whether the block attached to this {@link GPUContext} is dirty on the device and needs to be copied back to host */ protected boolean dirty = false; - /** number of read locks on this object */ - protected AtomicInteger readLocks = new AtomicInteger(0); + /** number of read/write locks on this object (this GPUObject is being used in a current instruction) */ + protected AtomicInteger locks = new AtomicInteger(0); /** Timestamp, needed by {@link GPUContext#evict(long)} */ AtomicLong timestamp = new AtomicLong(0); @@ -112,7 +112,7 @@ public class GPUObject { that.allocateTensorDescriptor(me.tensorShape[0], me.tensorShape[1], me.tensorShape[2], me.tensorShape[3]); } that.dirty = me.dirty; - that.readLocks = new AtomicInteger(me.readLocks.get()); + that.locks = new AtomicInteger(me.locks.get()); that.timestamp = new AtomicLong(me.timestamp.get()); that.isSparse = me.isSparse; @@ -126,7 +126,7 @@ public class GPUObject { cudaMemcpy(that.jcudaDenseMatrixPtr, me.jcudaDenseMatrixPtr, size, cudaMemcpyDeviceToDevice); } - if (me.jcudaSparseMatrixPtr != null){ + if (me.getJcudaSparseMatrixPtr() != null){ long rows = mat.getNumRows(); that.jcudaSparseMatrixPtr = me.jcudaSparseMatrixPtr.clone((int)rows); } @@ -265,7 +265,6 @@ public class GPUObject { /** * Convenience method to directly set the sparse matrix on GPU - * Make sure to call {@link #addReadLock()} after this to set appropriate state, if you are not sure what you are doing. * Needed for operations like {@link JCusparse#cusparseDcsrgemm(cusparseHandle, int, int, int, int, int, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, Pointer, Pointer, Pointer)} * @param sparseMatrixPtr CSR (compressed sparse row) pointer * @@ -278,11 +277,11 @@ public class GPUObject { cudaFreeHelper(getJcudaDenseMatrixPtr()); jcudaDenseMatrixPtr = null; } + getGPUContext().recordBlockUsage(this); } /** * Convenience method to directly set the dense matrix pointer on GPU - * Make sure to call {@link #addReadLock()} after this to set appropriate state, if you are not sure what you are doing. * * @param densePtr dense pointer * @throws DMLRuntimeException ? @@ -294,6 +293,7 @@ public class GPUObject { getJcudaSparseMatrixPtr().deallocate(); jcudaSparseMatrixPtr = null; } + getGPUContext().recordBlockUsage(this); } /** @@ -491,7 +491,6 @@ public class GPUObject { public void allocateSparseAndEmpty() throws DMLRuntimeException{ LOG.trace("GPU : allocate sparse and empty block on " + this + ", GPUContext=" + getGPUContext()); setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(getGPUContext(), 0, mat.getNumRows())); - addReadLock(); } /** @@ -508,7 +507,6 @@ public class GPUObject { int numElems = toIntExact(rows * cols); long size = getDoubleSizeOf(numElems); setDenseMatrixCudaPointer(allocate(size)); - addReadLock(); // The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v" // If the fill value is 0, no need to call the special kernel, the allocate memsets the allocated region to 0 if (v != 0) @@ -535,9 +533,8 @@ public class GPUObject { LOG.trace("GPU : in acquireDeviceRead, data is not allocated, copying from host, on " + this + ", GPUContext=" + getGPUContext()); copyFromHostToDevice(); transferred = true; - } else { - addReadLock(); } + addLock(); if(!isAllocated()) throw new DMLRuntimeException("Expected device data to be allocated"); return transferred; @@ -552,7 +549,6 @@ public class GPUObject { // Dense block, size = numRows * numCols allocateDenseMatrixOnDevice(); allocated = true; - getGPUContext().recordBlockUsage(this); } dirty = true; if(!isAllocated()) @@ -569,8 +565,6 @@ public class GPUObject { mat.setDirty(true); allocateSparseMatrixOnDevice(); allocated = true; - getGPUContext().recordBlockUsage(this); - } dirty = true; if(!isAllocated()) @@ -578,8 +572,8 @@ public class GPUObject { return allocated; } - public void addReadLock() { - readLocks.addAndGet(1); + public void addLock() { + locks.addAndGet(1); } /** @@ -606,11 +600,13 @@ public class GPUObject { * Updates the locks depending on the eviction policy selected * @throws DMLRuntimeException if there is no locked GPU Object or if could not obtain a {@link GPUContext} */ - private void updateReleaseLocks() throws DMLRuntimeException { - if (readLocks.addAndGet(-1) < 0) { - throw new CacheException("Redundant release of GPU object"); + private void updateReleaseLocks(int l) throws DMLRuntimeException { + int newLocks = locks.addAndGet(l); + if (newLocks < 0) { + throw new CacheException("Internal state error : Invalid number of locks on a GPUObject"); } - LOG.trace("GPU : updateReleaseLocks, new number of read locks is " + readLocks.get() + ", on " + this + ", GPUContext=" + getGPUContext()); + + LOG.trace("GPU : updateReleaseLocks, new number of locks is " + locks.get() + ", on " + this + ", GPUContext=" + getGPUContext()); GPUContext.EvictionPolicy evictionPolicy = getGPUContext().evictionPolicy; switch (evictionPolicy){ case LRU : timestamp.set(System.nanoTime()); break; @@ -625,7 +621,8 @@ public class GPUObject { * @throws DMLRuntimeException if data is not allocated or if there is no locked GPU Object or if could not obtain a {@link GPUContext} */ public void releaseInput() throws DMLRuntimeException { - updateReleaseLocks(); + // A read lock is a positive quantity, therefor when the lock is freed, a negative 1 is added + updateReleaseLocks(-1); if(!isAllocated()) throw new CacheException("Attempting to release an input before allocating it"); } @@ -635,7 +632,8 @@ public class GPUObject { * @throws DMLRuntimeException if data is not allocated or if there is no locked GPU Object or if could not obtain a {@link GPUContext} */ public void releaseOutput() throws DMLRuntimeException { - updateReleaseLocks(); + // A write lock is a negative quantity, therefore when the lock is freed, a positive number is added + updateReleaseLocks(1); dirty = true; if(!isAllocated()) throw new CacheException("Attempting to release an output before allocating it"); @@ -651,7 +649,6 @@ public class GPUObject { long size = getDoubleSizeOf(rows * cols); Pointer tmp = allocate(size); setDenseMatrixCudaPointer(tmp); - addReadLock(); } void allocateSparseMatrixOnDevice() throws DMLRuntimeException { @@ -660,10 +657,9 @@ public class GPUObject { long rows = mat.getNumRows(); long nnz = mat.getNnz(); assert rows > 0 : "Internal error - invalid number of rows when allocating a sparse matrix"; - assert nnz > 0 : "Internal error - invalid number of non zeroes when allocating a sparse matrix"; + assert nnz >= 0 : "Internal error - invalid number of non zeroes when allocating a sparse matrix"; CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), nnz, rows); setSparseMatrixCudaPointer(tmp); - addReadLock(); } void deallocateMemoryOnDevice(boolean eager) throws DMLRuntimeException { @@ -680,7 +676,8 @@ public class GPUObject { cudnnDestroyTensorDescriptor(tensorDescriptor); tensorDescriptor = null; } - readLocks.set(0); + locks.set(0); + getGPUContext().removeRecordedUsage(this); } protected long getSizeOnDevice() throws DMLRuntimeException { @@ -754,8 +751,8 @@ public class GPUObject { colInd = csrBlock.indexes(); values = csrBlock.values(); } + allocateSparseMatrixOnDevice(); - getGPUContext().recordBlockUsage(this); if(copyToDevice) { CSRPointer.copyToDevice(getJcudaSparseMatrixPtr(), tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd, values); @@ -773,7 +770,6 @@ public class GPUObject { // Copy dense block allocateDenseMatrixOnDevice(); - getGPUContext().recordBlockUsage(this); cudaMemcpy(getJcudaDenseMatrixPtr(), Pointer.to(data), getDoubleSizeOf(mat.getNumRows()*mat.getNumColumns()), cudaMemcpyHostToDevice); } @@ -862,9 +858,8 @@ public class GPUObject { * @throws CacheException ? */ public void clearData(boolean eager) throws DMLRuntimeException { - getGPUContext().removeRecordedUsage(this); deallocateMemoryOnDevice(eager); - + getGPUContext().removeRecordedUsage(this); } /** @@ -896,7 +891,7 @@ public class GPUObject { final StringBuilder sb = new StringBuilder("GPUObject{"); sb.append(", tensorShape=").append(Arrays.toString(tensorShape)); sb.append(", dirty=").append(dirty); - sb.append(", readLocks=").append(readLocks); + sb.append(", locks=").append(locks); sb.append(", sparse? ").append(isSparse); sb.append(", dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]"); sb.append('}'); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java index ac11df9..4d06831 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java @@ -34,6 +34,7 @@ import jcuda.Pointer; import jcuda.driver.CUfunction; import jcuda.driver.CUmodule; import jcuda.driver.CUresult; +import jcuda.runtime.JCuda; /** * Utility class that allows LibMatrixCUDA as well as GPUObject to invoke custom CUDA kernels. @@ -111,7 +112,7 @@ public class JCudaKernels { config.gridDimX, config.gridDimY, config.gridDimZ, config.blockDimX, config.blockDimY, config.blockDimZ, config.sharedMemBytes, config.stream, Pointer.to(kernelParams), null)); - //JCuda.cudaDeviceSynchronize(); + JCuda.cudaDeviceSynchronize(); } public static void checkResult(int cuResult) throws DMLRuntimeException { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index b023159..7990fef 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -235,8 +235,8 @@ public class LibMatrixCUDA { private static int CONVOLUTION_PREFERENCE = cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE; - - private static Pointer _one; + + private static Pointer _one; private static Pointer _zero; /** * Convenience method to get a pointer to value '1.0' on device. Instead of allocating and deallocating it for every kernel invocation. @@ -258,7 +258,7 @@ public class LibMatrixCUDA { } return _zero; } - + /** * Convenience method to get tensor descriptor from underlying GPUObject * @param gCtx a valid {@link GPUContext} @@ -272,7 +272,7 @@ public class LibMatrixCUDA { */ private static cudnnTensorDescriptor allocateTensorDescriptor(GPUContext gCtx, MatrixObject mat, int N, int C, int H, int W) throws DMLRuntimeException { if(mat.getNumRows() != N || mat.getNumColumns() != C*H*W) { - throw new DMLRuntimeException("Mismatch descriptor-matrix dimensions:" + mat.getNumRows() + " != " + N + throw new DMLRuntimeException("Mismatch descriptor-matrix dimensions:" + mat.getNumRows() + " != " + N + " || " + mat.getNumColumns() + " != " + (C*H*W)); } return mat.getGPUObject(gCtx).allocateTensorDescriptor(N, C, H, W); @@ -293,7 +293,7 @@ public class LibMatrixCUDA { cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W); return tensorDescriptor; } - + /** * Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously. * @param gCtx a valid {@link GPUContext} @@ -308,7 +308,7 @@ public class LibMatrixCUDA { } return getDensePointer(gCtx, image, instName); } - + /** * Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously. * @param gCtx a valid {@link GPUContext} @@ -339,15 +339,15 @@ public class LibMatrixCUDA { } return input.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); } - + /** * Convenience method for checking the status of CuDNN kernel. - * + * * @param status status returned by CuDNN * @throws DMLRuntimeException if status is not CUDNN_STATUS_SUCCESS */ private static void checkStatus(int status) throws DMLRuntimeException { - if(status != cudnnStatus.CUDNN_STATUS_SUCCESS) + if(status != cudnnStatus.CUDNN_STATUS_SUCCESS) throw new DMLRuntimeException("Error status returned by CuDNN:" + jcuda.jcudnn.cudnnStatus.stringFor(status)); } @@ -383,7 +383,7 @@ public class LibMatrixCUDA { //cudaDeviceSynchronize; biasAdd(gCtx, instName, outputBlock, bias, outputBlock); } - + public static void conv2d(GPUContext gCtx, String instName, MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException { @@ -530,7 +530,7 @@ public class LibMatrixCUDA { if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1); } - + /** * Performs the operation corresponding to the DML script: * ones = matrix(1, rows=1, cols=Hout*Wout) @@ -635,7 +635,7 @@ public class LibMatrixCUDA { throw new DMLRuntimeException("Incorrect dimensions for running variance"); } } - + /** * Performs the forward BatchNormalization layer computation for inference * @param gCtx a valid {@link GPUContext} @@ -650,21 +650,21 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if error occurs */ public static void batchNormalizationForwardInference(GPUContext gCtx, String instName, MatrixObject image, - MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, + MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, MatrixObject ret, double epsilon) throws DMLRuntimeException { LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" + gCtx); int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL; - + int N = (int) image.getNumRows(); int C = (int) scale.getNumColumns(); long CHW = image.getNumColumns(); validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C); - + // Allocate descriptors cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW, new MatrixObject[] {image}, new MatrixObject[] {ret}); cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1); - + // Get underlying dense pointer Pointer imagePtr = getDensePointer(gCtx, image, true, instName); Pointer retPtr = getDensePointer(gCtx, ret, true, instName); @@ -672,13 +672,13 @@ public class LibMatrixCUDA { Pointer scalePtr = getDensePointer(gCtx, scale, true, instName); Pointer runningMeanPtr = getDensePointer(gCtx, runningMean, true, instName); Pointer runningVarPtr = getDensePointer(gCtx, runningVar, true, instName); - + checkStatus(cudnnBatchNormalizationForwardInference(getCudnnHandle(gCtx), mode, one(), zero(), nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr, scaleTensorDesc, scalePtr, biasPtr, runningMeanPtr, runningVarPtr, epsilon)); } - + /** * Performs the forward BatchNormalization layer computation for training * @param gCtx a valid {@link GPUContext} @@ -696,21 +696,21 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if error occurs */ public static void batchNormalizationForwardTraining(GPUContext gCtx, String instName, MatrixObject image, - MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, + MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, MatrixObject ret, MatrixObject retRunningMean, MatrixObject retRunningVar, double epsilon, double exponentialAverageFactor) throws DMLRuntimeException { LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" + gCtx); int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL; - + int N = (int) image.getNumRows(); int C = (int) scale.getNumColumns(); long CHW = image.getNumColumns(); validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C); - + // Allocate descriptors cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW, new MatrixObject[] {image}, new MatrixObject[] {ret}); cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1); - + // Get underlying dense pointer Pointer imagePtr = getDensePointer(gCtx, image, true, instName); Pointer retPtr = getDensePointer(gCtx, ret, true, instName); @@ -718,20 +718,20 @@ public class LibMatrixCUDA { Pointer scalePtr = getDensePointer(gCtx, scale, true, instName); Pointer runningMeanPtr = getDensePointer(gCtx, runningMean, true, instName); Pointer runningVarPtr = getDensePointer(gCtx, runningVar, true, instName); - + // To allow for copy-on-write Pointer retRunningMeanPtr = getDensePointer(gCtx, retRunningMean, true, instName); Pointer retRunningVarPtr = getDensePointer(gCtx, retRunningVar, true, instName); cudaMemcpy(retRunningMeanPtr, runningMeanPtr, C * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice); cudaMemcpy(retRunningVarPtr, runningVarPtr, C * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice); - + // ignoring resultSaveMean and resultSaveVariance as it requires state management checkStatus(cudnnBatchNormalizationForwardTraining(getCudnnHandle(gCtx), mode, one(), zero(), nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr, scaleTensorDesc, scalePtr, biasPtr, exponentialAverageFactor, retRunningMeanPtr, retRunningVarPtr, epsilon, new Pointer(), new Pointer())); } - + /** * Convenient utility for batch normalization that returns a NCHW descriptor * @param gCtx a valid {@link GPUContext} @@ -776,8 +776,8 @@ public class LibMatrixCUDA { H = HW; W = 1; // If not known double potentialH = Math.sqrt(HW); if(potentialH == ((int) potentialH)) { - H = (int) potentialH; - W = H; + H = (int) potentialH; + W = H; } // We are not sure about H and W, hence don't allocate them. ret = new cudnnTensorDescriptor(); @@ -786,7 +786,7 @@ public class LibMatrixCUDA { } return ret; } - + /** * This method computes the backpropagation errors for image, scale and bias of batch normalization layer * @param gCtx a valid {@link GPUContext} @@ -805,16 +805,16 @@ public class LibMatrixCUDA { double epsilon) throws DMLRuntimeException { LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" + gCtx); int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL; - + int N = (int) image.getNumRows(); int C = (int) scale.getNumColumns(); long CHW = image.getNumColumns(); - + // Allocate descriptors cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW, new MatrixObject[] {image, dout}, new MatrixObject[] {ret}); cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1); - + // Get underlying dense pointer Pointer imagePtr = getDensePointer(gCtx, image, true, instName); Pointer doutPtr = getDensePointer(gCtx, dout, true, instName); @@ -822,7 +822,7 @@ public class LibMatrixCUDA { Pointer retPtr = getDensePointer(gCtx, ret, true, instName); Pointer retScalePtr = getDensePointer(gCtx, retScale, true, instName); Pointer retBiasPtr = getDensePointer(gCtx, retBias, true, instName); - + // ignoring resultSaveMean and resultSaveVariance as it requires state management checkStatus(cudnnBatchNormalizationBackward(getCudnnHandle(gCtx), mode, one(), zero(), one(), zero(), nCHWDescriptor, imagePtr, nCHWDescriptor, doutPtr, nCHWDescriptor, retPtr, @@ -915,7 +915,7 @@ public class LibMatrixCUDA { } } - private static long numDoublesIn2GB = 125000000; + private static long numDoublesIn2GB = 268435456; /** * This method computes the backpropogation errors for previous layer of convolution operation @@ -961,7 +961,7 @@ public class LibMatrixCUDA { Pointer w = getDensePointer(gCtx, filter, true, instName); Pointer dy = getDensePointer(gCtx, dout, true, instName); Pointer dx = getDensePointer(gCtx, output, true, instName); - + int padding [] = { pad_h, pad_w }; int strides [] = { stride_h, stride_w }; convDesc = allocateConvolutionDescriptor(padding, strides); @@ -999,7 +999,7 @@ public class LibMatrixCUDA { if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3); } } - + /** * performs maxpooling on GPU by exploiting cudnnPoolingForward(...) * @param gCtx a valid {@link GPUContext} @@ -1029,7 +1029,7 @@ public class LibMatrixCUDA { cudnnTensorDescriptor xDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W); performMaxpooling(gCtx, instName, x, xDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } - + public static void performMaxpooling(GPUContext gCtx, String instName, Pointer x, cudnnTensorDescriptor xDesc, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, @@ -1064,7 +1064,7 @@ public class LibMatrixCUDA { if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3); } } - + /** * Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...) * This method computes the backpropogation errors for previous layer of maxpooling operation @@ -1149,13 +1149,13 @@ public class LibMatrixCUDA { if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4); } } - + private static void performCuDNNReLU(GPUContext gCtx, String instName, MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws DMLRuntimeException { long t0=0; try { LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + gCtx); cudnnTensorDescriptor dstTensorDesc = srcTensorDesc; - + Pointer srcData = getDensePointer(gCtx, in, true, instName); cudnnActivationDescriptor activationDescriptor = new cudnnActivationDescriptor(); cudnnCreateActivationDescriptor(activationDescriptor); @@ -1175,7 +1175,7 @@ public class LibMatrixCUDA { if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t1); } } - + /** * Performs the relu operation on the GPU. @@ -1580,7 +1580,6 @@ public class LibMatrixCUDA { if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t1); output.getGPUObject(gCtx).setDenseMatrixCudaPointer(C_dense); - output.getGPUObject(gCtx).addReadLock(); } /** @@ -1671,7 +1670,6 @@ public class LibMatrixCUDA { if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t0); output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C); - output.getGPUObject(gCtx).addReadLock(); if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); cusparseDcsrgemm(getCusparseHandle(gCtx), transA, transB, m, n, k, @@ -2079,7 +2077,7 @@ public class LibMatrixCUDA { reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen); ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1); - matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, clen, out, divideOp); + matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp); gCtx.cudaFreeHelper(instName, tmpRow); @@ -2097,7 +2095,7 @@ public class LibMatrixCUDA { reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen); ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1); - matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, rlen, clen, out, divideOp); + matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp); gCtx.cudaFreeHelper(instName, tmpCol); @@ -2348,14 +2346,21 @@ public class LibMatrixCUDA { else if(op.fn instanceof Power) { setOutputToConstant(ec, gCtx, instName, 1.0, outputName); } - else if(op.fn instanceof Divide && isSparseAndEmpty(gCtx, in)) { - setOutputToConstant(ec, gCtx, instName, Double.NaN, outputName); - } - else if(op.fn instanceof Divide) { - //For division, IEEE 754 defines x/0.0 as INFINITY and 0.0/0.0 as NaN. - compareAndSet(ec, gCtx, instName, in, outputName, 0.0, 1e-6, Double.NaN, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY); - } - else { + // TODO: + // x/0.0 is either +Infinity or -Infinity according to Java. + // In the context of a matrix, different elements of the matrix + // could have different values. + // If the IEEE 754 standard defines otherwise, this logic needs + // to be re-enabled and the Java computation logic for divide by zero + // needs to be revisited + //else if(op.fn instanceof Divide && isSparseAndEmpty(gCtx, in)) { + // setOutputToConstant(ec, gCtx, instName, Double.NaN, outputName); + //} + //else if(op.fn instanceof Divide) { + // //For division, IEEE 754 defines x/0.0 as INFINITY and 0.0/0.0 as NaN. + // compareAndSet(ec, gCtx, instName, in, outputName, 0.0, 1e-6, Double.NaN, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY); + //} + else { // TODO: Potential to optimize matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op); } @@ -2790,7 +2795,6 @@ public class LibMatrixCUDA { CSRPointer B = in2.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); ec.allocateGPUMatrixObject(outputName); - out.getGPUObject(gCtx).addReadLock(); if (in1 == in2 && isLeftTransposed == true && isLeftTransposed == isRightTransposed) { // Special case for transpose @@ -3160,7 +3164,7 @@ public class LibMatrixCUDA { MatrixObject out = ec.getMatrixObject(outputName); getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix Pointer C = getDensePointer(gCtx, out, instName); - + long t1=0, t2=0; if(in1.getNumRows() == in2.getNumRows() && in1.getNumColumns() == in2.getNumColumns()) { LOG.trace("GPU : cublasDaxpy" + ", GPUContext=" + gCtx); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java index fb62c41..bb0a9a8 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java @@ -45,9 +45,6 @@ public class LibMatrixDatagen { private static final Log LOG = LogFactory.getLog(LibMatrixDatagen.class.getName()); private static final long PAR_NUMCELL_THRESHOLD = 512*1024; //Min 500k elements - public static final String RAND_PDF_UNIFORM = "uniform"; - public static final String RAND_PDF_NORMAL = "normal"; - public static final String RAND_PDF_POISSON = "poisson"; private static IDSequence _seqRandInput = new IDSequence(); @@ -55,9 +52,9 @@ public class LibMatrixDatagen //prevent instantiation via private constructor } - public static boolean isShortcutRandOperation( double min, double max, double sparsity, String pdf ) + public static boolean isShortcutRandOperation( double min, double max, double sparsity, RandomMatrixGenerator.PDF pdf ) { - return pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) + return pdf == RandomMatrixGenerator.PDF.UNIFORM && ( ( min == 0.0 && max == 0.0 ) //all zeros ||( sparsity==1.0d && min == max )); //equal values } @@ -137,27 +134,31 @@ public class LibMatrixDatagen } } - public static RandomMatrixGenerator createRandomMatrixGenerator(String pdf, int r, int c, int rpb, int cpb, double sp, double min, double max, String distParams) + public static RandomMatrixGenerator createRandomMatrixGenerator(String pdfStr, int r, int c, int rpb, int cpb, double sp, double min, double max, String distParams) throws DMLRuntimeException { - RandomMatrixGenerator rgen = null; - - if ( pdf.equalsIgnoreCase(RAND_PDF_UNIFORM)) - rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp, min, max); - else if ( pdf.equalsIgnoreCase(RAND_PDF_NORMAL)) - rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp); - else if ( pdf.equalsIgnoreCase(RAND_PDF_POISSON)) - { - double mean = Double.NaN; - try { - mean = Double.parseDouble(distParams); - } catch(NumberFormatException e) { - throw new DMLRuntimeException("Failed to parse Poisson distribution parameter: " + distParams); - } - rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp, min, max, mean); - } - else - throw new DMLRuntimeException("Unsupported probability distribution \"" + pdf + "\" in rand() -- it must be one of \"uniform\", \"normal\", or \"poisson\""); + RandomMatrixGenerator.PDF pdf = RandomMatrixGenerator.PDF.valueOf(pdfStr.toUpperCase()); + RandomMatrixGenerator rgen = null; + switch (pdf) { + case UNIFORM: + rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp, min, max); + break; + case NORMAL: + rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp); + break; + case POISSON: + double mean = Double.NaN; + try { + mean = Double.parseDouble(distParams); + } catch (NumberFormatException e) { + throw new DMLRuntimeException("Failed to parse Poisson distribution parameter: " + distParams); + } + rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp, min, max, mean); + break; + default: + throw new DMLRuntimeException("Unsupported probability distribution \"" + pdf + "\" in rand() -- it must be one of \"uniform\", \"normal\", or \"poisson\""); + + } return rgen; } @@ -202,11 +203,11 @@ public class LibMatrixDatagen * (max-min)*prng.nextDouble() is still valid. This is done primarily to * share the same code across different distributions. */ - double min = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? rgen._min : 0; - double max = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? rgen._max : 1; + double min = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? rgen._min : 0; + double max = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? rgen._max : 1; // Special case shortcuts for efficiency - if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM)) { + if ( rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM) { if ( min == 0.0 && max == 0.0 ) { //all zeros out.reset(rows, cols, true); return; @@ -288,8 +289,8 @@ public class LibMatrixDatagen * (max-min)*prng.nextDouble() is still valid. This is done primarily to * share the same code across different distributions. */ - double min = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? rgen._min : 0; - double max = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? rgen._max : 1; + double min = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? rgen._min : 0; + double max = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? rgen._max : 1; //determine the sparsity of output matrix (multi-threaded always invoked from CP): //estimated NNZ is for entire matrix (nnz=0, if 0 initialized) @@ -304,7 +305,7 @@ public class LibMatrixDatagen } //special case shortcuts for efficiency - if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM)) { + if ( rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM) { if ( min == 0.0 && max == 0.0 ) { //all zeros out.reset(rows, cols, false); return; @@ -497,8 +498,8 @@ public class LibMatrixDatagen int cpb = rgen._colsPerBlock; double sparsity = rgen._sparsity; PRNGenerator valuePRNG = rgen._valuePRNG; - double min = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? rgen._min : 0; - double max = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? rgen._max : 1; + double min = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? rgen._min : 0; + double max = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? rgen._max : 1; double range = max - min; int clen = out.clen; int estimatedNNzsPerRow = out.estimatedNNzsPerRow; @@ -510,14 +511,19 @@ public class LibMatrixDatagen // Setup Pseudo Random Number Generator for cell values based on 'pdf'. if (valuePRNG == null) { - if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM)) + switch (rgen._pdf) { + case UNIFORM: valuePRNG = new UniformPRNGenerator(); - else if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_NORMAL)) + break; + case NORMAL: valuePRNG = new NormalPRNGenerator(); - else if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_POISSON)) + break; + case POISSON: valuePRNG = new PoissonPRNGenerator(); - else + break; + default: throw new DMLRuntimeException("Unsupported distribution function for Rand: " + rgen._pdf); + } } // loop through row-block indices http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java b/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java index 1dc818c..b4c6c95 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java @@ -26,8 +26,15 @@ import org.apache.sysml.runtime.util.PoissonPRNGenerator; import org.apache.sysml.runtime.util.UniformPRNGenerator; public class RandomMatrixGenerator { - - String _pdf; + + /** + * Types of Probability density functions + */ + enum PDF { + NORMAL, UNIFORM, POISSON + } + + PDF _pdf; int _rows, _cols, _rowsPerBlock, _colsPerBlock; double _sparsity, _mean; double _min, _max; @@ -36,25 +43,76 @@ public class RandomMatrixGenerator { public RandomMatrixGenerator() { - _pdf = ""; + _pdf = PDF.UNIFORM; _rows = _cols = _rowsPerBlock = _colsPerBlock = -1; _sparsity = 0.0; _min = _max = Double.NaN; _valuePRNG = null; _mean = 1.0; } - - public RandomMatrixGenerator(String pdf, int r, int c, int rpb, int cpb, double sp) throws DMLRuntimeException + + /** + * Instantiates a Random number generator + * @param pdf probability density function + * @param r number of rows + * @param c number of columns + * @param rpb rows per block + * @param cpb columns per block + * @param sp sparsity (0 = completely sparse, 1 = completely dense) + * @throws DMLRuntimeException if error + */ + public RandomMatrixGenerator(PDF pdf, int r, int c, int rpb, int cpb, double sp) throws DMLRuntimeException { this(pdf, r, c, rpb, cpb, sp, Double.NaN, Double.NaN); } - - public RandomMatrixGenerator(String pdf, int r, int c, int rpb, int cpb, double sp, double min, double max) throws DMLRuntimeException + + /** + * Instantiates a Random number generator + * @param pdfStr probability density function + * @param r number of rows + * @param c number of columns + * @param rpb rows per block + * @param cpb columns per block + * @param sp sparsity (0 = completely sparse, 1 = completely dense) + * @param min minimum of range of random numbers + * @param max maximum of range of random numbers + * @throws DMLRuntimeException if error + */ + public RandomMatrixGenerator(String pdfStr, int r, int c, int rpb, int cpb, double sp, double min, double max) throws DMLRuntimeException + { + init(PDF.valueOf(pdfStr.toUpperCase()), r, c, rpb, cpb, sp, min, max); + } + + /** + * Instantiates a Random number generator + * @param pdf probability density function + * @param r number of rows + * @param c number of columns + * @param rpb rows per block + * @param cpb columns per block + * @param sp sparsity (0 = completely sparse, 1 = completely dense) + * @param min minimum of range of random numbers + * @param max maximum of range of random numbers + * @throws DMLRuntimeException if error + */ + public RandomMatrixGenerator(PDF pdf, int r, int c, int rpb, int cpb, double sp, double min, double max) throws DMLRuntimeException { init(pdf, r, c, rpb, cpb, sp, min, max); } - - public void init(String pdf, int r, int c, int rpb, int cpb, double sp, double min, double max) throws DMLRuntimeException + + /** + * Initializes internal data structures. Called by Constructor + * @param pdf probability density function + * @param r number of rows + * @param c number of columns + * @param rpb rows per block + * @param cpb columns per block + * @param sp sparsity (0 = completely sparse, 1 = completely dense) + * @param min minimum of range of random numbers + * @param max maximum of range of random numbers + * @throws DMLRuntimeException if error + */ + public void init(PDF pdf, int r, int c, int rpb, int cpb, double sp, double min, double max) throws DMLRuntimeException { _pdf = pdf; _rows = r; @@ -67,13 +125,39 @@ public class RandomMatrixGenerator { setupValuePRNG(); } - - public RandomMatrixGenerator(String pdf, int r, int c, int rpb, int cpb, double sp, double min, double max, double mean) throws DMLRuntimeException + + /** + * Instantiates a Random number generator with a specific poisson mean + * @param pdf probability density function + * @param r number of rows + * @param c number of columns + * @param rpb rows per block + * @param cpb columns per block + * @param sp sparsity (0 = completely sparse, 1 = completely dense) + * @param min minimum of range of random numbers + * @param max maximum of range of random numbers + * @param mean the poisson mean + * @throws DMLRuntimeException if error + */ + public RandomMatrixGenerator(PDF pdf, int r, int c, int rpb, int cpb, double sp, double min, double max, double mean) throws DMLRuntimeException { init(pdf, r, c, rpb, cpb, sp, min, max, mean); } - - public void init(String pdf, int r, int c, int rpb, int cpb, double sp, double min, double max, double mean) throws DMLRuntimeException + + /** + * Instantiates a Random number generator with a specific poisson mean + * @param pdf probability density function + * @param r number of rows + * @param c number of columns + * @param rpb rows per block + * @param cpb columns per block + * @param sp sparsity (0 = completely sparse, 1 = completely dense) + * @param min minimum of range of random numbers + * @param max maximum of range of random numbers + * @param mean the poisson mean + * @throws DMLRuntimeException if error + */ + public void init(PDF pdf, int r, int c, int rpb, int cpb, double sp, double min, double max, double mean) throws DMLRuntimeException { _pdf = pdf; _rows = r; @@ -89,15 +173,20 @@ public class RandomMatrixGenerator { protected void setupValuePRNG() throws DMLRuntimeException { - if ( _pdf.equalsIgnoreCase(LibMatrixDatagen.RAND_PDF_NORMAL) ) + switch (_pdf) { + case NORMAL: _valuePRNG = new NormalPRNGenerator(); - else if ( _pdf.equalsIgnoreCase(LibMatrixDatagen.RAND_PDF_UNIFORM) ) + break; + case UNIFORM: _valuePRNG = new UniformPRNGenerator(); - else if ( _pdf.equalsIgnoreCase(LibMatrixDatagen.RAND_PDF_POISSON) ) - { + break; + case POISSON: if(_mean <= 0) throw new DMLRuntimeException("Invalid parameter (" + _mean + ") for Poisson distribution."); _valuePRNG = new PoissonPRNGenerator(_mean); + break; + default: + throw new DMLRuntimeException("Unsupported probability density function"); } } } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java b/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java new file mode 100644 index 0000000..0b229f0 --- /dev/null +++ b/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.test.gpu; + +import org.apache.sysml.test.utils.TestUtils; +import org.junit.Test; + +/** + * Tests Aggregate Unary ops + */ +public class AggregateUnaryOpTests extends UnaryOpTestsBase { + + private final static String TEST_NAME = "AggregateUnaryOpTests"; + + @Override + public void setUp() { + TestUtils.clearAssertionInformation(); + addTestConfiguration(TEST_DIR, TEST_NAME); + getAndLoadTestConfiguration(TEST_NAME); + } + + @Test + public void sum() { + testSimpleUnaryOpMatrixOutput("sum", "gpu_uak+"); + } + + @Test + public void colSums() { + testSimpleUnaryOpMatrixOutput("colSums", "gpu_uack+"); + } + + @Test + public void rowSums() { + testSimpleUnaryOpMatrixOutput("rowSums", "gpu_uark+"); + } + + @Test + public void mult() { + testSimpleUnaryOpMatrixOutput("prod", "gpu_ua*"); + } + + @Test + public void mean() { + testSimpleUnaryOpMatrixOutput("mean", "gpu_uamean"); + } + + @Test + public void colMeans() { + testSimpleUnaryOpMatrixOutput("colMeans", "gpu_uacmean"); + } + + @Test + public void rowMeans() { + testSimpleUnaryOpMatrixOutput("rowMeans", "gpu_uarmean"); + } + + @Test + public void max() { + testSimpleUnaryOpMatrixOutput("max", "gpu_uamax"); + } + + @Test + public void rowMaxs() { + testSimpleUnaryOpMatrixOutput("rowMaxs", "gpu_uarmax"); + } + + @Test + public void colMaxs() { + testSimpleUnaryOpMatrixOutput("colMaxs", "gpu_uacmax"); + } + + @Test + public void min() { + testSimpleUnaryOpMatrixOutput("min", "gpu_uamin"); + } + + @Test + public void rowMins() { + testSimpleUnaryOpMatrixOutput("rowMins", "gpu_uarmin"); + } + + @Test + public void colMins() { + testSimpleUnaryOpMatrixOutput("colMins", "gpu_uacmin"); + } + + @Test + public void var() { + testSimpleUnaryOpMatrixOutput("var", "gpu_uavar"); + } + + @Test + public void colVars() { + testSimpleUnaryOpMatrixOutput("colVars", "gpu_uacvar"); + } + + @Test + public void rowVars() { + testSimpleUnaryOpMatrixOutput("rowVars", "gpu_uarvar"); + } + + @Test + public void sumsq() { + testUnaryOpMatrixOutput("out = sum(in1*in1)", "gpu_uasqk+", "in1", "out"); + } + + @Test + public void rowSumsqs() { + testUnaryOpMatrixOutput("out = rowSums(in1*in1)", "gpu_uarsqk+", "in1", "out"); + } + + @Test + public void colSumsqs() { + testUnaryOpMatrixOutput("out = colSums(in1*in1)", "gpu_uacsqk+", "in1", "out"); + } +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/BinaryOpTests.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/gpu/BinaryOpTests.java b/src/test/java/org/apache/sysml/test/gpu/BinaryOpTests.java new file mode 100644 index 0000000..f3d2b21 --- /dev/null +++ b/src/test/java/org/apache/sysml/test/gpu/BinaryOpTests.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.test.gpu; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; + +import org.apache.sysml.api.mlcontext.Matrix; +import org.apache.sysml.test.utils.TestUtils; +import org.junit.Test; + +/** + * Tests builtin binary ops on GPU + */ +public class BinaryOpTests extends GPUTests { + + private final static String TEST_NAME = "BinaryOpTests"; + private final int seed = 42; + + @Override + public void setUp() { + TestUtils.clearAssertionInformation(); + addTestConfiguration(TEST_DIR, TEST_NAME); + getAndLoadTestConfiguration(TEST_NAME); + } + + @Test + public void testSolve() { + // Test Ax = b + // Dimensions of A = m * n + // Dimensions of x = n * 1 + // Dimensions of b = m * 1 + + double sparsity = 1.0; // Only dense matrices supported by "solve" + final int[] sides = { 32, 33, 128, 256, 513, 2049 }; + for (int i = 0; i < sides.length; i++) { + for (int j = i; j < sides.length; j++) { + int m = sides[j]; + int n = sides[i]; + runSolveTest(sparsity, m, n); + } + } + + } + + /** + * Runs the test for solve (Ax = b) for input with given dimensions and sparsities + * A can be overdetermined (rows in A > columns in A) + * + * @param sparsity sparsity for the block A and b + * @param m rows in A + * @param n columns in A + */ + protected void runSolveTest(double sparsity, int m, int n) { + String scriptStr = "x = solve(A, b)"; + System.out.println("In solve, A[" + m + ", " + n + "], b[" + m + ", 1]"); + Matrix A = generateInputMatrix(spark, m, n, sparsity, seed); + Matrix b = generateInputMatrix(spark, m, 1, sparsity, seed); + HashMap<String, Object> inputs = new HashMap<>(); + inputs.put("A", A); + inputs.put("b", b); + List<Object> outCPU = runOnCPU(spark, scriptStr, inputs, Arrays.asList("x")); + List<Object> outGPU = runOnGPU(spark, scriptStr, inputs, Arrays.asList("x")); + assertHeavyHitterPresent("gpu_solve"); + assertEqualObjects(outCPU.get(0), outGPU.get(0)); + } +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/GPUTests.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java new file mode 100644 index 0000000..8af9104 --- /dev/null +++ b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java @@ -0,0 +1,250 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.test.gpu; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.spark.sql.SparkSession; +import org.apache.sysml.api.mlcontext.MLContext; +import org.apache.sysml.api.mlcontext.Matrix; +import org.apache.sysml.api.mlcontext.Script; +import org.apache.sysml.api.mlcontext.ScriptFactory; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; +import org.apache.sysml.runtime.matrix.data.MatrixBlock; +import org.apache.sysml.test.integration.AutomatedTestBase; +import org.apache.sysml.utils.Statistics; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; + +/** + * Parent class for all GPU tests + */ +public abstract class GPUTests extends AutomatedTestBase { + + protected final static String TEST_DIR = "org/apache/sysml/api/mlcontext"; + protected static SparkSession spark; + protected final double THRESHOLD = 1e-9; // for relative error + + @BeforeClass + public static void beforeClass() { + spark = createSystemMLSparkSession("GPUTests", "local"); + } + + @AfterClass + public static void afterClass() { + spark.close(); + } + + /** + * Gets threshold for relative error in tests + * + * @return a valid threshold + */ + protected double getTHRESHOLD() { + return THRESHOLD; + } + + @After + public void tearDown() { + clearGPUMemory(); + super.tearDown(); + } + + /** + * Clear out the memory on all GPUs + */ + protected void clearGPUMemory() { + try { + int count = GPUContextPool.getDeviceCount(); + int freeCount = GPUContextPool.getAvailableCount(); + Assert.assertTrue("All GPUContexts have not been returned to the GPUContextPool", count == freeCount); + ArrayList<GPUContext> gpuContexts = new ArrayList<>(); + for (int i = 0; i < count; i++) { + GPUContext gCtx = GPUContextPool.getFromPool(); + gCtx.initializeThread(); + gCtx.clearMemory(); + gpuContexts.add(gCtx); + } + for (GPUContext gCtx : gpuContexts) { + GPUContextPool.returnToPool(gCtx); + } + + } catch (DMLRuntimeException e) { + // Ignore + } + } + + /** + * Generates a random input matrix with a given size and sparsity + * + * @param spark valid instance of {@link SparkSession} + * @param m number of rows + * @param n number of columns + * @param sparsity sparsity (1 = completely dense, 0 = completely sparse) + * @return a random matrix with given size and sparsity + */ + protected Matrix generateInputMatrix(SparkSession spark, int m, int n, double sparsity, int seed) { + // Generate a random matrix of size m * n + MLContext genMLC = new MLContext(spark); + String scriptStr; + if (sparsity == 0.0) { + scriptStr = "in1 = matrix(0, rows=" + m + ", cols=" + n + ")"; + } else { + scriptStr = "in1 = rand(rows=" + m + ", cols=" + n + ", sparsity = " + sparsity + ", seed= " + seed + + ", min=-1.0, max=1.0)"; + } + Script generateScript = ScriptFactory.dmlFromString(scriptStr).out("in1"); + Matrix in1 = genMLC.execute(generateScript).getMatrix("in1"); + genMLC.close(); + return in1; + } + + /** + * Asserts that the values in two matrices are in {@link UnaryOpTests#THRESHOLD} of each other + * + * @param expected expected matrix + * @param actual actual matrix + */ + private void assertEqualMatrices(Matrix expected, Matrix actual) { + try { + MatrixBlock expectedMB = expected.toMatrixObject().acquireRead(); + MatrixBlock actualMB = actual.toMatrixObject().acquireRead(); + + long rows = expectedMB.getNumRows(); + long cols = expectedMB.getNumColumns(); + Assert.assertEquals(rows, actualMB.getNumRows()); + Assert.assertEquals(cols, actualMB.getNumColumns()); + + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + double expectedDouble = expectedMB.quickGetValue(i, j); + double actualDouble = actualMB.quickGetValue(i, j); + if (expectedDouble != 0.0 && !Double.isNaN(expectedDouble) && Double.isFinite(expectedDouble)) { + double relativeError = Math.abs((expectedDouble - actualDouble) / expectedDouble); + Assert.assertTrue("Comparing floating point numbers, relative error(" + relativeError + + ") is more than threshold (" + getTHRESHOLD() + ")", relativeError < getTHRESHOLD()); + } else { + Assert.assertEquals(expectedDouble, actualDouble, getTHRESHOLD()); + } + } + } + expected.toMatrixObject().release(); + actual.toMatrixObject().release(); + } catch (DMLRuntimeException e) { + throw new RuntimeException(e); + } + } + + /** + * asserts that the expected op was executed + * + * @param heavyHitterOpCode opcode of the heavy hitter for the unary op + */ + protected void assertHeavyHitterPresent(String heavyHitterOpCode) { + Set<String> heavyHitterOpCodes = Statistics.getCPHeavyHitterOpCodes(); + Assert.assertTrue(heavyHitterOpCodes.contains(heavyHitterOpCode)); + } + + /** + * Runs a program on the CPU + * + * @param spark a valid {@link SparkSession} + * @param scriptStr the script to run (as a string) + * @param inputs map of input variables names in the scriptStr (of variable_name -> object) + * @param outStrs list of variable names needed as output from the scriptStr + * @return list of output objects in order of outStrs + */ + protected List<Object> runOnCPU(SparkSession spark, String scriptStr, Map<String, Object> inputs, + List<String> outStrs) { + MLContext cpuMLC = new MLContext(spark); + List<Object> outputs = new ArrayList<>(); + Script script = ScriptFactory.dmlFromString(scriptStr).in(inputs).out(outStrs); + for (String outStr : outStrs) { + Object output = cpuMLC.execute(script).get(outStr); + outputs.add(output); + } + cpuMLC.close(); + return outputs; + } + + /** + * Runs a program on the GPU + * + * @param spark a valid {@link SparkSession} + * @param scriptStr the script to run (as a string) + * @param inputs map of input variables names in the scriptStr (of variable_name -> object) + * @param outStrs list of variable names needed as output from the scriptStr + * @return list of output objects in order of outStrs + */ + protected List<Object> runOnGPU(SparkSession spark, String scriptStr, Map<String, Object> inputs, + List<String> outStrs) { + MLContext gpuMLC = new MLContext(spark); + gpuMLC.setGPU(true); + gpuMLC.setForceGPU(true); + gpuMLC.setStatistics(true); + List<Object> outputs = new ArrayList<>(); + Script script = ScriptFactory.dmlFromString(scriptStr).in(inputs).out(outStrs); + for (String outStr : outStrs) { + Object output = gpuMLC.execute(script).get(outStr); + outputs.add(output); + } + gpuMLC.close(); + return outputs; + } + + /** + * Assert that the two objects are equal. Supported types are Boolean, Integer, String, Double and Matrix + * + * @param expected + * @param actual + */ + protected void assertEqualObjects(Object expected, Object actual) { + Assert.assertEquals(expected.getClass(), actual.getClass()); + + if (expected instanceof Boolean) { + Assert.assertEquals(((Boolean) expected).booleanValue(), ((Boolean) actual).booleanValue()); + } else if (expected instanceof Double) { + double expectedDouble = ((Double) expected).doubleValue(); + double actualDouble = ((Double) actual).doubleValue(); + if (expectedDouble != 0.0 && !Double.isNaN(expectedDouble) && Double.isFinite(expectedDouble)) { + double relativeError = Math.abs((expectedDouble - actualDouble) / expectedDouble); + Assert.assertTrue("Comparing floating point numbers, relative error(" + relativeError + + ") is more than threshold (" + getTHRESHOLD() + ")", relativeError < getTHRESHOLD()); + } else { + Assert.assertEquals(expectedDouble, actualDouble, getTHRESHOLD()); + } + } else if (expected instanceof String) { + Assert.assertEquals(expected.toString(), actual.toString()); + } else if (expected instanceof Integer) { + Assert.assertEquals(((Integer) expected).intValue(), ((Integer) actual).intValue()); + } else if (expected instanceof Matrix) + assertEqualMatrices((Matrix) expected, (Matrix) actual); + else { + Assert.fail("Invalid types for comparison"); + } + } +}