This is an automated email from the ASF dual-hosted git repository. niketanpansare pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git
The following commit(s) were added to refs/heads/master by this push: new 7fba4b2 [SYSTEMML-540] Added ternary aggregate operators for GPU backend 7fba4b2 is described below commit 7fba4b29d653747a9ed038d282954a44fea3031c Author: Niketan Pansare <npan...@us.ibm.com> AuthorDate: Sun Mar 24 09:06:55 2019 -0700 [SYSTEMML-540] Added ternary aggregate operators for GPU backend - Also added steps to upload SystemML's python package to pypi. --- docs/release-process.md | 25 +++- .../java/org/apache/sysml/hops/AggUnaryOp.java | 11 +- .../runtime/instructions/GPUInstructionParser.java | 7 ++ .../gpu/AggregateTernaryGPUInstruction.java | 130 +++++++++++++++++++++ .../runtime/instructions/gpu/GPUInstruction.java | 1 + .../sysml/runtime/matrix/data/LibMatrixCUDA.java | 13 ++- .../sysml/test/gpu/AggregateTernaryTests.java | 57 +++++++++ .../sysml/test/gpu/AggregateUnaryOpTests.java | 1 + .../apache/sysml/test/gpu/UnaryOpTestsBase.java | 18 +++ 9 files changed, 250 insertions(+), 13 deletions(-) diff --git a/docs/release-process.md b/docs/release-process.md index 2477cd0..c50a27e 100644 --- a/docs/release-process.md +++ b/docs/release-process.md @@ -388,7 +388,7 @@ file and remove all the `@Ignore` annotations from all the tests. Then run the N # Run other GPU Unit Tests rm result.txt - for t in AggregateUnaryOpTests BinaryOpTests MatrixMatrixElementWiseOpTests RightIndexingTests AppendTest MatrixMultiplicationOpTest ReorgOpTests ScalarMatrixElementwiseOpTests UnaryOpTests LstmTest LstmCPUTest + for t in AggregateUnaryOpTests AggregateTernaryTests BinaryOpTests MatrixMatrixElementWiseOpTests RightIndexingTests AppendTest MatrixMultiplicationOpTest ReorgOpTests ScalarMatrixElementwiseOpTests UnaryOpTests LstmTest LstmCPUTest do mvn -Dit.test="org.apache.sysml.test.gpu."$t verify -PgpuTests &> tmp.txt SUCCESS=`grep "BUILD SUCCESS" tmp.txt` @@ -503,8 +503,23 @@ The versioned project documentation is now deployed to the main website, and the ## Update Crawler configuration for the search indexing -Create a PR or an issue to update the version number in the crawler configuration. -Please see the `start_urls` tag in the file [https://github.com/algolia/docsearch-configs/blob/master/configs/apache_systemml.json](https://github.com/algolia/docsearch-configs/blob/master/configs/apache_systemml.json). -If the Algolia team provides us an updated `apiKey` or `indexName` credentials, then please update the corresponding entries in the file +- Create a PR or an issue to update the version number in the crawler configuration. Please see the `start_urls` tag in the file [https://github.com/algolia/docsearch-configs/blob/master/configs/apache_systemml.json](https://github.com/algolia/docsearch-configs/blob/master/configs/apache_systemml.json). +- If the Algolia team provides us an updated `apiKey` or `indexName` credentials, then please update the corresponding entries in the file [https://github.com/apache/systemml/blob/master/docs/_layouts/global.html](https://github.com/apache/systemml/blob/master/docs/_layouts/global.html) -(see for `Algolia search section` in the previously mentioned HTML file). \ No newline at end of file +(see for `Algolia search section` in the previously mentioned HTML file). + +## Upload Python package to PyPI + +Download the released `systemml-*-python.tar.gz` and `systemml-*-python.tar.gz`. + + $ wget https://dist.apache.org/repos/dist/release/systemml/1.0.0/systemml-1.0.0-python.tar.gz + $ wget https://dist.apache.org/repos/dist/release/systemml/1.0.0/systemml-1.0.0-python.tar.gz.asc + +Rename the files to remove `-python` suffix. + + $ mv systemml-1.0.0-python.tar.gz systemml-1.0.0.tar.gz + $ mv systemml-1.0.0-python.tar.gz.asc systemml-1.0.0.tar.gz.asc + +Upload the Python package to PyPI using [twine](https://pypi.org/project/twine/). + + $ twine upload -u systemml systemml-1.0.0.tar.gz systemml-1.0.0.tar.gz.asc \ No newline at end of file diff --git a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java index 48d18b7..92ec22c 100644 --- a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java +++ b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java @@ -93,9 +93,12 @@ public class AggUnaryOp extends MultiThreadedHop return false; try { - if( isTernaryAggregateRewriteApplicable() || isUnaryAggregateOuterCPRewriteApplicable() ) { + if(isUnaryAggregateOuterCPRewriteApplicable()) { return false; } + else if(isTernaryAggregateRewriteApplicable()) { + return true; + } else if ((_op == AggOp.SUM && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) || (_op == AggOp.SUM_SQ && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) || (_op == AggOp.MAX && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) @@ -498,10 +501,6 @@ public class AggUnaryOp extends MultiThreadedHop { boolean ret = false; - // TODO: Disable ternary aggregate rewrite on GPU backend. - if(!ConfigurationManager.isGPU()) - return false; - //currently we support only sum over binary multiply but potentially //it can be generalized to any RC aggregate over two common binary operations if( OptimizerUtils.ALLOW_SUM_PRODUCT_REWRITES && _op == AggOp.SUM && @@ -713,8 +712,6 @@ public class AggUnaryOp extends MultiThreadedHop // The execution type of a unary aggregate instruction should depend on the execution type of inputs to avoid OOM // Since we only support matrix-vector and not vector-matrix, checking the execution type of input1 should suffice. ExecType et_input = input1.optFindExecType(); - // Because ternary aggregate are not supported on GPU - et_input = et_input == ExecType.GPU ? ExecType.CP : et_input; DirectionTypes dir = HopsDirection2Lops.get(_direction); return new TernaryAggregate(in1, in2, in3, Aggregate.OperationTypes.KahanSum, diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java index 20058de..aabb36f 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java @@ -23,6 +23,7 @@ import java.util.HashMap; import org.apache.sysml.lops.RightIndex; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.instructions.gpu.AggregateBinaryGPUInstruction; +import org.apache.sysml.runtime.instructions.gpu.AggregateTernaryGPUInstruction; import org.apache.sysml.runtime.instructions.gpu.ArithmeticBinaryGPUInstruction; import org.apache.sysml.runtime.instructions.gpu.BuiltinBinaryGPUInstruction; import org.apache.sysml.runtime.instructions.gpu.BuiltinUnaryGPUInstruction; @@ -43,6 +44,9 @@ public class GPUInstructionParser extends InstructionParser static final HashMap<String, GPUINSTRUCTION_TYPE> String2GPUInstructionType; static { String2GPUInstructionType = new HashMap<>(); + + String2GPUInstructionType.put( "tak+*" , GPUINSTRUCTION_TYPE.AggregateTernary); + String2GPUInstructionType.put( "tack+*" , GPUINSTRUCTION_TYPE.AggregateTernary); // Neural Network Operators String2GPUInstructionType.put( "relu_backward", GPUINSTRUCTION_TYPE.Dnn); @@ -179,6 +183,9 @@ public class GPUInstructionParser extends InstructionParser switch(gputype) { case AggregateUnary: return AggregateUnaryGPUInstruction.parseInstruction(str); + + case AggregateTernary: + return AggregateTernaryGPUInstruction.parseInstruction(str); case AggregateBinary: return AggregateBinaryGPUInstruction.parseInstruction(str); diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateTernaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateTernaryGPUInstruction.java new file mode 100644 index 0000000..53eab47 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateTernaryGPUInstruction.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.runtime.instructions.gpu; + +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; +import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; +import org.apache.sysml.runtime.functionobjects.Multiply; +import org.apache.sysml.runtime.instructions.InstructionUtils; +import org.apache.sysml.runtime.instructions.cp.CPOperand; +import org.apache.sysml.runtime.instructions.cp.DoubleObject; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; +import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; +import org.apache.sysml.runtime.matrix.operators.AggregateTernaryOperator; +import org.apache.sysml.runtime.matrix.operators.BinaryOperator; +import org.apache.sysml.runtime.matrix.operators.Operator; +import org.apache.sysml.utils.GPUStatistics; + +import jcuda.Pointer; + +public class AggregateTernaryGPUInstruction extends GPUInstruction { + + private CPOperand _input1 = null; + private CPOperand _input2 = null; + private CPOperand _input3 = null; + private CPOperand _output = null; + + private AggregateTernaryGPUInstruction(Operator op, CPOperand in1, CPOperand in2, CPOperand in3, CPOperand out, + String opcode, String istr) { + super(op, opcode, istr); + _gputype = GPUINSTRUCTION_TYPE.AggregateTernary; + _input1 = in1; + _input2 = in1; + _input3 = in1; + _output = out; + } + + public static AggregateTernaryGPUInstruction parseInstruction( String str ) { + String[] parts = InstructionUtils.getInstructionPartsWithValueType(str); + String opcode = parts[0]; + + if ( opcode.equalsIgnoreCase("tak+*") || opcode.equalsIgnoreCase("tack+*") ) { + InstructionUtils.checkNumFields( parts, 4 ); + + CPOperand in1 = new CPOperand(parts[1]); + CPOperand in2 = new CPOperand(parts[2]); + CPOperand in3 = new CPOperand(parts[3]); + CPOperand out = new CPOperand(parts[4]); + + AggregateTernaryOperator op = InstructionUtils.parseAggregateTernaryOperator(opcode, 1); + return new AggregateTernaryGPUInstruction(op, in1, in2, in3, out, opcode, str); + } + else { + throw new DMLRuntimeException("AggregateTernaryGPUInstruction.parseInstruction():: Unknown opcode " + opcode); + } + } + + @Override + public void processInstruction(ExecutionContext ec) { + GPUStatistics.incrementNoOfExecutedGPUInst(); + GPUContext gCtx = ec.getGPUContext(0); + String instName = getExtendedOpcode(); + AggregateTernaryOperator ab_op = (AggregateTernaryOperator) _optr; + MatrixObject in1 = getMatrixInputForGPUInstruction(ec, _input1.getName()); + MatrixObject in2 = getMatrixInputForGPUInstruction(ec, _input2.getName()); + + BinaryOperator bop = new BinaryOperator(Multiply.getMultiplyFnObject()); + + int rlenA = LibMatrixCUDA.toInt(in1.getNumRows()); + int rlenB = LibMatrixCUDA.toInt(in2.getNumRows()); + int clenA = LibMatrixCUDA.toInt(in1.getNumColumns()); + int clenB = LibMatrixCUDA.toInt(in2.getNumColumns()); + int rlenOut = Math.max(rlenA, rlenB); + int clenOut = Math.max(clenA, clenB); + int sizeOfOutput = rlenOut*clenOut; + Pointer out = gCtx.allocate(instName, sizeOfOutput*LibMatrixCUDA.sizeOfDataType); + + // out = in1 * in2 + Pointer A = LibMatrixCUDA.getDensePointer(gCtx, in1, instName); + Pointer B = LibMatrixCUDA.getDensePointer(gCtx, in2, instName); + LibMatrixCUDA.denseMatrixMatrixOp(gCtx, instName, A, B, rlenA, clenA, rlenB, clenB, out, bop); + ec.releaseMatrixInputForGPUInstruction(_input1.getName()); + ec.releaseMatrixInputForGPUInstruction(_input2.getName()); + + if(!_input3.isLiteral()) { + // out = out * in3 + MatrixObject in3 = getMatrixInputForGPUInstruction(ec, _input3.getName()); + rlenB = LibMatrixCUDA.toInt(in3.getNumRows()); + clenB = LibMatrixCUDA.toInt(in3.getNumColumns()); + if(rlenB*clenB > sizeOfOutput) { + throw new DMLRuntimeException("Matrix-vector AggregateTernaryGPUInstruction is not supported."); + } + B = LibMatrixCUDA.getDensePointer(gCtx, in3, instName); + LibMatrixCUDA.denseMatrixMatrixOp(gCtx, instName, out, B, rlenA, clenA, rlenB, clenB, out, bop); + ec.releaseMatrixInputForGPUInstruction(_input3.getName()); + } + + if( _output.getDataType().isScalar() ) { + // sum( in1*in2*in3 ) + double result = LibMatrixCUDA.reduceAll(gCtx, instName, "reduce_sum", out, sizeOfOutput); + ec.setScalarOutput(_output.getName(), new DoubleObject(result)); + } + else { + // colSum( in1*in2*in3 ) + Pointer out1 = LibMatrixCUDA.getDensePointer(gCtx, + LibMatrixCUDA.getDenseMatrixOutputForGPUInstruction(ec, instName, _output.getName(), 1, clenOut), instName); + LibMatrixCUDA.reduceCol(gCtx, instName, "reduce_col_sum", out, out1, rlenOut, clenOut); + ec.releaseMatrixOutputForGPUInstruction(_output.getName()); + } + + gCtx.cudaFreeHelper(instName, out, gCtx.EAGER_CUDA_FREE); + } +} diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java index 7f3b017..8b703e6 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java @@ -36,6 +36,7 @@ import org.apache.sysml.utils.Statistics; public abstract class GPUInstruction extends Instruction { public enum GPUINSTRUCTION_TYPE { AggregateUnary, + AggregateTernary, AggregateBinary, RelationalBinary, Dnn, diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index fd06578..657143a 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -984,7 +984,7 @@ public class LibMatrixCUDA { * @param n size of array * @return the reduced value */ - private static double reduceAll(GPUContext gCtx, String instName, String kernelFunction, Pointer in, int n) { + public static double reduceAll(GPUContext gCtx, String instName, String kernelFunction, Pointer in, int n) { if(LOG.isTraceEnabled()) { LOG.trace("GPU : reduceAll for " + kernelFunction + ", GPUContext=" + gCtx); } @@ -1530,6 +1530,17 @@ public class LibMatrixCUDA { a, b, c, maxRlen, maxClen, vecStatusA, vecStatusB, getBinaryOp(op.fn)); if (ConfigurationManager.isFinegrainedStatistics()) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL, System.nanoTime() - t0); } + + public static void denseMatrixMatrixOp(GPUContext gCtx, String instName, + Pointer A, Pointer B, + int rlenA, int clenA, int rlenB, int clenB, + Pointer C, BinaryOperator op) { + int vecStatusA = LibMatrixCUDA.getVectorStatus(rlenA, clenA).code(); + int vecStatusB = LibMatrixCUDA.getVectorStatus(rlenB, clenB).code(); + int maxRlen = Math.max(rlenA, rlenB); + int maxClen = Math.max(clenA, clenB); + matrixMatrixOp(gCtx, instName, A, B, maxRlen, maxClen, vecStatusA, vecStatusB, C, op); + } /** * This enum declares the different vector shapes diff --git a/src/test/java/org/apache/sysml/test/gpu/AggregateTernaryTests.java b/src/test/java/org/apache/sysml/test/gpu/AggregateTernaryTests.java new file mode 100644 index 0000000..578eb26 --- /dev/null +++ b/src/test/java/org/apache/sysml/test/gpu/AggregateTernaryTests.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.test.gpu; + +import org.apache.sysml.test.utils.TestUtils; +import org.junit.Test; + +/** + * Tests Ternary Aggregate ops + */ +public class AggregateTernaryTests extends UnaryOpTestsBase { + + private final static String TEST_NAME = "AggregateTernaryTests"; + + @Override + public void setUp() { + super.setUp(); + TestUtils.clearAssertionInformation(); + addTestConfiguration(TEST_DIR, TEST_NAME); + getAndLoadTestConfiguration(TEST_NAME); + } + + @Test + public void ternaryAgg1() { + testTernaryUnaryOpMatrixOutput("out = sum(in1*in2*in3)", "gpu_tak+*", "in1", "in2", "in3", "out", 30, 40, 0.9); + } + @Test + public void ternaryAgg2() { + testTernaryUnaryOpMatrixOutput("out = colSums(in1*in2*in3)", "gpu_tack+*", "in1", "in2", "in3", "out", 30, 40, 0.9); + } + + @Test + public void ternaryAgg3() { + testTernaryUnaryOpMatrixOutput("out = sum(in1*in2*in3)", "gpu_tak+*", "in1", "in2", "in3", "out", 30, 40, 0.2); + } + @Test + public void ternaryAgg4() { + testTernaryUnaryOpMatrixOutput("out = colSums(in1*in2*in3)", "gpu_tack+*", "in1", "in2", "in3", "out", 30, 40, 0.2); + } +} diff --git a/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java b/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java index 78a7c1b..ee6af94 100644 --- a/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java +++ b/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java @@ -162,4 +162,5 @@ public class AggregateUnaryOpTests extends UnaryOpTestsBase { public void colSumsqs() { testUnaryOpMatrixOutput("out = colSums(in1*in1)", "gpu_uacsqk+", "in1", "out"); } + } diff --git a/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java b/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java index 0f6b59c..1726ca7 100644 --- a/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java +++ b/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java @@ -102,5 +102,23 @@ public abstract class UnaryOpTestsBase extends GPUTests { //assertHeavyHitterPresent(heavyHitterOpCode); assertEqualObjects(outCPU.get(0), outGPU.get(0)); } + + public void testTernaryUnaryOpMatrixOutput(String scriptStr, String heavyHitterOpCode, + String inStr1, String inStr2, String inStr3, + String outStr, + int row, int column, double sparsity) { + int seed = 99; + Matrix in1 = generateInputMatrix(spark, row, column, sparsity, seed); + Matrix in2 = generateInputMatrix(spark, row, column, sparsity, seed); + Matrix in3 = generateInputMatrix(spark, row, column, sparsity, seed); + HashMap<String, Object> inputs = new HashMap<>(); + inputs.put(inStr1, in1); + inputs.put(inStr2, in2); + inputs.put(inStr3, in3); + List<Object> outCPU = runOnCPU(spark, scriptStr, inputs, Arrays.asList(outStr)); + List<Object> outGPU = runOnGPU(spark, scriptStr, inputs, Arrays.asList(outStr)); + assertHeavyHitterPresent(heavyHitterOpCode); + assertEqualObjects(outCPU.get(0), outGPU.get(0)); + } }