[systemml] branch master updated: [MINOR] Additional lineage parfor remote tests, and cleanups
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new c6d7a52 [MINOR] Additional lineage parfor remote tests, and cleanups c6d7a52 is described below commit c6d7a52e2e4259fa62ba8e0b15cdfe1397baac0f Author: Matthias Boehm AuthorDate: Tue Jun 23 22:46:05 2020 +0200 [MINOR] Additional lineage parfor remote tests, and cleanups This patch adds msvm w/ remote_spark parfor workers to the test suite and fixes missing support for tak+ operators in the recompute-by-lineage utility. --- scripts/builtin/l2svm.dml | 2 +- .../sysds/hops/ipa/FunctionCallSizeInfo.java | 9 ++-- .../sysds/runtime/lineage/LineageItemUtils.java| 25 ++--- .../functions/lineage/LineageTraceParforTest.java | 7 +++ .../functions/lineage/LineageTraceParforMSVM.dml | 61 ++ 5 files changed, 90 insertions(+), 14 deletions(-) diff --git a/scripts/builtin/l2svm.dml b/scripts/builtin/l2svm.dml index 3e251ae..f411fb9 100644 --- a/scripts/builtin/l2svm.dml +++ b/scripts/builtin/l2svm.dml @@ -72,7 +72,7 @@ m_l2svm = function(Matrix[Double] X, Matrix[Double] Y, Boolean intercept = FALSE # TODO make this a stop condition for l2svm instead of just printing. if(num_min + num_max != nrow(Y)) -print("L2SVM: WARNING invalid number of labels in Y") +print("L2SVM: WARNING invalid number of labels in Y: "+num_min+" "+num_max) # Scale inputs to -1 for negative, and 1 for positive classification if(check_min != -1 | check_max != +1) diff --git a/src/main/java/org/apache/sysds/hops/ipa/FunctionCallSizeInfo.java b/src/main/java/org/apache/sysds/hops/ipa/FunctionCallSizeInfo.java index b349a5f..551ce98 100644 --- a/src/main/java/org/apache/sysds/hops/ipa/FunctionCallSizeInfo.java +++ b/src/main/java/org/apache/sysds/hops/ipa/FunctionCallSizeInfo.java @@ -233,14 +233,11 @@ public class FunctionCallSizeInfo && h1.getDim1()==h2.getDim1() && h1.getDim2()==h2.getDim2() && h1.getNnz()==h2.getNnz() ); - //check literal values (equi value) - if( h1 instanceof LiteralOp ) { - consistent &= (h2 instanceof LiteralOp + //check literal values (both needs to be literals and same value) + if( h1 instanceof LiteralOp || h2 instanceof LiteralOp ) { + consistent &= (h1 instanceof LiteralOp && h2 instanceof LiteralOp && HopRewriteUtils.isEqualValue((LiteralOp)h1, (LiteralOp)h2)); } - else if(h2 instanceof LiteralOp) { - consistent = false; //h2 literal, but h1 not - } } } if( consistent ) diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageItemUtils.java b/src/main/java/org/apache/sysds/runtime/lineage/LineageItemUtils.java index 467bbc9..e659025 100644 --- a/src/main/java/org/apache/sysds/runtime/lineage/LineageItemUtils.java +++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageItemUtils.java @@ -278,6 +278,24 @@ public class LineageItemUtils { operands.put(item.getId(), aggunary); break; } + case AggregateBinary: { + Hop input1 = operands.get(item.getInputs()[0].getId()); + Hop input2 = operands.get(item.getInputs()[1].getId()); + Hop aggbinary = HopRewriteUtils.createMatrixMultiply(input1, input2); + operands.put(item.getId(), aggbinary); + break; + } + case AggregateTernary: { +
[systemml] branch master updated: [SYSTEMDS-421] Fix IPA scalar propagation (inconsistent literals/vars)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new ff32c05 [SYSTEMDS-421] Fix IPA scalar propagation (inconsistent literals/vars) ff32c05 is described below commit ff32c05373af72da825713c74de3ddc4c46a2159 Author: Matthias Boehm AuthorDate: Tue Jun 23 21:41:32 2020 +0200 [SYSTEMDS-421] Fix IPA scalar propagation (inconsistent literals/vars) This patch fixes the logic of IPA scalar propagation into functions with multiple function calls. Similar to sizes, we check if literal function arguments have consistent values and propagate valid ones. However, this check had a logic problem of only checking if the first call was a literal. This missed cases where the first call had a scalar variable but the second call a valid scalar literal that could had been propagated individually. --- dev/Tasks.txt | 3 + .../sysds/hops/ipa/FunctionCallSizeInfo.java | 5 +- .../recompile/IPAConstantPropagationFunTest.java | 71 ++ .../functions/recompile/IPAFunctionArgs.dml| 109 + 4 files changed, 187 insertions(+), 1 deletion(-) diff --git a/dev/Tasks.txt b/dev/Tasks.txt index 689949c..c84a523 100644 --- a/dev/Tasks.txt +++ b/dev/Tasks.txt @@ -341,6 +341,9 @@ SYSTEMDS-410 Lineage Tracing, Reuse and Integration II * 413 Cache and reuse MultiReturnBuiltin instructionsOK * 414 New rewrite for PCA --> lmDS pipeline OK +SYSTEMDS-420 Compiler Improvements + * 421 Fix invalid IPA scalar propagation into functions OK + SYSTEMDS-500 Documentation Webpage Reintroduction * 501 Make Documentation webpage framework OK diff --git a/src/main/java/org/apache/sysds/hops/ipa/FunctionCallSizeInfo.java b/src/main/java/org/apache/sysds/hops/ipa/FunctionCallSizeInfo.java index 7199d17..b349a5f 100644 --- a/src/main/java/org/apache/sysds/hops/ipa/FunctionCallSizeInfo.java +++ b/src/main/java/org/apache/sysds/hops/ipa/FunctionCallSizeInfo.java @@ -234,10 +234,13 @@ public class FunctionCallSizeInfo && h1.getDim2()==h2.getDim2() && h1.getNnz()==h2.getNnz() ); //check literal values (equi value) - if( h1 instanceof LiteralOp ){ + if( h1 instanceof LiteralOp ) { consistent &= (h2 instanceof LiteralOp && HopRewriteUtils.isEqualValue((LiteralOp)h1, (LiteralOp)h2)); } + else if(h2 instanceof LiteralOp) { + consistent = false; //h2 literal, but h1 not + } } } if( consistent ) diff --git a/src/test/java/org/apache/sysds/test/functions/recompile/IPAConstantPropagationFunTest.java b/src/test/java/org/apache/sysds/test/functions/recompile/IPAConstantPropagationFunTest.java new file mode 100644 index 000..efd0397 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/recompile/IPAConstantPropagationFunTest.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.recompile; + +import java.util.HashMap; + +import org.junit.Test; +import org.apache.sysds.hops.OptimizerUtils; +import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; +impo
[systemml] branch master updated: [SYSTEMDS-412] Fix lineage-based reuse for update-inplace indexing
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 3d876d3 [SYSTEMDS-412] Fix lineage-based reuse for update-inplace indexing 3d876d3 is described below commit 3d876d33ad019fe026799df540fc20a86ec4 Author: Matthias Boehm AuthorDate: Thu Jun 11 15:57:20 2020 +0200 [SYSTEMDS-412] Fix lineage-based reuse for update-inplace indexing This patch disabled lineage-based reuse for update-inplace left indexing operations as reuse would create incorrect results due to later in-place updates the change the cached data object. Furthermore, this patch also aims to make the codegen tests for robust wrt the surefire github action integration (less explain output). --- .../runtime/controlprogram/context/SparkExecutionContext.java | 8 ++-- .../java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java | 5 - .../org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java | 2 +- .../apache/sysds/test/functions/codegen/DAGCellwiseTmplTest.java | 2 +- .../org/apache/sysds/test/functions/codegen/MiscPatternTest.java | 2 +- .../org/apache/sysds/test/functions/codegen/MultiAggTmplTest.java | 2 +- .../apache/sysds/test/functions/codegen/OuterProdTmplTest.java| 2 +- .../org/apache/sysds/test/functions/codegen/RowAggTmplTest.java | 2 +- .../sysds/test/functions/codegen/RowConv2DOperationsTest.java | 2 +- .../sysds/test/functions/codegen/RowVectorComparisonTest.java | 2 +- .../apache/sysds/test/functions/codegen/SparseSideInputTest.java | 2 +- .../apache/sysds/test/functions/codegen/SumProductChainTest.java | 2 +- 12 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/context/SparkExecutionContext.java b/src/main/java/org/apache/sysds/runtime/controlprogram/context/SparkExecutionContext.java index a1e2b92..11a4e93 100644 --- a/src/main/java/org/apache/sysds/runtime/controlprogram/context/SparkExecutionContext.java +++ b/src/main/java/org/apache/sysds/runtime/controlprogram/context/SparkExecutionContext.java @@ -171,18 +171,14 @@ public class SparkExecutionContext extends ExecutionContext _spctx = null; } - public void close() - { + public void close() { synchronized( SparkExecutionContext.class ) { - if( _spctx != null ) - { + if( _spctx != null ) { //stop the spark context if existing _spctx.stop(); - //make sure stopped context is never used again _spctx = null; } - } } diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java index 48a512a..22964ba 100644 --- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java +++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java @@ -25,6 +25,7 @@ import org.apache.sysds.runtime.controlprogram.context.ExecutionContext; import org.apache.sysds.runtime.instructions.Instruction; import org.apache.sysds.runtime.instructions.cp.ComputationCPInstruction; import org.apache.sysds.runtime.instructions.cp.ListIndexingCPInstruction; +import org.apache.sysds.runtime.instructions.cp.MatrixIndexingCPInstruction; import java.util.Comparator; @@ -151,7 +152,9 @@ public class LineageCacheConfig && !(inst instanceof ListIndexingCPInstruction); boolean rightop = (ArrayUtils.contains(REUSE_OPCODES, inst.getOpcode()) || (inst.getOpcode().equals("append") && isVectorAppend(inst, ec))); - return insttype && rightop; + boolean updateInplace = (inst instanceof MatrixIndexingCPInstruction) + && ec.getMatrixObject(((ComputationCPInstruction)inst).input1).getUpdateType().isInPlace(); + return insttype && rightop && !updateInplace; } private static boolean isVectorAppend(Instruction inst, ExecutionContext ec) { diff --git a/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java b/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java index bd369c0..b1d184c 100644 --- a/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java +++ b/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java @@ -477,7 +477,7 @@ public class CellwiseTmplTest extends AutomatedTestBase
[systemml] branch master updated: [SYSTEMDS-412] Fix robustness lineage DAGs, parfor integration
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new e8c0a28 [SYSTEMDS-412] Fix robustness lineage DAGs, parfor integration e8c0a28 is described below commit e8c0a28c95b9a22f2a023715a3717c36528bd3ab Author: Matthias Boehm AuthorDate: Thu Jun 11 14:08:13 2020 +0200 [SYSTEMDS-412] Fix robustness lineage DAGs, parfor integration This patch makes further robustness improvements to the handling of large lineage DAGs via non-recursive primitives. In this context, explain needed special treatment to preserve the previous output in DFS order w/ post-append. Furthermore, this also fixes a number of issues of the parfor integration such as (1) invalid cached hashes after sub-DAG replacement, (2) introduced cycles during parfor lineage merge, (3) steplm script improvements (disabled parfor dependency analysis was hiding the issue that introduced the cycles), and (4) some debugging functionality to reliably detect cycles in lineage DAGs. --- scripts/builtin/steplm.dml | 20 .../instructions/cp/DataGenCPInstruction.java | 2 +- .../apache/sysds/runtime/lineage/LineageItem.java | 5 ++ .../sysds/runtime/lineage/LineageItemUtils.java| 55 -- src/main/java/org/apache/sysds/utils/Explain.java | 49 +++ .../test/functions/lineage/LineageReuseAlg.java| 37 ++- .../functions/lineage/LineageTraceParforSteplm.dml | 4 +- 7 files changed, 134 insertions(+), 38 deletions(-) diff --git a/scripts/builtin/steplm.dml b/scripts/builtin/steplm.dml index 01f35ba..800c2ca 100644 --- a/scripts/builtin/steplm.dml +++ b/scripts/builtin/steplm.dml @@ -98,7 +98,7 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, # First pass to examine single features AICs = matrix(0, 1, m_orig); - parfor (i in 1:m_orig, check = 0) { + parfor (i in 1:m_orig) { [AIC_1, beta_out_i] = linear_regression(X_orig[, i], y, icpt, reg, tol, maxi, verbose); AICs[1, i] = AIC_1; beta_out_all[1:nrow(beta_out_i), i] = beta_out_i; @@ -129,25 +129,25 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, while (continue) { # Subsequent passes over the features beta_out_all_2 = matrix(0, boa_ncol, m_orig * 1); - AICs = matrix(0, 1, m_orig); # full overwrite - parfor (i in 1:m_orig, check = 0) { + AICs_2 = matrix(0, 1, m_orig); # full overwrite + parfor (i in 1:m_orig) { if (as.scalar(columns_fixed[1, i]) == 0) { # Construct the feature matrix - X = cbind(X_global, X_orig[, i]); - [AIC_2, beta_out_i] = linear_regression(X, y, icpt, reg, tol, maxi, verbose); - AICs[1, i] = AIC_2; - beta_out_all_2[1:nrow(beta_out_i), i] = beta_out_i; + Xi = cbind(X_global, X_orig[, i]); + [AIC_2, beta_out_i2] = linear_regression(Xi, y, icpt, reg, tol, maxi, verbose); + AICs_2[1, i] = AIC_2; + beta_out_all_2[1:nrow(beta_out_i2), i] = beta_out_i2; } else { - AICs[1,i] = Inf; + AICs_2[1,i] = Inf; } } # Determine the best AIC AIC_best_orig = AIC_best; - AIC_best = min(min(AICs), AIC_best_orig); + AIC_best = min(min(AICs_2), AIC_best_orig); AIC_check = checkAIC(AIC_best, AIC_best_orig, thr); - column_best = ifelse(AIC_check, as.scalar(rowIndexMin(AICs)), column_best); + column_best = ifelse(AIC_check, as.scalar(rowIndexMin(AICs_2)), column_best); # have the best beta store in the matrix beta_best = beta_out_all_2[, column_best]; diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/DataGenCPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/DataGenCPInstruction.java index baacca6..8d688b8 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/cp/DataGenCPInstruction.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/DataGenCPInstruction.java @@ -402,7 +402,7 @@ public class DataGenCPInstruction extends UnaryCPInstruction { tmpInstStr, position, String.valueOf(runtimeSeed)) : tmpInstStr; } //replace output variable name with a placeholder - //tmpInstStr = InstructionUtils.replaceOperandName(tmpInstStr); + tmpInstStr = InstructionUtils.replaceOperandName(tmpInstStr); tmpInstStr = replaceNonLiteral(tmpInstStr, rows, 2, ec); tmpInstStr = replaceNonLiteral(tmpInstStr, cols, 3, ec); break; diff --git
[systemml] branch master updated: [SYSTEMDS-412] Robustness lineage DAG ops (non-recursive resetVisit)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 86f7b1f [SYSTEMDS-412] Robustness lineage DAG ops (non-recursive resetVisit) 86f7b1f is described below commit 86f7b1f47bc1f1f4291e97adc3c5d996b0dc67ba Author: Matthias Boehm AuthorDate: Sun Jun 7 22:29:56 2020 +0200 [SYSTEMDS-412] Robustness lineage DAG ops (non-recursive resetVisit) This patch is a first step toward making the lineage DAG more robust with regard to stack overflow errors, which occur for example in default JVM configuration when writing out lineage DAGs of a depth >10,000s of nodes. We use simple non-recursive stacks to perform these operations, but explain and similar operations require some additional queueing to preserve the previous output format (no need to break backwards compatibility to previous releases). --- .../apache/sysds/runtime/lineage/LineageCache.java | 2 +- .../apache/sysds/runtime/lineage/LineageItem.java | 43 +++--- .../sysds/runtime/lineage/LineageItemUtils.java| 14 +++ src/main/java/org/apache/sysds/utils/Explain.java | 12 +++--- 4 files changed, 52 insertions(+), 19 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java index 9f53395..cb2d13b 100644 --- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java +++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java @@ -255,7 +255,7 @@ public class LineageCache String boundVarName = outputs.get(i).getName(); LineageItem boundLI = ec.getLineage().get(boundVarName); if (boundLI != null) - boundLI.resetVisitStatus(); + boundLI.resetVisitStatusNR(); if (boundLI == null || !LineageCache.probe(li) || !LineageCache.probe(boundLI)) { AllOutputsCacheable = false; } diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageItem.java b/src/main/java/org/apache/sysds/runtime/lineage/LineageItem.java index e5345e8..38a4cb9 100644 --- a/src/main/java/org/apache/sysds/runtime/lineage/LineageItem.java +++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageItem.java @@ -19,6 +19,8 @@ package org.apache.sysds.runtime.lineage; +import java.util.Stack; + import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.controlprogram.parfor.util.IDSequence; import org.apache.sysds.runtime.util.UtilFunctions; @@ -128,9 +130,9 @@ public class LineageItem { if (!(o instanceof LineageItem)) return false; - resetVisitStatus(); + resetVisitStatusNR(); boolean ret = equalsLI((LineageItem) o); - resetVisitStatus(); + resetVisitStatusNR(); return ret; } @@ -180,16 +182,47 @@ public class LineageItem { return !_opcode.isEmpty(); } - public LineageItem resetVisitStatus() { + /** +* Non-recursive equivalent of {@link #resetVisitStatus()} +* for robustness with regard to stack overflow errors. +*/ + public void resetVisitStatusNR() { + Stack q = new Stack<>(); + q.push(this); + while( !q.empty() ) { + LineageItem tmp = q.pop(); + if( !tmp.isVisited() ) + continue; + if (tmp.getInputs() != null) + for (LineageItem li : tmp.getInputs()) + q.push(li); + tmp.setVisited(false); + } + } + + /** +* Non-recursive equivalent of {@link #resetVisitStatus(LineageItem[])} +* for robustness with regard to stack overflow errors. +* +* @param lis root lineage items +*/ + public static void resetVisitStatusNR(LineageItem[] lis) { + if (lis != null) + for (LineageItem liRoot : lis) + liRoot.resetVisitStatusNR(); + } + + @Deprecated + public void resetVisitStatus() { if (!isVisited()) - return this; + return; if (_inputs != null) for (LineageItem li : getInputs()) li.resetVisitStatus(); setVisited(false); - return this; } + @Deprecated public st
[systemml] branch master updated: [MINOR] Fix reading dml scripts from dist fs / object store
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 9d5999d [MINOR] Fix reading dml scripts from dist fs / object store 9d5999d is described below commit 9d5999dc91df37beaddbdd48fe3c7487188f52a7 Author: Matthias Boehm AuthorDate: Fri Jun 5 17:45:36 2020 +0200 [MINOR] Fix reading dml scripts from dist fs / object store --- src/main/java/org/apache/sysds/api/DMLScript.java | 6 +- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/main/java/org/apache/sysds/api/DMLScript.java b/src/main/java/org/apache/sysds/api/DMLScript.java index a069182..6854ebb 100644 --- a/src/main/java/org/apache/sysds/api/DMLScript.java +++ b/src/main/java/org/apache/sysds/api/DMLScript.java @@ -112,8 +112,6 @@ public class DMLScript public static String _uuid = IDHandler.createDistributedUniqueID(); private static final Log LOG = LogFactory.getLog(DMLScript.class.getName()); - private static FileSystem fs = null; - /// // public external interface @@ -283,7 +281,7 @@ public class DMLScript || IOUtilFunctions.isObjectStoreFileScheme(new Path(fileName)) ) { Path scriptPath = new Path(fileName); - fs = IOUtilFunctions.getFileSystem(scriptPath); + FileSystem fs = IOUtilFunctions.getFileSystem(scriptPath); in = new BufferedReader(new InputStreamReader(fs.open(scriptPath))); } // from local file system @@ -303,8 +301,6 @@ public class DMLScript throw ex; } finally { - if(fs != null) - fs.close(); IOUtilFunctions.closeSilently(in); }
[systemml] branch master updated: [MINOR] Fix BuiltinFunctionExpression
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new d8c9495 [MINOR] Fix BuiltinFunctionExpression d8c9495 is described below commit d8c9495ae6d8e0507d113718b23dfd8fa5035e6d Author: Sebastian AuthorDate: Wed Jun 3 18:57:30 2020 +0200 [MINOR] Fix BuiltinFunctionExpression Closes #937. Closes #944. --- .../java/org/apache/sysds/parser/BuiltinFunctionExpression.java | 8 1 file changed, 8 insertions(+) diff --git a/src/main/java/org/apache/sysds/parser/BuiltinFunctionExpression.java b/src/main/java/org/apache/sysds/parser/BuiltinFunctionExpression.java index 96e2ebc..b358396 100644 --- a/src/main/java/org/apache/sysds/parser/BuiltinFunctionExpression.java +++ b/src/main/java/org/apache/sysds/parser/BuiltinFunctionExpression.java @@ -912,6 +912,14 @@ public class BuiltinFunctionExpression extends DataIdentifier case NROW: case NCOL: case LENGTH: + checkNumParameters(1); + checkDataTypeParam(getFirstExpr(), + DataType.FRAME, DataType.LIST, DataType.MATRIX); + output.setDataType(DataType.SCALAR); + output.setDimensions(0, 0); + output.setBlocksize(0); + output.setValueType(ValueType.INT64); + break; case COUNT_DISTINCT: case COUNT_DISTINCT_APPROX: checkNumParameters(1);
[systemml] branch master updated: [SYSTEMDS-397] Neural Collaborative Filtering (NCF) algorithm script
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 5e0d5c4 [SYSTEMDS-397] Neural Collaborative Filtering (NCF) algorithm script 5e0d5c4 is described below commit 5e0d5c45162c7f26e6003659d58f091a3a794f11 Author: Patrick Deutschmann AuthorDate: Tue Jun 2 23:29:57 2020 +0200 [SYSTEMDS-397] Neural Collaborative Filtering (NCF) algorithm script AMLS project SS2020. Closes #925. --- dev/docs/Tasks.txt | 1 + .../Example - Neural Collaborative Filtering.ipynb | 347 + scripts/nn/examples/README.md | 7 + scripts/nn/examples/ncf-dummy-data.dml | 57 scripts/nn/examples/ncf-real-data.dml | 65 scripts/staging/NCF.dml| 330 6 files changed, 807 insertions(+) diff --git a/dev/docs/Tasks.txt b/dev/docs/Tasks.txt index f3d4acd..8c6b306 100644 --- a/dev/docs/Tasks.txt +++ b/dev/docs/Tasks.txt @@ -311,6 +311,7 @@ SYSTEMDS-390 New Builtin Functions IV * 394 Builtin for one-hot encoding of matrix (not frame), see table OK * 395 SVM rework and utils (confusionMatrix, msvmPredict)OK * 396 Builtin for counting number of distinct values OK + * 397 Algorithm for neural collaborative filtering (NCF) OK SYSTEMDS-400 Spark Backend Improvements * 401 Fix output block indexes of rdiag (diagM2V)OK diff --git a/scripts/nn/examples/Example - Neural Collaborative Filtering.ipynb b/scripts/nn/examples/Example - Neural Collaborative Filtering.ipynb new file mode 100644 index 000..5c047fd --- /dev/null +++ b/scripts/nn/examples/Example - Neural Collaborative Filtering.ipynb @@ -0,0 +1,347 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Neural Collaborative Filtering (NCF)\n", +"\n", +"This examples trains a neural network on the MovieLens data set using the concept of [Neural Collaborative Filtering (NCF)](https://dl.acm.org/doi/abs/10.1145/3038912.3052569) that is aimed at approaching recommendation problems using deep neural networks as opposed to common matrix factorization approaches." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"## Setup and Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ +"import numpy as np\n", +"import pandas as pd\n", +"import matplotlib.pyplot as plt\n", +"from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"## Download Data - MovieLens" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"The MovieLens data set is provided by the Unniversity of Minnesota and the GroupLens Research Group:\n", +"\n", +"> This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org/), a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.\n", +"Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.\n", +"The data are contained in the files links.csv, movies.csv, ratings.csv and tags.csv. More details about the contents and use of all these files follows.\n", +"This is a development dataset. As such, it may change over time and is not an appropriate dataset for shared research results. See available benchmark datasets if that is your intent.\n", +"This and other GroupLens data sets are publicly available for download at http://grouplens.org/datasets/.; + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Archive: ml-latest-small.zip\n", + " creating: ml-latest-small/\n",
[systemml] branch master updated: [SYSTEMDS-396] Distinct values count/estimation functions
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 02e5c6d [SYSTEMDS-396] Distinct values count/estimation functions 02e5c6d is described below commit 02e5c6db0dd5d02416e45874253c59db04151605 Author: Sebastian AuthorDate: Tue Jun 2 22:07:34 2020 +0200 [SYSTEMDS-396] Distinct values count/estimation functions New function for counting the number of distinct values in a MatrixBlock. It is using the builtin AggregateInstructions to parse through hop lop. It can be called to execute with different types of estimators: - count : The default implementation that counts by adding to an hashmap. Not memory efficient, but returns exact counts. - KMV : An estimation algorithm K Minimum Values - HLL : An estimation algorithm Hyper Log Log (Not finished) Closes #909. --- .github/workflows/functionsTests.yml | 1 + dev/docs/Tasks.txt | 1 + .../java/org/apache/sysds/common/Builtins.java | 2 + src/main/java/org/apache/sysds/common/Types.java | 4 +- .../org/apache/sysds/lops/PartialAggregate.java| 21 +- .../sysds/parser/BuiltinFunctionExpression.java| 5 +- .../org/apache/sysds/parser/DMLTranslator.java | 2 + .../sysds/runtime/functionobjects/Builtin.java | 3 +- .../runtime/instructions/CPInstructionParser.java | 2 + .../cp/AggregateUnaryCPInstruction.java| 24 +- .../matrix/data/LibMatrixCountDistinct.java| 277 + .../matrix/operators/CountDistinctOperator.java| 64 + .../apache/sysds/runtime/util/DataConverter.java | 20 ++ src/main/java/org/apache/sysds/utils/Hash.java | 133 ++ src/test/java/org/apache/sysds/test/TestUtils.java | 38 ++- .../test/component/matrix/CountDistinctTest.java | 195 +++ .../apache/sysds/test/component/misc/UtilHash.java | 106 .../builtin/BuiltinFactorizationTest.java | 2 +- .../functions/countDistinct/CountDistinct.java | 49 .../countDistinct/CountDistinctApprox.java | 56 + .../functions/countDistinct/CountDistinctBase.java | 109 .../functions/countDistinct/countDistinct.dml | 24 ++ .../countDistinct/countDistinctApprox.dml | 24 ++ 23 files changed, 1150 insertions(+), 12 deletions(-) diff --git a/.github/workflows/functionsTests.yml b/.github/workflows/functionsTests.yml index b983018..fb1a5bb 100644 --- a/.github/workflows/functionsTests.yml +++ b/.github/workflows/functionsTests.yml @@ -49,6 +49,7 @@ jobs: codegen, codegenalg.partone, codegenalg.parttwo, + countDistinct, data.misc, data.rand, data.tensor, diff --git a/dev/docs/Tasks.txt b/dev/docs/Tasks.txt index 9a51eb5..f3d4acd 100644 --- a/dev/docs/Tasks.txt +++ b/dev/docs/Tasks.txt @@ -310,6 +310,7 @@ SYSTEMDS-390 New Builtin Functions IV * 393 Builtin to find Connected Components of a graphOK * 394 Builtin for one-hot encoding of matrix (not frame), see table OK * 395 SVM rework and utils (confusionMatrix, msvmPredict)OK + * 396 Builtin for counting number of distinct values OK SYSTEMDS-400 Spark Backend Improvements * 401 Fix output block indexes of rdiag (diagM2V)OK diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java index 7345077..5ee7a79 100644 --- a/src/main/java/org/apache/sysds/common/Builtins.java +++ b/src/main/java/org/apache/sysds/common/Builtins.java @@ -178,6 +178,8 @@ public enum Builtins { TRACE("trace", false), TO_ONE_HOT("toOneHot", true), TYPEOF("typeOf", false), + COUNT_DISTINCT("countDistinct",false), + COUNT_DISTINCT_APPROX("countDistinctApprox",false), VAR("var", false), XOR("xor", false), WINSORIZE("winsorize", true, false), //TODO parameterize w/ prob, min/max val diff --git a/src/main/java/org/apache/sysds/common/Types.java b/src/main/java/org/apache/sysds/common/Types.java index 2d66e81..996132f 100644 --- a/src/main/java/org/apache/sysds/common/Types.java +++ b/src/main/java/org/apache/sysds/common/Types.java @@ -175,7 +175,9 @@ public class Types PROD, SUM_PROD, MIN, MAX, TRACE, MEAN, VAR, - MAXINDEX, MININDEX; + MAXINDEX, MININDEX, + COUNT_DISTINCT, + COUNT_DISTINCT_APPROX; @Override public String toString() { diff --git a/src/main/java/org/apache/sysds/lops/PartialAggregate.java b/src/mai
[systemml] branch master updated: [MINOR] Fix multi-threaded federated MV multiply, and test issues
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 0118a3e [MINOR] Fix multi-threaded federated MV multiply, and test issues 0118a3e is described below commit 0118a3eef317826cf79bf01471f07a67631cee64 Author: Matthias Boehm AuthorDate: Sun May 31 22:41:59 2020 +0200 [MINOR] Fix multi-threaded federated MV multiply, and test issues So far, the federated matrix-vector multiplications were always executed in a single-threaded manner, now we execute them according to the local parallelism configuration at the federated worker. Also, it seems I introduced a bug of privacy handling during the merge, which this patch also fixes (e.g., on scalar casts of non-cacheable data objects). --- .../federated/FederatedWorkerHandler.java | 8 +++- .../cp/AggregateBinaryCPInstruction.java | 23 ++ .../instructions/cp/VariableCPInstruction.java | 3 --- .../gpu/AggregateBinaryGPUInstruction.java | 4 +--- .../instructions/spark/CpmmSPInstruction.java | 12 +++ .../instructions/spark/MapmmSPInstruction.java | 21 ++-- .../instructions/spark/PMapmmSPInstruction.java| 17 .../instructions/spark/PmmSPInstruction.java | 6 +- .../instructions/spark/ZipmmSPInstruction.java | 3 +-- .../sysds/runtime/privacy/PrivacyMonitor.java | 2 +- .../compress/ParCompressedMatrixTest.java | 10 +++--- 11 files changed, 29 insertions(+), 80 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/federated/FederatedWorkerHandler.java b/src/main/java/org/apache/sysds/runtime/controlprogram/federated/FederatedWorkerHandler.java index bba731c..6fe814a 100644 --- a/src/main/java/org/apache/sysds/runtime/controlprogram/federated/FederatedWorkerHandler.java +++ b/src/main/java/org/apache/sysds/runtime/controlprogram/federated/FederatedWorkerHandler.java @@ -38,15 +38,13 @@ import org.apache.sysds.runtime.controlprogram.caching.FrameObject; import org.apache.sysds.runtime.controlprogram.caching.MatrixObject; import org.apache.sysds.runtime.controlprogram.caching.TensorObject; import org.apache.sysds.runtime.controlprogram.parfor.util.IDSequence; -import org.apache.sysds.runtime.functionobjects.Multiply; -import org.apache.sysds.runtime.functionobjects.Plus; +import org.apache.sysds.runtime.instructions.InstructionUtils; import org.apache.sysds.runtime.instructions.cp.Data; import org.apache.sysds.runtime.instructions.cp.ListObject; import org.apache.sysds.runtime.io.IOUtilFunctions; import org.apache.sysds.runtime.matrix.data.LibMatrixAgg; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.sysds.runtime.matrix.operators.AggregateBinaryOperator; -import org.apache.sysds.runtime.matrix.operators.AggregateOperator; import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator; import org.apache.sysds.runtime.matrix.operators.ScalarOperator; import org.apache.sysds.runtime.meta.MatrixCharacteristics; @@ -187,8 +185,8 @@ public class FederatedWorkerHandler extends ChannelInboundHandlerAdapter { matTo = PrivacyMonitor.handlePrivacy(matTo); MatrixBlock matBlock1 = matTo.acquireReadAndRelease(); // TODO other datatypes - AggregateBinaryOperator ab_op = new AggregateBinaryOperator( - Multiply.getMultiplyFnObject(), new AggregateOperator(0, Plus.getPlusFnObject())); + AggregateBinaryOperator ab_op = InstructionUtils + .getMatMultOperator(OptimizerUtils.getConstrainedNumThreads(0)); MatrixBlock result = isMatVecMult ? matBlock1.aggregateBinaryOperations(matBlock1, vector, new MatrixBlock(), ab_op) : vector.aggregateBinaryOperations(vector, matBlock1, new MatrixBlock(), ab_op); diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/AggregateBinaryCPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/AggregateBinaryCPInstruction.java index 1e3186d..0df8108 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/cp/AggregateBinaryCPInstruction.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/AggregateBinaryCPInstruction.java @@ -19,17 +19,12 @@ package org.apache.sysds.runtime.instructions.cp; -import org.apache.sysds.common.Types.DataType; -import org.apache.sysds.common.Types.ValueType; import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.compress.CompressedMatrixBlock; import org.apache.sysds.runtime.controlprogram.context.ExecutionContext; -import org.apache.sysds.runtime.functionobjects.Multiply; -import
[systemml] branch master updated: [SYSTEMDS-362] Federated runtime propagation of privacy constraints
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 98cb93d [SYSTEMDS-362] Federated runtime propagation of privacy constraints 98cb93d is described below commit 98cb93d0dbbc2f1fabc3796bbc21aca1874eed5f Author: sebwrede AuthorDate: Sun May 31 19:41:45 2020 +0200 [SYSTEMDS-362] Federated runtime propagation of privacy constraints * Runtime propagation of privacy constraints * Privacy level as Enum with three levels: Private, PrivateAggregate, and None * Privacy handling in FederatedWorkerHandler preventing private data from being included in federated response * Test of privacy handling of different federated request types * Test of different privacy levels and combinations for Federated L2SVM Closes #919. --- src/main/java/org/apache/sysds/hops/Hop.java | 2 +- .../org/apache/sysds/parser/BinaryExpression.java | 2 +- .../org/apache/sysds/parser/DataExpression.java| 11 +- .../java/org/apache/sysds/parser/Identifier.java | 5 +- .../federated/FederatedWorkerHandler.java | 7 + .../sysds/runtime/instructions/Instruction.java| 4 + .../instructions/cp/BuiltinNaryCPInstruction.java | 8 + .../runtime/instructions/cp/CPInstruction.java | 3 + .../instructions/cp/QuaternaryCPInstruction.java | 3 + .../instructions/cp/VariableCPInstruction.java | 518 - .../runtime/instructions/fed/FEDInstruction.java | 5 +- .../instructions/spark/ReblockSPInstruction.java | 2 +- ...acyConstraint.java => DMLPrivacyException.java} | 38 +- .../sysds/runtime/privacy/PrivacyConstraint.java | 30 +- .../sysds/runtime/privacy/PrivacyMonitor.java | 96 .../sysds/runtime/privacy/PrivacyPropagator.java | 315 - .../org/apache/sysds/runtime/util/HDFSTool.java| 7 +- .../test/functions/privacy/FederatedL2SVMTest.java | 384 +++ .../privacy/FederatedWorkerHandlerTest.java| 339 ++ .../MatrixMultiplicationPropagationTest.java | 53 ++- .../privacy/MatrixRuntimePropagationTest.java | 123 + .../privacy/MatrixRuntimePropagationTest.dml | 28 ++ 22 files changed, 1695 insertions(+), 288 deletions(-) diff --git a/src/main/java/org/apache/sysds/hops/Hop.java b/src/main/java/org/apache/sysds/hops/Hop.java index f0ef363..24aade1 100644 --- a/src/main/java/org/apache/sysds/hops/Hop.java +++ b/src/main/java/org/apache/sysds/hops/Hop.java @@ -73,7 +73,7 @@ public abstract class Hop implements ParseInfo protected ValueType _valueType; protected boolean _visited = false; protected DataCharacteristics _dc = new MatrixCharacteristics(); - protected PrivacyConstraint _privacyConstraint = new PrivacyConstraint(); + protected PrivacyConstraint _privacyConstraint = null; protected UpdateType _updateType = UpdateType.COPY; protected ArrayList _parent = new ArrayList<>(); diff --git a/src/main/java/org/apache/sysds/parser/BinaryExpression.java b/src/main/java/org/apache/sysds/parser/BinaryExpression.java index 6c177e2..acccb66 100644 --- a/src/main/java/org/apache/sysds/parser/BinaryExpression.java +++ b/src/main/java/org/apache/sysds/parser/BinaryExpression.java @@ -146,7 +146,7 @@ public class BinaryExpression extends Expression } // Set privacy of output - output.setPrivacy(PrivacyPropagator.MergeBinary( + output.setPrivacy(PrivacyPropagator.mergeBinary( getLeft().getOutput().getPrivacy(), getRight().getOutput().getPrivacy())); this.setOutput(output); diff --git a/src/main/java/org/apache/sysds/parser/DataExpression.java b/src/main/java/org/apache/sysds/parser/DataExpression.java index c94532d..779f788 100644 --- a/src/main/java/org/apache/sysds/parser/DataExpression.java +++ b/src/main/java/org/apache/sysds/parser/DataExpression.java @@ -37,6 +37,7 @@ import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; import org.apache.sysds.runtime.io.FileFormatPropertiesMM; import org.apache.sysds.runtime.io.IOUtilFunctions; +import org.apache.sysds.runtime.privacy.PrivacyConstraint.PrivacyLevel; import org.apache.sysds.runtime.util.HDFSTool; import org.apache.sysds.runtime.util.UtilFunctions; import org.apache.sysds.utils.JSONHelper; @@ -1097,10 +1098,8 @@ public class DataExpression extends DataIdentifier // set privacy Expression eprivacy = getVarParam("privacy"); - boolean privacy = false; - if
[systemml] branch master updated: [SYSTEMDS-274] Fix compressed colMins/colMaxs w/ shared dictionary
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new ea2d971 [SYSTEMDS-274] Fix compressed colMins/colMaxs w/ shared dictionary ea2d971 is described below commit ea2d971ec4ad3a0cf93fe78224a6f14176a6235b Author: Matthias Boehm AuthorDate: Sun May 31 18:26:55 2020 +0200 [SYSTEMDS-274] Fix compressed colMins/colMaxs w/ shared dictionary This patch fixes remaining issues of incorrect results for colMins and colMaxs over compressed matrix blocks with shared DDC1 dictionaries. Specifically, if the individual column groups have only partial overlap, the shared dictionary contains a superset of column group distinct values. Since aggregation functions like min and max are executed only over the dictionary (without touching the compressed data), it led to incorrect results as we find extreme values that do not actually exist in the column group. Three alternatives approaches could solve this: (1) drop shared dictionaries, (2) execute colMins and colMax over the compressed data, or (3) refactor the double array dictionary into a proper class hierarchy and maintain additional meta data for shared dictionaries. We decided for (3) in order to keep predictable performance, irrespective of shared dictionaries and because this class hierarchy allows for further improvements of shared dictionaries between any subsets of column groups. Additionally, this fix also cleanups incorrect estimates of the individual column groups (because getValueSize was used in the estimates as a number of values, although it gave the size in bytes) as well as some of the Class-layout size estimation tests. Closes #927. --- dev/docs/Tasks.txt | 2 +- .../runtime/compress/CompressedMatrixBlock.java| 5 +- .../compress/CompressedMatrixBlockFactory.java | 59 ++ .../sysds/runtime/compress/colgroup/ColGroup.java | 7 -- .../runtime/compress/colgroup/ColGroupDDC.java | 7 +- .../runtime/compress/colgroup/ColGroupDDC1.java| 34 +++--- .../runtime/compress/colgroup/ColGroupDDC2.java| 39 --- .../runtime/compress/colgroup/ColGroupOLE.java | 33 -- .../runtime/compress/colgroup/ColGroupOffset.java | 43 --- .../runtime/compress/colgroup/ColGroupRLE.java | 30 +++-- .../runtime/compress/colgroup/ColGroupSizes.java | 3 +- .../runtime/compress/colgroup/ColGroupValue.java | 126 - .../runtime/compress/colgroup/Dictionary.java | 96 .../compress/colgroup/DictionaryShared.java| 79 + .../component/compress/CompressedMatrixTest.java | 10 +- .../component/compress/CompressedTestBase.java | 7 +- .../compress/colgroup/JolEstimateTest.java | 8 +- .../compress/colgroup/JolEstimateTestEmpty.java| 6 +- 18 files changed, 398 insertions(+), 196 deletions(-) diff --git a/dev/docs/Tasks.txt b/dev/docs/Tasks.txt index cd90a66..6b5dbb0 100644 --- a/dev/docs/Tasks.txt +++ b/dev/docs/Tasks.txt @@ -223,7 +223,7 @@ SYSTEMDS-270 Compressed Matrix Blocks * 272 Simplify and speedup compression tests OK * 273 Refactor compressed Matrix Block to simplify responsibilities OK * 273a Redesign allocation of ColGroups in ColGroupFactory - * 274 Make the DDC Compression dictionary share correctly + * 274 Make the DDC Compression dictionary share correctlyOK * 275 Include compressionSettings in DMLConfiguration * 276 Allow Uncompressed Columns to be in sparse formats * 277 Sampling based estimators fix diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java index 3ad65c5..1085afc 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java @@ -243,9 +243,10 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock { if(_sharedDDC1Dict) { boolean seenDDC1 = false; for(ColGroup grp : _colGroups) - if(grp.getNumCols() == 1 && grp.getCompType() == CompressionType.DDC) { + if(grp.getNumCols() == 1 && grp instanceof ColGroupDDC1) { + ColGroupDDC1 grpDDC = (ColGroupDDC1) grp; if(seenDDC1) - total -= grp.getValuesSize(); + total -= grpDDC.getDictionarySize();
[systemml] branch master updated: [MINOR] Fix invalid consistency checks of spark append_aligned rbind
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 405462e [MINOR] Fix invalid consistency checks of spark append_aligned rbind 405462e is described below commit 405462e84ad1192e447bb09c03fe20d112bf6afb Author: Matthias Boehm AuthorDate: Sat May 30 00:39:07 2020 +0200 [MINOR] Fix invalid consistency checks of spark append_aligned rbind --- .../apache/sysds/runtime/instructions/spark/BinarySPInstruction.java| 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/sysds/runtime/instructions/spark/BinarySPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/spark/BinarySPInstruction.java index dc4e09b..3c3e5b6 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/spark/BinarySPInstruction.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/spark/BinarySPInstruction.java @@ -459,7 +459,7 @@ public abstract class BinarySPInstruction extends ComputationSPInstruction { } if( checkAligned ) { - if( mc1.getCols() % mc1.getBlocksize() != 0 ) + if( (cbind ? mc1.getCols() : mc1.getRows()) % mc1.getBlocksize() != 0 ) throw new DMLRuntimeException("Input matrices are not aligned to blocksize boundaries. Wrong append selected"); } }
[systemml] branch master updated: [SYSTEMDS-393] Performance distributed connected components
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new be7191c [SYSTEMDS-393] Performance distributed connected components be7191c is described below commit be7191c2502ad7f5445891ceb671f335e88e51c9 Author: Matthias Boehm AuthorDate: Sat May 30 00:15:11 2020 +0200 [SYSTEMDS-393] Performance distributed connected components This patch makes a few tweaks to significantly improve the performance of the new connected components builtin function where the graph G does not fix in the driver memory and thus, spawns distributed spark operations. The test case was a 1M x 1M graph with 1G edges, ran with driver memory of 10GB and 9 executors 80GB each. The baseline runtime of 10 calls to connected components (each requiring 4 iterations until convergence) was pretty bad with 1,512s due to excessive shuffle and GC overhead. 1) Modified Script: Removed the unnecessary removal of self-edges as the chosen update rule is robust enough to handle both cases. This removed the excessive shuffling overhead for matrix-matrix binary operations without existing hash partitioning. This change alone reduced the total runtime of 10 calls to 760s. 2) Handling of approximately known sparsity: The large GC overhead was due to not converting the MCSR representation into read-optimized CSR during checkpointing (spark caching). We now compute these conditions with the upper bound information that is available in cases where the exact nnz is unknown. This further reduce the total runtime to 131s With codegen the runtime is further slightly improved to 120s (including spark context creation, and matrix creation) as we avoid materializing G * t(c) in memory by fusing it with rowMaxs(G * t(c)). For 40 update rule computations (and thus scans of the graph), this is fairly reasonable. --- scripts/builtin/components.dml | 9 - src/main/java/org/apache/sysds/hops/OptimizerUtils.java | 8 ++-- .../sysds/runtime/instructions/spark/RandSPInstruction.java | 3 +++ 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/scripts/builtin/components.dml b/scripts/builtin/components.dml index 51d96db..5f37c07 100644 --- a/scripts/builtin/components.dml +++ b/scripts/builtin/components.dml @@ -27,11 +27,10 @@ m_components = function(Matrix[Double] G, Integer maxi = 0, Boolean verbose = TRUE) return (Matrix[Double] C) { - # ensure there are no self-edges in the graph - if( trace(G) != 0 ) { -G = G - diag(diag(G)); -if(verbose) - print("Connected Components: warning - removed self-edges from input graph"); + # best effort check for symmetry (not exact but fast) + if( sum(rowSums(G) != t(colSums(G))) > 0 ) { +stop("Connected Components: input graph needs to be " + + "symmetric but rowSums and colSums don't match up."); } # initialize state with vertex ids diff --git a/src/main/java/org/apache/sysds/hops/OptimizerUtils.java b/src/main/java/org/apache/sysds/hops/OptimizerUtils.java index ef2b5ff..213041f 100644 --- a/src/main/java/org/apache/sysds/hops/OptimizerUtils.java +++ b/src/main/java/org/apache/sysds/hops/OptimizerUtils.java @@ -477,8 +477,12 @@ public class OptimizerUtils } public static boolean checkSparseBlockCSRConversion( DataCharacteristics dcIn ) { - return Checkpoint.CHECKPOINT_SPARSE_CSR - && OptimizerUtils.getSparsity(dcIn) < MatrixBlock.SPARSITY_TURN_POINT; + //we use the non-zero bound to make the important csr decision in + //an best effort manner (the precise non-zeros is irrelevant here) + double sp = OptimizerUtils.getSparsity( + dcIn.getRows(), dcIn.getCols(), dcIn.getNonZerosBound()); + return Checkpoint.CHECKPOINT_SPARSE_CSR + && sp < MatrixBlock.SPARSITY_TURN_POINT; } /** diff --git a/src/main/java/org/apache/sysds/runtime/instructions/spark/RandSPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/spark/RandSPInstruction.java index ef40773..17315f0 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/spark/RandSPInstruction.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/spark/RandSPInstruction.java @@ -403,8 +403,11 @@ public class RandSPInstruction extends UnarySPInstruction { if(!mcOut.dimsKnown(true)) { //note: we cannot compute the nnz from sparsity because this would not reflect the //actual number of non-zeros, except for extreme values of sp
[systemml] branch master updated: [SYSTEMDS-335] Updated weighted eviction scheme for lineage cache
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 822b492 [SYSTEMDS-335] Updated weighted eviction scheme for lineage cache 822b492 is described below commit 822b4922b938ece3a23204823f818545d471bae4 Author: arnabp AuthorDate: Tue May 26 21:01:28 2020 +0200 [SYSTEMDS-335] Updated weighted eviction scheme for lineage cache This patch updates the weighted scheme by adding a elaborate scoring function. The function has two components, a ratio of compute time, in-memory size, and a last used timestamp. The components are associated with weights, which can tune the eviction policies (e.g. weights 0 and 1 for time/size and timestamp respectively translate to LRU scheme). This patch also replaces the earlier PriorityQueye by a TreeSet. New eviction test, refactor LineageCacheConfig, eviction logic tuning. This commit contains, 1) Few updates in eviction logic. Thanks Matthias for catching an unneeded enqueue/dequeue. 2) Refactoring of LineageCacheConfig class. 3) A new test to compare the order of evicted items based on the specified policies. Closes #915. --- docs/Tasks.txt | 2 +- .../sysds/runtime/lineage/LineageCacheConfig.java | 154 + .../sysds/runtime/lineage/LineageCacheEntry.java | 11 +- .../runtime/lineage/LineageCacheEviction.java | 103 +- .../runtime/lineage/LineageCacheStatistics.java| 5 +- .../test/functions/dnn/Conv2DBackwardDataTest.java | 3 +- .../test/functions/dnn/Conv2DBackwardTest.java | 2 +- .../sysds/test/functions/dnn/Conv2DTest.java | 2 +- .../sysds/test/functions/dnn/PoolBackwardTest.java | 2 +- .../apache/sysds/test/functions/dnn/PoolTest.java | 2 +- .../sysds/test/functions/dnn/ReluBackwardTest.java | 44 ++ .../test/functions/lineage/CacheEvictionTest.java | 141 +++ .../scripts/functions/lineage/CacheEviction1.dml | 55 .../scripts/functions/lineage/LineageReuseAlg3.dml | 2 +- 14 files changed, 357 insertions(+), 171 deletions(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 081a44b..91c966d 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -270,7 +270,7 @@ SYSTEMDS-330 Lineage Tracing, Reuse and Integration * 332 Parfor integration with multi-level reuse OK * 333 Improve cache eviction with actual compute timeOK * 334 Cache scalars only with atleast one matrix inputs - * 335 Weighted eviction policy (function of size & computetime) OK + * 335 Weighted eviction policy (function(size,computetime,LRU time)) OK * 336 Better use of cache status to handle multithreading * 337 Adjust disk I/O speed by recording actual time taken OK * 338 Extended lineage tracing (rmEmpty, lists), partial rewritesOK diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java index 888d27d..2a3c426 100644 --- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java +++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java @@ -26,12 +26,14 @@ import org.apache.sysds.runtime.instructions.Instruction; import org.apache.sysds.runtime.instructions.cp.ComputationCPInstruction; import org.apache.sysds.runtime.instructions.cp.ListIndexingCPInstruction; -import java.util.ArrayList; +import java.util.Comparator; + +public class LineageCacheConfig +{ + //-CACHING LOGIC RELATED CONFIGURATIONS--// -public class LineageCacheConfig { - private static final String[] REUSE_OPCODES = new String[] { - "tsmm", "ba+*", "*", "/", "+", "nrow", "ncol", + "tsmm", "ba+*", "*", "/", "+", "nrow", "ncol", "round", "exp", "log", "rightIndex", "leftIndex", "groupedagg", "r'", "solve", "spoof" }; @@ -55,63 +57,81 @@ public class LineageCacheConfig { || DMLScript.LINEAGE_REUSE == NONE; } } + + private static ReuseCacheType _cacheType = null; + private static CachedItemHead _itemH = null; + private static CachedItemTail _itemT = null; + private static boolean _compilerAssistedRW = false; + + //-DISK SPILLING RELATED CONFIGURATIONS--// + + private static boolean _allowSpill = false; + // Minimum reliable spilling estimate
[systemml] branch master updated: [SYSTEMDS-209] Performance sparse matrix-colvector cell-wise multiply
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 75648fe [SYSTEMDS-209] Performance sparse matrix-colvector cell-wise multiply 75648fe is described below commit 75648fe8a3817a4971480b09cce3ae0d694c7d06 Author: Matthias Boehm AuthorDate: Tue May 26 20:15:24 2020 +0200 [SYSTEMDS-209] Performance sparse matrix-colvector cell-wise multiply While working on the new builtin function for connected components and ultra-sparse graphs, we found that 'rowMaxs(G * t(c))' performed orders of magnitude better than the semantically equivalent 't(colMaxs(G * c))'. The reason was a missing handling of strict sparse-safe operations for matrix-colvector operations, while this was already handled for matrix-rowvector operations. In detail, we performed unnecessary operations in the number of cells instead of in the number of non-zeros leading to worse asymptotic behavior. With the simple fix of this patch, now we have very similar performance. For example, on a scenario of performing 100 times G*c where X is a 10Kx10K, sparsity=0.0001 matrix, total execution time (for 100 operations) improved from 4.2s to 167ms. --- docs/Tasks.txt | 1 + .../apache/sysds/runtime/matrix/data/LibMatrixBincell.java | 12 +--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 3c9782f..081a44b 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -171,6 +171,7 @@ SYSTEMDS-200 Various Fixes * 206 Fix codegen outer template compilation (tsmm) OK * 207 Fix builtin function call hoisting from expressionsOK * 208 Fix bufferpool leak (live var analysis and createvar) OK + * 209 Fix performance sparse M-CV elementwise multiply OK SYSTEMDS-210 Extended lists Operations * 211 Cbind and Rbind over lists of matrices OK diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixBincell.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixBincell.java index 44d6f6a..34464a6 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixBincell.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixBincell.java @@ -393,10 +393,9 @@ public class LibMatrixBincell int alen = a.size(i); int[] aix = a.indexes(i); double[] avals = a.values(i); - for( int j=apos; j
[systemml] branch master updated: [SYSTEMDS-393] Fix convergence condition of connected components builtin
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new b66a3c0 [SYSTEMDS-393] Fix convergence condition of connected components builtin b66a3c0 is described below commit b66a3c006ce6a3a888653e2d1accec479cc756fd Author: Matthias Boehm AuthorDate: Mon May 25 21:11:52 2020 +0200 [SYSTEMDS-393] Fix convergence condition of connected components builtin --- scripts/builtin/components.dml | 2 +- .../test/functions/builtin/BuiltinComponentsTest.java| 16 +++- .../scripts/functions/builtin/ConnectedComponents.dml| 4 ++-- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/scripts/builtin/components.dml b/scripts/builtin/components.dml index f760a49..51d96db 100644 --- a/scripts/builtin/components.dml +++ b/scripts/builtin/components.dml @@ -40,7 +40,7 @@ m_components = function(Matrix[Double] G, Integer maxi = 0, Boolean verbose = TR iter = 1; # iterative computation of connected components - while( diff > 0 & (maxi==0 | maxi<=iter) ) { + while( diff > 0 & (maxi==0 | iter<=maxi) ) { u = max(rowMaxs(G * t(c)), c); diff = sum(u != c) c = u; # update assignment diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinComponentsTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinComponentsTest.java index e541f9d..8c1b05b 100644 --- a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinComponentsTest.java +++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinComponentsTest.java @@ -45,20 +45,25 @@ public class BuiltinComponentsTest extends AutomatedTestBase { @Test public void testConnectedComponents11CP() { - runConnectedComponentsTest(11, LopProperties.ExecType.CP); + runConnectedComponentsTest(11, 0, LopProperties.ExecType.CP); } @Test public void testConnectedComponents201CP() { - runConnectedComponentsTest(201, LopProperties.ExecType.CP); + runConnectedComponentsTest(201, 0, LopProperties.ExecType.CP); } @Test public void testConnectedComponents2001CP() { - runConnectedComponentsTest(2001, LopProperties.ExecType.CP); + runConnectedComponentsTest(2001, 0, LopProperties.ExecType.CP); + } + + @Test + public void testConnectedComponents11Maxi100CP() { + runConnectedComponentsTest(11, 100, LopProperties.ExecType.CP); } - private void runConnectedComponentsTest(int numVertices, ExecType instType) + private void runConnectedComponentsTest(int numVertices, int maxi, ExecType instType) { Types.ExecMode platformOld = setExecMode(instType); @@ -68,7 +73,8 @@ public class BuiltinComponentsTest extends AutomatedTestBase { String HOME = SCRIPT_DIR + TEST_DIR; fullDMLScriptName = HOME + TEST_NAME + ".dml"; - programArgs = new String[]{ "-args", input("X"), output("R")}; + programArgs = new String[]{ "-args", + input("X"), String.valueOf(maxi), output("R")}; //generate actual dataset (3 components) double[][] X = new double[numVertices-3][2]; diff --git a/src/test/scripts/functions/builtin/ConnectedComponents.dml b/src/test/scripts/functions/builtin/ConnectedComponents.dml index 0c6fbe7..56403a8 100644 --- a/src/test/scripts/functions/builtin/ConnectedComponents.dml +++ b/src/test/scripts/functions/builtin/ConnectedComponents.dml @@ -23,6 +23,6 @@ X = read($1) n = max(X); G = table(X[,1], X[, 2], n, n) G = G + t(G); #symmetry -C = components(G=G, verbose=FALSE) +C = components(G=G, maxi=$2, verbose=FALSE) -write(C, $2) +write(C, $3)
[systemml] branch master updated: [MINOR] Integration of steplm builtin (avoid excessive test output)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new c61e7ac [MINOR] Integration of steplm builtin (avoid excessive test output) c61e7ac is described below commit c61e7ac71a83df3d525b6131ca76cf8252c6802f Author: Matthias Boehm AuthorDate: Sun May 24 17:19:39 2020 +0200 [MINOR] Integration of steplm builtin (avoid excessive test output) --- scripts/algorithms/StepLinearRegDS.dml | 324 + scripts/builtin/steplm.dml | 19 +- 2 files changed, 16 insertions(+), 327 deletions(-) diff --git a/scripts/algorithms/StepLinearRegDS.dml b/scripts/algorithms/StepLinearRegDS.dml index 20b1777..a8740f5 100644 --- a/scripts/algorithms/StepLinearRegDS.dml +++ b/scripts/algorithms/StepLinearRegDS.dml @@ -79,331 +79,15 @@ fileX = $X; fileY = $Y; fileB = $B; fileS = $S; - write_beta = ifdef($write_beta, TRUE); - -# currently only the forward selection strategy in supported: start from one feature and iteratively add -# features until AIC improves -dir = "forward"; - fmt = ifdef ($fmt, "text"); -intercept_status = ifdef ($icpt, 1); +intercept = ifdef ($icpt, 1); thr = ifdef ($thr, 0.001); -print ("BEGIN STEPWISE LINEAR REGRESSION SCRIPT"); -print ("Reading X and Y..."); X_orig = read (fileX); y = read (fileY); -n = nrow (X_orig); -m_orig = ncol (X_orig); - -# BEGIN STEPWISE LINEAR REGRESSION - -if (dir == "forward") { - continue = TRUE; - columns_fixed = matrix (0, rows = 1, cols = m_orig); - columns_fixed_ordered = matrix (0, rows = 1, cols = 1); - - # X_global stores the best model found at each step - X_global = matrix (0, rows = n, cols = 1); - - if (intercept_status == 1 | intercept_status == 2) { -beta = mean (y); -AIC_best = 2 + n * log(sum((beta - y)^2) / n); - } else { -beta = 0.0; -AIC_best = n * log(sum(y^2) / n); - } - - AICs = matrix (AIC_best, rows = 1, cols = m_orig); - print ("Best AIC without any features: " + AIC_best); - - boa_ncol = ncol(X_orig) - if (intercept_status != 0) { -boa_ncol = boa_ncol + 1 - } - - beta_out_all = matrix(0, rows = boa_ncol, cols = m_orig * 1); - - y_ncol = 1; - - # First pass to examine single features - parfor (i in 1:m_orig, check = 0) { -columns_fixed_ordered_1 = matrix(i, rows=1, cols=1); - -[AIC_1, beta_out_i] = linear_regression (X_orig[, i], y, m_orig, columns_fixed_ordered_1, - write_beta, 0); - -AICs[1, i] = AIC_1; - -beta_out_all[1:nrow(beta_out_i), (i - 1) * y_ncol + 1 : i * y_ncol] = beta_out_i[, 1:1]; - - } - - # Determine the best AIC - column_best = 0; - for (k in 1:m_orig) { -AIC_cur = as.scalar (AICs[1, k]); -if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) ) { - column_best = k; - AIC_best = as.scalar(AICs[1, k]); -} - } - - # beta best so far - beta_best = beta_out_all[, (column_best-1) * y_ncol + 1: column_best * y_ncol]; - - if (column_best == 0) { -print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + - (thr * 100) + "% decrease in AIC!"); -Selected = matrix (0, rows = 1, cols = 1); -if (intercept_status == 0) { - B = matrix (beta, rows = m_orig, cols = 1); -} else { - B_tmp = matrix (0, rows = m_orig + 1, cols = 1); - B_tmp[m_orig + 1, ] = beta; - B = B_tmp; -} - -beta_out = B; - -write(Selected, fileS, format=fmt); -write(beta_out, fileB, format=fmt); - -stop (""); - } - print ("Best AIC " + AIC_best + " achieved with feature: " + column_best); - columns_fixed[1, column_best] = 1; - columns_fixed_ordered[1, 1] = column_best; - X_global = X_orig[, column_best]; - -while (continue) { -# Subsequent passes over the features -beta_out_all_2 = matrix(0, rows = boa_ncol, cols = m_orig * 1); - -parfor (i in 1:m_orig, check = 0) { - if (as.scalar(columns_fixed[1, i]) == 0) { - -# Construct the feature matrix -X = cbind (X_global, X_orig[, i]); - -tmp = matrix(0, rows=1, cols=1); -tmp[1, 1] = i; -columns_fixed_ordered_2 = append(columns_fixed_ordered, tmp ) -[AIC_2, beta_out_i] = linear_regression (X, y, m_orig, columns_fixed_ordered_2, write_beta, 0); -beta_out_all_2[1:nrow(beta_out_i), (i - 1) * y_ncol + 1 : i * y_ncol] = beta_out_i[,1:1]; - -AICs[1, i] = AIC_2; - } -} - -# Determine the best AIC -for (k in 1:m_orig) { - AIC_cur = as.scalar (AICs[1, k]); - if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) & -(as.scalar(columns_fixed[1, k]
[systemml] branch master updated: [MINOR] Fix l2svm algorithm and cleanup codegen/builtin tests
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 407f736 [MINOR] Fix l2svm algorithm and cleanup codegen/builtin tests 407f736 is described below commit 407f736720edc865d9547792c46f982675e292db Author: Matthias Boehm AuthorDate: Sun May 24 16:52:06 2020 +0200 [MINOR] Fix l2svm algorithm and cleanup codegen/builtin tests This patch makes some minor fixes to the algorithm integration of the builtin l2svm function (which failed the codegen tests), and cleans up the codegen and builtin tests via result caching, removed explain output, and slightly smaller sizes to avoid spurious test failures (for which reason failures are often ignored without double checking). --- scripts/algorithms/l2-svm.dml | 8 -- .../apache/sysds/test/applications/ArimaTest.java | 2 +- .../test/applications/MDABivariateStatsTest.java | 17 ++--- .../test/functions/builtin/BuiltinCVLmTest.java| 2 -- .../functions/builtin/BuiltinComponentsTest.java | 2 +- .../builtin/BuiltinFactorizationTest.java | 6 ++--- .../test/functions/builtin/BuiltinGLMTest.java | 18 +++--- .../functions/builtin/BuiltinGridSearchTest.java | 2 +- .../builtin/BuiltinImageBrightnessTest.java| 2 +- .../functions/builtin/BuiltinImageCropTest.java| 2 +- .../functions/builtin/BuiltinImageMirrorTest.java | 2 +- .../test/functions/builtin/BuiltinKmeansTest.java | 2 +- .../test/functions/builtin/BuiltinL2SVMTest.java | 4 +-- .../functions/builtin/BuiltinLmPredictTest.java| 2 +- .../test/functions/builtin/BuiltinLmTest.java | 2 +- .../builtin/BuiltinMultiLogRegPredictTest.java | 2 +- .../builtin/BuiltinMulticlassSVMTest.java | 2 +- .../functions/builtin/BuiltinNaiveBayesTest.java | 1 - .../functions/builtin/BuiltinNormalizeTest.java| 2 +- .../test/functions/builtin/BuiltinOutlierTest.java | 2 +- .../test/functions/builtin/BuiltinSTEPLmTest.java | 2 +- .../test/functions/builtin/BuiltinScaleTest.java | 29 +++--- .../test/functions/builtin/BuiltinSigmoidTest.java | 2 +- .../functions/builtin/BuiltinSliceFinderTest.java | 2 +- .../functions/builtin/BuiltinToOneHotTest.java | 2 +- .../functions/builtin/BuiltinWinsorizeTest.java| 2 +- .../functions/builtin/MultipleBuiltinsTest.java| 2 +- .../codegenalg/partone/AlgorithmKMeans.java| 2 +- .../codegenalg/partone/AlgorithmL2SVM.java | 14 +-- .../codegenalg/parttwo/AlgorithmDatagen.java | 2 +- .../functions/codegenalg/parttwo/AlgorithmGLM.java | 2 +- .../parttwo/AlgorithmStepwiseRegression.java | 2 +- .../scripts/functions/codegenalg/Algorithm_L2SVM.R | 8 +++--- 33 files changed, 87 insertions(+), 66 deletions(-) diff --git a/scripts/algorithms/l2-svm.dml b/scripts/algorithms/l2-svm.dml index 04d6524..1c2fb9d 100644 --- a/scripts/algorithms/l2-svm.dml +++ b/scripts/algorithms/l2-svm.dml @@ -57,15 +57,19 @@ verbose = ifdef($verbose, FALSE) X = read($X) Y = read($Y) +positive_label = max(Y) +negative_label = min(Y) +dimensions = ncol(X) + w = l2svm(X=X, Y=Y, intercept=intercept, - epsilon=epsilon, lambda=labmda, + epsilon=epsilon, lambda=lambda, maxIterations=maxIterations, verbose=verbose) extra_model_params = matrix(0, 4, 1) extra_model_params[1,1] = positive_label extra_model_params[2,1] = negative_label -extra_model_params[3,1] = intercept +extra_model_params[3,1] = as.double(intercept) extra_model_params[4,1] = dimensions w = rbind(w, extra_model_params) diff --git a/src/test/java/org/apache/sysds/test/applications/ArimaTest.java b/src/test/java/org/apache/sysds/test/applications/ArimaTest.java index c9ab019..020ffb6 100644 --- a/src/test/java/org/apache/sysds/test/applications/ArimaTest.java +++ b/src/test/java/org/apache/sysds/test/applications/ArimaTest.java @@ -130,7 +130,7 @@ public class ArimaTest extends AutomatedTestBase { rCmd = getRCmd(inputDir(), Integer.toString(max_func_invoc), Integer.toString(p), Integer.toString(d), Integer.toString(q), Integer.toString(P), Integer.toString(D), Integer.toString(Q), Integer.toString(s), Integer.toString(include_mean), Integer.toString(useJacobi), expectedDir()); - int timeSeriesLength = 5000; + int timeSeriesLength = 3000; double[][] timeSeries = getRandomMatrix(timeSeriesLength, 1, 1, 5, 0.9, System.currentTimeMillis()); MatrixCharacteristics mc = new MatrixCharacteristics(timeSeriesLength,1,-1,-1); diff --git a/src/test/java/org/apache/sysds/test/applications/MDABivariateStatsTest.java b/src/test/java/org/apache/sysds/test/applications
[systemml] branch master updated: [SYSTEMDS-395] Cleanup SVM scripts, new confusionMatrix, msvmPredict
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 8e4f7d8 [SYSTEMDS-395] Cleanup SVM scripts, new confusionMatrix, msvmPredict 8e4f7d8 is described below commit 8e4f7d82e0df9ce5c0634d7516de10fb262603ed Author: Sebastian AuthorDate: Sun May 24 00:02:36 2020 +0200 [SYSTEMDS-395] Cleanup SVM scripts, new confusionMatrix, msvmPredict - ConfusionMatrix - msvmPredict Make confusion matrixes, based on Predictions and labels. It returns two matrixes: - A count matrix, containing counts of each case in the matrix. - An avg matrix, returning the accuracy of each class, and thereby how the percentage distribution across labels, aka the percentage confusion. msvmPredict applies the trained msvm model and returns - Y_hat, the output from the model, that is the raw output from the model - Y, the row max of the raw output, which is the highest value predictions. Furthermore some consistency changes in L2SVM and MSVM. Closes #910. --- docs/Tasks.txt | 1 + scripts/algorithms/l2-svm.dml | 114 ++-- scripts/builtin/confusionMatrix.dml| 62 +++ scripts/builtin/kmeans.dml | 22 +-- scripts/builtin/l2svm.dml | 97 +- scripts/builtin/msvm.dml | 42 ++--- scripts/builtin/msvmPredict.dml| 53 ++ scripts/builtin/multiLogRegPredict.dml | 17 +- .../java/org/apache/sysds/common/Builtins.java | 2 + .../builtin/BuiltinConfusionMatrixTest.java| 195 + .../builtin/BuiltinMulticlassSVMPredictTest.java | 186 .../builtin/BuiltinMulticlassSVMTest.java | 54 +++--- .../builtin/{l2svm.dml => confusionMatrix.dml} | 8 +- src/test/scripts/functions/builtin/l2svm.dml | 2 +- src/test/scripts/functions/builtin/multisvm.R | 10 +- src/test/scripts/functions/builtin/multisvm.dml| 4 +- .../builtin/{l2svm.dml => multisvmPredict.dml} | 7 +- .../functions/federated/FederatedL2SVMTest.dml | 2 +- .../federated/FederatedL2SVMTestReference.dml | 2 +- 19 files changed, 653 insertions(+), 227 deletions(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 1196566..3c9782f 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -304,6 +304,7 @@ SYSTEMDS-390 New Builtin Functions IV * 392 Builtin function for missing value imputation via FDs OK * 393 Builtin to find Connected Components of a graphOK * 394 Builtin for one-hot encoding of matrix (not frame), see table OK + * 395 SVM rework and utils (confusionMatrix, msvmPredict)OK Others: * Break append instruction to cbind and rbind diff --git a/scripts/algorithms/l2-svm.dml b/scripts/algorithms/l2-svm.dml index 4cbcdb5..04d6524 100644 --- a/scripts/algorithms/l2-svm.dml +++ b/scripts/algorithms/l2-svm.dml @@ -21,10 +21,6 @@ # Implements binary-class SVM with squared slack variables # -# Example Usage: -# Assume L2SVM_HOME is set to the home of the dml script -# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR -# Assume epsilon = 0.001, lambda = 1, maxiterations = 100 # # INPUT PARAMETERS: # - @@ -40,111 +36,31 @@ # maxiter Int 100 Maximum number of conjugate gradient iterations # model String --- Location to write model # fmt String "text" The output format of the output, such as "text" or "csv" -# Log String --- [OPTIONAL] Location to write the log file # - -# hadoop jar SystemDS.jar -f $L2SVM_HOME/l2-svm.dml -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/Y \ -# icpt=0 tol=0.001 reg=1 maxiter=100 model=$OUPUT_DIR/w Log=$OUTPUT_DIR/Log fmt="text" -# +# Example Execution: +# systemds -f $SYSTEMDS_ROOT/scripts/algorithms/l2-svm.dml \ +# -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/Y \ +# icpt=FALSE tol=0.001 reg=1 maxiter=100 \ +# model=$OUPUT_DIR/w fmt="text" + # Note about inputs: # Assumes that labels (entries in Y) are set to either -1 or +1 or non-negative integers fmt = ifdef($fmt, "text") -intercept = ifdef($icpt, 0) +intercept = ifdef($icpt, FALSE) epsilon = ifdef($tol, 0.001) lambda = ifdef($reg, 1.0) -maxiterations = ifdef($maxiter, 100) +maxIterations = ifdef($maxiter, 100) +verbose = ifdef($verbose, FALSE) X = read($X) Y = read($Y) -#check input parameter assertions -if(nrow(X) < 2) -
[systemml] branch master updated: [SYSTEMDS-254] Fixes distributed slice finding implementation
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new e78962b [SYSTEMDS-254] Fixes distributed slice finding implementation e78962b is described below commit e78962b8db5b1c90fdb37ae6d9c6284f744cdbfc Author: gilgenbergg AuthorDate: Sat May 23 23:24:30 2020 +0200 [SYSTEMDS-254] Fixes distributed slice finding implementation Closes #908. --- docs/Tasks.txt | 1 + scripts/staging/slicing/base/Bucket.py | 2 + scripts/staging/slicing/base/SparkNode.py | 70 +--- scripts/staging/slicing/base/__init__.py | 4 +- scripts/staging/slicing/base/node.py | 74 -- scripts/staging/slicing/base/slicer.py | 12 ++-- scripts/staging/slicing/base/union_slicer.py | 19 ++ .../slicing/spark_modules/join_data_parallel.py| 24 +++ .../staging/slicing/spark_modules/spark_slicer.py | 9 +-- .../slicing/spark_modules/spark_union_slicer.py| 9 +-- .../staging/slicing/spark_modules/spark_utils.py | 15 ++--- .../slicing/spark_modules/union_data_parallel.py | 22 --- .../slicing/tests/classification/__init__.py | 4 +- .../staging/slicing/tests/regression/__init__.py | 4 +- 14 files changed, 117 insertions(+), 152 deletions(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 7b64145..1196566 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -209,6 +209,7 @@ SYSTEMDS-250 Extended Slice Finding * 251 Alternative slice enumeration approach OK * 252 Initial data slicing implementation Python OK * 253 Distributed slicing algorithms (task/data parallel)OK + * 254 Consolidation and fixes distributed slice finding OK SYSTEMDS-260 Misc Tools * 261 Stable marriage algorithm OK diff --git a/scripts/staging/slicing/base/Bucket.py b/scripts/staging/slicing/base/Bucket.py index 0277f6d..dc8402e 100644 --- a/scripts/staging/slicing/base/Bucket.py +++ b/scripts/staging/slicing/base/Bucket.py @@ -45,6 +45,8 @@ class Bucket: self.parents = [] self.sum_error = 0 self.size = 0 +self.s_upper = 0 +self.s_lower = 0 self.score = 0 self.error = 0 self.max_tuple_error = 0 diff --git a/scripts/staging/slicing/base/SparkNode.py b/scripts/staging/slicing/base/SparkNode.py index fbaa0bd..a123624 100644 --- a/scripts/staging/slicing/base/SparkNode.py +++ b/scripts/staging/slicing/base/SparkNode.py @@ -65,25 +65,16 @@ class SparkNode: print(mask) if loss_type == 0: self.calc_l2(mask) -if loss_type == 1: +elif loss_type == 1: self.calc_class(mask) def calc_class(self, mask): self.e_max = 1 -size = 0 -mistakes = 0 -for row in self.preds: -flag = True -for attr in mask: -if attr not in row[0].indices: -flag = False -if flag: -size = size + 1 -if row[1] == 0: -mistakes += 1 -self.size = size -if size != 0: -self.loss = mistakes / size +filtered = self.filter_by_mask(mask) +self.size = len(filtered) +mistakes = len(list(filter(lambda row: row[1] == 0, filtered))) +if self.size != 0: +self.loss = mistakes / self.size else: self.loss = 0 self.e_upper = self.loss @@ -92,25 +83,22 @@ class SparkNode: max_tuple_error = 0 sum_error = 0 size = 0 -for row in self.preds: -flag = True -for attr in mask: -if attr not in row[0].indices: -flag = False -if flag: -size = size + 1 -if row[1] > max_tuple_error: -max_tuple_error = row[1] -sum_error = sum_error + row[1] +filtered = self.filter_by_mask(mask) +self.size = len(filtered) +for row in filtered: +if row[1] > max_tuple_error: +max_tuple_error = row[1] +sum_error += row[1] self.e_max = max_tuple_error self.e_upper = max_tuple_error self.e_max_upper = max_tuple_error -if size != 0: -self.loss = sum_error/size +if self.size != 0: +self.loss = sum_error/self.size else: self.loss = 0 -self.size = size -self.s_upper = size + +def filter_by_mask(self, mask): +return list(filter(lambda row: all(attr in row[0].indices for attr in mask), self.preds)) def calc_s_upper(self, c
[systemml] branch master updated: [SYSTEMDS-394] New builtin function toOneHot (one hot encoding)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 86fd7b3 [SYSTEMDS-394] New builtin function toOneHot (one hot encoding) 86fd7b3 is described below commit 86fd7b3d4aae5dbca8090e2638e0abc4da696655 Author: Patrick Deutschmann AuthorDate: Sat May 23 22:57:17 2020 +0200 [SYSTEMDS-394] New builtin function toOneHot (one hot encoding) Adds a builtin function toOneHot which transforms a vector containing integers into a one-hot-encoded matrix (note transform works over frames and reassigns the integer codes) Closes #916. --- docs/Tasks.txt | 1 + docs/dml-language-reference.md | 2 +- scripts/builtin/toOneHot.dml | 43 .../java/org/apache/sysds/common/Builtins.java | 1 + .../functions/builtin/BuiltinToOneHotTest.java | 113 + src/test/scripts/functions/builtin/toOneHot.dml| 25 + 6 files changed, 184 insertions(+), 1 deletion(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 6539ff8..7b64145 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -302,6 +302,7 @@ SYSTEMDS-390 New Builtin Functions IV * 391 New GLM builtin function (from algorithms) OK * 392 Builtin function for missing value imputation via FDs OK * 393 Builtin to find Connected Components of a graphOK + * 394 Builtin for one-hot encoding of matrix (not frame), see table OK Others: * Break append instruction to cbind and rbind diff --git a/docs/dml-language-reference.md b/docs/dml-language-reference.md index 652a451..76f2656 100644 --- a/docs/dml-language-reference.md +++ b/docs/dml-language-reference.md @@ -699,7 +699,7 @@ cummin() | Column prefix-min (For row-prefix min, use cummin(t(X)) | Input: matr cummax() | Column prefix-max (For row-prefix min, use cummax(t(X)) | Input: matrix Output: matrix of the same dimensions | A = matrix("3 4 1 6 5 2", rows=3, cols=2) B = cummax(A) The output matrix B = [[3, 4], [3, 6], [5, 6]] sample(range, size, replacement, seed) | Sample returns a column vector of length size, containing uniform random numbers from [1, range] | Input: range: integer size: integer replacement: boolean (Optional, default: FALSE) seed: integer (Optional) Output: Matrix dimensions are size x 1 | sample(100, 5) sample(100, 5, TRUE) sample(100, 120, TRUE) sample(100, 5, 1234) # 1234 is the seed sample(100, 5, TRUE, 1234) outer(vector1, vector2, "op") | Applies element wise binary operation "op" (for example: "", "==", "=", "*", "min") on the all combination of vector. Note: Using "*", we get outer product of two vectors. | Input: vectors of same size d, string Output: matrix of size d X d | A = matrix("1 4", rows = 2, cols = 1) B = matrix("3 6", rows = 1, cols = 2) C = outer(A, B, "") D = outer(A, B, "*") The output matrix C = [[1, 1], [0, 1]] The out [...] - +toOneHot(X, num_classes)| Converts a vector containing integers to a one-hot-encoded matrix | Input: vector with N integer entries between 1 and num_classes, number of columns (must be >= largest value in X)Output: one-hot-encoded matrix with shape (N, num_classes) | X = round(rand(rows=10, cols=1, min=2, max=10)); num_classes = ​12; Y = toOneHot(X, num_classes); Alternative forms of table() diff --git a/scripts/builtin/toOneHot.dml b/scripts/builtin/toOneHot.dml new file mode 100644 index 000..8134f5c --- /dev/null +++ b/scripts/builtin/toOneHot.dml @@ -0,0 +1,43 @@ +#- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#- + +# One-hot encodes a vector + +# INPUT PARAMETERS: +# +# NAME
[systemml] branch master updated: [MINOR] Update Dockerfile (fixes, new R dependency)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new b742443 [MINOR] Update Dockerfile (fixes, new R dependency) b742443 is described below commit b7424431f3f6131e3245c03ba8d40aa81d2cb245 Author: Sebastian AuthorDate: Sat May 23 22:40:54 2020 +0200 [MINOR] Update Dockerfile (fixes, new R dependency) Closes #918. --- .github/action/Dockerfile | 2 +- .github/workflows/componentTests.yml | 17 - docker/build.sh| 6 +++--- docker/entrypoint.sh | 2 +- docker/sysds.Dockerfile| 4 ++-- src/test/scripts/installDependencies.R | 1 + 6 files changed, 24 insertions(+), 8 deletions(-) diff --git a/.github/action/Dockerfile b/.github/action/Dockerfile index 5a23a8e..36420da 100644 --- a/.github/action/Dockerfile +++ b/.github/action/Dockerfile @@ -19,4 +19,4 @@ # #- -FROM sebaba/testingsysds:0.2 +FROM sebaba/testingsysds:2.0 diff --git a/.github/workflows/componentTests.yml b/.github/workflows/componentTests.yml index 0cc934c..195fb5c 100644 --- a/.github/workflows/componentTests.yml +++ b/.github/workflows/componentTests.yml @@ -52,4 +52,19 @@ jobs: run: mvn clean compile test-compile - name: Component Tests - run: mvn surefire:test -DskipTests=false -Dtest=org.apache.sysds.test.component.*.** + run: | +log="/tmp/sysdstest.log" +echo "Starting Tests" +mvn surefire:test -DskipTests=false -Dtest=org.apache.sysds.test.component.*.** 2>&1 > $log +grep_args="SUCCESS" +grepvals="$( tail -n 100 $log | grep $grep_args)" +if [[ $grepvals == *"SUCCESS"* ]]; then + echo "- last 100 lines from test " + tail -n 100 $log + echo "-- last 100 lines from test end ---" + exit 0 +else + echo "\n $(cat $log)" + exit 1 +fi + diff --git a/docker/build.sh b/docker/build.sh index 73add3c..643813e 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -23,13 +23,13 @@ # Build the docker containers # The first build is for running systemds through docker. -docker image build -f docker/sysds.Dockerfile -t sebaba/sysds:0.2 . +docker image build -f docker/sysds.Dockerfile -t sebaba/sysds:2.0 . # The second build is for testing systemds. This image installs the R dependencies needed to run the tests. -docker image build -f docker/testsysds.Dockerfile -t sebaba/testingsysds:0.2 . +docker image build -f docker/testsysds.Dockerfile -t sebaba/testingsysds:2.0 . # The third build is python docker for systemds. -docker image build -f docker/pythonsysds.Dockerfile -t sebaba/pythonsysds:0.2 . +docker image build -f docker/pythonsysds.Dockerfile -t sebaba/pythonsysds:2.0 . # You might want to prune the docker system afterwards using # docker system prune \ No newline at end of file diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 713e948..fd80cbe 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -24,7 +24,7 @@ cd /github/workspace -build="$(mvn -T 2 clean compile test-compile surefire:test | grep 'BUILD')" +build="$(mvn -T 2 clean compile test-compile | grep 'BUILD')" if [[ $build == *"SUCCESS"* ]]; then echo "Successfull build" diff --git a/docker/sysds.Dockerfile b/docker/sysds.Dockerfile index 01a0094..cf788a1 100644 --- a/docker/sysds.Dockerfile +++ b/docker/sysds.Dockerfile @@ -39,11 +39,11 @@ RUN wget http://archive.apache.org/dist/maven/maven-3/$MAVEN_VERSION/binaries/ap # Install Extras RUN apk add --no-cache git bash -RUN git clone https://github.com/apache/systemml.git +RUN git clone https://github.com/apache/systemml.git systemds WORKDIR /usr/src/systemds/ -RUN mvn package +RUN mvn clean package -P distribution # Remove Maven since it is not needed for running the system RUN rm -r /usr/lib/mvn diff --git a/src/test/scripts/installDependencies.R b/src/test/scripts/installDependencies.R index 9689f63..4696361 100644 --- a/src/test/scripts/installDependencies.R +++ b/src/test/scripts/installDependencies.R @@ -55,6 +55,7 @@ custom_install("caret"); custom_install("sigmoid"); custom_install("DescTools"); custom_install("mice"); +custom_install("mclust"); print("Installation Done")
[systemml] branch master updated: [SYSTEMDS-393] Builtin function for connected components, tests
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 50fac03 [SYSTEMDS-393] Builtin function for connected components, tests 50fac03 is described below commit 50fac03eed584c2019c95a4702367ddc442e7bf2 Author: Matthias Boehm AuthorDate: Sat May 23 22:09:04 2020 +0200 [SYSTEMDS-393] Builtin function for connected components, tests This patch adds a new built-in function for finding the connected components in a undirected graph, represented as a symmetric 0/1 matrix. On a scenario of finding the connected components of the DBLP co-author graph (for selected DB venues and >=2011 -> 35632x35632, 310582 non-zeros), the algorithm terminated in 12 iterations, found 2443 connected components (w/ 837 single-author components), and took only 4.1s including I/O, transform recoding, and graph construction. --- docs/Tasks.txt | 1 + scripts/builtin/components.dml | 53 .../java/org/apache/sysds/common/Builtins.java | 1 + .../functions/builtin/BuiltinComponentsTest.java | 94 ++ .../functions/builtin/ConnectedComponents.dml | 28 +++ 5 files changed, 177 insertions(+) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 66d0901..6539ff8 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -301,6 +301,7 @@ SYSTEMDS-380 Memory Footprint SYSTEMDS-390 New Builtin Functions IV * 391 New GLM builtin function (from algorithms) OK * 392 Builtin function for missing value imputation via FDs OK + * 393 Builtin to find Connected Components of a graphOK Others: * Break append instruction to cbind and rbind diff --git a/scripts/builtin/components.dml b/scripts/builtin/components.dml new file mode 100644 index 000..f760a49 --- /dev/null +++ b/scripts/builtin/components.dml @@ -0,0 +1,53 @@ +#- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#- + +# Computes the connected components of a graph and returns a +# vector indicating the assignment of vertices to components, +# where each component is identified by the maximum vertex ID +# (i.e., row/column position of the input graph) + +m_components = function(Matrix[Double] G, Integer maxi = 0, Boolean verbose = TRUE) + return (Matrix[Double] C) +{ + # ensure there are no self-edges in the graph + if( trace(G) != 0 ) { +G = G - diag(diag(G)); +if(verbose) + print("Connected Components: warning - removed self-edges from input graph"); + } + + # initialize state with vertex ids + c = seq(1,nrow(G)); + diff = Inf; + iter = 1; + + # iterative computation of connected components + while( diff > 0 & (maxi==0 | maxi<=iter) ) { +u = max(rowMaxs(G * t(c)), c); +diff = sum(u != c) +c = u; # update assignment +if( verbose ) + print("Connected components: iter = "+iter+", #diff = "+diff); +iter = iter + 1; + } + + C = c; +} diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java index b738d40..6c53692 100644 --- a/src/main/java/org/apache/sysds/common/Builtins.java +++ b/src/main/java/org/apache/sysds/common/Builtins.java @@ -68,6 +68,7 @@ public enum Builtins { COLSD("colSds", false), COLSUM("colSums", false), COLVAR("colVars", false), + COMPONENTS("components", true), CONV2D("conv2d", false), CONV2D_BACKWARD_FILTER("conv2d_backward_filter", false), CONV2D_BACKWARD_DATA("conv2d_backward_data", false), diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinComponentsTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinComponentsTest.java new file mode 100644 index 000..ca54528
[systemml] branch master updated: [MINOR] Performance lineage tracing of literal operands
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 6f44461 [MINOR] Performance lineage tracing of literal operands 6f44461 is described below commit 6f44461692b5e3eea2a1554425ce6228ae4ddea2 Author: Matthias Boehm AuthorDate: Sat May 23 00:02:00 2020 +0200 [MINOR] Performance lineage tracing of literal operands This patch makes a minor performance improvement reuse thread-local string builders (as done for instructions) for the construction for lineage literals as well. On the following reduced example script, this patch improved the total execution time from 56s to 50.3s due to partially removed garbage collection overhead: X = rand(rows=10, cols=10, seed=1); for(i in 1:1e6) { tmp1 = ((X + 1) * 2) / 3 tmp2 = (tmp1 - 1) * tmp1 X = tmp2; if( i%%1e5==0 ) print("Iteration "+i); } print(sum(X)); Notice that this script creates over one million lineage items for literals to cover the 1e6 distinct values of the loop variable i. --- .../runtime/controlprogram/IfProgramBlock.java | 12 --- .../runtime/instructions/InstructionUtils.java | 19 +- .../sysds/runtime/instructions/cp/CPOperand.java | 23 +++--- 3 files changed, 21 insertions(+), 33 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/IfProgramBlock.java b/src/main/java/org/apache/sysds/runtime/controlprogram/IfProgramBlock.java index 9d0d58e..94ec8e1 100644 --- a/src/main/java/org/apache/sysds/runtime/controlprogram/IfProgramBlock.java +++ b/src/main/java/org/apache/sysds/runtime/controlprogram/IfProgramBlock.java @@ -95,10 +95,8 @@ public class IfProgramBlock extends ProgramBlock ec.getLineagePath().setBranchPredicateValue(predResult.getBooleanValue()); //execute if statement - if(predResult.getBooleanValue()) - { - try - { + if(predResult.getBooleanValue()) { + try { for (int i=0 ; i < _childBlocksIfBody.size() ; i++) { _childBlocksIfBody.get(i).execute(ec); } @@ -106,13 +104,11 @@ public class IfProgramBlock extends ProgramBlock catch(DMLScriptException e) { throw e; } - catch(Exception e) - { + catch(Exception e) { throw new DMLRuntimeException(this.printBlockErrorLocation() + "Error evaluating if statement body ", e); } } - else - { + else { try { for (int i=0 ; i < _childBlocksElseBody.size() ; i++) { _childBlocksElseBody.get(i).execute(ec); diff --git a/src/main/java/org/apache/sysds/runtime/instructions/InstructionUtils.java b/src/main/java/org/apache/sysds/runtime/instructions/InstructionUtils.java index 740d821..1401bfa 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/InstructionUtils.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/InstructionUtils.java @@ -955,24 +955,25 @@ public class InstructionUtils if( operand >= parts.length ) throw new DMLRuntimeException("Operand position " + operand + " exceeds the length of the instruction."); - //replace and reconstruct string parts[operand] = newValue; - StringBuilder sb = new StringBuilder(instStr.length()); - sb.append(parts[0]); - for( int i=1; i
[systemml] branch master updated: [SYSTEMDS-339] Fix robustness lineage tracing/parsing, part II
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new f29ae42 [SYSTEMDS-339] Fix robustness lineage tracing/parsing, part II f29ae42 is described below commit f29ae426be1722fba9468609976709068e6e5d7d Author: Matthias Boehm AuthorDate: Fri May 22 22:38:54 2020 +0200 [SYSTEMDS-339] Fix robustness lineage tracing/parsing, part II This patch fixes many additional issues in lineage tracing and parsing in order to support the round-trip for steplm and kmeans. 1) Lineage tracing with default arguments of function call parameters (so far missing arguments where traces as literal variable name) 2) Lineage Tracing: rshape with parameters, ctable w/ dimensions, rand/seq w/ variable rows/cols, from/to/incr inputs 3) Lineage Parsing: rshape, rdiag, nrow, ncol, all casts ops, ifelse with scalar/matrix inputs (so far block size wrong), ctable /w dimensions, gappend spark ops 4) New lineage parfor algorithm tests: steplm, kmeans --- scripts/builtin/kmeans.dml | 6 +-- src/main/java/org/apache/sysds/common/Types.java | 35 ++--- .../apache/sysds/hops/recompile/Recompiler.java| 8 +-- .../apache/sysds/hops/rewrite/HopRewriteUtils.java | 37 -- .../RewriteAlgebraicSimplificationDynamic.java | 6 +-- .../runtime/instructions/InstructionUtils.java | 41 ++- .../instructions/cp/CtableCPInstruction.java | 27 ++ .../instructions/cp/DataGenCPInstruction.java | 56 ++-- .../instructions/cp/FunctionCallCPInstruction.java | 7 ++- .../instructions/cp/ReshapeCPInstruction.java | 9 .../instructions/spark/RandSPInstruction.java | 10 +++- .../sysds/runtime/lineage/LineageItemUtils.java| 59 +- .../functions/lineage/LineageTraceParforTest.java | 34 - ...aceParfor4.dml => LineageTraceParforKmeans.dml} | 3 +- ...aceParfor4.dml => LineageTraceParforSteplm.dml} | 0 15 files changed, 211 insertions(+), 127 deletions(-) diff --git a/scripts/builtin/kmeans.dml b/scripts/builtin/kmeans.dml index 23482da..96591c6 100644 --- a/scripts/builtin/kmeans.dml +++ b/scripts/builtin/kmeans.dml @@ -60,8 +60,8 @@ m_kmeans = function(Matrix[Double] X, Integer k = 0, Integer runs = 10, Integer print ("Taking data samples for initialization..."); - [sample_maps, samples_vs_runs_map, sample_block_size] = - get_sample_maps (num_records, num_runs, num_centroids * avg_sample_size_per_centroid); + [sample_maps, samples_vs_runs_map, sample_block_size] = get_sample_maps( +num_records, num_runs, num_centroids * avg_sample_size_per_centroid); is_row_in_samples = rowSums (sample_maps); X_samples = sample_maps %*% X; @@ -230,7 +230,7 @@ get_sample_maps = function (int num_records, int num_samples, int approx_sample_ # Replace all sample record ids over "num_records" (i.e. out of range) by "num_records + 1": is_sample_rec_id_within_range = (sample_rec_ids <= num_records); sample_rec_ids = sample_rec_ids * is_sample_rec_id_within_range -+ (num_records + 1) * (1 - is_sample_rec_id_within_range); + + (num_records + 1) * (1 - is_sample_rec_id_within_range); # Rearrange all samples (and their out-of-range indicators) into one column-vector: sample_rec_ids = matrix (sample_rec_ids, rows = num_rows, cols = 1, byrow = FALSE); diff --git a/src/main/java/org/apache/sysds/common/Types.java b/src/main/java/org/apache/sysds/common/Types.java index d693b7f..2d66e81 100644 --- a/src/main/java/org/apache/sysds/common/Types.java +++ b/src/main/java/org/apache/sysds/common/Types.java @@ -206,6 +206,15 @@ public class Types MULT2, MINUS1_MULT, MINUS_RIGHT, POW2, SUBTRACT_NZ; + + public boolean isScalarOutput() { + return this == CAST_AS_SCALAR + || this == NROW || this == NCOL + || this == LENGTH || this == EXISTS + || this == IQM || this == LINEAGE + || this == MEDIAN; + } + @Override public String toString() { switch(this) { @@ -244,7 +253,7 @@ public class Types case "ucumk+": return CUMSUM; case "ucumk+*": return CUMSUMPROD; case "*2": return MULT2; - case "!": return OpOp1.NOT; + case "!": return NOT; cas
[systemml] branch master updated: [SYSTEMDS-339] Fix robustness lineage tracing/parsing, part I
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 498aef6 [SYSTEMDS-339] Fix robustness lineage tracing/parsing, part I 498aef6 is described below commit 498aef6c5915c8256da44a041049cc4a59a84d41 Author: Matthias Boehm AuthorDate: Wed May 20 23:37:12 2020 +0200 [SYSTEMDS-339] Fix robustness lineage tracing/parsing, part I This patch adds steplm as a new parfor lineage tracing/parsing test case and fixes many related lineage tracing/parsing issues: 1) Parfor Lineage Merge: robustness against empty first worker (no lineage for result variable, e.g., due to conditional control flow) 2) Lineage Tracing: support for replace, rexpand 3) Lineage Parsing: support for seq, ifelse, log, log_nz, groupedagg, rmempty, replace, rexpand 4) Steplm: Improved initialization of parfor result variables to avoid cycles and stackoverflow errors in overwrite scenarios. 5) Minor: parsing of ternary operator codes However, additional fixes are required for lineage tracing wrt default handling in functions, literal replacement during dynamic recompilation, and better overwrite support in matrix indexing (currently this creates lineage cycles). --- docs/Tasks.txt | 1 + scripts/builtin/steplm.dml | 1 + src/main/java/org/apache/sysds/common/Types.java | 2 +- .../apache/sysds/hops/rewrite/HopRewriteUtils.java | 9 ++- .../runtime/controlprogram/ParForProgramBlock.java | 6 +- .../instructions/cp/DataGenCPInstruction.java | 12 .../cp/ParameterizedBuiltinCPInstruction.java | 57 +--- .../sysds/runtime/lineage/LineageItemUtils.java| 75 ++ .../functions/lineage/LineageTraceParforTest.java | 19 +- .../functions/lineage/LineageTraceParfor1.dml | 4 +- .../functions/lineage/LineageTraceParfor2.dml | 4 +- .../functions/lineage/LineageTraceParfor3.dml | 4 +- ...ageTraceParfor3.dml => LineageTraceParfor4.dml} | 13 ++-- 13 files changed, 161 insertions(+), 46 deletions(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 6d5ff80..66d0901 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -272,6 +272,7 @@ SYSTEMDS-330 Lineage Tracing, Reuse and Integration * 336 Better use of cache status to handle multithreading * 337 Adjust disk I/O speed by recording actual time taken OK * 338 Extended lineage tracing (rmEmpty, lists), partial rewritesOK + * 339 Lineage tracing robustness (indexed updates, algorithms) SYSTEMDS-340 Compiler Assisted Lineage Caching and Reuse * 341 Finalize unmarking of loop dependent operations diff --git a/scripts/builtin/steplm.dml b/scripts/builtin/steplm.dml index fd0018d..28208c8 100644 --- a/scripts/builtin/steplm.dml +++ b/scripts/builtin/steplm.dml @@ -126,6 +126,7 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, while (continue) { # Subsequent passes over the features beta_out_all_2 = matrix(0, boa_ncol, m_orig * 1); + AICs = matrix(0, 1, m_orig); # full overwrite parfor (i in 1:m_orig, check = 0) { if (as.scalar(columns_fixed[1, i]) == 0) { # Construct the feature matrix diff --git a/src/main/java/org/apache/sysds/common/Types.java b/src/main/java/org/apache/sysds/common/Types.java index 11e597e..d693b7f 100644 --- a/src/main/java/org/apache/sysds/common/Types.java +++ b/src/main/java/org/apache/sysds/common/Types.java @@ -359,7 +359,7 @@ public class Types case "cm": return OpOp3.MOMENT; case "+*": return OpOp3.PLUS_MULT; case "-*": return OpOp3.MINUS_MULT; - default: return OpOp3.valueOf(code); + default: return OpOp3.valueOf(code.toUpperCase()); } } } diff --git a/src/main/java/org/apache/sysds/hops/rewrite/HopRewriteUtils.java b/src/main/java/org/apache/sysds/hops/rewrite/HopRewriteUtils.java index 9d86b4d..9e73fcc 100644 --- a/src/main/java/org/apache/sysds/hops/rewrite/HopRewriteUtils.java +++ b/src/main/java/org/apache/sysds/hops/rewrite/HopRewriteUtils.java @@ -779,8 +779,13 @@ public class HopRewriteUtils } public static TernaryOp createTernaryOp(Hop mleft, Hop smid, Hop mright, OpOp3 op) { - TernaryOp ternOp = new TernaryOp("tmp", DataType.MATRIX, ValueType.FP64, op, mleft, smid, mright); - ternOp.setBlocksize(mleft.getBlocksize()); + //NOTe: for ifelse it's sufficient to check mright as smid==mright + System.out.print
[systemml] branch master updated: [SYSTEMDS-344] New IPA pass for marking deterministic functions/SBs
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new f1bffeb [SYSTEMDS-344] New IPA pass for marking deterministic functions/SBs f1bffeb is described below commit f1bffeb299eec6a57d5290fd12c81ba92c9f03e2 Author: arnabp AuthorDate: Sun May 17 22:24:22 2020 +0200 [SYSTEMDS-344] New IPA pass for marking deterministic functions/SBs This patch moves the fragile and less efficient non-determinism check in runtime to compile time. This adds a new IPA rewrite to unmark the functions and StatementBlocks containing direct or transitive nondeterministic calls (e.g. rand with UNSPECIFIED_SEED) for lineage caching. AMLS project SS 2020. Closes #911. --- docs/Tasks.txt | 2 +- src/main/java/org/apache/sysds/hops/DataGenOp.java | 8 + .../sysds/hops/ipa/IPAPassFlagNonDeterminism.java | 201 + .../sysds/hops/ipa/InterProceduralAnalysis.java| 2 + .../apache/sysds/hops/rewrite/HopRewriteUtils.java | 7 + .../org/apache/sysds/parser/DMLTranslator.java | 1 + .../sysds/parser/FunctionStatementBlock.java | 9 + .../org/apache/sysds/parser/StatementBlock.java| 10 + .../runtime/controlprogram/BasicProgramBlock.java | 4 +- .../controlprogram/FunctionProgramBlock.java | 9 + .../instructions/cp/FunctionCallCPInstruction.java | 6 +- .../apache/sysds/runtime/lineage/LineageCache.java | 8 +- .../sysds/runtime/util/ProgramConverter.java | 4 + .../functions/lineage/FunctionFullReuseTest.java | 7 +- .../functions/lineage/FunctionFullReuse8.dml | 57 ++ .../scripts/functions/lineage/LineageReuseAlg2.dml | 4 +- 16 files changed, 322 insertions(+), 17 deletions(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 8163e9f..6d5ff80 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -277,7 +277,7 @@ SYSTEMDS-340 Compiler Assisted Lineage Caching and Reuse * 341 Finalize unmarking of loop dependent operations * 342 Mark functions as last-use to enable early eviction * 343 Identify equal last level HOPs to ensure SB-level reuse - * 344 Unmark functions/SBs containing non-determinism for caching + * 344 Unmark functions/SBs containing non-determinism for cachingOK * 345 Compiler assisted cache configuration SYSTEMDS-350 Data Cleaning Framework diff --git a/src/main/java/org/apache/sysds/hops/DataGenOp.java b/src/main/java/org/apache/sysds/hops/DataGenOp.java index edcb448..8fdf98d 100644 --- a/src/main/java/org/apache/sysds/hops/DataGenOp.java +++ b/src/main/java/org/apache/sysds/hops/DataGenOp.java @@ -468,6 +468,14 @@ public class DataGenOp extends MultiThreadedHop return ret; } + public boolean hasUnspecifiedSeed() { + if (_op == OpOpDG.RAND || _op == OpOpDG.SINIT) { + Hop seed = getInput().get(_paramIndexMap.get(DataExpression.RAND_SEED)); + return seed.getName().equals(String.valueOf(DataGenOp.UNSPECIFIED_SEED)); + } + return false; + } + public Hop getConstantValue() { return getInput().get(_paramIndexMap.get(DataExpression.RAND_MIN)); } diff --git a/src/main/java/org/apache/sysds/hops/ipa/IPAPassFlagNonDeterminism.java b/src/main/java/org/apache/sysds/hops/ipa/IPAPassFlagNonDeterminism.java new file mode 100644 index 000..a96 --- /dev/null +++ b/src/main/java/org/apache/sysds/hops/ipa/IPAPassFlagNonDeterminism.java @@ -0,0 +1,201 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.hops.ipa; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; + +import org.apache.sysds.hops.FunctionOp; +import org.apache.sysds.hops.Hop; +import org.apache.sysds.hops.HopsException; +import org.apache.sysds.hops.rewrite.HopRewriteUtils; +import org.apache.sysds.parser.DMLProgram; +import org.apache.sysds.parser.F
[systemml] branch master updated: [SYSTEMDS-74] Cleanup lineage tracing (unnecessary variable names)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new bd0b319 [SYSTEMDS-74] Cleanup lineage tracing (unnecessary variable names) bd0b319 is described below commit bd0b319df52215b359c04590ed4091ad136ea4f9 Author: Matthias Boehm AuthorDate: Sat May 16 17:00:59 2020 +0200 [SYSTEMDS-74] Cleanup lineage tracing (unnecessary variable names) This patch removes unnecessary attributes from lineage items in order to reduce the size (and GC overhead) for long lineage traces. So far, each lineage item kept the variable name to which is was bound. As lineage information should be independent of such properties, this information was already ignored for lineage hashing and comparisons. In few, places however, we use it to rewire placeholders, which is now cleaned up. --- .../cp/AggregateUnaryCPInstruction.java| 9 +-- .../instructions/cp/ComputationCPInstruction.java | 8 ++- .../instructions/cp/DataGenCPInstruction.java | 6 +- .../instructions/cp/ListIndexingCPInstruction.java | 7 +- .../instructions/cp/MatrixAppendCPInstruction.java | 7 +- .../cp/MatrixBuiltinNaryCPInstruction.java | 7 +- .../cp/MatrixIndexingCPInstruction.java| 7 +- .../cp/MultiReturnBuiltinCPInstruction.java| 15 ++-- .../cp/ParameterizedBuiltinCPInstruction.java | 16 ++--- .../cp/ScalarBuiltinNaryCPInstruction.java | 8 +-- .../instructions/cp/SpoofCPInstruction.java| 9 +-- .../instructions/cp/VariableCPInstruction.java | 41 +-- .../fed/ComputationFEDInstruction.java | 7 +- .../spark/BuiltinNarySPInstruction.java| 7 +- .../spark/ComputationSPInstruction.java| 7 +- .../spark/MatrixIndexingSPInstruction.java | 7 +- .../instructions/spark/RandSPInstruction.java | 5 +- .../instructions/spark/WriteSPInstruction.java | 5 +- .../apache/sysds/runtime/lineage/LineageCache.java | 6 +- .../apache/sysds/runtime/lineage/LineageItem.java | 81 -- .../sysds/runtime/lineage/LineageItemUtils.java| 37 ++ .../apache/sysds/runtime/lineage/LineageMap.java | 71 ++- .../sysds/runtime/lineage/LineageParser.java | 15 ++-- .../sysds/runtime/lineage/LineageRewriteReuse.java | 58 .../sysds/runtime/lineage/LineageTraceable.java| 32 - .../test/functions/lineage/LineageReadTest.java| 2 +- .../test/functions/lineage/LineageRewriteTest.java | 4 +- 27 files changed, 251 insertions(+), 233 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/AggregateUnaryCPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/AggregateUnaryCPInstruction.java index 7c52737..5f053e9 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/cp/AggregateUnaryCPInstruction.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/AggregateUnaryCPInstruction.java @@ -147,12 +147,9 @@ public class AggregateUnaryCPInstruction extends UnaryCPInstruction throw new DMLRuntimeException("Lineage trace " + "for variable "+input1.getName()+" unavailable."); - LineageItem li = DMLScript.LINEAGE_DEDUP ? - LineageItemUtils.rDecompress(ec.getLineageItem(input1)) : - ec.getLineageItem(input1); - - ec.setScalarOutput(output_name, new StringObject( - Explain.explain(li))); + LineageItem li = !DMLScript.LINEAGE_DEDUP ? ec.getLineageItem(input1): + LineageItemUtils.rDecompress(ec.getLineageItem(input1)); + ec.setScalarOutput(output_name, new StringObject(Explain.explain(li))); break; } default: { diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/ComputationCPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/ComputationCPInstruction.java index 3eecb80..a1c3568 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/cp/ComputationCPInstruction.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/ComputationCPInstruction.java @@ -19,6 +19,7 @@ package org.apache.sysds.runtime.instructions.cp; +import org.apache.commons.lang3.tuple.Pair; import org.apache.sysds.api.DMLScript; import org.apache.sysds.common.Types.ExecMode; import org.ap
[systemml] branch master updated: [MINOR] Cache Python pip and apt dependencies
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 9bc5232 [MINOR] Cache Python pip and apt dependencies 9bc5232 is described below commit 9bc52328f5535492d50cca811a67bd81829220ce Author: Sebastian AuthorDate: Fri May 15 22:53:58 2020 +0200 [MINOR] Cache Python pip and apt dependencies Closes #913. --- .github/workflows/python.yml | 26 ++ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 156d843..c0002b4 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -53,9 +53,19 @@ jobs: with: path: ~/.m2/repository key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} -restore-keys: | - ${{ runner.os }}-maven- - + +- name: Cache Pip Dependencies + uses: actions/cache@v1 + with: +path: ~/.cache/pip +key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('src/main/python/setup.py') }} + +- name: Cache Deb Dependencies + uses: actions/cache@v1 + with: +path: /var/cache/apt/archives +key: ${{ runner.os }}-${{ hashFiles('.github/workflows/python.yml') }} + - name: Maven clean & package run: mvn clean package -P distribution @@ -65,15 +75,7 @@ jobs: python-version: ${{ matrix.python-version }} architecture: 'x64' -- name: Cache Pip Dependencies - uses: actions/cache@v1 - with: -path: ~/.cache/pip -key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('src/main/python/setup.py') }} -restore-keys: | - ${{ runner.os }}-pip-${{ matrix.python-version }}- - -- name: Install protobuf +- name: Install Protobuf run: sudo apt-get install protobuf-compiler libprotoc-dev - name: Install pip Dependencies
[systemml] branch master updated: [MINOR] Fix missing licenses and build rat check
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 99a7271 [MINOR] Fix missing licenses and build rat check 99a7271 is described below commit 99a7271a12a8d30e9f604cb46db5c6c6ee20d241 Author: Sebastian AuthorDate: Fri May 15 22:49:56 2020 +0200 [MINOR] Fix missing licenses and build rat check ignore __pycache__ folders for rat add license missing in compression tests add license missing in scripts - __init__.py files rat check for build workflow Closes #912. --- .github/workflows/build.yml | 2 +- pom.xml | 1 + scripts/staging/slicing/__init__.py | 20 scripts/staging/slicing/base/__init__.py | 20 scripts/staging/slicing/tests/__init__.py| 20 .../staging/slicing/tests/classification/__init__.py | 20 scripts/staging/slicing/tests/regression/__init__.py | 20 .../compress/colgroup/JolEstimateDDCTest.java| 19 +++ .../component/compress/colgroup/JolEstimateTest.java | 19 +++ 9 files changed, 140 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0ae7f82..ac762cd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -48,4 +48,4 @@ jobs: ${{ runner.os }}-maven- - name: Build - run: mvn package + run: mvn package -P rat diff --git a/pom.xml b/pom.xml index 6d6ab8d..bbc3088 100644 --- a/pom.xml +++ b/pom.xml @@ -509,6 +509,7 @@ **/*.mtx **/*.mtd **/*.out + **/__pycache__/** **/part-* **/*.keep **/target/** diff --git a/scripts/staging/slicing/__init__.py b/scripts/staging/slicing/__init__.py index e69de29..e66abb4 100644 --- a/scripts/staging/slicing/__init__.py +++ b/scripts/staging/slicing/__init__.py @@ -0,0 +1,20 @@ +# - +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# - diff --git a/scripts/staging/slicing/base/__init__.py b/scripts/staging/slicing/base/__init__.py index e69de29..e66abb4 100644 --- a/scripts/staging/slicing/base/__init__.py +++ b/scripts/staging/slicing/base/__init__.py @@ -0,0 +1,20 @@ +# - +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# - diff --git a/scripts/staging/slicing/tests/__init__.py b/scripts/staging/slicing/tests/__init__.py index e69de29..e66abb4 100644 --- a/scripts/staging/slicing/tests/__init__.py +++ b/scripts/staging
[systemml] branch master updated: [SYSTEMDS-263] ONNX graph importer (Python API, docs, tests)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 0ac0c25 [SYSTEMDS-263] ONNX graph importer (Python API, docs, tests) 0ac0c25 is described below commit 0ac0c2571b39e96f7a117fd317d73443632f6f26 Author: Lukas Timpl AuthorDate: Thu May 14 23:39:04 2020 +0200 [SYSTEMDS-263] ONNX graph importer (Python API, docs, tests) This PR implements a first poc-implementation for an ONNX importer. It adds support for the following operators: Add, Sub, MatMul, Neg, Xor, Or, And, Relu, Tanh, Sigmoid, Softmax, Dropout, MaxPool, Conv, If; as well as the logic for nested sub-graphs. AMLS project SS 2020 Closes #904. --- .github/workflows/python.yml | 17 +- .gitignore | 3 + docs/Tasks.txt | 3 +- docs/onnx-systemds-design.md | 46 -- pom.xml| 1 + .../python/docs/source/assets/sample_graph.png | Bin 0 -> 35508 bytes src/main/python/docs/source/index.rst | 8 + src/main/python/docs/source/onnx_systemds.rst | 59 +++ .../python/docs/source/onnx_systemds_design.rst| 217 ++ src/main/python/systemds/__init__.py | 2 +- src/main/python/systemds/onnx_systemds/README.md | 22 + src/main/python/systemds/onnx_systemds/__init__.py | 14 + src/main/python/systemds/onnx_systemds/convert.py | 53 +++ .../python/systemds/onnx_systemds/onnx_helper.py | 218 ++ .../python/systemds/onnx_systemds/operator_gen.py | 465 + src/main/python/systemds/onnx_systemds/render.py | 215 ++ .../templates/graph_function.dml.jinja | 54 +++ .../onnx_systemds/templates/graph_header.dml.jinja | 22 + .../onnx_systemds/templates/main.dml.jinja | 26 ++ .../templates/matrix_initialize.dml.jinja | 24 ++ .../onnx_systemds/templates/model_header.dml.jinja | 36 ++ .../templates/module_import.dml.jinja | 17 + .../operators/2input_1output_operator.dml.jinja| 18 + .../templates/operators/function_call.dml.jinja| 31 ++ .../templates/operators/if_operator.dml.jinja | 19 + .../templates/operators/neg.dml.jinja | 18 + .../onnx_systemds/templates/util.dml.jinja | 42 ++ src/main/python/systemds/onnx_systemds/util.py | 40 ++ src/main/python/{systemds => tests}/__init__.py| 5 - .../python/{systemds => tests/onnx}/__init__.py| 4 - .../dml_wrapper/simple_conv_layer_2_wrapper.dml| 27 ++ .../onnx/dml_wrapper/simple_conv_layer_wrapper.dml | 25 ++ .../dml_wrapper/simple_dropout_layer_wrapper.dml | 22 + .../onnx/dml_wrapper/simple_if_graph_wrapper.dml | 27 ++ .../dml_wrapper/simple_mat_add_mul_sub_wrapper.dml | 24 ++ .../onnx/dml_wrapper/simple_mat_add_wrapper.dml| 24 ++ .../dml_wrapper/simple_mat_initialized_wrapper.dml | 21 + .../dml_wrapper/simple_maxpool_layer_wrapper.dml | 22 + .../simple_relu_tanh_sigmoid_softmax_wrapper.dml | 27 ++ .../simple_conv_layer_2_reference.out | 5 + .../simple_conv_layer_reference.out| 25 ++ .../output_reference/simple_if_graph_reference.out | 5 + .../simple_mat_add_mul_sub_reference.out | 4 + .../output_reference/simple_mat_add_reference.out | 4 + .../simple_mat_initialized_reference.out | 9 + .../simple_maxpool_layer_reference.out | 25 ++ .../simple_relu_tanh_sigmoid_softmax_reference.out | 11 + .../tests/onnx/test_models/model_generate.py | 388 + src/main/python/tests/onnx/test_simple.py | 65 +++ src/main/python/tests/onnx/util.py | 84 50 files changed, 2485 insertions(+), 58 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 27c12ec..156d843 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -72,9 +72,12 @@ jobs: key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('src/main/python/setup.py') }} restore-keys: | ${{ runner.os }}-pip-${{ matrix.python-version }}- + +- name: Install protobuf + run: sudo apt-get install protobuf-compiler libprotoc-dev - name: Install pip Dependencies - run: pip install numpy py4j wheel scipy sklearn + run: pip install numpy py4j wheel scipy sklearn jinja2 onnx - name: Build Python Package run: | @@ -97,3 +100,15 @@ jobs: cd src/main/python python -m unittest tests/lineage/*.py echo "Exit Status: " $? + +- name: Run onnx-systemds python tests + run: | +export SYSTEMDS_ROOT=$(pwd) +export
[systemml] branch master updated: [MINOR] Avoid unnecessary overhead in createvar instructions
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 30d5c40 [MINOR] Avoid unnecessary overhead in createvar instructions 30d5c40 is described below commit 30d5c408b900b1aa4c8ddeb2f1264b830f460a05 Author: Matthias Boehm AuthorDate: Thu May 14 18:20:17 2020 +0200 [MINOR] Avoid unnecessary overhead in createvar instructions This patch makes a minor performance improvement to the createvar instruction execution (which happens for every non-scalar operator). In detail, the need for creating unique file names (from one instruction), led to unnecessary string concatenation and thus object allocation. We now reuse the existing thread-local string builders as used for instruction generation. On a special-case scenario with ~1M loop iterations over tiny data (100 values), this patch improved the createvar overhead from 22.1s to 5.6s (and overall from 49s to 33s). --- .../sysds/runtime/instructions/InstructionUtils.java | 8 .../runtime/instructions/cp/VariableCPInstruction.java | 16 ++-- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/instructions/InstructionUtils.java b/src/main/java/org/apache/sysds/runtime/instructions/InstructionUtils.java index a47d6de..f1c8dc6 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/InstructionUtils.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/InstructionUtils.java @@ -1008,4 +1008,12 @@ public class InstructionUtils sb.append(inputs[inputs.length-1]); return sb.toString(); } + + public static String concatStrings(String... inputs) { + StringBuilder sb = _strBuilders.get(); + sb.setLength(0); //reuse allocated space + for( int i=0; i obj = new TensorObject(getInput1().getValueType(), fname); //clone meta data because it is updated on copy-on-write, otherwise there //is potential for hidden side effects between variables. @@ -560,6 +558,8 @@ public class VariableCPInstruction extends CPInstruction implements LineageTrace } else if( getInput1().getDataType() == DataType.FRAME ) { String fname = getInput2().getName(); + if( Boolean.parseBoolean(getInput3().getName()) ) + fname = getUniqueFileName(fname); FrameObject fobj = new FrameObject(fname); fobj.setMetaData((MetaData)metadata.clone()); fobj.setFileFormatProperties(_formatProperties); @@ -1257,4 +1257,8 @@ public class VariableCPInstruction extends CPInstruction implements LineageTrace || opcode == VariableOperationCode.CastAsDoubleVariable || opcode == VariableOperationCode.CastAsBooleanVariable; } + + public static String getUniqueFileName(String fname) { + return InstructionUtils.concatStrings(fname, "_", String.valueOf(_uniqueVarID.getNextID())); + } }
[systemml] 02/02: [SYSTEMDS] Fix and cleanup steplm feature selection built-in
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git commit 996f61281c45a428986a89bc14c982ad41af0382 Author: Matthias Boehm AuthorDate: Tue May 12 22:35:13 2020 +0200 [SYSTEMDS] Fix and cleanup steplm feature selection built-in This patch makes several improvements to the existing steplm built-in function (correctness and performance): 1) So far, the lm parameters were not correctly passed through to the actual lm call, which for example rendered tol, reg, and icpt parameters ineffective (except for icpt=1 which was the only one tested). 2) Cleanup of unnecessarily operations and control flow 3) Converted the main two for loops of greedy model building to parfor loops (which required a slightly different analysis of the best model). On a scenario of a dense 10K x 1K input matrix (with convergence after 20 iterations -> ~21000 lm training calls), this patch improved performance from 103.9s to 14.4s due to much better utilization (with fewer barriers) of the available 24 virtual cores. --- scripts/builtin/steplm.dml | 254 - 1 file changed, 113 insertions(+), 141 deletions(-) diff --git a/scripts/builtin/steplm.dml b/scripts/builtin/steplm.dml index 608a477..fd0018d 100644 --- a/scripts/builtin/steplm.dml +++ b/scripts/builtin/steplm.dml @@ -60,170 +60,138 @@ # STDEV_TOT_Y Standard Deviation of the response value Y # AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias -m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, Double reg = 1e-7, Double tol = 1e-7, Integer maxi = 0, Boolean verbose = TRUE) -return(Matrix[Double] C, Matrix[Double] S) { - - # currently only the forward selection strategy in supported: start - # from one feature and iteratively add features until AIC improves - dir = "forward"; +m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, + Double reg = 1e-7, Double tol = 1e-7, Integer maxi = 0, Boolean verbose = TRUE) + return(Matrix[Double] B, Matrix[Double] S) +{ + if( icpt!=0 & icpt!=1 & icpt!=2 ) +stop("Invalid steplm invocation with icpt="+icpt+" (valid values: 0,1,2)."); + + # NOTE: currently only the forward selection strategy in supported: + # start from one feature and iteratively add features until AIC improves thr = 0.001; print("BEGIN STEPWISE LINEAR REGRESSION SCRIPT"); - print("Reading X and Y..."); X_orig = X; n = nrow(X_orig); m_orig = ncol(X_orig); # BEGIN STEPWISE LINEAR REGRESSION - if (dir == "forward") { + columns_fixed = matrix(0, 1, m_orig); + columns_fixed_ordered = matrix(0, 1, 1); + + # X_global stores the best model found at each step + X_global = matrix(0, n, 1); + + if (icpt == 1 | icpt == 2) { +beta = mean(y); +AIC_best_orig = 2 + n * log(sum((beta - y) ^ 2) / n); + } else { +beta = 0.0; +AIC_best_orig = n * log(sum(y ^ 2) / n); + } + print("Best AIC without any features: " + AIC_best_orig); + boa_ncol = ncol(X_orig) + as.integer(icpt!=0); + beta_out_all = matrix(0, boa_ncol, m_orig); + + # First pass to examine single features + AICs = matrix(0, 1, m_orig); + parfor (i in 1:m_orig, check = 0) { +[AIC_1, beta_out_i] = linear_regression(X_orig[, i], y, icpt, reg, tol, maxi, verbose); +AICs[1, i] = AIC_1; +beta_out_all[1:nrow(beta_out_i), i] = beta_out_i; + } + AIC_best = min(min(AICs), AIC_best_orig); + AIC_check = checkAIC(AIC_best, AIC_best_orig, thr); + column_best = ifelse(AIC_check, as.scalar(rowIndexMin(AICs)), 0); + + # beta best so far + beta_best = beta_out_all[, column_best]; + if (column_best == 0) { +print("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!"); +B = matrix(0, m_orig, 1); +if (icpt != 0) + B = rbind(B, as.matrix(beta)); +S = matrix(0, 1, 1); + } + else { + +print("Best AIC " + AIC_best + " achieved with feature: " + column_best); + +columns_fixed[1, column_best] = 1; +columns_fixed_ordered[1, 1] = column_best; +X_global = X_orig[, column_best]; + continue = TRUE -columns_fixed = matrix(0, 1, m_orig); -columns_fixed_ordered = matrix(0, 1, 1); - - # X_global stores the best model found at each step -X_global = matrix(0, n, 1); - - if (icpt == 1 | icpt == 2) { - beta = mean(y); - AIC_best = 2 + n * log(sum((beta - y) ^ 2) / n); -} else { - beta = 0.0; - AIC_best = n * log(sum(y ^ 2) / n); -} -AICs = matrix(AIC_best, 1, m_orig); -print("Best AIC without any features: " + AIC_best);
[systemml] branch master updated (5e726cf -> 996f612)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git. from 5e726cf [SYSTEMDS-55] Fix file format handling, docs, and github test config new 96a719b [SYSTEMDS-238] Fix lineage merge on parfor w/ conditional control flow new 996f612 [SYSTEMDS] Fix and cleanup steplm feature selection built-in The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: scripts/builtin/steplm.dml | 254 + .../runtime/controlprogram/ParForProgramBlock.java | 3 +- 2 files changed, 115 insertions(+), 142 deletions(-)
[systemml] 01/02: [SYSTEMDS-238] Fix lineage merge on parfor w/ conditional control flow
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git commit 96a719bc384f0c60dc1994be49d72d91d2031dea Author: Matthias Boehm AuthorDate: Tue May 12 22:26:14 2020 +0200 [SYSTEMDS-238] Fix lineage merge on parfor w/ conditional control flow This patch makes a minor robustness fix to the parfor lineage merge for the case that certain workers did not make any updates of result variables due to conditional control flow in the parfor body. --- .../org/apache/sysds/runtime/controlprogram/ParForProgramBlock.java| 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/ParForProgramBlock.java b/src/main/java/org/apache/sysds/runtime/controlprogram/ParForProgramBlock.java index 1c98c34..812cf2c 100644 --- a/src/main/java/org/apache/sysds/runtime/controlprogram/ParForProgramBlock.java +++ b/src/main/java/org/apache/sysds/runtime/controlprogram/ParForProgramBlock.java @@ -1351,7 +1351,8 @@ public class ParForProgramBlock extends ForProgramBlock LineageItem current = lineages[0].get(var._name); for( int i=1; i
[systemml] branch master updated: [SYSTEMDS-253] Distributed slice finding (task/data parallel, fixes)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 8cbc85a [SYSTEMDS-253] Distributed slice finding (task/data parallel, fixes) 8cbc85a is described below commit 8cbc85a949b3699cde8ed3cf3e3abec6a27fbc60 Author: gilgenbergg AuthorDate: Sun May 3 17:17:57 2020 +0200 [SYSTEMDS-253] Distributed slice finding (task/data parallel, fixes) Closes #881. --- docs/Tasks.txt | 3 +- scripts/staging/hmm/HMM.py | 2 - scripts/staging/slicing/__init__.py| 0 scripts/staging/slicing/base/Bucket.py | 168 + .../staging/slicing/base/{node.py => SparkNode.py} | 79 ++ scripts/staging/slicing/base/__init__.py | 0 scripts/staging/slicing/base/node.py | 28 +++- scripts/staging/slicing/base/slicer.py | 135 + .../base/tests/classification/test_adult.py| 101 - .../slicing/base/tests/classification/test_iris.py | 88 --- .../base/tests/regression/test_insurance.py| 81 -- .../slicing/base/tests/regression/test_salary.py | 87 --- scripts/staging/slicing/base/top_k.py | 7 +- scripts/staging/slicing/base/union_slicer.py | 78 -- .../slicing/spark_modules/join_data_parallel.py| 120 +++ .../staging/slicing/spark_modules/spark_slicer.py | 100 .../slicing/spark_modules/spark_union_slicer.py| 70 + .../staging/slicing/spark_modules/spark_utils.py | 141 + .../slicing/spark_modules/union_data_parallel.py | 119 +++ scripts/staging/slicing/tests/__init__.py | 0 .../slicing/tests/classification/__init__.py | 0 .../slicing/tests/classification/sparked_adults.py | 118 +++ .../slicing/tests/classification/test_adult.py | 121 +++ .../slicing/tests/classification/test_iris.py | 109 + .../staging/slicing/tests/regression/__init__.py | 0 .../slicing/tests/regression/bd_spark_salary.py| 131 .../slicing/tests/regression/spark_salary.py | 123 +++ .../slicing/tests/regression/test_insurance.py | 103 + .../slicing/tests/regression/test_salary.py| 104 + 29 files changed, 1717 insertions(+), 499 deletions(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 9d2dba4..9fa9a6f 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -203,7 +203,8 @@ SYSTEMDS-240 GPU Backend Improvements SYSTEMDS-250 Extended Slice Finding * 251 Alternative slice enumeration approach OK - * 252 Initial data slicing implementation Python + * 252 Initial data slicing implementation Python OK + * 253 Distributed slicing algorithms (task/data parallel)OK SYSTEMDS-260 Misc Tools * 261 Stable marriage algorithm OK diff --git a/scripts/staging/hmm/HMM.py b/scripts/staging/hmm/HMM.py index 61fa0d0..d9eb187 100644 --- a/scripts/staging/hmm/HMM.py +++ b/scripts/staging/hmm/HMM.py @@ -19,8 +19,6 @@ # #- -#Author: Afan Secic - from bs4 import BeautifulSoup,SoupStrainer import nltk from nltk.tokenize import sent_tokenize, word_tokenize diff --git a/scripts/staging/slicing/__init__.py b/scripts/staging/slicing/__init__.py new file mode 100644 index 000..e69de29 diff --git a/scripts/staging/slicing/base/Bucket.py b/scripts/staging/slicing/base/Bucket.py new file mode 100644 index 000..0277f6d --- /dev/null +++ b/scripts/staging/slicing/base/Bucket.py @@ -0,0 +1,168 @@ +#- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#- + +class Bucket: + +key: [] +attributes: [] +name: "" +error: float +
[systemml] branch master updated: [SYSTEMDS-335] Weighted eviction policy in lineage cache
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 6e811d7 [SYSTEMDS-335] Weighted eviction policy in lineage cache 6e811d7 is described below commit 6e811d75facf0a6cbff0ee9ff93c15beedc1302f Author: arnabp AuthorDate: Sun May 3 16:26:05 2020 +0200 [SYSTEMDS-335] Weighted eviction policy in lineage cache This patch contains a new eviction policy for lineage cache. A min-heap based priority queue over a function of computation time and size is maintained to define the order of evictions.The idea is to evict large matrices, which take little time to recompute. This weighted scheme significantly reduces the number of evictions (including disk spilling). This patch also refactors the LineageCache class to hide the eviction policy related maintenance. Closes #905. --- docs/Tasks.txt | 2 +- .../instructions/cp/FunctionCallCPInstruction.java | 2 + .../apache/sysds/runtime/lineage/LineageCache.java | 430 ++--- .../sysds/runtime/lineage/LineageCacheConfig.java | 20 + .../sysds/runtime/lineage/LineageCacheEntry.java | 112 ++ .../runtime/lineage/LineageCacheEviction.java | 371 ++ 6 files changed, 544 insertions(+), 393 deletions(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 42b2b3e..9d2dba4 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -262,7 +262,7 @@ SYSTEMDS-330 Lineage Tracing, Reuse and Integration * 332 Parfor integration with multi-level reuse OK * 333 Improve cache eviction with actual compute timeOK * 334 Cache scalars only with atleast one matrix inputs - * 335 Weighted eviction policy (function of size & computetime) + * 335 Weighted eviction policy (function of size & computetime) OK * 336 Better use of cache status to handle multithreading * 337 Adjust disk I/O speed by recording actual time taken OK * 338 Extended lineage tracing (rmEmpty, lists), partial rewritesOK diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/FunctionCallCPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/FunctionCallCPInstruction.java index def4859..3c4e1a9 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/cp/FunctionCallCPInstruction.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/FunctionCallCPInstruction.java @@ -231,6 +231,8 @@ public class FunctionCallCPInstruction extends CPInstruction { if( DMLScript.LINEAGE && LineageCacheConfig.isMultiLevelReuse() ) { LineageCache.putValue(fpb.getOutputParams(), liInputs, getCacheFunctionName(_functionName, fpb), ec, t1-t0); + //FIXME: send _boundOutputNames instead of fpb.getOutputParams as + //those are already replaced by boundoutput names in the lineage map. } } diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java index a42a376..32e5585 100644 --- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java +++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java @@ -38,15 +38,12 @@ import org.apache.sysds.runtime.instructions.cp.Data; import org.apache.sysds.runtime.instructions.cp.MMTSJCPInstruction; import org.apache.sysds.runtime.instructions.cp.ParameterizedBuiltinCPInstruction; import org.apache.sysds.runtime.instructions.cp.ScalarObject; -import org.apache.sysds.runtime.lineage.LineageCacheConfig.LineageCacheStatus; import org.apache.sysds.runtime.lineage.LineageCacheConfig.ReuseCacheType; import org.apache.sysds.runtime.matrix.data.InputInfo; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.sysds.runtime.matrix.data.OutputInfo; import org.apache.sysds.runtime.meta.MetaDataFormat; -import org.apache.sysds.runtime.util.LocalFileUtils; -import java.io.IOException; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; @@ -55,20 +52,13 @@ import java.util.Map; public class LineageCache { - private static final Map _cache = new HashMap<>(); - private static final Map _spillList = new HashMap<>(); - private static final HashSet _removelist = new HashSet<>(); + private static final Map _cache = new HashMap<>(); private static final double CACHE_FRAC = 0.05; // 5% of JVM heap size - private static final long CACHE_LIMIT; //limit in bytes - private static final boolean DEBUG = false; - private static String _outdir = null; - private static long _cachesize
[systemml] branch master updated: [MINOR] Fix unnecessary date handling in MLContext API
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 1c40be6 [MINOR] Fix unnecessary date handling in MLContext API 1c40be6 is described below commit 1c40be6e31975ddc8e734b6613fdb32997ba0439 Author: bd2019us AuthorDate: Sun May 3 16:15:33 2020 +0200 [MINOR] Fix unnecessary date handling in MLContext API Closes #862. --- src/main/java/org/apache/sysds/api/mlcontext/MLContext.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/main/java/org/apache/sysds/api/mlcontext/MLContext.java b/src/main/java/org/apache/sysds/api/mlcontext/MLContext.java index bfcc491..9d0b55b 100644 --- a/src/main/java/org/apache/sysds/api/mlcontext/MLContext.java +++ b/src/main/java/org/apache/sysds/api/mlcontext/MLContext.java @@ -19,7 +19,6 @@ package org.apache.sysds.api.mlcontext; -import java.util.Date; import java.util.Set; import org.apache.log4j.Logger; @@ -330,9 +329,8 @@ public class MLContext implements ConfigurableAPI try { executionScript = script; - Long time = new Long((new Date()).getTime()); if ((script.getName() == null) || (script.getName().equals(""))) { - script.setName(time.toString()); + script.setName(String.valueOf(System.currentTimeMillis())); } MLResults results = scriptExecutor.execute(script);
[systemml] branch master updated: [MINOR] Various improvements of data cleaning built-in primitives
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 8fbcd75 [MINOR] Various improvements of data cleaning built-in primitives 8fbcd75 is described below commit 8fbcd758674a07fa0a0f41be2ecea110b53691cc Author: Shafaq Siddiqi AuthorDate: Sun May 3 14:50:43 2020 +0200 [MINOR] Various improvements of data cleaning built-in primitives Closes #901. --- scripts/builtin/mice.dml | 27 --- scripts/builtin/multiLogReg.dml| 3 +- scripts/builtin/outlierByIQR.dml | 20 +++-- scripts/builtin/outlierBySd.dml| 20 +++-- scripts/builtin/winsorize.dml | 7 ++ .../test/functions/builtin/BuiltinMiceTest.java| 91 +++--- .../functions/builtin/BuiltinOutlierByIQRTest.java | 9 +++ .../functions/builtin/BuiltinOutlierBySDTest.java | 21 +++-- .../functions/builtin/BuiltinWinsorizeTest.java| 4 +- src/test/scripts/functions/builtin/mice.R | 85 +--- src/test/scripts/functions/builtin/mice.dml| 28 +-- .../scripts/functions/builtin/outlier_by_IQR.dml | 2 +- .../scripts/functions/builtin/outlier_by_sd.dml| 2 +- src/test/scripts/functions/builtin/winsorize.R | 4 +- .../scripts/functions/caching/BufferpoolLeak.dml | 2 +- 15 files changed, 220 insertions(+), 105 deletions(-) diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml index 99d2be2..b00d542 100644 --- a/scripts/builtin/mice.dml +++ b/scripts/builtin/mice.dml @@ -26,6 +26,7 @@ # NAMETYPEDEFAULT MEANING # - # F String---Data Frame +# cMask Double---A 0/1 row vector for identifying numeric (0) adn categorical features (1) # iterInteger3 Number of iteration for multiple imputations # completeInteger3 A complete dataset generated though a specific iteration # - @@ -40,17 +41,21 @@ # Assumption missing value are represented with empty string i.e ",," in csv file # variables with suffix n are storing continous/numeric data and variables with suffix c are storing categorical data -s_mice= function(Frame[String] F, Matrix[Double] cMask, Integer iter = 3, Integer complete = 3) +s_mice= function(Frame[String] F, Matrix[Double] cMask, Integer iter = 3, Integer complete = 3, Boolean verbose = FALSE) return(Frame[String] dataset, Frame[String] singleSet) { if(ncol(F) == 1) stop("invalid aregument: can not apply mice on single column") + + if(complete > iter) +complete = iter - # adding a temporary categorical feature (in-case all attributes are continous) + + # adding a temporary feature (in-case all attributes are of same type) F = cbind(F, as.frame(matrix(1,nrow(F), 1))) cMask = cbind(cMask, matrix(1,1,1)) - + n = nrow(F) row = n*complete; col = ncol(F) @@ -58,6 +63,10 @@ return(Frame[String] dataset, Frame[String] singleSet) Mask_Result = matrix(0, rows=1, cols=col) scat = seq(1, ncol(cMask)) cat = removeEmpty(target=scat, margin="rows", select=t(cMask)) + + if(nrow(cat) == ncol(F)) +cMask[1,ncol(cMask)] = 0 + s="" for(i in 1: nrow(cat), check =0) s = s+as.integer(as.scalar(cat[i, 1]))+","; @@ -168,7 +177,7 @@ return(Frame[String] dataset, Frame[String] singleSet) in_n = in_n + 1; } - if((as.scalar(dXMask[1,i]) == 1) & (sum(Mask_c[, in_c]) != 0)) + if( (as.scalar(dXMask[1,i]) == 1) & (sum(Mask_c[, in_c]) != 0) ) { j = (i + as.scalar(dist[1,in_c])) - 1 @@ -199,8 +208,8 @@ return(Frame[String] dataset, Frame[String] singleSet) Mask_Filled_c[,in_c] = table(R, 1, pred, n, 1); i = as.integer(j) } - - in_c = in_c + 1 + if(in_c < col) +in_c = in_c + 1 i = i+1; } @@ -214,7 +223,7 @@ return(Frame[String] dataset, Frame[String] singleSet) Result = Result[2: n*iter+1, ] Mask_Result = Mask_Result[2: n*iter+1, ] index = (((complete*n)-n)+1) - #voting for aggregation of categorical imputations + # voting for aggregation of categorical imputations agg = cAggregate(Mask_Result*cMask, iter, n) # aggregating the results @@ -229,11 +238,11 @@ return(Frame[String] dataset, Frame[String] singleSet) dataset = XO + Agg_Matrix singleSet = Result[index:row, ] - # # decoding nominal columns + # decoding nominal columns dataset = transformdecode(target=dataset, spec=jspecR, meta=M); s
[systemml] branch master updated: [MINOR] Fix failing component tests (due to excessive log output)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 97cf552 [MINOR] Fix failing component tests (due to excessive log output) 97cf552 is described below commit 97cf5523cf2443e9a462b4fa1321735dc8c60285 Author: Matthias Boehm AuthorDate: Sat May 2 00:53:16 2020 +0200 [MINOR] Fix failing component tests (due to excessive log output) Closes #906. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 4fad885..4c25f07 100644 --- a/pom.xml +++ b/pom.xml @@ -262,7 +262,7 @@ 1C -Xms4g -Xmx4g false - plain + brief true
[systemml] branch master updated: [SYSTEMDS-391] New built-in GLM function (Generalized Linear Model)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new c7ce2ff [SYSTEMDS-391] New built-in GLM function (Generalized Linear Model) c7ce2ff is described below commit c7ce2ff95f306284363faf64321b1d2f36bbbeb4 Author: Shafaq Siddiqi AuthorDate: Thu Apr 30 22:30:11 2020 +0200 [SYSTEMDS-391] New built-in GLM function (Generalized Linear Model) Closes #888. --- docs/Tasks.txt |3 + scripts/algorithms/StepGLM.dml | 16 +- scripts/builtin/glm.dml| 1118 .../java/org/apache/sysds/common/Builtins.java |1 + src/test/java/org/apache/sysds/test/TestUtils.java | 383 +++ .../apache/sysds/test/applications/GLMTest.java| 417 +--- .../test/functions/builtin/BuiltinGLMTest.java | 269 + src/test/scripts/functions/builtin/glmTest.R | 139 +++ src/test/scripts/functions/builtin/glmTest.dml | 25 + 9 files changed, 1955 insertions(+), 416 deletions(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 35d07e6..86ab8fe 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -290,5 +290,8 @@ SYSTEMDS-370 Lossy Compression Blocks SYSTEMDS-380 Memory Footprint * 371 Matrix Block Memory footprint update +SYSTEMDS-390 New Builtin Functions IV + * 391 New GLM builtin-in function (from algorithms) OK + Others: * Break append instruction to cbind and rbind diff --git a/scripts/algorithms/StepGLM.dml b/scripts/algorithms/StepGLM.dml index 2ce5a1b..213f373 100644 --- a/scripts/algorithms/StepGLM.dml +++ b/scripts/algorithms/StepGLM.dml @@ -127,18 +127,18 @@ if (dir == "forward") { if (intercept_status == 0) { # Compute AIC of an empty model with no features and no intercept (all Ys are zero) - [AIC_best] = glm (X_global, Y, 0, num_features, columns_fixed_ordered, " "); + [AIC_best] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered, " "); } else { # compute AIC of an empty model with only intercept (all Ys are constant) all_ones = matrix (1, rows = num_records, cols = 1); - [AIC_best] = glm (all_ones, Y, 0, num_features, columns_fixed_ordered, " "); + [AIC_best] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered, " "); } print ("Best AIC without any features: " + AIC_best); # First pass to examine single features AICs = matrix (AIC_best, rows = 1, cols = num_features); parfor (i in 1:num_features) { - [AIC_1] = glm (X_orig[,i], Y, intercept_status, num_features, columns_fixed_ordered, " "); + [AIC_1] = glm_fit (X_orig[,i], Y, intercept_status, num_features, columns_fixed_ordered, " "); AICs[1,i] = AIC_1; } @@ -156,11 +156,11 @@ if (dir == "forward") { print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!"); if (intercept_status == 0) { # Compute AIC of an empty model with no features and no intercept (all Ys are zero) - [AIC_best] = glm (X_global, Y, 0, num_features, columns_fixed_ordered, fileB); + [AIC_best] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered, fileB); } else { # compute AIC of an empty model with only intercept (all Ys are constant) ###all_ones = matrix (1, rows = num_records, cols = 1); - [AIC_best] = glm (all_ones, Y, 0, num_features, columns_fixed_ordered, fileB); + [AIC_best] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered, fileB); } }; @@ -177,7 +177,7 @@ if (dir == "forward") { # Construct the feature matrix X = cbind (X_global, X_orig[,i]); - [AIC_2] = glm (X, Y, intercept_status, num_features, columns_fixed_ordered, " "); + [AIC_2] = glm_fit (X, Y, intercept_status, num_features, columns_fixed_ordered, " "); AICs[1,i] = AIC_2; } } @@ -209,7 +209,7 @@ if (dir == "forward") { # run GLM with selected set of features print ("Running GLM with selected features..."); - [AIC] = glm (X_global, Y, intercept_status, num_
[systemml] branch master updated: [SYSTEMDS-338] Extended lineage tracing and partial reuse
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new dc01c4d [SYSTEMDS-338] Extended lineage tracing and partial reuse dc01c4d is described below commit dc01c4db8b46bc413ceabf4998e0b8d44969db73 Author: arnabp AuthorDate: Thu Apr 30 22:01:51 2020 +0200 [SYSTEMDS-338] Extended lineage tracing and partial reuse This patch contains - two new partial rewrites which are specializations of existing rewrites, - bug fixes and optimizations in partial rewrites, - lineage tracing for removeEmpty, - a new test class to test algorithms and builtins with reuse, - extension of lineage tracing of list objects. Note that lineage doesn't work with most of the list handling methods today. Due to that the current generalized grid search builtin is far from working with lineage framework. Closes #897. --- docs/Tasks.txt | 1 + .../instructions/cp/DataGenCPInstruction.java | 4 + .../instructions/cp/ListIndexingCPInstruction.java | 7 + .../cp/ParameterizedBuiltinCPInstruction.java | 8 + .../apache/sysds/runtime/lineage/LineageCache.java | 4 +- .../sysds/runtime/lineage/LineageCacheConfig.java | 9 +- .../apache/sysds/runtime/lineage/LineageMap.java | 3 +- .../sysds/runtime/lineage/LineageRewriteReuse.java | 244 + .../functions/lineage/FunctionFullReuseTest.java | 5 - ...tionFullReuseTest.java => LineageReuseAlg.java} | 60 ++--- .../test/functions/lineage/LineageRewriteTest.java | 10 +- ...FunctionFullReuse5.dml => LineageReuseAlg1.dml} | 2 - .../scripts/functions/lineage/LineageReuseAlg2.dml | 60 + .../{FunctionFullReuse5.dml => RewriteTest13.dml} | 24 +- 14 files changed, 333 insertions(+), 108 deletions(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 97fa914..35d07e6 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -264,6 +264,7 @@ SYSTEMDS-330 Lineage Tracing, Reuse and Integration * 335 Weighted eviction policy (function of size & computetime) * 336 Better use of cache status to handle multithreading * 337 Adjust disk I/O speed by recording actual time taken OK + * 338 Extended lineage tracing (rmEmpty, lists), partial rewritesOK SYSTEMDS-340 Compiler Assisted Lineage Caching and Reuse * 341 Finalize unmarking of loop dependent operations diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/DataGenCPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/DataGenCPInstruction.java index c29c539..7a37608 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/cp/DataGenCPInstruction.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/DataGenCPInstruction.java @@ -164,6 +164,10 @@ public class DataGenCPInstruction extends UnaryCPInstruction { public long getSeed() { return seed; } + + public boolean isOnesCol() { + return minValue == maxValue && minValue == 1 && sparsity == 1 && getCols() == 1; + } public static DataGenCPInstruction parseInstruction(String str) { diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/ListIndexingCPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/ListIndexingCPInstruction.java index f15ad65..523eceb 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/cp/ListIndexingCPInstruction.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/ListIndexingCPInstruction.java @@ -25,6 +25,8 @@ import org.apache.sysds.common.Types.ValueType; import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.controlprogram.caching.CacheableData; import org.apache.sysds.runtime.controlprogram.context.ExecutionContext; +import org.apache.sysds.runtime.lineage.LineageItem; +import org.apache.sysds.runtime.lineage.LineageItemUtils; public final class ListIndexingCPInstruction extends IndexingCPInstruction { @@ -93,4 +95,9 @@ public final class ListIndexingCPInstruction extends IndexingCPInstruction { else throw new DMLRuntimeException("Invalid opcode (" + opcode +") encountered in ListIndexingCPInstruction."); } + @Override + public LineageItem[] getLineageItems(ExecutionContext ec) { + return new LineageItem[]{new LineageItem(output.getName(), getOpcode(), + LineageItemUtils.getLineage(ec, input1,input2,input3,rowLower,rowUpper))}; + } } diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/ParameterizedBuiltinCPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/Parameteriz
[systemml] branch master updated: [SYSTEMDS-316] Fix Python lm/rand tests (tolerance, workflow)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new a7f17b3 [SYSTEMDS-316] Fix Python lm/rand tests (tolerance, workflow) a7f17b3 is described below commit a7f17b3d17176ea8339cb5b5bcdd3c5854763761 Author: Julia Le AuthorDate: Wed Apr 29 23:48:33 2020 +0200 [SYSTEMDS-316] Fix Python lm/rand tests (tolerance, workflow) Just a few changes to the lm test case (increasing tolerance) so that the tc doesn't fail randomly. Remove multiple definitions of run in python workflow file. AMLS project SS 2020, part 2. Closes #902. --- .github/workflows/python.yml | 4 +--- src/main/python/tests/test_lm.py | 4 ++-- src/main/python/tests/test_matrix_rand.py | 30 +- 3 files changed, 16 insertions(+), 22 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 31af9d5..27c12ec 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -74,9 +74,7 @@ jobs: ${{ runner.os }}-pip-${{ matrix.python-version }}- - name: Install pip Dependencies - run: pip install numpy py4j wheel - run: pip install scipy - run: pip install sklearn + run: pip install numpy py4j wheel scipy sklearn - name: Build Python Package run: | diff --git a/src/main/python/tests/test_lm.py b/src/main/python/tests/test_lm.py index 24abd5c..1bd9ad7 100644 --- a/src/main/python/tests/test_lm.py +++ b/src/main/python/tests/test_lm.py @@ -38,7 +38,7 @@ sds = SystemDSContext() regressor = LinearRegression(fit_intercept=False) shape = (random.randrange(1, 30), random.randrange(1, 30)) -eps = 1e-05 +eps = 1e-03 class TestLm(unittest.TestCase): def setUp(self): @@ -60,8 +60,8 @@ class TestLm(unittest.TestCase): model.coef_ = model.coef_.reshape(sds_model_weights.shape) self.assertTrue(np.allclose(sds_model_weights, model.coef_, eps)) except Exception as e: -self.assertTrue(False, "This should not raise an exception!") print(e) +self.assertTrue(False, "This should not raise an exception!") def test_lm_invalid_shape(self): X = np.random.rand(shape[0], 0) diff --git a/src/main/python/tests/test_matrix_rand.py b/src/main/python/tests/test_matrix_rand.py index d267bca..b1f964b 100644 --- a/src/main/python/tests/test_matrix_rand.py +++ b/src/main/python/tests/test_matrix_rand.py @@ -27,14 +27,16 @@ import unittest import numpy as np import scipy.stats as st import random +import math path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../") sys.path.insert(0, path) from systemds.context import SystemDSContext -shape = (random.randrange(1, 50), random.randrange(1, 50)) +shape = (random.randrange(1, 25), random.randrange(1, 25)) +dist_shape = (10, 15) min_max = (0, 1) -sparsity = 0.2 +sparsity = random.uniform(0.0, 1.0) seed = 123 distributions = ["norm", "uniform"] @@ -58,37 +60,31 @@ class TestRand(unittest.TestCase): self.assertTrue((m.min() >= min_max[0]) and (m.max() <= min_max[1])) def test_rand_sparsity(self): -m = sds.rand(rows=shape[0], cols=shape[1], sparsity=sparsity, seed=seed).compute() -count, bins = np.histogram(m.flatten("F")) -non_zero_value_percent = sum(count[1:]) * 100 / sum(count) -e = 0.05 +m = sds.rand(rows=shape[0], cols=shape[1], sparsity=sparsity, seed=0).compute() +non_zero_value_percent = np.count_nonzero(m) * 100 /np.prod(m.shape) -self.assertTrue( -sum(count) == (shape[0] * shape[1]) -and (non_zero_value_percent >= (sparsity - e) * 100) -and (non_zero_value_percent <= (sparsity + e) * 100) -) +self.assertTrue(math.isclose(non_zero_value_percent, sparsity*100, rel_tol=5)) def test_rand_uniform_distribution(self): m = sds.rand( -rows=shape[0], -cols=shape[1], +rows=dist_shape[0], +cols=dist_shape[1], pdf="uniform", min=min_max[0], max=min_max[1], -seed=seed).compute() +seed=0).compute() dist = find_best_fit_distribution(m.flatten("F"), distributions) self.assertTrue(dist == "uniform") def test_rand_normal_distribution(self): m = sds.rand( -rows=shape[0], -cols=shape[1], +rows=dist_shape[0], +cols=dist_shape[1], pdf="normal", min=min_max[0], max=min_max[1], -seed=seed).compute() +seed=0).compute() dist = find_best_fit_distribution(m.flatten("F"), distributions) self.assertTrue(dist == "norm")
[systemml] branch master updated: [MINOR] Fix readme badges with links to master branch
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 9377496 [MINOR] Fix readme badges with links to master branch 9377496 is described below commit 937749621bf4d79238f52b0baae4d3308a9318fd Author: Sebastian AuthorDate: Wed Apr 29 23:24:10 2020 +0200 [MINOR] Fix readme badges with links to master branch - Fix badges to only reflect status on push to master branch - Make badges link to the tests conducted. Closes #903. --- README.md | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 49a7eb2..117b02a 100644 --- a/README.md +++ b/README.md @@ -40,10 +40,10 @@ programs over matrices, while replacing the underlying data model and compiler, supported functionalities. Until the first release, you can build your own snapshot via Apache Maven: `mvn clean package -P distribution`. -![Build](https://github.com/apache/systemml/workflows/Build/badge.svg) -![Documentation](https://github.com/apache/systemml/workflows/Documentation/badge.svg) -![Component Test](https://github.com/apache/systemml/workflows/Component%20Test/badge.svg) -![Application Test](https://github.com/apache/systemml/workflows/Application%20Test/badge.svg) -![Function Test](https://github.com/apache/systemml/workflows/Function%20Test/badge.svg) -![Python Test](https://github.com/apache/systemml/workflows/Python%20Test/badge.svg) -![Federated Python Test](https://github.com/apache/systemml/workflows/Federated%20Python%20Test/badge.svg) +[![Build](https://github.com/apache/systemml/workflows/Build/badge.svg?branch=master=push)](https://github.com/apache/systemml/actions?query=workflow%3A%22Build%22+branch%3Amaster+event%3Apush) +[![Documentation](https://github.com/apache/systemml/workflows/Documentation/badge.svg?branch=master=push)](https://github.com/apache/systemml/actions?query=workflow%3ADocumentation+branch%3Amaster+event%3Apush) +[![Component Test](https://github.com/apache/systemml/workflows/Component%20Test/badge.svg?branch=master=push)](https://github.com/apache/systemml/actions?query=workflow%3A%22Component+Test%22+branch%3Amaster+event%3Apush) +[![Application Test](https://github.com/apache/systemml/workflows/Application%20Test/badge.svg?branch=master=push)](https://github.com/apache/systemml/actions?query=workflow%3A%22Application+Test%22+branch%3Amaster+event%3Apush) +[![Function Test](https://github.com/apache/systemml/workflows/Function%20Test/badge.svg?branch=master=push)](https://github.com/apache/systemml/actions?query=workflow%3A%22Function+Test%22+branch%3Amaster+event%3Apush) +[![Python Test](https://github.com/apache/systemml/workflows/Python%20Test/badge.svg?branch=master=push)](https://github.com/apache/systemml/actions?query=workflow%3A%22Python+Test%22+branch%3Amaster+event%3Apush) +[![Federated Python Test](https://github.com/apache/systemml/workflows/Federated%20Python%20Test/badge.svg?branch=master=push)](https://github.com/apache/systemml/actions?query=workflow%3A%22Federated+Python+Test%22+branch%3Amaster+event%3Apush)
[systemml] branch master updated: [SYSTEMML-2121] AutoEncoder test for codegenalg suite
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new bdf78e4 [SYSTEMML-2121] AutoEncoder test for codegenalg suite bdf78e4 is described below commit bdf78e462506ef8ef7fc9e6b23a6520e4155eca0 Author: Janardhan AuthorDate: Tue Apr 28 00:15:58 2020 +0200 [SYSTEMML-2121] AutoEncoder test for codegenalg suite This patch adds a test case for AutoEncoder with codegen enabled against a corresponding R script. Closes #890. --- .../codegenalg/partone/AlgorithmAutoEncoder.java | 57 - .../functions/codegenalg/Algorithm_AutoEncoder.R | 239 .../functions/codegenalg/Algorithm_AutoEncoder.dml | 251 + 3 files changed, 542 insertions(+), 5 deletions(-) diff --git a/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java index 90d7ff8..fca850c 100644 --- a/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java +++ b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java @@ -20,7 +20,9 @@ package org.apache.sysds.test.functions.codegenalg.partone; import java.io.File; +import java.util.HashMap; +import org.apache.sysds.runtime.matrix.data.MatrixValue; import org.junit.Assert; import org.junit.Test; import org.apache.sysds.api.DMLScript; @@ -37,11 +39,12 @@ public class AlgorithmAutoEncoder extends AutomatedTestBase private final static String TEST_DIR = "functions/codegenalg/"; private final static String TEST_CLASS_DIR = TEST_DIR + AlgorithmAutoEncoder.class.getSimpleName() + "/"; - private final static int rows = 2468; + private final static int rows = 1068; private final static int cols = 784; private final static double sparsity1 = 0.7; //dense private final static double sparsity2 = 0.1; //sparse + private final static double eps = 1e-5; private final static int H1 = 500; private final static int H2 = 2; @@ -179,22 +182,66 @@ public class AlgorithmAutoEncoder extends AutomatedTestBase TestConfiguration config = getTestConfiguration(TEST_NAME); loadTestConfiguration(config); - fullDMLScriptName = "scripts/staging/autoencoder-2layer.dml"; + fullDMLScriptName = SCRIPT_DIR + TEST_DIR + "/Algorithm_AutoEncoder.dml"; + //"scripts/staging/autoencoder-2layer.dml"; programArgs = new String[]{ "-stats", "-nvargs", "X="+input("X"), - "H1="+H1, "H2="+H2, "EPOCH="+epochs, "BATCH="+batchsize, + "H1="+H1, "H2="+H2, "EPOCH="+epochs, "BATCH="+batchsize, + "W1_rand="+input("W1_rand"),"W2_rand="+input("W2_rand"), + "W3_rand="+input("W3_rand"), "W4_rand="+input("W4_rand"), + "order_rand="+input("order_rand"), "W1_out="+output("W1"), "b1_out="+output("b1"), "W2_out="+output("W2"), "b2_out="+output("b2"), "W3_out="+output("W3"), "b3_out="+output("b3"), "W4_out="+output("W4"), "b4_out="+output("b4")}; + + rCmd = getRCmd(inputDir(), String.valueOf(H1), String.valueOf(H2), + String.valueOf(epochs), String.valueOf(batchsize), expectedDir()); OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrites; //generate actual datasets double[][] X = getRandomMatrix(rows, cols, 0, 1, sparse?sparsity2:sparsity1, 714); writeInputMatrixWithMTD("X", X, true); - + + //generate rand matrices for W1, W2, W3, W4 here itself for passing onto both DML and R scripts + double[][] W1_rand = getRandomMatrix(H1, cols, 0, 1, sparse?sparsity2:sparsity1, 800); + writeInputMatrixWithMTD("W1_rand", W1_rand, true); +
[systemml] branch master updated: [SYSTEMDS-316] Extended Python API (rand, lm, matrix multiplication)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 608b9e5 [SYSTEMDS-316] Extended Python API (rand, lm, matrix multiplication) 608b9e5 is described below commit 608b9e5bfb6c612134fde25249beca10c467160e Author: Julia Le AuthorDate: Sun Apr 26 20:33:27 2020 +0200 [SYSTEMDS-316] Extended Python API (rand, lm, matrix multiplication) Add rand(), lm and matrix multiplication to Python API Adapt rand testcases and add exception handling to rand function Add testcase for LM, update testcase for rand() and add rand testcase to python.yml Update python.yml and add simple example of lm to the documentation AMLS project SS 2020. Closes #892. --- .github/workflows/python.yml | 4 +- docs/Tasks.txt | 1 + src/main/python/docs/source/matrix.rst | 1 + src/main/python/docs/source/simple_examples.rst| 39 ++ .../python/systemds/context/systemds_context.py| 30 - src/main/python/systemds/matrix/matrix.py | 31 - src/main/python/systemds/matrix/operation_node.py | 19 +++ src/main/python/systemds/utils/consts.py | 2 +- src/main/python/tests/test_lm.py | 79 src/main/python/tests/test_matrix_binary_op.py | 3 + src/main/python/tests/test_matrix_rand.py | 140 + 11 files changed, 345 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 84933ac..31af9d5 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -75,6 +75,8 @@ jobs: - name: Install pip Dependencies run: pip install numpy py4j wheel + run: pip install scipy + run: pip install sklearn - name: Build Python Package run: | @@ -96,4 +98,4 @@ jobs: export SYSDS_QUIET=1 cd src/main/python python -m unittest tests/lineage/*.py -echo "Exit Status: " $? \ No newline at end of file +echo "Exit Status: " $? diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 3a7abe7..97fa914 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -249,6 +249,7 @@ SYSTEMDS-310 Python Bindings * 313 Python Documentation upload via Github Actions OK * 314 Python SystemDS context managerOK * 315 Python Federated Matrices TestsOK + * 316 Extended Python API (rand, lm, mm) OK SYSTEMDS-320 Merge SystemDS into Apache SystemML OK * 321 Merge histories of SystemDS and SystemML OK diff --git a/src/main/python/docs/source/matrix.rst b/src/main/python/docs/source/matrix.rst index dd88c7c..e75eff4 100644 --- a/src/main/python/docs/source/matrix.rst +++ b/src/main/python/docs/source/matrix.rst @@ -98,3 +98,4 @@ the recommended way is to use the methods defined on ``SystemDSContext``. .. autofunction:: systemds.matrix.seq +.. autofunction:: systemds.matrix.rand \ No newline at end of file diff --git a/src/main/python/docs/source/simple_examples.rst b/src/main/python/docs/source/simple_examples.rst index 2175fd4..e92cfed 100644 --- a/src/main/python/docs/source/simple_examples.rst +++ b/src/main/python/docs/source/simple_examples.rst @@ -122,3 +122,42 @@ The output should be similar to:: [-0.0011352 ] [-0.01686351] [-0.03839821]] + +SystemDS includes a built-in function lm, which solves linear regression. The lm function takes as input a matrix of +feature vectors and a vector of response values y. The output of the function is a vector of weights. + +.. code-block:: python + + # Import numpy and SystemDS matrix + import numpy as np + from systemds.context import SystemDSContext + + # Set a seed + np.random.seed(0) + # Generate matrix of feature vectors + features = np.random.rand(10, 15) + # Generate a 1-column matrix of response values + y = np.random.rand(10, 1) + + # compute the weights + with SystemDSContext() as sds: +weights = sds.matrix(features).lm(sds.matrix(y)).compute() +print(weights) + +The output should be similar to:: + + [[-0.11538199] + [-0.20386541] + [-0.39956035] + [ 1.04078623] + [ 0.4327084 ] + [ 0.18954599] + [ 0.49858968] + [-0.26812763] + [ 0.09961844] + [-0.57000751] + [-0.43386048] + [ 0.55358873] + [-0.54638565] + [ 0.2205885 ] + [ 0.37957689]] diff --git a/src/main/python/systemds/context/systemds_context.py b/src/main/python/systemds/context/systemds_context.py index d5bdeb8..01f31a6 100644 --- a/src/main/python/systemds/context/systemds_context.py +++ b/src/main/python/systemds/context/systemds_context.py @@ -30,7 +30,7 @@ import numpy as np from py4j
[systemml] branch master updated: [MINOR] Cleanup codegen algorithm tests (config setup redundancy)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 592c44b [MINOR] Cleanup codegen algorithm tests (config setup redundancy) 592c44b is described below commit 592c44b7325e36b45e51f8510d352c05a0156b42 Author: Matthias Boehm AuthorDate: Sat Apr 25 23:07:46 2020 +0200 [MINOR] Cleanup codegen algorithm tests (config setup redundancy) --- .../org/apache/sysds/test/AutomatedTestBase.java | 24 .../codegenalg/partone/AlgorithmAutoEncoder.java | 68 - .../codegenalg/partone/AlgorithmKMeans.java| 124 +++- .../codegenalg/partone/AlgorithmL2SVM.java | 60 +++- .../codegenalg/partone/AlgorithmLinregCG.java | 123 +++- .../codegenalg/partone/AlgorithmMDABivar.java | 33 + .../codegenalg/partone/AlgorithmMLogreg.java | 156 + .../codegenalg/partone/AlgorithmMSVM.java | 72 -- .../functions/codegenalg/partone/AlgorithmPCA.java | 58 +++- .../codegenalg/parttwo/AlgorithmARIMA.java | 32 + .../codegenalg/parttwo/AlgorithmDatagen.java | 92 +--- .../functions/codegenalg/parttwo/AlgorithmGLM.java | 112 ++- .../codegenalg/parttwo/AlgorithmPNMF.java | 40 ++ .../codegenalg/parttwo/AlgorithmPageRank.java | 51 +++ .../parttwo/AlgorithmStepwiseRegression.java | 60 +++- 15 files changed, 427 insertions(+), 678 deletions(-) diff --git a/src/test/java/org/apache/sysds/test/AutomatedTestBase.java b/src/test/java/org/apache/sysds/test/AutomatedTestBase.java index 5217d0c..6d1b396 100644 --- a/src/test/java/org/apache/sysds/test/AutomatedTestBase.java +++ b/src/test/java/org/apache/sysds/test/AutomatedTestBase.java @@ -116,6 +116,23 @@ public abstract class AutomatedTestBase { */ private static final File CONFIG_TEMPLATE_FILE = new File(CONFIG_DIR, "SystemDS-config.xml"); + protected enum CodegenTestType { + DEFAULT, FUSE_ALL, FUSE_NO_REDUNDANCY; + + public String getCodgenConfig() { + switch(this) { + case DEFAULT: + return "SystemDS-config-codegen.xml"; + case FUSE_ALL: + return "SystemDS-config-codegen-fuse-all.xml"; + case FUSE_NO_REDUNDANCY: + return "SystemDS-config-codegen-fuse-no-redundancy.xml"; + default: + throw new RuntimeException("Unsupported codegen test config: "+this.name()); + } + } + } + /** * Location under which we create local temporary directories for test cases. To adjust where testTemp is located, * use -Dsystemds.testTemp.root.dir=. This is necessary if any parent directories are @@ -289,6 +306,13 @@ public abstract class AutomatedTestBase { return CONFIG_TEMPLATE_FILE; } + protected File getCodegenConfigFile(String parent, CodegenTestType type) { + // Instrumentation in this test's output log to show custom configuration file used for template. + File tmp = new File(parent, type.getCodgenConfig()); + System.out.println("This test case overrides default configuration with " + tmp.getPath()); + return tmp; + } + protected ExecMode setExecMode(ExecType instType) { switch(instType) { case SPARK: return setExecMode(ExecMode.SPARK); diff --git a/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java index 6fef59a..90d7ff8 100644 --- a/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java +++ b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java @@ -36,15 +36,6 @@ public class AlgorithmAutoEncoder extends AutomatedTestBase private final static String TEST_NAME1 = "Algorithm_AutoEncoder"; private final static String TEST_DIR = "functions/codegenalg/"; private final static String TEST_CLASS_DIR = TEST_DIR + AlgorithmAutoEncoder.class.getSimpleName() + "/"; - private final static String TEST_CONF_DEFAULT = "SystemDS-config-codegen.xml"; - private final static File TEST_CONF_FILE_DEFAULT = new File(SCRIPT_DIR + TEST_DIR, TEST_CONF_DEFAULT); - private final static St
[systemml] branch master updated: [SYSTEMML-2121] PCA test for codegenalg suite
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 955365c [SYSTEMML-2121] PCA test for codegenalg suite 955365c is described below commit 955365c5da1a916541d734a4e9494ab61c932503 Author: Janardhan Pulivarthi AuthorDate: Sat Apr 25 22:15:06 2020 +0200 [SYSTEMML-2121] PCA test for codegenalg suite This patch adds a test case for algorithm test with codegen enabled against an R script. The test matrix is as follows: | Rewrite | Sparse | FuseAll | FuseNoRedundancy | | --- | -- | | | - Spark | 1 |0 | 0|0 | or CP | 1 |1 | 0|0 | | 0 |0 | 0|0 | | 0 |1 | 0|0 | | 0 |0 | 1|0 | | 0 |1 | 1|0 | | 0 |0 | 0|1 | | 0 |1 | 0|1 | Closes #889. --- scripts/algorithms/PCA.dml | 14 +- .../functions/codegenalg/partone/AlgorithmPCA.java | 213 + .../scripts/functions/codegenalg/Algorithm_PCA.R | 87 + 3 files changed, 301 insertions(+), 13 deletions(-) diff --git a/scripts/algorithms/PCA.dml b/scripts/algorithms/PCA.dml index d165351..ea7afd7 100644 --- a/scripts/algorithms/PCA.dml +++ b/scripts/algorithms/PCA.dml @@ -62,19 +62,7 @@ if (model != "") { D = ncol(A); # perform z-scoring (centering and scaling) - if (center == 1) { - cm = colMeans(A); - A = A - cm; - } - if (scale == 1) { - cvars = (colSums (A^2)); - if (center == 1){ - cm = colMeans(A); - cvars = (cvars - N*(cm^2))/(N-1); - } - Azscored = (A)/sqrt(cvars); -A = Azscored; - } + A = scale(A, center==1, scale==1); # co-variance matrix mu = colSums(A)/N; diff --git a/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmPCA.java b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmPCA.java new file mode 100644 index 000..e0a1906 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmPCA.java @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.codegenalg.partone; + +import java.io.File; +import java.util.HashMap; + +import org.junit.Test; +import org.apache.sysds.common.Types.ExecMode; +import org.apache.sysds.hops.OptimizerUtils; +import org.apache.sysds.lops.LopProperties.ExecType; +import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; +import org.apache.sysds.test.TestUtils; +import org.junit.Assert; + +public class AlgorithmPCA extends AutomatedTestBase +{ + private final static String TEST_NAME1 = "Algorithm_PCA"; + private final static String TEST_DIR = "functions/codegenalg/"; + private final static String TEST_CLASS_DIR = TEST_DIR + AlgorithmPCA.class.getSimpleName() + "/"; + private final static String TEST_CONF_DEFAULT = "SystemDS-config-codegen.xml"; + private final static File TEST_CONF_FILE_DEFAULT = new File(SCRIPT_DIR + TEST_DIR, TEST_CONF_DEFAULT); + private final static String TEST_CONF_FUSE_ALL = "SystemDS-config-codegen-fuse-all.xml"; + private final static File TEST_CONF_FILE_FUSE_ALL = new File(SCRIPT_DIR + TEST_DIR, TEST_CONF_FUSE_ALL); + private final static String TEST_CONF_FUSE_NO_REDUNDANCY = "SystemDS-config-codegen-fuse-no-redundancy.xml";
[systemml] branch master updated: [MINOR] Script-level improvements mice builtin function
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new f450ead [MINOR] Script-level improvements mice builtin function f450ead is described below commit f450ead5506d1615b5979bee85b39891e0f0fc00 Author: Matthias Boehm AuthorDate: Sat Apr 25 19:40:58 2020 +0200 [MINOR] Script-level improvements mice builtin function * Loop vectorization of scalar assignment * Removed unnecessary branch for table padding * Minor modifications of rmEmpty use to increase common subexpression elimination --- scripts/builtin/mice.dml | 44 +++- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml index 3f3c325..99d2be2 100644 --- a/scripts/builtin/mice.dml +++ b/scripts/builtin/mice.dml @@ -56,12 +56,12 @@ return(Frame[String] dataset, Frame[String] singleSet) col = ncol(F) Result = matrix(0, rows=1, cols = col) Mask_Result = matrix(0, rows=1, cols=col) - cat = t(cMask) * seq(1, ncol(cMask)) - cat = removeEmpty(target = cat, margin = "rows") + scat = seq(1, ncol(cMask)) + cat = removeEmpty(target=scat, margin="rows", select=t(cMask)) s="" for(i in 1: nrow(cat), check =0) -s = s+as.integer(as.scalar(cat[i, 1]))+","; - +s = s+as.integer(as.scalar(cat[i, 1]))+","; + # encoding categorical columns using recode transformation jspecR = "{ids:true, recode:["+s+"]}"; @@ -70,7 +70,7 @@ return(Frame[String] dataset, Frame[String] singleSet) XO = replace(target=X, pattern=NaN, replacement=0); # remove categorical features and impute continous features with mean - eX_n = removeEmpty(target=X, margin="cols", select=(1-cMask)) + eX_n = removeEmpty(target=X, margin="cols", select=(cMask==0)) col_n = ncol(eX_n); # storing the mask/address of missing values Mask_n = is.na(eX_n); @@ -80,7 +80,7 @@ return(Frame[String] dataset, Frame[String] singleSet) # filling the missing data with their means X2_n = eX_n+(Mask_n*colMeans(eX_n)) # matrices for computing actul data - p_n = table( (seq(1, ncol(eX_n))) , (removeEmpty(target = t(cMask==0)*seq(1, ncol(cMask)), margin ="rows")) , 1 ) + p_n = table(seq(1, ncol(eX_n)), removeEmpty(target=scat, margin="rows", select=t(cMask==0))) if(ncol(p_n) < ncol(cMask)) p_n = cbind(p_n, matrix(0, nrow(p_n), ncol(cMask)-ncol(p_n))) q = XO * cMask @@ -91,8 +91,7 @@ return(Frame[String] dataset, Frame[String] singleSet) eX_c2 = removeEmpty(target = eX_c, margin = "rows", select = (rowSums(eX_c != 0)==col_c)) colMod = matrix(0, 1, ncol(eX_c)) # compute columnwise mode - parfor(i in 1: col_c) - { + parfor(i in 1: col_c) { f = eX_c2[, i] # adding one in data for dealing with zero category cat_counts = table(f, 1, n, 1); # counts for each category mode = as.scalar(rowIndexMax(t(cat_counts))); @@ -100,13 +99,10 @@ return(Frame[String] dataset, Frame[String] singleSet) } # find the mask of missing values - tmpMask_c = (eX_c == 0); - tmpMask_c = (tmpMask_c * colMod) # fill missing values with mode + tmpMask_c = (eX_c==0) * colMod # fill missing values with mode # Generate a matrix of actual length - p_c = table((seq(1, ncol(tmpMask_c))) , (removeEmpty(target = t(cMask)*seq(1, ncol(cMask)), margin ="rows")), 1) - if(ncol(p_c) < ncol(cMask)) -p_c = cbind(p_c, matrix(0, nrow(p_c), ncol(cMask)-ncol(p_c))) + p_c = table(seq(1, ncol(tmpMask_c)), removeEmpty(target=scat, margin ="rows", select=t(cMask)), ncol(tmpMask_c), ncol(cMask)) Mask_c = tmpMask_c %*% p_c inverseMask_c = Mask_c == 0 @@ -131,14 +127,13 @@ return(Frame[String] dataset, Frame[String] singleSet) dXMask = matrix(0, 1, ncol(dX)) index = 1 for(k in 1:col) { -if(as.scalar(dcDistincts[1,k]) != 0) { - for(l in 1:as.scalar(dcDistincts[1,k])){ -dXMask[1,index] = 1 -index = index +1 - } +nDistk = as.scalar(dcDistincts[1,k]); +if(nDistk != 0) { + dXMask[1,index:(index+nDistk-1)] = matrix(1,1,nDistk) + index += nDistk; } else - index = index +1 + index += 1 } #multiple imputations @@ -149,7 +144,6 @@ return(Frame[String] dataset, Frame[String] singleSet) in_n = 1; in_c = 1; i=1; j=1; # varibales for index selection while(i <= ncol(dX)) { - if(as.scalar(dXMask[1,i]) == 0) { # construct column selector @@ -175,7 +169,7 @@ return(Frame[String] dataset, Frame[String] singleSet) } if((as.scalar(dXMask[1,i]) == 1) & (sum(Mask_c[, in_c]) != 0)) - { + { j = (i + as.scalar(dist[1,in_
[systemml] branch master updated: [SYSTEMDS-208] Fix buffer pool leak and cleanup robustness
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new beb4840 [SYSTEMDS-208] Fix buffer pool leak and cleanup robustness beb4840 is described below commit beb4840439ce6ca027470ce0cf3d2c903c1fa40d Author: Matthias Boehm AuthorDate: Sat Apr 25 16:49:45 2020 +0200 [SYSTEMDS-208] Fix buffer pool leak and cleanup robustness This patch fixes a buffer pool eviction leak, where each calls to mice added 3 uncleaned objects to the buffer pool and thus eventually ran into severe eviction (up to 'no space left on device'). A closer investigation revealed missing rmVar instructions in complex control flow programs. Specifically, we now reintroduced the notion of exit instructions for while/for/parfor/if and derive and add a packed rmVar instruction if necessary (based on livein and liveout sets). To make the mentioned exit instructions more effective, this patch also introduces a best-effort cleanup of liveout variable sets, which are too conservative for nested control flow. However, this cleanup is only done where it is guaranteed to be safe, i.e., the top-level of statement blocks at the main program and individual functions. Finally, the memory leak was due to creatvar instructions overwriting existing objects in the symbol table without proper cleanup. This is a consequence of missing rmvar instructions, but in order to guard against all cases, we now check this condition and perform a proper cleanup which guards against such unknown leaks. --- docs/Tasks.txt | 1 + .../org/apache/sysds/parser/DMLTranslator.java | 41 +-- .../runtime/controlprogram/BasicProgramBlock.java | 4 +- .../runtime/controlprogram/ForProgramBlock.java| 3 ++ .../runtime/controlprogram/IfProgramBlock.java | 27 +++--- .../runtime/controlprogram/ParForProgramBlock.java | 3 ++ .../sysds/runtime/controlprogram/ProgramBlock.java | 27 +- .../runtime/controlprogram/WhileProgramBlock.java | 3 ++ .../controlprogram/caching/LazyWriteBuffer.java| 7 ++- .../instructions/cp/VariableCPInstruction.java | 4 ++ .../sysds/runtime/util/ProgramConverter.java | 5 ++ src/main/java/org/apache/sysds/utils/Explain.java | 33 +++- .../test/functions/caching/BufferpoolLeakTest.java | 60 ++ .../scripts/functions/caching/BufferpoolLeak.dml | 28 ++ 14 files changed, 204 insertions(+), 42 deletions(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index d1e30c0..e63544c 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -166,6 +166,7 @@ SYSTEMDS-200 Various Fixes * 205 Fix scoping of builtin dml-bodied functions (vs user-defined) * 206 Fix codegen outer template compilation (tsmm) OK * 207 Fix builtin function call hoisting from expressionsOK + * 208 Fix bufferpool leak (live var analysis and createvar) OK SYSTEMDS-210 Extended lists Operations * 211 Cbind and Rbind over lists of matrices OK diff --git a/src/main/java/org/apache/sysds/parser/DMLTranslator.java b/src/main/java/org/apache/sysds/parser/DMLTranslator.java index f1f64c1..789ea9d 100644 --- a/src/main/java/org/apache/sysds/parser/DMLTranslator.java +++ b/src/main/java/org/apache/sysds/parser/DMLTranslator.java @@ -25,6 +25,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; +import java.util.Set; import java.util.stream.Collectors; import org.apache.commons.logging.Log; @@ -88,6 +89,7 @@ import org.apache.sysds.runtime.controlprogram.Program; import org.apache.sysds.runtime.controlprogram.ProgramBlock; import org.apache.sysds.runtime.controlprogram.WhileProgramBlock; import org.apache.sysds.runtime.instructions.Instruction; +import org.apache.sysds.runtime.instructions.cp.VariableCPInstruction; public class DMLTranslator @@ -200,6 +202,8 @@ public class DMLTranslator currentLiveOut = sb.analyze(currentLiveOut); } } + + cleanupLiveOutVariables(dmlp.getStatementBlocks(), new VariableSet()); } public void liveVariableAnalysisFunction(DMLProgram dmlp, FunctionStatementBlock fsb) { @@ -218,15 +222,32 @@ public class DMLTranslator //STEP 2: backward direction VariableSet currentLiveOut = new VariableSet(); VariableSet currentLiveIn = new VariableSet(); + VariableSet unionLiveIn = new VariableSet(); for (DataIdentifier id : fstmt.getInputParams()) currentLiveIn.addVariable(id.getName(), id
[systemml] branch master updated: [SYSTEMDS-361] New privacy constraint meta data (compiler/runtime)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 013ca82 [SYSTEMDS-361] New privacy constraint meta data (compiler/runtime) 013ca82 is described below commit 013ca8224c23b1d9f63e254162a56fb78bf74c96 Author: sebwrede AuthorDate: Fri Apr 24 20:00:13 2020 +0200 [SYSTEMDS-361] New privacy constraint meta data (compiler/runtime) Closes #895. --- docs/Tasks.txt | 7 + .../java/org/apache/sysds/hops/AggBinaryOp.java| 3 +- src/main/java/org/apache/sysds/hops/DataOp.java| 1 + src/main/java/org/apache/sysds/hops/Hop.java | 21 ++- src/main/java/org/apache/sysds/hops/LiteralOp.java | 1 + src/main/java/org/apache/sysds/lops/DataGen.java | 4 +- src/main/java/org/apache/sysds/lops/Lop.java | 18 +++ .../java/org/apache/sysds/lops/compile/Dag.java| 34 +++- .../org/apache/sysds/parser/BinaryExpression.java | 26 ++-- .../org/apache/sysds/parser/DMLTranslator.java | 3 + .../org/apache/sysds/parser/DataExpression.java| 135 +++- .../java/org/apache/sysds/parser/Identifier.java | 15 ++ .../controlprogram/caching/CacheableData.java | 16 +- .../sysds/runtime/instructions/Instruction.java| 12 ++ .../instructions/cp/VariableCPInstruction.java | 2 + .../org/apache/sysds/runtime/io/MatrixReader.java | 4 +- .../sysds/runtime/privacy/PrivacyConstraint.java | 42 + .../sysds/runtime/privacy/PrivacyPropagator.java | 38 + .../org/apache/sysds/runtime/util/HDFSTool.java| 38 - .../org/apache/sysds/test/AutomatedTestBase.java | 43 +- src/test/java/org/apache/sysds/test/TestUtils.java | 129 ++-- .../test/functions/data/misc/WriteMMTest.java | 2 +- .../MatrixMultiplicationPropagationTest.java | 171 + .../MatrixMultiplicationPropagationTest.dml| 27 24 files changed, 591 insertions(+), 201 deletions(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 2283d57..d1e30c0 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -260,5 +260,12 @@ SYSTEMDS-340 Compiler Assisted Lineage Caching and Reuse SYSTEMDS-350 Data Cleaning Framework * 351 New builtin function for error correction by schemaOK +SYSTEMDS-360 Privacy/Data Exchange Constraints + * 361 Initial privacy meta data (compiler/runtime) OK + * 362 Runtime privacy propagation + * 363 Compile-time privacy propagation + * 364 Error handling violated privacy constraints + * 365 Extended privacy/data exchange constraints + Others: * Break append instruction to cbind and rbind diff --git a/src/main/java/org/apache/sysds/hops/AggBinaryOp.java b/src/main/java/org/apache/sysds/hops/AggBinaryOp.java index b456cc8..a04d267 100644 --- a/src/main/java/org/apache/sysds/hops/AggBinaryOp.java +++ b/src/main/java/org/apache/sysds/hops/AggBinaryOp.java @@ -627,7 +627,8 @@ public class AggBinaryOp extends MultiThreadedHop setOutputDimensions(matmultCP); } - setLineNumbers( matmultCP ); + setLineNumbers(matmultCP); + setPrivacy(matmultCP); setLops(matmultCP); } diff --git a/src/main/java/org/apache/sysds/hops/DataOp.java b/src/main/java/org/apache/sysds/hops/DataOp.java index 7a22727..99cf91e 100644 --- a/src/main/java/org/apache/sysds/hops/DataOp.java +++ b/src/main/java/org/apache/sysds/hops/DataOp.java @@ -311,6 +311,7 @@ public class DataOp extends Hop } setLineNumbers(l); + setPrivacy(l); setLops(l); //add reblock/checkpoint lops if necessary diff --git a/src/main/java/org/apache/sysds/hops/Hop.java b/src/main/java/org/apache/sysds/hops/Hop.java index ba0dd03..79a251f 100644 --- a/src/main/java/org/apache/sysds/hops/Hop.java +++ b/src/main/java/org/apache/sysds/hops/Hop.java @@ -50,6 +50,7 @@ import org.apache.sysds.runtime.instructions.gpu.context.GPUContextPool; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.sysds.runtime.meta.DataCharacteristics; import org.apache.sysds.runtime.meta.MatrixCharacteristics; +import org.apache.sysds.runtime.privacy.PrivacyConstraint; import org.apache.sysds.runtime.util.UtilFunctions; import java.util.ArrayList; @@ -72,6 +73,7 @@ public abstract class Hop implements ParseInfo protected ValueType _valueType; protected boolean _visited = false; protected DataCharacteristics _dc = new MatrixCharacteristics(); + protected PrivacyConstraint _privacyConstraint = new PrivacyConstraint(); protected UpdateType _updateType = UpdateType.COPY; protected ArrayList _parent = new
[systemml] branch master updated: [MINOR] Fix Python lineage tracing tests
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new bd537bb [MINOR] Fix Python lineage tracing tests bd537bb is described below commit bd537bb39130484318216a91d27e8243f6166c54 Author: Sebastian AuthorDate: Fri Apr 24 19:40:09 2020 +0200 [MINOR] Fix Python lineage tracing tests Change to the way lineage trace tests are executed, such that instead of having files to compare to, the python trace is compared to a trace made from systemds directly. Motivated by the fact that the previous tests were failing, because of inconsistencies between new traces and old. Furthermore this commit contains: - A Badge for Federated Python tests - A update to the Automated tests of Federated Python for new build instruction Closes #896. --- .github/workflows/federatedPython.yml |2 +- .github/workflows/python.yml |9 + README.md |1 + src/main/python/tests/lineage/README.md| 40 + src/main/python/tests/lineage/test_lineagetrace.py | 103 + src/main/python/tests/lt.txt |1 - src/main/python/tests/lt2.txt |4 - src/main/python/tests/lt_l2svm.txt | 2035 src/main/python/tests/test_l2svm_lineage.py| 88 - src/main/python/tests/test_lineagetrace.py | 75 - 10 files changed, 154 insertions(+), 2204 deletions(-) diff --git a/.github/workflows/federatedPython.yml b/.github/workflows/federatedPython.yml index 9ec7b20..c07cde9 100644 --- a/.github/workflows/federatedPython.yml +++ b/.github/workflows/federatedPython.yml @@ -51,7 +51,7 @@ jobs: ${{ runner.os }}-maven- - name: Maven clean & package - run: mvn clean package + run: mvn clean package -P distribution - name: Setup Python uses: actions/setup-python@v1 diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index cc3e1cb..84933ac 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -88,3 +88,12 @@ jobs: echo "Beginning tests" python -m unittest tests/*.py echo "Exit Status: " $? + +- name: Run all lineage python tests + run: | +export SYSTEMDS_ROOT=$(pwd) +export PATH=$SYSTEMDS_ROOT/bin:$PATH +export SYSDS_QUIET=1 +cd src/main/python +python -m unittest tests/lineage/*.py +echo "Exit Status: " $? \ No newline at end of file diff --git a/README.md b/README.md index ce1d574..86803e8 100644 --- a/README.md +++ b/README.md @@ -36,3 +36,4 @@ limitations under the License. ![Application Test](https://github.com/apache/systemml/workflows/Application%20Test/badge.svg) ![Function Test](https://github.com/apache/systemml/workflows/Function%20Test/badge.svg) ![Python Test](https://github.com/apache/systemml/workflows/Python%20Test/badge.svg) +![Federated Python Test](https://github.com/apache/systemml/workflows/Federated%20Python%20Test/badge.svg) diff --git a/src/main/python/tests/lineage/README.md b/src/main/python/tests/lineage/README.md new file mode 100644 index 000..eb2eb4c --- /dev/null +++ b/src/main/python/tests/lineage/README.md @@ -0,0 +1,40 @@ + + +# Python Lineage Tests + +To enable testing the lineage you have to setup your path environment. + +## Linux/bash + +From the root of the repository call: + +```bash +# Do once in terminal +export SYSTEMDS_ROOT=$(pwd) +export PATH=$SYSTEMDS_ROOT/bin:$PATH +export SYSDS_QUIET=1 +``` + +Once the environment is setup, you can begin testing with the following: + +```bash +cd src/main/python/ +python tests/lineage/*.py +``` diff --git a/src/main/python/tests/lineage/test_lineagetrace.py b/src/main/python/tests/lineage/test_lineagetrace.py new file mode 100644 index 000..e462c48 --- /dev/null +++ b/src/main/python/tests/lineage/test_lineagetrace.py @@ -0,0 +1,103 @@ +# - +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied.
[systemml] branch master updated: [SYSTEMDS-333, 337] Improved lineage cache eviction
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 12f69c7 [SYSTEMDS-333,337] Improved lineage cache eviction 12f69c7 is described below commit 12f69c7c111cbe5e0ccc35d8bac58674b06480af Author: arnabp AuthorDate: Thu Apr 23 22:12:25 2020 +0200 [SYSTEMDS-333,337] Improved lineage cache eviction This patch improves lineage cache eviction by taking into account actual execution time of instructions/functions. The ordering policy is still LRU. Future commits will bring better approach to estimate spilling time and new eviction policies. Closes #891. --- docs/Tasks.txt | 6 +- .../runtime/controlprogram/BasicProgramBlock.java | 8 +- .../sysds/runtime/controlprogram/ProgramBlock.java | 4 +- .../instructions/cp/FunctionCallCPInstruction.java | 7 +- .../apache/sysds/runtime/lineage/LineageCache.java | 295 ++--- .../sysds/runtime/lineage/LineageCacheConfig.java | 23 +- .../runtime/lineage/LineageCacheStatistics.java| 10 + .../sysds/runtime/lineage/LineageRewriteReuse.java | 9 +- .../java/org/apache/sysds/utils/Statistics.java| 2 +- .../functions/lineage/.FunctionFullReuse5.dml.swp | Bin 0 -> 4096 bytes 10 files changed, 258 insertions(+), 106 deletions(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 6e6118c..2283d57 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -244,7 +244,11 @@ SYSTEMDS-320 Merge SystemDS into Apache SystemML OK SYSTEMDS-330 Lineage Tracing, Reuse and Integration * 331 Cache and reuse scalar outputs (instruction and multi-level) OK * 332 Parfor integration with multi-level reuse OK - * 333 Use exact execution time for cost based eviction + * 333 Improve cache eviction with actual compute timeOK + * 334 Cache scalars only with atleast one matrix inputs + * 335 Weighted eviction policy (function of size & computetime) + * 336 Better use of cache status to handle multithreading + * 337 Adjust disk I/O speed by recording actual time taken OK SYSTEMDS-340 Compiler Assisted Lineage Caching and Reuse * 341 Finalize unmarking of loop dependent operations diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/BasicProgramBlock.java b/src/main/java/org/apache/sysds/runtime/controlprogram/BasicProgramBlock.java index 5f44ac3..4590f0e 100644 --- a/src/main/java/org/apache/sysds/runtime/controlprogram/BasicProgramBlock.java +++ b/src/main/java/org/apache/sysds/runtime/controlprogram/BasicProgramBlock.java @@ -108,14 +108,17 @@ public class BasicProgramBlock extends ProgramBlock //statement-block-level, lineage-based reuse LineageItem[] liInputs = null; + long t0 = 0; if (_sb != null && LineageCacheConfig.isMultiLevelReuse()) { liInputs = LineageItemUtils.getLineageItemInputstoSB(_sb.getInputstoSB(), ec); List outNames = _sb.getOutputNamesofSB(); - if( LineageCache.reuse(outNames, _sb.getOutputsofSB(), outNames.size(), liInputs, _sb.getName(), ec) ) { + if(liInputs != null && LineageCache.reuse(outNames, _sb.getOutputsofSB(), + outNames.size(), liInputs, _sb.getName(), ec) ) { if( DMLScript.STATISTICS ) LineageCacheStatistics.incrementSBHits(); return; } + t0 = System.nanoTime(); } //actual instruction execution @@ -123,6 +126,7 @@ public class BasicProgramBlock extends ProgramBlock //statement-block-level, lineage-based caching if (_sb != null && liInputs != null) - LineageCache.putValue(_sb.getOutputsofSB(), liInputs, _sb.getName(), ec); + LineageCache.putValue(_sb.getOutputsofSB(), liInputs, _sb.getName(), + ec, System.nanoTime()-t0); } } diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/ProgramBlock.java b/src/main/java/org/apache/sysds/runtime/controlprogram/ProgramBlock.java index 5cde84e..8859d39 100644 --- a/src/main/java/org/apache/sysds/runtime/controlprogram/ProgramBlock.java +++ b/src/main/java/org/apache/sysds/runtime/controlprogram/ProgramBlock.java @@ -43,6 +43,7 @@ import org.apache.sysds.runtime.instructions.cp.IntObject; import org.apache.sysds.runtime.instructions.cp.ScalarObject; import org.apache.sysds.runtime.instructions.cp.StringObject; import org.apache.sys
[systemml] branch master updated: [SYSTEMDS-315] Python Federated Matrices (test, docs, scripts)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 0fe05a9 [SYSTEMDS-315] Python Federated Matrices (test, docs, scripts) 0fe05a9 is described below commit 0fe05a97840238c0130e58c1e4ec19b9195bb1a9 Author: Sebastian AuthorDate: Thu Apr 23 21:59:44 2020 +0200 [SYSTEMDS-315] Python Federated Matrices (test, docs, scripts) - Easy start of federated worker in /bin/systemds.sh - Setup of tests for the Python language bindings federated matrices - Tests of the basic federated operations - Out commented "advanced" functionality that is for later. - Initial tutorial on Python federated matrices - Minor :bug: fix in federated matrix, not allowing multiple sources - Github workflow action for automated federated tests Closes #871. --- .github/workflows/federatedPython.yml | 85 +++ README.md | 8 +- bin/README.md | 81 -- bin/systemds.sh| 40 ++- docker/build.sh| 3 + docker/{build.sh => pythonsysds.Dockerfile}| 14 +- docker/sysds.Dockerfile| 13 +- docker/testsysds.Dockerfile| 10 +- docs/README.md | 25 +- docs/Tasks.txt | 1 + src/assembly/bin/README.md | 6 +- src/main/python/docs/source/federated.rst | 126 + src/main/python/docs/source/index.rst | 7 + src/main/python/systemds/matrix/matrix.py | 3 +- src/main/python/tests/federated/runFedTest.sh | 67 + .../tests/federated/test_federated_aggregations.py | 236 + .../python/tests/federated/test_federated_basic.py | 281 + 17 files changed, 940 insertions(+), 66 deletions(-) diff --git a/.github/workflows/federatedPython.yml b/.github/workflows/federatedPython.yml new file mode 100644 index 000..9ec7b20 --- /dev/null +++ b/.github/workflows/federatedPython.yml @@ -0,0 +1,85 @@ +#- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#- + +name: Federated Python Test + +on: [push, pull_request] + +jobs: + applicationsTests: +runs-on: ${{ matrix.os }} +strategy: + fail-fast: false + matrix: +python-version: [3.6] +os: [ubuntu-latest] +java: [ 1.8 ] +name: Python Test +steps: +- name: Checkout Repository + uses: actions/checkout@v2 + +- name: Setup Java + uses: actions/setup-java@v1 + with: +java-version: ${{ matrix.java }} + +- name: Cache Maven Dependencies + uses: actions/cache@v1 + with: +path: ~/.m2/repository +key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} +restore-keys: | + ${{ runner.os }}-maven- + +- name: Maven clean & package + run: mvn clean package + +- name: Setup Python + uses: actions/setup-python@v1 + with: +python-version: ${{ matrix.python-version }} +architecture: 'x64' + +- name: Cache Pip Dependencies + uses: actions/cache@v1 + with: +path: ~/.cache/pip +key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('src/main/python/setup.py') }} +restore-keys: | + ${{ runner.os }}-pip-${{ matrix.python-version }}- + +- name: Install pip Dependencies + run: pip install numpy py4j wheel + +- name: Build Python Package + run: | +cd src/main/python +python create_python_dist.py + +- name: Run Federated Python Tests + run: | +export SYSTEMDS_ROOT=$(pwd) +export PATH=$SYSTEMDS_ROOT/bin:$PATH +cd src/main/python +./tests/federated/runFedTest.sh tests/fed
[systemml] branch master updated (9bd68ff -> cf74661)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git. from 9bd68ff [SYSTEMDS-233] Fix multi-level lineage caching (parfor, determinism) new 0426099 [MINOR] Removal of remaining pydml test files and tests new cf74661 [SYSTEMDS-207] Fix dml-builtin-function hoisting from expressions The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: docs/Tasks.txt | 1 + .../java/org/apache/sysds/parser/DMLProgram.java | 4 +- .../org/apache/sysds/parser/StatementBlock.java| 59 ++ .../functions/misc/FunctionInExpressionTest.java | 7 +++ .../test/functions/mlcontext/MLContextTest.java| 59 -- .../scripts/functions/misc/FunInExpression7.dml| 7 ++- .../scripts/functions/misc/PackageFunCall1.pydml | 25 - .../scripts/functions/misc/PackageFunCall2.pydml | 26 -- .../scripts/functions/misc/PackageFunLib.pydml | 25 - 9 files changed, 52 insertions(+), 161 deletions(-) copy scripts/nn/test/compare_backends/gen_softmax.dml => src/test/scripts/functions/misc/FunInExpression7.dml (86%) delete mode 100644 src/test/scripts/functions/misc/PackageFunCall1.pydml delete mode 100644 src/test/scripts/functions/misc/PackageFunCall2.pydml delete mode 100644 src/test/scripts/functions/misc/PackageFunLib.pydml
[systemml] 02/02: [SYSTEMDS-207] Fix dml-builtin-function hoisting from expressions
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git commit cf74661016928e3413d693f939a67964f3256b19 Author: Matthias Boehm AuthorDate: Tue Apr 21 22:00:46 2020 +0200 [SYSTEMDS-207] Fix dml-builtin-function hoisting from expressions Function calls to dml-bodied functions need to bind their outputs to logical variable names and hence require a cut of the basic block for correctness. To still allow such functions in expressions (which is very common), we perform function call hoisting from expressions during parsing in order to be able to cut after entire statements. This automatically applied to the new dml-bodied builtin functions too, but because theses functions are loaded before ran into null pointer exceptions during validation (thanks Arnab for catching this). This fix extends the function hoisting by probing for dml-bodied builtin functions and lazily loading, parsing, and adding the required functions if needed. By reusing the recently added mechanics from lazy function loading in eval functions, we keep the number of alternative entry points very small. --- docs/Tasks.txt | 1 + .../java/org/apache/sysds/parser/DMLProgram.java | 4 +- .../org/apache/sysds/parser/StatementBlock.java| 59 ++ .../functions/misc/FunctionInExpressionTest.java | 7 +++ .../scripts/functions/misc/FunInExpression7.dml| 26 ++ 5 files changed, 73 insertions(+), 24 deletions(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 5ae71b1..7a61c05 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -165,6 +165,7 @@ SYSTEMDS-200 Various Fixes * 204 Fix rewrite simplify sequences of binary comparisons OK * 205 Fix scoping of builtin dml-bodied functions (vs user-defined) * 206 Fix codegen outer template compilation (tsmm) OK + * 207 Fix builtin function call hoisting from expressionsOK SYSTEMDS-210 Extended lists Operations * 211 Cbind and Rbind over lists of matrices OK diff --git a/src/main/java/org/apache/sysds/parser/DMLProgram.java b/src/main/java/org/apache/sysds/parser/DMLProgram.java index 4e5e229..2487aec 100644 --- a/src/main/java/org/apache/sysds/parser/DMLProgram.java +++ b/src/main/java/org/apache/sysds/parser/DMLProgram.java @@ -166,11 +166,11 @@ public class DMLProgram try { //handle statement blocks of all functions for( FunctionStatementBlock fsb : getFunctionStatementBlocks() ) - StatementBlock.rHoistFunctionCallsFromExpressions(fsb); + StatementBlock.rHoistFunctionCallsFromExpressions(fsb, this); //handle statement blocks of main program ArrayList tmp = new ArrayList<>(); for( StatementBlock sb : _blocks ) - tmp.addAll(StatementBlock.rHoistFunctionCallsFromExpressions(sb)); + tmp.addAll(StatementBlock.rHoistFunctionCallsFromExpressions(sb, this)); _blocks = tmp; } catch(LanguageException ex) { diff --git a/src/main/java/org/apache/sysds/parser/StatementBlock.java b/src/main/java/org/apache/sysds/parser/StatementBlock.java index f275a84..f6a8f72 100644 --- a/src/main/java/org/apache/sysds/parser/StatementBlock.java +++ b/src/main/java/org/apache/sysds/parser/StatementBlock.java @@ -23,6 +23,8 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.Map.Entry; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -37,6 +39,7 @@ import org.apache.sysds.common.Types.ValueType; import org.apache.sysds.parser.Expression.FormatType; import org.apache.sysds.parser.LanguageException.LanguageErrorCodes; import org.apache.sysds.parser.PrintStatement.PRINTTYPE; +import org.apache.sysds.parser.dml.DmlSyntacticValidator; import org.apache.sysds.runtime.controlprogram.parfor.util.IDSequence; import org.apache.sysds.utils.MLContextProxy; @@ -460,13 +463,13 @@ public class StatementBlock extends LiveVariableAnalysis implements ParseInfo } - public static List rHoistFunctionCallsFromExpressions(StatementBlock current) { + public static List rHoistFunctionCallsFromExpressions(StatementBlock current, DMLProgram prog) { if (current instanceof FunctionStatementBlock) { FunctionStatementBlock fsb = (FunctionStatementBlock)current; FunctionStatement fstmt = (FunctionStatement)fsb.getStatement(0); ArrayList tmp = new Arr
[systemml] 01/02: [MINOR] Removal of remaining pydml test files and tests
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git commit 0426099b80fb451239e7d9a39bdacf752c79c80e Author: Matthias Boehm AuthorDate: Tue Apr 21 21:43:08 2020 +0200 [MINOR] Removal of remaining pydml test files and tests --- .../test/functions/mlcontext/MLContextTest.java| 59 -- .../scripts/functions/misc/PackageFunCall1.pydml | 25 - .../scripts/functions/misc/PackageFunCall2.pydml | 26 -- .../scripts/functions/misc/PackageFunLib.pydml | 25 - 4 files changed, 135 deletions(-) diff --git a/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextTest.java b/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextTest.java index ce7df49..ac7b3e7 100644 --- a/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextTest.java +++ b/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextTest.java @@ -1063,22 +1063,6 @@ public class MLContextTest extends MLContextTestBase { ml.execute(script); } - @Test(expected = MLContextException.class) - public void testJavaRDDBadMetadataPYDML() { - System.out.println("MLContextTest - JavaRDD bad metadata PYML"); - - List list = new ArrayList<>(); - list.add("1,2,3"); - list.add("4,5,6"); - list.add("7,8,9"); - JavaRDD javaRDD = sc.parallelize(list); - - MatrixMetadata mm = new MatrixMetadata(1, 1, 9); - - Script script = dml("print('sum: ' + sum(M))").in("M", javaRDD, mm); - ml.execute(script); - } - @Test public void testRDDGoodMetadataDML() { System.out.println("MLContextTest - RDD good metadata DML"); @@ -1274,28 +1258,6 @@ public class MLContextTest extends MLContextTestBase { } @Test - public void testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified() { - System.out.println("MLContextTest - DataFrame sum PYDML, vector with ID column, no format specified"); - - List> list = new ArrayList<>(); - list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0))); - list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0))); - list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0))); - JavaRDD> javaRddTuple = sc.parallelize(list); - - JavaRDD javaRddRow = javaRddTuple.map(new DoubleVectorRow()); - List fields = new ArrayList<>(); - fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); - fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); - StructType schema = DataTypes.createStructType(fields); - Dataset dataFrame = spark.createDataFrame(javaRddRow, schema); - - Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame); - setExpectedStdOut("sum: 45.0"); - ml.execute(script); - } - - @Test public void testDataFrameSumDMLVectorWithNoIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column, no format specified"); @@ -1317,27 +1279,6 @@ public class MLContextTest extends MLContextTestBase { } @Test - public void testDataFrameSumPYDMLVectorWithNoIDColumnNoFormatSpecified() { - System.out.println("MLContextTest - DataFrame sum PYDML, vector with no ID column, no format specified"); - - List list = new ArrayList<>(); - list.add(Vectors.dense(1.0, 2.0, 3.0)); - list.add(Vectors.dense(4.0, 5.0, 6.0)); - list.add(Vectors.dense(7.0, 8.0, 9.0)); - JavaRDD javaRddVector = sc.parallelize(list); - - JavaRDD javaRddRow = javaRddVector.map(new VectorRow()); - List fields = new ArrayList<>(); - fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); - StructType schema = DataTypes.createStructType(fields); - Dataset dataFrame = spark.createDataFrame(javaRddRow, schema); - - Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame); - setExpectedStdOut("sum: 45.0"); - ml.execute(script); - } - - @Test public void testDisplayBooleanDML() { System.out.println("MLContextTest - display boolean DML"); String s = "print(b);"; diff
[systemml] branch master updated: [SYSTEMDS-233] Fix multi-level lineage caching (parfor, determinism)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 9bd68ff [SYSTEMDS-233] Fix multi-level lineage caching (parfor, determinism) 9bd68ff is described below commit 9bd68ffc5d211583a2ebcfe5be514abf4cc29b69 Author: Matthias Boehm AuthorDate: Wed Apr 15 21:46:16 2020 +0200 [SYSTEMDS-233] Fix multi-level lineage caching (parfor, determinism) This patch fixes some issues with multi-level lineage caching in parfor, specifically (1) to allow function reuse despite differently named parfor worker functions, and (2) the check for deterministic function results incorrectly probed too far and thus missing opportunities. However, down the road we should add an IPA pass which determines once for all functions if they are deterministic and pass this information down to the runtime, in order to avoid scenarios where threads are already blocking on placeholders that are later removed due to non-deterministic functions. --- .../apache/sysds/hops/recompile/Recompiler.java| 10 +- src/main/java/org/apache/sysds/lops/Lop.java | 2 +- .../sysds/runtime/controlprogram/ProgramBlock.java | 17 ++-- .../instructions/cp/FunctionCallCPInstruction.java | 23 +- .../apache/sysds/runtime/lineage/LineageCache.java | 16 +-- .../runtime/lineage/LineageCacheStatistics.java| 10 +- .../sysds/runtime/lineage/LineageItemUtils.java| 10 +++--- .../java/org/apache/sysds/utils/Statistics.java| 2 +- .../functions/lineage/FunctionFullReuseTest.java | 7 +++ .../functions/lineage/FunctionFullReuse6.dml | 4 ++-- 10 files changed, 71 insertions(+), 30 deletions(-) diff --git a/src/main/java/org/apache/sysds/hops/recompile/Recompiler.java b/src/main/java/org/apache/sysds/hops/recompile/Recompiler.java index 2b11c73..d058c6a 100644 --- a/src/main/java/org/apache/sysds/hops/recompile/Recompiler.java +++ b/src/main/java/org/apache/sysds/hops/recompile/Recompiler.java @@ -155,7 +155,7 @@ public class Recompiler } // replace thread ids in new instructions - if( tid != 0 ) //only in parfor context + if( ProgramBlock.isThreadID(tid) ) //only in parfor context newInst = ProgramConverter.createDeepCopyInstructionSet(newInst, tid, -1, null, null, null, false, false); // remove writes if called through mlcontext or jmlc @@ -187,7 +187,7 @@ public class Recompiler } // replace thread ids in new instructions - if( tid != 0 ) //only in parfor context + if( ProgramBlock.isThreadID(tid) ) //only in parfor context newInst = ProgramConverter.createDeepCopyInstructionSet(newInst, tid, -1, null, null, null, false, false); // explain recompiled instructions @@ -209,7 +209,7 @@ public class Recompiler } // replace thread ids in new instructions - if( tid != 0 ) //only in parfor context + if( ProgramBlock.isThreadID(tid) ) //only in parfor context newInst = ProgramConverter.createDeepCopyInstructionSet(newInst, tid, -1, null, null, null, false, false); // explain recompiled instructions @@ -231,7 +231,7 @@ public class Recompiler } // replace thread ids in new instructions - if( tid != 0 ) //only in parfor context + if( ProgramBlock.isThreadID(tid) ) //only in parfor context newInst = ProgramConverter.createDeepCopyInstructionSet(newInst, tid, -1, null, null, null, false, false); // explain recompiled hops / instructions @@ -253,7 +253,7 @@ public class Recompiler } // replace thread ids in new instructions - if( tid != 0 ) //only in parfor context + if( ProgramBlock.isThreadID(tid) ) //only in parfor context newInst = ProgramConverter.createDeepCopyInstructionSet(newInst, tid, -1, null, null, null, false, false); // explain recompiled hops / instructions diff --git a/src/main/java/org/apache/sysds/lops/Lop.java b/src/main/java/org/apache/sysds/lops/Lop.java index fa25000..8bb7e1a 100644 --- a/src/main/java/org/apache/sysds/lops/Lop.java +++ b/src/main/java/org/apache/sysds/lops/Lop.java @@ -82,7 +82,7 @@ public abstract class Lop public static final String PROCESS_PREFIX = "_p"; public static final String CP_ROOT_THREAD
[systemml] branch master updated: [SYSTEMDS-118] New generic gridSearch builtin function
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new acfe388 [SYSTEMDS-118] New generic gridSearch builtin function acfe388 is described below commit acfe3883a50b827e78db45d0db901a3f448add20 Author: Matthias Boehm AuthorDate: Mon Apr 13 22:05:52 2020 +0200 [SYSTEMDS-118] New generic gridSearch builtin function This patch adds a new generic grid search function for hyper-parameter optimization of arbitrary ML algorithms and parameter combinations. This function takes train and eval functions by name as well as lists of parameter names and vectors of their values, and returns the parameter combination and model that gave the best results. So far hyper-parameter optimization is working, but the core training/scoring part needs additional features on list data types (e.g., list-list append, and eval fcalls with lists of unnamed and named parameters). Also, before it can be applied in practice it needs an integration with cross validation. --- docs/Tasks.txt | 2 +- scripts/builtin/gridSearch.dml | 80 + .../java/org/apache/sysds/common/Builtins.java | 1 + .../functions/builtin/BuiltinGridSearchTest.java | 82 ++ .../scripts/functions/builtin/GridSearchLM.dml | 44 5 files changed, 208 insertions(+), 1 deletion(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index c4fa46f..5ae71b1 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -91,7 +91,7 @@ SYSTEMDS-110 New Builtin Functions * 115 Builtin function for model debugging (slice finder)OK * 116 Builtin function for kmeansOK * 117 Builtin function for lm cross validation OK - * 118 Builtin function for hyperparameter grid search with CVlm + * 118 Builtin function for hyperparameter grid search * 119 Builtin functions for l2svm and msvm OK SYSTEMDS-120 Performance Features diff --git a/scripts/builtin/gridSearch.dml b/scripts/builtin/gridSearch.dml new file mode 100644 index 000..227b863 --- /dev/null +++ b/scripts/builtin/gridSearch.dml @@ -0,0 +1,80 @@ +#- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#- + +m_gridSearch = function(Matrix[Double] X, Matrix[Double] y, String train, String predict, + List[String] params, List[Unknown] paramValues, Boolean verbose = TRUE) + return (Matrix[Double] B, Frame[Unknown] opt) +{ + # Step 0) preparation of parameters, lengths, and values in convenient form + numParams = length(params); + paramLens = matrix(0, numParams, 1); + for( j in 1:numParams ) { +vect = as.matrix(paramValues[j,1]); +paramLens[j,1] = nrow(vect); + } + paramVals = matrix(0, numParams, max(paramLens)); + for( j in 1:numParams ) { +vect = as.matrix(paramValues[j,1]); +paramVals[j,1:nrow(vect)] = t(vect); + } + cumLens = rev(cumprod(rev(paramLens))/rev(paramLens)); + numConfigs = prod(paramLens); + + # Step 1) materialize hyper-parameter combinations + # (simplify debugging and compared to compute negligible) + HP = matrix(0, numConfigs, numParams); + parfor( i in 1:nrow(HP) ) { +for( j in 1:numParams ) + HP[i,j] = paramVals[j,as.scalar(((i-1)/cumLens[j,1])%%paramLens[j,1]+1)]; + } + + if( verbose ) +print("GridSeach: Hyper-parameter combinations: \n"+toString(HP)); + + # Step 2) training/scoring of parameter combinations + # TODO integrate cross validation + Rbeta = matrix(0, nrow(HP), ncol(X)); + Rloss = matrix(0, nrow(HP), 1); + arguments = list(X=X, y=y); + + parfor( i in 1:nrow(HP) ) { +# a) prepare training arguments +largs = arguments; +for( j in 1:numParams ) { + key = as.scalar(params[j]); + value = as.scalar(HP[i,j]); + largs = append(largs, list(key=va
[systemml] branch master updated: [SYSTEMDS-15] Travis remove badge
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 4bbba40 [SYSTEMDS-15] Travis remove badge 4bbba40 is described below commit 4bbba4051e63e67a3a2366ee3f414f01cc7d0b93 Author: Sebastian AuthorDate: Mon Apr 13 18:44:24 2020 +0200 [SYSTEMDS-15] Travis remove badge Missed that the badge still was in the README. This is now removed, furthermore the task associated with travis have been modified to reflect that it is removed, and why. Closes #886. --- README.md | 2 -- docs/Tasks.txt | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 1ccb7b4..a9cb743 100644 --- a/README.md +++ b/README.md @@ -27,9 +27,7 @@ limitations under the License. ## Status -[![Build Status](https://travis-ci.org/apache/systemml.svg?branch=master)](https://travis-ci.org/apache/systemml) [![License](https://img.shields.io/badge/License-Apache%202.0-gre.svg)](https://opensource.org/licenses/Apache-2.0) - ![Build](https://github.com/apache/systemml/workflows/Build/badge.svg) ![Documentation](https://github.com/apache/systemml/workflows/Documentation/badge.svg) ![Component Test](https://github.com/apache/systemml/workflows/Component%20Test/badge.svg) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 316971d..c4fa46f 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -9,7 +9,7 @@ SYSTEMDS-10 Compiler Rework / Misc * 12 Remove unnecessary HOP/LOP indirections OK * 13 Refactoring test cases into component/integration OK * 14 Complete removal of external functions from all scripts - * 15 Travis integration w/ subset of tests OK + * 15 Travis integration w/ subset of tests OK (removed for Github Actions) * 16 Remove instruction patching * 17 Refactoring of program block hierarchy OK * 18 Improve API for new dml-bodied builtin functionsOK
[systemml] branch master updated: [SYSTEMDS-291] Extended eval lazy function compilation (nested builtins)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 5f1cdf3 [SYSTEMDS-291] Extended eval lazy function compilation (nested builtins) 5f1cdf3 is described below commit 5f1cdf367b0616359461f1fd198898d59f0598a4 Author: Matthias Boehm AuthorDate: Mon Apr 13 18:39:47 2020 +0200 [SYSTEMDS-291] Extended eval lazy function compilation (nested builtins) This patch extends the lazy function compilation of dml-bodied builtin functions called through eval. We now support nested dml-bodied function calls (e.g., eval -> lm -> lmDS/lmCG) which is crucial for generic primitives of hyper-parameter optimization and the enumeration of cleaning pipelines. --- .../sysds/hops/rewrite/RewriteConstantFolding.java | 2 +- .../java/org/apache/sysds/parser/DMLProgram.java | 4 ++ .../org/apache/sysds/parser/DMLTranslator.java | 2 +- .../sysds/parser/FunctionCallIdentifier.java | 8 +-- .../sysds/parser/FunctionStatementBlock.java | 14 ++--- .../org/apache/sysds/parser/IfStatementBlock.java | 4 +- .../org/apache/sysds/parser/StatementBlock.java| 2 +- .../sysds/parser/dml/DmlSyntacticValidator.java| 8 ++- .../sysds/runtime/controlprogram/Program.java | 18 +- .../controlprogram/paramserv/ParamservUtils.java | 2 +- .../instructions/cp/EvalNaryCPInstruction.java | 70 ++ .../sysds/runtime/lineage/LineageRewriteReuse.java | 2 +- .../test/functions/mlcontext/MLContextTest.java| 10 .../mlcontext/eval4-nested_builtin-test.dml| 30 ++ 14 files changed, 129 insertions(+), 47 deletions(-) diff --git a/src/main/java/org/apache/sysds/hops/rewrite/RewriteConstantFolding.java b/src/main/java/org/apache/sysds/hops/rewrite/RewriteConstantFolding.java index ec098e6..6e04082 100644 --- a/src/main/java/org/apache/sysds/hops/rewrite/RewriteConstantFolding.java +++ b/src/main/java/org/apache/sysds/hops/rewrite/RewriteConstantFolding.java @@ -184,7 +184,7 @@ public class RewriteConstantFolding extends HopRewriteRule private BasicProgramBlock getProgramBlock() { if( _tmpPB == null ) - _tmpPB = new BasicProgramBlock( new Program() ); + _tmpPB = new BasicProgramBlock(new Program()); return _tmpPB; } diff --git a/src/main/java/org/apache/sysds/parser/DMLProgram.java b/src/main/java/org/apache/sysds/parser/DMLProgram.java index e86464c..4e5e229 100644 --- a/src/main/java/org/apache/sysds/parser/DMLProgram.java +++ b/src/main/java/org/apache/sysds/parser/DMLProgram.java @@ -131,6 +131,10 @@ public class DMLProgram return ret; } + public boolean containsFunctionStatementBlock(String name) { + return _functionBlocks.containsKey(name); + } + public void addFunctionStatementBlock(String fname, FunctionStatementBlock fsb) { _functionBlocks.put(fname, fsb); } diff --git a/src/main/java/org/apache/sysds/parser/DMLTranslator.java b/src/main/java/org/apache/sysds/parser/DMLTranslator.java index 9e41f9b..e61c928 100644 --- a/src/main/java/org/apache/sysds/parser/DMLTranslator.java +++ b/src/main/java/org/apache/sysds/parser/DMLTranslator.java @@ -412,7 +412,7 @@ public class DMLTranslator throws LanguageException, DMLRuntimeException, LopsException, HopsException { // constructor resets the set of registered functions - Program rtprog = new Program(); + Program rtprog = new Program(prog); // for all namespaces, translate function statement blocks into function program blocks for (String namespace : prog.getNamespaces().keySet()){ diff --git a/src/main/java/org/apache/sysds/parser/FunctionCallIdentifier.java b/src/main/java/org/apache/sysds/parser/FunctionCallIdentifier.java index fc5e1d8..497d591 100644 --- a/src/main/java/org/apache/sysds/parser/FunctionCallIdentifier.java +++ b/src/main/java/org/apache/sysds/parser/FunctionCallIdentifier.java @@ -115,8 +115,8 @@ public class FunctionCallIdentifier extends DataIdentifier } if (hasNamed && hasUnnamed){ raiseValidateError(" In DML, functions can only have named parameters " + - "(e.g., name1=value1, name2=value2) or unnamed parameters (e.g, value1, value2). " + - _name + " has both parameter types.", conditional); + "(e.g., name1=value1, name2=value2) or unnamed parameters (e.g, value1, value2). " + +
[systemml] branch master updated: [SYSTEMDS-263] Initial design ONNX graph importer
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 0dae427 [SYSTEMDS-263] Initial design ONNX graph importer 0dae427 is described below commit 0dae42705f91b00abc03be09d810b3a9286338c5 Author: Lukas Timpl AuthorDate: Sun Apr 12 20:43:47 2020 +0200 [SYSTEMDS-263] Initial design ONNX graph importer Since ONNX does support conditional operators (loop, if), I've tailored the design towards a command-line tool that generates a DML script as discussed. AMLS project SS2020. Closes #885. --- docs/Tasks.txt | 1 + docs/onnx-systemds-design.md | 46 2 files changed, 47 insertions(+) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index cfcab1a..316971d 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -205,6 +205,7 @@ SYSTEMDS-250 Extended Slice Finding SYSTEMDS-260 Misc Tools * 261 Stable marriage algorithm OK * 262 Data augmentation tool for data cleaning OK + * 263 ONNX graph importer/exporter SYSTEMDS-270 Compressed Matrix Blocks * 271 Reintroduce compressed matrix blocks from SystemML OK diff --git a/docs/onnx-systemds-design.md b/docs/onnx-systemds-design.md new file mode 100644 index 000..9650f9c --- /dev/null +++ b/docs/onnx-systemds-design.md @@ -0,0 +1,46 @@ +# onnx-systemds + +A tool for importing/exporting [ONNX](https://github.com/onnx/onnx/blob/master/docs/IR.md) graphs into/from SystemDS DML scripts. + + +## Goals + +* Support for importing [operators of the ONNX base definition](https://github.com/onnx/onnx/blob/master/docs/Operators.md) + +* Support for importing [operators defined by ONNX-ML](https://github.com/onnx/onnx/blob/master/docs/Operators-ml.md) + +* Support for exporting DML script to ONNX graphs + +## Limitations + +* Not able to support all data types / operators as they are not currently supported by SystemDS + + + +## Suggested Implementation + +Since the ONNX specification includes the conditional operators [loop](https://github.com/onnx/onnx/blob/master/docs/Operators.md#Loop) and [if](https://github.com/onnx/onnx/blob/master/docs/Operators.md#If), a direct conversion from ONNX to the internal HOP might not be ideal. + +Hence my suggested implementation is a dedicated tool invoked from command line which generates DML scripts. This also enables optimizations performed by the compiler at both graph and program level. + +### Example Call + +```bash +onnx-systemds model.onx --out model_script.dml +``` + + +### Tooling + +* Due to the availability of a [Python API](https://github.com/onnx/onnx/blob/master/docs/PythonAPIOverview.md) for ONNX, I would suggest implementing the tool in Python +* Another advantage of Python is good support for template engines e.g. [Jinja](https://jinja.palletsprojects.com/en/2.11.x/) +* An implementation could use templates for various operators which are then combined into a script + +### Implementation Details + +ONNX is a [serialized graph](https://github.com/onnx/onnx/blob/master/docs/IR.md#graphs) structured as a sorted list of nodes that form a DAG (directed acyclic graph). + +1. Loading in the serialized structure +2. [Checking](https://github.com/onnx/onnx/blob/master/docs/PythonAPIOverview.md#checking-an-onnx-model) model and [converting](https://github.com/onnx/onnx/blob/master/docs/PythonAPIOverview.md#converting-version-of-an-onnx-model-within-default-domain-aionnx) models to a common version +3. Building a simple internal graph structure (for arbitrary operators) +4. Generating the DML script while traversing this graph (provided information in doc_strings and other description variables are added as comments to improve human-readability of the generated script)
[systemml] branch master updated: [SYSTEMDS-52] Fix libsvm reader/writer integration and correctness
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 84ef713 [SYSTEMDS-52] Fix libsvm reader/writer integration and correctness 84ef713 is described below commit 84ef71326c6781bad4ed9b39a210ee2cd4a6d4bd Author: Matthias Boehm AuthorDate: Sat Apr 11 23:12:37 2020 +0200 [SYSTEMDS-52] Fix libsvm reader/writer integration and correctness This patch fixes a correctness issue of the libsvm local writers, which incorrectly shifted the output indexes twice for space inputs. Furthermore, the libsvm local readers were not fully integrated in all code path yet. The distributed libsvm readers/writers still remain to be integrated. --- docs/Tasks.txt | 4 +- .../sysds/runtime/io/MatrixReaderFactory.java | 69 ++ .../sysds/runtime/io/ReaderTextLIBSVMParallel.java | 2 +- .../apache/sysds/runtime/io/WriterTextLIBSVM.java | 12 ++-- .../test/functions/data/misc/NoRenameTest.java | 48 +++ src/test/scripts/functions/data/NoRenameTest1.dml | 2 +- 6 files changed, 65 insertions(+), 72 deletions(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 7bb0c10..cfcab1a 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -47,8 +47,8 @@ SYSTEMDS-40 Preprocessing builtins SYSTEMDS-50 I/O Formats * 51 Support for homogeneous JSON (local/distributed) - * 52 Support for libsvm files (local/distributed) - * 53 New sql data source (local, distributed) + * 52 Support for libsvm files (local/distributed) + * 53 New sql data source (local, distributed) * 54 Support for is.na, is.nan, is.infinite OK SYSTEMDS-60 Update SystemML improvements diff --git a/src/main/java/org/apache/sysds/runtime/io/MatrixReaderFactory.java b/src/main/java/org/apache/sysds/runtime/io/MatrixReaderFactory.java index 3d2af34..168a336 100644 --- a/src/main/java/org/apache/sysds/runtime/io/MatrixReaderFactory.java +++ b/src/main/java/org/apache/sysds/runtime/io/MatrixReaderFactory.java @@ -28,36 +28,34 @@ import org.apache.sysds.runtime.matrix.data.MatrixBlock; public class MatrixReaderFactory { - - public static MatrixReader createMatrixReader( InputInfo iinfo ) + public static MatrixReader createMatrixReader(InputInfo iinfo) { MatrixReader reader = null; + boolean par = ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_READ_TEXTFORMATS); + boolean mcsr = MatrixBlock.DEFAULT_SPARSEBLOCK == SparseBlock.Type.MCSR; - if( iinfo == InputInfo.TextCellInputInfo || iinfo == InputInfo.MatrixMarketInputInfo ) - { - if( ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_READ_TEXTFORMATS) && MatrixBlock.DEFAULT_SPARSEBLOCK == SparseBlock.Type.MCSR ) - reader = new ReaderTextCellParallel( iinfo ); - else - reader = new ReaderTextCell( iinfo ); + if( iinfo == InputInfo.TextCellInputInfo || iinfo == InputInfo.MatrixMarketInputInfo ) { + reader = (par & mcsr) ? + new ReaderTextCellParallel(iinfo) : new ReaderTextCell(iinfo); } - else if( iinfo == InputInfo.CSVInputInfo ) - { - if( ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_READ_TEXTFORMATS) && MatrixBlock.DEFAULT_SPARSEBLOCK == SparseBlock.Type.MCSR ) - reader = new ReaderTextCSVParallel(new FileFormatPropertiesCSV()); - else - reader = new ReaderTextCSV(new FileFormatPropertiesCSV()); + else if( iinfo == InputInfo.CSVInputInfo ) { + reader = (par & mcsr) ? + new ReaderTextCSVParallel(new FileFormatPropertiesCSV()) : + new ReaderTextCSV(new FileFormatPropertiesCSV()); + } + else if( iinfo == InputInfo.LIBSVMInputInfo) { + reader = (par & mcsr) ? + new ReaderTextLIBSVMParallel() : new ReaderTextLIBSVM(); } else if( iinfo == InputInfo.BinaryCellInputInfo ) reader = new ReaderBinaryCell(); else if( iinfo == InputInfo.BinaryBlockInputInfo ) { - if( ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_READ_BINARYFORMATS) && MatrixBlock.DEFAULT_SPARSEBLOCK == SparseBlock.Type.MCSR ) - reader =
[systemml] branch master updated: [SYSTEMML-2538] Fix csv/text output rename in forced singlenode
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 47924e6 [SYSTEMML-2538] Fix csv/text output rename in forced singlenode 47924e6 is described below commit 47924e6aced3dac0768756c7dfec932d696b6a3f Author: Matthias Boehm AuthorDate: Sat Apr 11 22:51:50 2020 +0200 [SYSTEMML-2538] Fix csv/text output rename in forced singlenode This patch fixes an issue where an input csv/text file is directly fed into a persistent write, which eventually just renames the input file because it already exist on HDFS in the right format. We now explicitly guard against persistently read inputs, which only can occur w/ forced singelnode execution mode because other (in spark and hybrid) there is a reblock (potentially in memory) that creates a new metadata object. Furthermore, this also includes a minor internal refactoring for consistently obtaining input/output infos for external format strings, as well as a slight modification of the MatrixMatrixCellwiseTest to run over smaller inputs (because R is taken quite a while for them). --- .../java/org/apache/sysds/api/jmlc/Connection.java | 4 +- .../org/apache/sysds/parser/DataExpression.java| 4 +- .../controlprogram/caching/CacheableData.java | 2 +- .../federated/FederatedWorkerHandler.java | 2 +- .../instructions/cp/VariableCPInstruction.java | 38 ++- .../sysds/runtime/io/MatrixReaderFactory.java | 2 +- .../sysds/runtime/matrix/data/InputInfo.java | 8 +- .../sysds/runtime/matrix/data/OutputInfo.java | 2 +- .../org/apache/sysds/test/AutomatedTestBase.java | 14 +- .../FullMatrixMatrixCellwiseOperationTest.java | 4 +- .../test/functions/data/misc/NoRenameTest.java | 254 + .../functions/frame/FrameMatrixReblockTest.java| 5 +- .../test/functions/frame/FrameMatrixWriteTest.java | 2 +- .../transform/TransformEncodeDecodeTest.java | 2 +- src/test/scripts/functions/data/NoRenameTest1.dml | 24 ++ src/test/scripts/functions/data/NoRenameTest2.dml | 24 ++ 16 files changed, 345 insertions(+), 46 deletions(-) diff --git a/src/main/java/org/apache/sysds/api/jmlc/Connection.java b/src/main/java/org/apache/sysds/api/jmlc/Connection.java index e1557b8..a008939 100644 --- a/src/main/java/org/apache/sysds/api/jmlc/Connection.java +++ b/src/main/java/org/apache/sysds/api/jmlc/Connection.java @@ -372,7 +372,7 @@ public class Connection implements Closeable long nnz = jmtd.containsKey(DataExpression.READNNZPARAM)? jmtd.getLong(DataExpression.READNNZPARAM) : -1; String format = jmtd.getString(DataExpression.FORMAT_TYPE); - InputInfo iinfo = InputInfo.stringExternalToInputInfo(format); + InputInfo iinfo = InputInfo.fromExternalString(format); //read matrix file return readDoubleMatrix(fname, iinfo, rows, cols, blen, nnz); @@ -614,7 +614,7 @@ public class Connection implements Closeable long rows = jmtd.getLong(DataExpression.READROWPARAM); long cols = jmtd.getLong(DataExpression.READCOLPARAM); String format = jmtd.getString(DataExpression.FORMAT_TYPE); - InputInfo iinfo = InputInfo.stringExternalToInputInfo(format); + InputInfo iinfo = InputInfo.fromExternalString(format); //read frame file return readStringFrame(fname, iinfo, rows, cols); diff --git a/src/main/java/org/apache/sysds/parser/DataExpression.java b/src/main/java/org/apache/sysds/parser/DataExpression.java index 6ab1eb2..1b7ddc4 100644 --- a/src/main/java/org/apache/sysds/parser/DataExpression.java +++ b/src/main/java/org/apache/sysds/parser/DataExpression.java @@ -220,7 +220,7 @@ public class DataExpression extends DataIdentifier return null; } dataExpr.addVarParam(currName, currExpr); - } + } } else if (functionName.equalsIgnoreCase("rand")){ @@ -1178,7 +1178,7 @@ public class DataExpression extends DataIdentifier getOutput().setNnz(-1L); } - else{ + else{ raiseValidateError("Unknown Data Type " + dataTypeString + ". Valid values: "
[systemml] branch master updated: [SYSTEMDS-351] New builtin dropInvalid for cleaning by expected schema
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 158ccff [SYSTEMDS-351] New builtin dropInvalid for cleaning by expected schema 158ccff is described below commit 158ccffbadef845058d9f3a2c5084fbb8fa00429 Author: Shafaq Siddiqi AuthorDate: Fri Apr 10 20:44:33 2020 +0200 [SYSTEMDS-351] New builtin dropInvalid for cleaning by expected schema Closes #883. --- docs/Tasks.txt | 3 + .../java/org/apache/sysds/common/Builtins.java | 1 + src/main/java/org/apache/sysds/hops/BinaryOp.java | 3 +- src/main/java/org/apache/sysds/hops/Hop.java | 5 +- src/main/java/org/apache/sysds/lops/Binary.java| 11 +- src/main/java/org/apache/sysds/lops/BinaryM.java | 4 +- .../sysds/parser/BuiltinFunctionExpression.java| 10 + .../org/apache/sysds/parser/DMLTranslator.java | 21 ++- .../sysds/runtime/functionobjects/Builtin.java | 3 +- .../runtime/instructions/CPInstructionParser.java | 5 +- .../runtime/instructions/InstructionUtils.java | 3 + .../runtime/instructions/SPInstructionParser.java | 10 +- .../instructions/cp/BinaryCPInstruction.java | 2 + .../cp/BinaryFrameFrameCPInstruction.java | 47 + .../runtime/instructions/cp/CPInstruction.java | 2 +- .../spark/BinaryFrameFrameSPInstruction.java | 84 + .../instructions/spark/BinarySPInstruction.java| 42 +++-- .../sysds/runtime/matrix/data/FrameBlock.java | 36 .../apache/sysds/runtime/util/UtilFunctions.java | 4 +- .../functions/frame/FrameIsCorrectTypeTest.java| 206 + src/test/scripts/functions/frame/DropInvalid.dml | 25 +++ 21 files changed, 480 insertions(+), 47 deletions(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index d1168e6..5fdd96d 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -250,5 +250,8 @@ SYSTEMDS-340 Compiler Assisted Lineage Caching and Reuse * 344 Unmark functions/SBs containing non-determinism for caching * 345 Compiler assisted cache configuration +SYSTEMDS-350 Data Cleaning Framework + * 351 New builtin function for error correction by schemaOK + Others: * Break append instruction to cbind and rbind diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java index 7220198..4f20d87 100644 --- a/src/main/java/org/apache/sysds/common/Builtins.java +++ b/src/main/java/org/apache/sysds/common/Builtins.java @@ -83,6 +83,7 @@ public enum Builtins { DETECTSCHEMA("detectSchema", false), DIAG("diag", false), DISCOVER_FD("discoverFD", true), + DROP_INVALID("dropInvalid", false), EIGEN("eigen", false, ReturnType.MULTI_RETURN), EXISTS("exists", false), EXP("exp", false), diff --git a/src/main/java/org/apache/sysds/hops/BinaryOp.java b/src/main/java/org/apache/sysds/hops/BinaryOp.java index 769329b..8212b53 100644 --- a/src/main/java/org/apache/sysds/hops/BinaryOp.java +++ b/src/main/java/org/apache/sysds/hops/BinaryOp.java @@ -894,7 +894,8 @@ public class BinaryOp extends MultiThreadedHop private static MMBinaryMethod optFindMMBinaryMethodSpark(Hop left, Hop right) { // TODO size information for tensor - if (left._dataType == DataType.TENSOR && right._dataType == DataType.TENSOR) + if ((left._dataType == DataType.TENSOR && right._dataType == DataType.TENSOR) + || (left._dataType == DataType.FRAME && right._dataType == DataType.FRAME)) return MMBinaryMethod.MR_BINARY_R; long m1_dim1 = left.getDim1(); long m1_dim2 = left.getDim2(); diff --git a/src/main/java/org/apache/sysds/hops/Hop.java b/src/main/java/org/apache/sysds/hops/Hop.java index 01c61c9..f64b5f0 100644 --- a/src/main/java/org/apache/sysds/hops/Hop.java +++ b/src/main/java/org/apache/sysds/hops/Hop.java @@ -1057,6 +1057,7 @@ public abstract class Hop implements ParseInfo LOG_NZ, //sparse-safe log; ppred(X,0,"!=")*log(X,0.5) MINUS1_MULT, //1-X*Y BITWAND, BITWOR, BITWXOR, BITWSHIFTL, BITWSHIFTR, //bitwise operations + DROP_INVALID, // frame operation for removing cells invalid wrt given data type } public static final HashMap HopsOpOp2LopsB; @@ -1088,6 +1089,7 @@ public abstract class Hop implements ParseInfo HopsOpOp2LopsB.put(OpOp2.BITWXOR, Binary.OperationTypes.BW_XOR); HopsOpOp2LopsB.put(OpOp2.BITWSHIFTL, Binary.OperationTypes.BW_SHIFTL); HopsOpOp2LopsB.put(Op
[systemml] branch master updated: [MINOR] Fix opcodes for lineage-based reuse (corrupted by rework)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 3fd8769 [MINOR] Fix opcodes for lineage-based reuse (corrupted by rework) 3fd8769 is described below commit 3fd87695591bdba30964db995066472d148b252e Author: Matthias Boehm AuthorDate: Fri Apr 10 18:39:39 2020 +0200 [MINOR] Fix opcodes for lineage-based reuse (corrupted by rework) --- src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java index 75a305a..e130cfa 100644 --- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java +++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java @@ -30,7 +30,7 @@ import java.util.ArrayList; public class LineageCacheConfig { private static final String[] REUSE_OPCODES = new String[] { - "tmm", "ba+*", "*", "/", "+", "nrow", "ncol", + "tsmm", "ba+*", "*", "/", "+", "nrow", "ncol", "rightIndex", "leftIndex", "groupedagg", "r'", "solve", "spoof" };
[systemml] branch master updated: [SYSTEMDS-314] New Python SystemDS context manager
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 0793a18 [SYSTEMDS-314] New Python SystemDS context manager 0793a18 is described below commit 0793a183cf874faaf1f7d143d6b4e64b48e35db9 Author: Kevin Innerebner AuthorDate: Fri Apr 10 17:53:34 2020 +0200 [SYSTEMDS-314] New Python SystemDS context manager Closes #874. --- docs/Tasks.txt | 1 + src/main/python/docs/source/matrix.rst | 42 +- src/main/python/docs/source/simple_examples.rst| 67 + src/main/python/systemds/__init__.py | 2 +- src/main/python/systemds/{ => context}/__init__.py | 4 +- .../python/systemds/context/systemds_context.py| 149 + src/main/python/systemds/matrix/matrix.py | 44 +++--- src/main/python/systemds/matrix/operation_node.py | 124 + src/main/python/systemds/script_building/dag.py| 36 +++-- src/main/python/systemds/script_building/script.py | 37 ++--- .../systemds/{__init__.py => utils/consts.py} | 9 +- src/main/python/systemds/utils/converters.py | 6 +- src/main/python/systemds/utils/helpers.py | 46 +-- src/main/python/tests/test_l2svm.py| 9 +- src/main/python/tests/test_l2svm_lineage.py| 20 +-- src/main/python/tests/test_lineagetrace.py | 23 ++-- src/main/python/tests/test_matrix_aggregations.py | 30 ++--- src/main/python/tests/test_matrix_binary_op.py | 32 ++--- 18 files changed, 446 insertions(+), 235 deletions(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index d19672f..d1168e6 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -231,6 +231,7 @@ SYSTEMDS-310 Python Bindings * 311 Initial Python Binding for federated execution OK * 312 Python 3.6 compatibility OK * 313 Python Documentation upload via Github Actions OK + * 314 Python SystemDS context managerOK SYSTEMDS-320 Merge SystemDS into Apache SystemML OK * 321 Merge histories of SystemDS and SystemML OK diff --git a/src/main/python/docs/source/matrix.rst b/src/main/python/docs/source/matrix.rst index f2f2fdc..dd88c7c 100644 --- a/src/main/python/docs/source/matrix.rst +++ b/src/main/python/docs/source/matrix.rst @@ -23,6 +23,39 @@ Matrix API == +SystemDSContext +--- + +All operations using SystemDS need a java instance running. +The connection is ensured by an ``SystemDSContext`` object. +An ``SystemDSContext`` object can be created using: + +.. code_block:: python + sysds = SystemDSContext() + +When the calculations are finished the context has to be closed again: + +.. code_block:: python + sysds.close() + +Since it is annoying that it is always necessary to close the context, ``SystemDSContext`` +implements the python context management protocol, which supports the following syntax: + +.. code_block:: python + with SystemDSContext() as sds: +# do something with sds which is an SystemDSContext +pass + +This will automatically close the ``SystemDSContext`` once the with-block is left. + +.. note:: + + Creating a context is an expensive procedure, because a sub-process starting a JVM might have to start, therefore + try to do this only once for your program, or always leave at least one context open. + +.. autoclass:: systemds.context.SystemDSContext + :members: + OperationNode - @@ -49,13 +82,12 @@ Matrix -- A ``Matrix`` is represented either by an ``OperationNode``, or the derived class ``Matrix``. -An Matrix can recognized it by checking the ``output_type`` of the object. +An Matrix can be recognized it by checking the ``output_type`` of the object. -Matrices are the most fundamental objects we operate on. -If one generate the matrix in SystemDS directly via a function call, -it can be used in an function which will generate an ``OperationNode`` e.g. ``federated``, ``full``, ``seq``. +Matrices are the most fundamental objects SystemDS operates on. -If we want to work on an numpy array we need to use the class ``Matrix``. +Although it is possible to generate matrices with the function calls or object construction specified below, +the recommended way is to use the methods defined on ``SystemDSContext``. .. autoclass:: systemds.matrix.Matrix :members: diff --git a/src/main/python/docs/source/simple_examples.rst b/src/main/python/docs/source/simple_examples.rst index b9c35c3..2175fd4 100644 --- a/src/main/python/docs/source/simple_examples.rst +++ b/src/main/python/docs/source/simple_examples.rst @@ -27,18 +27,24 @@ Let's take a look at some code examples. Matrix Oper
[systemml] branch master updated: [MINOR] Remove Travis Testing
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 4e0edec [MINOR] Remove Travis Testing 4e0edec is described below commit 4e0edec2d19fb28b59b830ac5dee479c8596041f Author: Sebastian AuthorDate: Fri Apr 10 17:16:53 2020 +0200 [MINOR] Remove Travis Testing The travis testing is removed since our testing is now executed using Github Actions The travis testing was only covering the component tests. Closes #884. --- .travis.yml | 53 - 1 file changed, 53 deletions(-) diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 82101be..000 --- a/.travis.yml +++ /dev/null @@ -1,53 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -#http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -dist: xenial - -language: java - -jdk: - - openjdk8 - -addons: -# apt: -#sources: -#- r-packages-trusty -#packages: -#- r-base-dev - -cache: - apt: true - directories: -# caching .m2 causes an error loading hadoop-yarn-common-2.6.0.jar. Not sure why. -#- ${HOME}/.m2 -#- ${HOME}/R -#- /usr/local/lib/R/site-library - -install: -# - sudo Rscript ./src/test/scripts/installDependencies.R - -before_script: -# this is not needed anymore since adding authentication object in code for running hadoop/spark local -# - chmod -R 755 * - -script: - # - mvn clean verify jacoco:report coveralls:report - - mvn test-compile - - mvn surefire:test -Dtest=org.apache.sysds.test.component.** - -after_success: -# - mvn test jacoco:report coveralls:report \ No newline at end of file
[systemml] branch master updated: [SYSTEMDS-331, 332] Fix robustness lineage cache (deadlocks, correctness)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new f22e999 [SYSTEMDS-331,332] Fix robustness lineage cache (deadlocks, correctness) f22e999 is described below commit f22e9991e2370dc30a1fed01c3142c27071da42c Author: Matthias Boehm AuthorDate: Fri Apr 10 16:28:39 2020 +0200 [SYSTEMDS-331,332] Fix robustness lineage cache (deadlocks, correctness) This patch fixes the robustness of lineage-based caching, especially in multi-threaded parfor programs. This includes: 1) Deadlock prevention: With multi-level caching, the placeholders that prevent concurrent computation of redundant intermediates led to deadlocks because the following threads blocked inside the critical region and thus any caching of the thread that was producing the intermediate (via a complex DAG of operations) was blocked. 2) Deadlock wrong Data Types: With the introduction of scalar caching each thread had to decide to either pull a scalar or matrix on the placeholders. Since this decision was made based on the data item (which might not be available yet in parfor) threads were blocking on the wrong type and thus again producing deadlocks. 3) Correctness: The loop iteration variable of parfor was not integrated yet with lineage tracing leading to incorrect reuse for different parfor iterations that depended on the iteration variable. Furthermore, this patch also cleans up an unnecessarily wide public API of the lineage cache in order to facilitate a correct internal implementation. However, there are still a number of remaining issues, e.g., with the computation of compensation plans and probing logic. --- docs/Tasks.txt | 2 +- .../org/apache/sysds/parser/StatementBlock.java| 43 ++- .../runtime/controlprogram/BasicProgramBlock.java | 16 +- .../runtime/controlprogram/parfor/ParWorker.java | 42 ++- .../instructions/cp/FunctionCallCPInstruction.java | 8 +- .../apache/sysds/runtime/lineage/LineageCache.java | 388 +++-- .../sysds/runtime/lineage/LineageCacheConfig.java | 30 ++ .../sysds/runtime/lineage/LineageRewriteReuse.java | 54 +-- .../functions/lineage/FunctionFullReuseTest.java | 42 ++- .../functions/lineage/FunctionFullReuse6.dml | 37 ++ .../functions/lineage/FunctionFullReuse7.dml | 37 ++ 11 files changed, 412 insertions(+), 287 deletions(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 42741da..d19672f 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -239,7 +239,7 @@ SYSTEMDS-320 Merge SystemDS into Apache SystemML OK SYSTEMDS-330 Lineage Tracing, Reuse and Integration * 331 Cache and reuse scalar outputs (instruction and multi-level) OK - * 332 Parfor integration with multi-level reuse + * 332 Parfor integration with multi-level reuse OK * 333 Use exact execution time for cost based eviction SYSTEMDS-340 Compiler Assisted Lineage Caching and Reuse diff --git a/src/main/java/org/apache/sysds/parser/StatementBlock.java b/src/main/java/org/apache/sysds/parser/StatementBlock.java index 2e87909..5991315 100644 --- a/src/main/java/org/apache/sysds/parser/StatementBlock.java +++ b/src/main/java/org/apache/sysds/parser/StatementBlock.java @@ -43,12 +43,12 @@ import org.apache.sysds.utils.MLContextProxy; public class StatementBlock extends LiveVariableAnalysis implements ParseInfo { - protected static final Log LOG = LogFactory.getLog(StatementBlock.class.getName()); protected static IDSequence _seq = new IDSequence(); private static IDSequence _seqSBID = new IDSequence(); protected final long _ID; - + protected final String _name; + protected DMLProgram _dmlProg; protected ArrayList _statements; ArrayList _hops = null; @@ -62,6 +62,7 @@ public class StatementBlock extends LiveVariableAnalysis implements ParseInfo public StatementBlock() { _ID = getNextSBID(); + _name = "SB"+_ID; _dmlProg = null; _statements = new ArrayList<>(); _read = new VariableSet(); @@ -96,6 +97,10 @@ public class StatementBlock extends LiveVariableAnalysis implements ParseInfo public long getSBID() { return _ID; } + + public String getName() { + return _name; + } public void addStatement(Statement s) { _statements.add(s); @@ -399,8 +404,9 @@ public class StatementBlock extends LiveVariableAnalysis implements ParseInfo return inputs; } - public ArrayList getOutputsofSB() { - ArrayList ou
[systemml] branch master updated: [SYSTEMML-2533] Fix named arguments in MNIST LeNet example script
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new a3c0cce [SYSTEMML-2533] Fix named arguments in MNIST LeNet example script a3c0cce is described below commit a3c0cce761c855b034302e1f0871d68d8eccd089 Author: Nathan Kan AuthorDate: Thu Apr 9 19:55:39 2020 +0200 [SYSTEMML-2533] Fix named arguments in MNIST LeNet example script This fix backports the fix from #866 into the merged SystemDS code line. Closes #867. --- scripts/nn/examples/mnist_lenet.dml | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/nn/examples/mnist_lenet.dml b/scripts/nn/examples/mnist_lenet.dml index 57b8ba6..a882501 100644 --- a/scripts/nn/examples/mnist_lenet.dml +++ b/scripts/nn/examples/mnist_lenet.dml @@ -118,13 +118,13 @@ train = function(matrix[double] X, matrix[double] Y, stride, stride, pad, pad) outr1 = relu::forward(outc1) [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2, -strideh=2, stridew=2, pad=0, pad=0) +strideh=2, stridew=2, padh=0, padw=0) ## layer 2: conv2 -> relu2 -> pool2 [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad) outr2 = relu::forward(outc2) [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2, -strideh=2, stridew=2, pad=0, pad=0) +strideh=2, stridew=2, padh=0, padw=0) ## layer 3: affine3 -> relu3 -> dropout outa3 = affine::forward(outp2, W3, b3) outr3 = relu::forward(outa3) @@ -166,13 +166,13 @@ train = function(matrix[double] X, matrix[double] Y, [doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3) ## layer 2: conv2 -> relu2 -> pool2 doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2, -strideh=2, stridew=2, pad=0, pad=0) +strideh=2, stridew=2, padh=0, padw=0) doutc2 = relu::backward(doutr2, outc2) [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad) ## layer 1: conv1 -> relu1 -> pool1 doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2, -strideh=2, stridew=2, pad=0, pad=0) +strideh=2, stridew=2, padh=0, padw=0) doutc1 = relu::backward(doutr1, outc1) [dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) @@ -264,13 +264,13 @@ predict = function(matrix[double] X, int C, int Hin, int Win, pad, pad) outr1 = relu::forward(outc1) [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2, - strideh=2, stridew=2, pad=0, pad=0) + strideh=2, stridew=2, padh=0, padw=0) ## layer 2: conv2 -> relu2 -> pool2 [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad) outr2 = relu::forward(outc2) [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2, - strideh=2, stridew=2, pad=0, pad=0) + strideh=2, stridew=2, padh=0, padw=0) ## layer 3: affine3 -> relu3 outa3 = affine::forward(outp2, W3, b3) outr3 = relu::forward(outa3)
[systemml] branch master updated: [MINOR] Extended JMLC API (handling of pinned variables)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 39c5654 [MINOR] Extended JMLC API (handling of pinned variables) 39c5654 is described below commit 39c56541ca83ea36093384220d19a31b5578537e Author: Anthony Thomas AuthorDate: Thu Apr 9 19:29:55 2020 +0200 [MINOR] Extended JMLC API (handling of pinned variables) Closes #835. --- .../java/org/apache/sysds/api/jmlc/PreparedScript.java | 17 + .../sysds/runtime/controlprogram/LocalVariableMap.java | 4 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/apache/sysds/api/jmlc/PreparedScript.java b/src/main/java/org/apache/sysds/api/jmlc/PreparedScript.java index 04101f2..3175fcb 100644 --- a/src/main/java/org/apache/sysds/api/jmlc/PreparedScript.java +++ b/src/main/java/org/apache/sysds/api/jmlc/PreparedScript.java @@ -72,7 +72,7 @@ public class PreparedScript implements ConfigurableAPI //input/output specification private final HashSet _inVarnames; private final HashSet _outVarnames; - private final HashMap _inVarReuse; + private final LocalVariableMap _inVarReuse; //internal state (reused) private final Program _prog; @@ -91,7 +91,7 @@ public class PreparedScript implements ConfigurableAPI _vars.setRegisteredOutputs(that._outVarnames); _inVarnames = that._inVarnames; _outVarnames = that._outVarnames; - _inVarReuse = new HashMap<>(that._inVarReuse); + _inVarReuse = new LocalVariableMap(that._inVarReuse); _dmlconf = that._dmlconf; _cconf = that._cconf; } @@ -115,7 +115,7 @@ public class PreparedScript implements ConfigurableAPI Collections.addAll(_inVarnames, inputs); _outVarnames = new HashSet<>(); Collections.addAll(_outVarnames, outputs); - _inVarReuse = new HashMap<>(); + _inVarReuse = new LocalVariableMap(); //attach registered outputs (for dynamic recompile) _vars.setRegisteredOutputs(_outVarnames); @@ -415,7 +415,16 @@ public class PreparedScript implements ConfigurableAPI public void clearParameters() { _vars.removeAll(); } - + + /** +* Remove all references to pinned variables from this script. +* Note: this *does not* remove the underlying data. It merely +* removes a reference to it from this prepared script. This is +* useful if you want to maintain an independent cache of weights +* and allow the JVM to garbage collect under memory pressure. +*/ + public void clearPinnedData() { _inVarReuse.removeAll(); } + /** * Executes the prepared script over the bound inputs, creating the * result variables according to bound and registered outputs. diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/LocalVariableMap.java b/src/main/java/org/apache/sysds/runtime/controlprogram/LocalVariableMap.java index 92eabbd..1ac47b7 100644 --- a/src/main/java/org/apache/sysds/runtime/controlprogram/LocalVariableMap.java +++ b/src/main/java/org/apache/sysds/runtime/controlprogram/LocalVariableMap.java @@ -94,6 +94,10 @@ public class LocalVariableMap implements Cloneable localMap.putAll(vals); } + public void putAll(LocalVariableMap vars) { + putAll(vars.localMap); + } + public Data remove( String name ) { return localMap.remove( name ); }
[systemml] branch master updated: [MINOR] Fix unnecessarily detailed test output in tests/functions/misc
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 395b5d0 [MINOR] Fix unnecessarily detailed test output in tests/functions/misc 395b5d0 is described below commit 395b5d08b75ac0cd71421ac83f7792ff02e2086a Author: Matthias Boehm AuthorDate: Thu Apr 9 19:25:37 2020 +0200 [MINOR] Fix unnecessarily detailed test output in tests/functions/misc --- .../functions/misc/ConditionalValidateTest.java| 25 +++--- .../test/functions/misc/ExistsVariableTest.java| 2 +- .../functions/misc/FunctionInExpressionTest.java | 2 +- .../test/functions/misc/FunctionInliningTest.java | 2 +- .../test/functions/misc/FunctionNotFoundTest.java | 2 +- ...nstantFoldingScalarVariablePropagationTest.java | 2 +- .../test/functions/misc/IPANnzPropagationTest.java | 2 +- .../test/functions/misc/ListAndStructTest.java | 2 +- .../sysds/test/functions/misc/PrintMatrixTest.java | 8 ++- .../misc/RemoveUnnecessaryCTableTest.java | 2 +- .../test/functions/misc/RewriteListTsmmCVTest.java | 2 +- .../misc/RewriteSlicedMatrixMultTest.java | 4 ++-- .../test/functions/misc/SizePropagationTest.java | 2 +- .../functions/misc/ZeroRowsColsMatrixTest.java | 2 +- 14 files changed, 27 insertions(+), 32 deletions(-) diff --git a/src/test/java/org/apache/sysds/test/functions/misc/ConditionalValidateTest.java b/src/test/java/org/apache/sysds/test/functions/misc/ConditionalValidateTest.java index 7a54652..71282a7 100644 --- a/src/test/java/org/apache/sysds/test/functions/misc/ConditionalValidateTest.java +++ b/src/test/java/org/apache/sysds/test/functions/misc/ConditionalValidateTest.java @@ -105,14 +105,14 @@ public class ConditionalValidateTest extends AutomatedTestBase String TEST_NAME = testName; try - { + { TestConfiguration config = getTestConfiguration(TEST_NAME); loadTestConfiguration(config); - String HOME = SCRIPT_DIR + TEST_DIR; - String input = input("Y"); + String HOME = SCRIPT_DIR + TEST_DIR; + String input = input("Y"); - fullDMLScriptName = HOME + TEST_NAME + ".dml"; + fullDMLScriptName = HOME + TEST_NAME + ".dml"; programArgs = new String[]{"-args", input }; //write input @@ -124,16 +124,15 @@ public class ConditionalValidateTest extends AutomatedTestBase HDFSTool.writeMetaDataFile(input+(fileExists?"":"b")+".mtd", ValueType.FP64, mc, OutputInfo.TextCellOutputInfo); //run tests - runTest(true, exceptionExpected, DMLException.class, -1); - - //cleanup - HDFSTool.deleteFileIfExistOnHDFS(input); - HDFSTool.deleteFileIfExistOnHDFS(input+"b"); - HDFSTool.deleteFileIfExistOnHDFS(input+".mtd"); - HDFSTool.deleteFileIfExistOnHDFS(input+"b.mtd"); + runTest(true, exceptionExpected, DMLException.class, -1); + + //cleanup + HDFSTool.deleteFileIfExistOnHDFS(input); + HDFSTool.deleteFileIfExistOnHDFS(input+"b"); + HDFSTool.deleteFileIfExistOnHDFS(input+".mtd"); + HDFSTool.deleteFileIfExistOnHDFS(input+"b.mtd"); } - catch(Exception ex) - { + catch(Exception ex) { throw new RuntimeException(ex); } } diff --git a/src/test/java/org/apache/sysds/test/functions/misc/ExistsVariableTest.java b/src/test/java/org/apache/sysds/test/functions/misc/ExistsVariableTest.java index 8036dd9..ff32dbe 100644 --- a/src/test/java/org/apache/sysds/test/functions/misc/ExistsVariableTest.java +++ b/src/test/java/org/apache/sysds/test/functions/misc/ExistsVariableTest.java @@ -68,7 +68,7 @@ public class ExistsVariableTest extends AutomatedTestBase String HOME = SCRIPT_DIR + TEST_DIR; String param = pos ? "1" : "0"; fullDMLScriptName = HOME + testName + ".dml"; - programArgs = new String[]{"-explain", "-stats", "-args", param, output("R") }; + programArgs = new String[]{"-stats", "-args", param, output("R") };
[systemml] branch master updated: [SYSTEMDS-331] Extended lineage-based reuse (caching of scalars)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 6c94556 [SYSTEMDS-331] Extended lineage-based reuse (caching of scalars) 6c94556 is described below commit 6c9455678b2c38a41db741270a86812b05ee77ca Author: arnabp AuthorDate: Thu Apr 9 17:45:12 2020 +0200 [SYSTEMDS-331] Extended lineage-based reuse (caching of scalars) - This patch contains lineage caching support for scalar objects. This enables instruction level and multi-level reuse of operations/functions/statementblocks producing at least one scalar output. This patch improves multi-level cache hits. - Furthermore, this adds a new option `-reuse_multilevel` to enable multi-level reuse. - This patch also fixes few bugs and enhances reusable instructions list. - Additional fix for lineage cache reset to avoid endless loops on eviction in sequences of tests Closes #876. --- docs/Tasks.txt | 12 + src/main/java/org/apache/sysds/api/DMLOptions.java | 2 + .../runtime/controlprogram/BasicProgramBlock.java | 5 +- .../runtime/instructions/cp/BooleanObject.java | 5 + .../runtime/instructions/cp/DoubleObject.java | 5 + .../sysds/runtime/instructions/cp/IntObject.java | 5 + .../runtime/instructions/cp/ScalarObject.java | 3 + .../runtime/instructions/cp/StringObject.java | 5 + .../org/apache/sysds/runtime/lineage/Lineage.java | 1 + .../apache/sysds/runtime/lineage/LineageCache.java | 255 ++--- .../sysds/runtime/lineage/LineageCacheConfig.java | 8 +- .../runtime/lineage/LineageCacheStatistics.java| 4 +- .../sysds/runtime/lineage/LineageCodegenItem.java | 4 + .../apache/sysds/runtime/lineage/LineageMap.java | 8 +- .../sysds/runtime/lineage/LineageParser.java | 6 +- .../sysds/runtime/lineage/LineageRewriteReuse.java | 77 ++- .../sysds/runtime/lineage/LineageTokenizer.java| 2 +- .../test/functions/lineage/FullReuseTest.java | 7 + .../functions/lineage/FunctionFullReuseTest.java | 2 +- .../test/functions/lineage/SBFullReuseTest.java| 2 +- src/test/scripts/functions/lineage/FullReuse4.dml | 34 +++ 21 files changed, 296 insertions(+), 156 deletions(-) diff --git a/docs/Tasks.txt b/docs/Tasks.txt index 5fd6749..42741da 100644 --- a/docs/Tasks.txt +++ b/docs/Tasks.txt @@ -236,6 +236,18 @@ SYSTEMDS-320 Merge SystemDS into Apache SystemML OK * 321 Merge histories of SystemDS and SystemML OK * 322 Change global package namesOK * 323 Fix licenses and notice file OK + +SYSTEMDS-330 Lineage Tracing, Reuse and Integration + * 331 Cache and reuse scalar outputs (instruction and multi-level) OK + * 332 Parfor integration with multi-level reuse + * 333 Use exact execution time for cost based eviction + +SYSTEMDS-340 Compiler Assisted Lineage Caching and Reuse + * 341 Finalize unmarking of loop dependent operations + * 342 Mark functions as last-use to enable early eviction + * 343 Identify equal last level HOPs to ensure SB-level reuse + * 344 Unmark functions/SBs containing non-determinism for caching + * 345 Compiler assisted cache configuration Others: * Break append instruction to cbind and rbind diff --git a/src/main/java/org/apache/sysds/api/DMLOptions.java b/src/main/java/org/apache/sysds/api/DMLOptions.java index 7eca8ab..b9972a3 100644 --- a/src/main/java/org/apache/sysds/api/DMLOptions.java +++ b/src/main/java/org/apache/sysds/api/DMLOptions.java @@ -120,6 +120,8 @@ public class DMLOptions { dmlOptions.linReuseType = ReuseCacheType.REUSE_FULL; else if (lineageType.equalsIgnoreCase("reuse_partial")) dmlOptions.linReuseType = ReuseCacheType.REUSE_PARTIAL; + else if (lineageType.equalsIgnoreCase("reuse_multilevel")) + dmlOptions.linReuseType = ReuseCacheType.REUSE_MULTILEVEL; else if (lineageType.equalsIgnoreCase("reuse_hybrid")) dmlOptions.linReuseType = ReuseCacheType.REUSE_HYBRID; else if (lineageType.equalsIgnoreCase("none")) diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/BasicProgramBlock.java b/src/main/java/org/apache/sysds/runtime/controlprogram/BasicProgramBlock.java index 2a9e281..1f52a75 100644 --- a/src/main/java
[systemml] branch master updated: [MINOR] Fix typo internal builtin function names (sigmoid)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 809c025 [MINOR] Fix typo internal builtin function names (sigmoid) 809c025 is described below commit 809c02580570e136e2d150400abf184cbff01a74 Author: Matthias Boehm AuthorDate: Wed Apr 1 19:18:42 2020 +0200 [MINOR] Fix typo internal builtin function names (sigmoid) --- src/main/java/org/apache/sysds/common/Builtins.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java index d260d35..7220198 100644 --- a/src/main/java/org/apache/sysds/common/Builtins.java +++ b/src/main/java/org/apache/sysds/common/Builtins.java @@ -154,7 +154,7 @@ public enum Builtins { SAMPLE("sample", false), SD("sd", false), SEQ("seq", false), - SIGMOD("sigmoid", true), // 1 / (1 + exp(-X)) + SIGMOID("sigmoid", true), // 1 / (1 + exp(-X)) SIGN("sign", false), SIN("sin", false), SINH("sinh", false),
[systemml] branch master updated: [MINOR] Updated readme w/ correct travis badge
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new ef8b551 [MINOR] Updated readme w/ correct travis badge ef8b551 is described below commit ef8b551103085b73d0b471db9df6c5c0748f7d94 Author: Sebastian AuthorDate: Sat Mar 28 20:27:26 2020 +0100 [MINOR] Updated readme w/ correct travis badge Closes #870. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ee23ed6..1ccb7b4 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ limitations under the License. ## Status -[![Build Status](https://travis-ci.com/apache/systemml.svg?branch=master)](https://travis-ci.com/apache/systemml) +[![Build Status](https://travis-ci.org/apache/systemml.svg?branch=master)](https://travis-ci.org/apache/systemml) [![License](https://img.shields.io/badge/License-Apache%202.0-gre.svg)](https://opensource.org/licenses/Apache-2.0) ![Build](https://github.com/apache/systemml/workflows/Build/badge.svg)
[systemml] branch master updated: [SYSTEMDS-301] Improved github workflows (cache dependencies)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new dab0991 [SYSTEMDS-301] Improved github workflows (cache dependencies) dab0991 is described below commit dab09916436c9518afa3cf8da572db2bde32207a Author: Sebastian AuthorDate: Sat Mar 28 20:22:12 2020 +0100 [SYSTEMDS-301] Improved github workflows (cache dependencies) Closes #869. --- .github/workflows/applicationTests.yml | 3 ++- .github/workflows/build.yml| 16 .github/workflows/componentTests.yml | 16 .github/workflows/documentation.yml| 28 +++- .github/workflows/functionsTests.yml | 3 ++- .github/workflows/python.yml | 20 ++-- 6 files changed, 69 insertions(+), 17 deletions(-) diff --git a/.github/workflows/applicationTests.yml b/.github/workflows/applicationTests.yml index e4efc2c..652b31a 100644 --- a/.github/workflows/applicationTests.yml +++ b/.github/workflows/applicationTests.yml @@ -39,7 +39,8 @@ jobs: os: [ubuntu-latest] name: Ap Test ${{ matrix.tests }} steps: -- uses: actions/checkout@v2 +- name: Checkout Repository + uses: actions/checkout@v2 - name: Run all tests starting with "${{ matrix.tests }}" uses: ./.github/action/ diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c449040..0ae7f82 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -30,14 +30,22 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, macOS-latest, windows-latest] - steps: -- uses: actions/checkout@v2 +- name: Checkout Repository + uses: actions/checkout@v2 -- name: Set up JDK 1.8 +- name: Setup Java 1.8 uses: actions/setup-java@v1 with: java-version: 1.8 -- name: Build with Maven +- name: Cache Maven Dependencies + uses: actions/cache@v1 + with: +path: ~/.m2/repository +key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} +restore-keys: | + ${{ runner.os }}-maven- + +- name: Build run: mvn package diff --git a/.github/workflows/componentTests.yml b/.github/workflows/componentTests.yml index 838b662..0cc934c 100644 --- a/.github/workflows/componentTests.yml +++ b/.github/workflows/componentTests.yml @@ -30,15 +30,23 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] -java: [ 1.8 ] name: Component Tests ${{ matrix.os }} steps: -- uses: actions/checkout@v2 +- name: Checkout Repository + uses: actions/checkout@v2 -- name: Setup Java +- name: Setup Java 1.8 uses: actions/setup-java@v1 with: -java-version: ${{ matrix.java }} +java-version: 1.8 + +- name: Cache Maven Dependencies + uses: actions/cache@v1 + with: +path: ~/.m2/repository +key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} +restore-keys: | + ${{ runner.os }}-maven- - name: Maven clean compile & test-compile run: mvn clean compile test-compile diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 96e4881..201210f 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -31,13 +31,22 @@ jobs: runs-on: ubuntu-latest name: Documentation Java steps: -- uses: actions/checkout@v2 +- name: Checkout Repository + uses: actions/checkout@v2 -- name: Setup Java +- name: Setup Java 1.8 uses: actions/setup-java@v1 with: -java-version: 1.8 +java-version: 1.8 +- name: Cache Maven Dependencies + uses: actions/cache@v1 + with: +path: ~/.m2/repository +key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} +restore-keys: | + ${{ runner.os }}-maven- + - name: Make Documentation SystemDS Java run: mvn -P distribution package @@ -51,14 +60,23 @@ jobs: runs-on: ubuntu-latest name: Documentation Python steps: -- uses: actions/checkout@v2 +- name: Checkout Repository + uses: actions/checkout@v2 - name: Setup Python uses: actions/setup-python@v1 with: python-version: 3.7 architecture: 'x64' - + +- name: Cache Pip Dependencies + uses: actions/cache@v1 + with: +path: ~/.cache/pip +key: ${{ runner.os }}-pip-docs-${{ hashFiles('src/main/python/docs/requires-docs.txt') }} +restore-keys: | + ${{ runner.os }}-pip-docs- + - name: Install Dependencies run: | cd src/main/python/docs diff --git a/.github/workflows/functionsTests.yml b/.gi
[systemml] branch master updated: [MINOR] Fix mlcontext function tests (wrong url)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 2cc922c [MINOR] Fix mlcontext function tests (wrong url) 2cc922c is described below commit 2cc922c856967d75afac09f80fd4df73a620584a Author: Matthias Boehm AuthorDate: Sat Mar 28 00:28:46 2020 +0100 [MINOR] Fix mlcontext function tests (wrong url) --- .../java/org/apache/sysds/test/functions/mlcontext/MLContextTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextTest.java b/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextTest.java index fbc90c1..c2add4d 100644 --- a/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextTest.java +++ b/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextTest.java @@ -158,7 +158,7 @@ public class MLContextTest extends MLContextTestBase { @Test public void testCreateDMLScriptBasedOnURL() throws MalformedURLException { System.out.println("MLContextTest - create DML script based on URL"); - String urlString = "https://raw.githubusercontent.com/apache/systemml/systemds/master/src/test/scripts/applications/hits/HITS.dml;; + String urlString = "https://raw.githubusercontent.com/apache/systemml/master/src/test/scripts/applications/hits/HITS.dml;; URL url = new URL(urlString); Script script = dmlFromUrl(url); String expectedContent = "Licensed to the Apache Software Foundation";
[systemml] branch master updated: [SYSTEMML-2533] Fix named arguments in MNIST LeNet example script
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new e2b9858 [SYSTEMML-2533] Fix named arguments in MNIST LeNet example script e2b9858 is described below commit e2b985807c485b3c3f1b63e2926a2f5478441641 Author: Nathan Kan AuthorDate: Sun Mar 1 22:26:31 2020 +0100 [SYSTEMML-2533] Fix named arguments in MNIST LeNet example script Closes #866. --- scripts/nn/examples/mnist_lenet.dml | 13 ++--- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/scripts/nn/examples/mnist_lenet.dml b/scripts/nn/examples/mnist_lenet.dml index 57b8ba6..484219d 100644 --- a/scripts/nn/examples/mnist_lenet.dml +++ b/scripts/nn/examples/mnist_lenet.dml @@ -118,13 +118,13 @@ train = function(matrix[double] X, matrix[double] Y, stride, stride, pad, pad) outr1 = relu::forward(outc1) [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2, -strideh=2, stridew=2, pad=0, pad=0) +strideh=2, stridew=2, padh=0, padw=0) ## layer 2: conv2 -> relu2 -> pool2 [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad) outr2 = relu::forward(outc2) [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2, -strideh=2, stridew=2, pad=0, pad=0) +strideh=2, stridew=2, padh=0, padw=0) ## layer 3: affine3 -> relu3 -> dropout outa3 = affine::forward(outp2, W3, b3) outr3 = relu::forward(outa3) @@ -166,13 +166,13 @@ train = function(matrix[double] X, matrix[double] Y, [doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3) ## layer 2: conv2 -> relu2 -> pool2 doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2, -strideh=2, stridew=2, pad=0, pad=0) +strideh=2, stridew=2, padh=0, padw=0) doutc2 = relu::backward(doutr2, outc2) [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad) ## layer 1: conv1 -> relu1 -> pool1 doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2, -strideh=2, stridew=2, pad=0, pad=0) +strideh=2, stridew=2, padh=0, padw=0) doutc1 = relu::backward(doutr1, outc1) [dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) @@ -264,13 +264,13 @@ predict = function(matrix[double] X, int C, int Hin, int Win, pad, pad) outr1 = relu::forward(outc1) [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2, - strideh=2, stridew=2, pad=0, pad=0) + strideh=2, stridew=2, padh=0, padw=0) ## layer 2: conv2 -> relu2 -> pool2 [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad) outr2 = relu::forward(outc2) [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2, - strideh=2, stridew=2, pad=0, pad=0) + strideh=2, stridew=2, padh=0, padw=0) ## layer 3: affine3 -> relu3 outa3 = affine::forward(outp2, W3, b3) outr3 = relu::forward(outa3) @@ -328,4 +328,3 @@ generate_dummy_data = function() classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform")) Y = table(seq(1, N), classes) # one-hot encoding } -
[systemml] branch master updated: [SYSTEMML-2530] Fix wrong integer casting for negative numbers
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 4422a05 [SYSTEMML-2530] Fix wrong integer casting for negative numbers 4422a05 is described below commit 4422a05325b03e0b656302774504ca9763e72c2a Author: Matthias Boehm AuthorDate: Fri Aug 9 16:20:18 2019 +0200 [SYSTEMML-2530] Fix wrong integer casting for negative numbers This patch backports SYSTEMDS-106 as it resolves an issue of incorrect results that are so subtle that they might go unnoticed. --- src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java index 42b519b..f6c1182 100644 --- a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java +++ b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java @@ -323,11 +323,13 @@ public class UtilFunctions } public static int toInt( double val ) { - return (int) Math.floor( val + DOUBLE_EPS ); + return (int) (Math.signum(val) + * Math.floor(Math.abs(val) + DOUBLE_EPS)); } public static long toLong( double val ) { - return (long) Math.floor( val + DOUBLE_EPS ); + return (long) (Math.signum(val) + * Math.floor(Math.abs(val) + DOUBLE_EPS)); } public static int toInt(Object obj) {
[systemml] 02/02: [SYSTEMML-2521] New rewrite for sparsity-aware matrix product chains
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git commit f42dfb358ac24b6633d01dd181b51d458cd1bbe7 Author: Matthias Boehm AuthorDate: Sun Mar 17 20:23:33 2019 +0100 [SYSTEMML-2521] New rewrite for sparsity-aware matrix product chains This patch introduces a new dynamic rewrite for sparsity-aware matrix multiplication chain optimization. For estimating the sparsity of intermediates, we use the existing MNC sparsity estimator. While this rewrite does find the optimal plan in case of perfect estimates, it currently requires access to all input matrices of the mm chain and these inputs need to fit into CP memory. Accordingly, this rewrite is still disabled by default. --- .../sysml/hops/estim/EstimatorMatrixHistogram.java | 2 +- .../sysml/hops/rewrite/ProgramRewriteStatus.java | 16 ++- .../RewriteMatrixMultChainOptimization.java| 88 ++-- .../RewriteMatrixMultChainOptimizationSparse.java | 157 + 4 files changed, 215 insertions(+), 48 deletions(-) diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java b/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java index 5f1abff..b079a7e 100644 --- a/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java +++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java @@ -59,7 +59,7 @@ public class EstimatorMatrixHistogram extends SparsityEstimator return estim(root, true); } - private MatrixCharacteristics estim(MMNode root, boolean topLevel) { + public MatrixCharacteristics estim(MMNode root, boolean topLevel) { //NOTE: not estimateInputs due to handling of topLevel MatrixHistogram h1 = getCachedSynopsis(root.getLeft()); MatrixHistogram h2 = getCachedSynopsis(root.getRight()); diff --git a/src/main/java/org/apache/sysml/hops/rewrite/ProgramRewriteStatus.java b/src/main/java/org/apache/sysml/hops/rewrite/ProgramRewriteStatus.java index 552a598..a622948 100644 --- a/src/main/java/org/apache/sysml/hops/rewrite/ProgramRewriteStatus.java +++ b/src/main/java/org/apache/sysml/hops/rewrite/ProgramRewriteStatus.java @@ -19,9 +19,10 @@ package org.apache.sysml.hops.rewrite; +import org.apache.sysml.runtime.controlprogram.LocalVariableMap; + public class ProgramRewriteStatus { - //status of applied rewrites private boolean _rmBranches = false; //removed branches private int _blkSize = -1; @@ -29,14 +30,19 @@ public class ProgramRewriteStatus //current context private boolean _inParforCtx = false; + private LocalVariableMap _vars = null; - public ProgramRewriteStatus() - { + public ProgramRewriteStatus() { _rmBranches = false; _inParforCtx = false; _injectCheckpoints = false; } + public ProgramRewriteStatus(LocalVariableMap vars) { + this(); + _vars = vars; + } + public void setRemovedBranches(){ _rmBranches = true; } @@ -68,4 +74,8 @@ public class ProgramRewriteStatus public boolean getInjectedCheckpoints(){ return _injectCheckpoints; } + + public LocalVariableMap getVariables() { + return _vars; + } } diff --git a/src/main/java/org/apache/sysml/hops/rewrite/RewriteMatrixMultChainOptimization.java b/src/main/java/org/apache/sysml/hops/rewrite/RewriteMatrixMultChainOptimization.java index 91033c4..cdb1e12 100644 --- a/src/main/java/org/apache/sysml/hops/rewrite/RewriteMatrixMultChainOptimization.java +++ b/src/main/java/org/apache/sysml/hops/rewrite/RewriteMatrixMultChainOptimization.java @@ -35,14 +35,16 @@ import org.apache.sysml.utils.Explain; /** * Rule: Determine the optimal order of execution for a chain of - * matrix multiplications Solution: Classic Dynamic Programming - * Approach Currently, the approach based only on matrix dimensions + * matrix multiplications + * + * Solution: Classic Dynamic Programming + * Approach: Currently, the approach based only on matrix dimensions * Goal: To reduce the number of computations in the run-time * (map-reduce) layer */ public class RewriteMatrixMultChainOptimization extends HopRewriteRule { - private static final Log LOG = LogFactory.getLog(RewriteMatrixMultChainOptimization.class.getName()); + protected static final Log LOG = LogFactory.getLog(RewriteMatrixMultChainOptimization.class.getName()); private static final boolean LDEBUG = false; static { @@ -61,7 +63,7 @@ public class RewriteMatrixMultChainOptimization extends HopRewriteRule // Find the optimal order
[systemml] branch master updated (881f606 -> f42dfb3)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git. from 881f606 [MINOR] Provide a more informative error message when the dimensions don't match during the validate phase new 4a38a47 [MINOR] Fix unnecessary warnings (unnecessary imports) new f42dfb3 [SYSTEMML-2521] New rewrite for sparsity-aware matrix product chains The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: .../org/apache/sysml/api/ScriptExecutorUtils.java | 1 - .../apache/sysml/api/mlcontext/ScriptExecutor.java | 1 - .../java/org/apache/sysml/hops/FunctionOp.java | 1 - .../sysml/hops/estim/EstimatorMatrixHistogram.java | 2 +- .../sysml/hops/rewrite/ProgramRewriteStatus.java | 16 ++- .../RewriteMatrixMultChainOptimization.java| 88 ++-- .../RewriteMatrixMultChainOptimizationSparse.java | 157 + .../java/org/apache/sysml/utils/Statistics.java| 1 - .../org/apache/sysml/test/gpu/LstmCPUTest.java | 2 - .../functions/unary/matrix/AbsTest.java| 2 - .../functions/unary/matrix/NegationTest.java | 2 - .../functions/unary/matrix/SinTest.java| 2 - .../functions/unary/matrix/TanTest.java| 2 - 13 files changed, 215 insertions(+), 62 deletions(-) create mode 100644 src/main/java/org/apache/sysml/hops/rewrite/RewriteMatrixMultChainOptimizationSparse.java
[systemml] 01/02: [MINOR] Fix unnecessary warnings (unnecessary imports)
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git commit 4a38a4789302741965f49b4dd559a7078d94eb69 Author: Matthias Boehm AuthorDate: Sun Mar 17 12:09:33 2019 +0100 [MINOR] Fix unnecessary warnings (unnecessary imports) --- src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java | 1 - src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java| 1 - src/main/java/org/apache/sysml/hops/FunctionOp.java | 1 - src/main/java/org/apache/sysml/utils/Statistics.java| 1 - src/test/java/org/apache/sysml/test/gpu/LstmCPUTest.java| 2 -- .../apache/sysml/test/integration/functions/unary/matrix/AbsTest.java | 2 -- .../sysml/test/integration/functions/unary/matrix/NegationTest.java | 2 -- .../apache/sysml/test/integration/functions/unary/matrix/SinTest.java | 2 -- .../apache/sysml/test/integration/functions/unary/matrix/TanTest.java | 2 -- 9 files changed, 14 deletions(-) diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java index 0d072e5..c9d1a5d 100644 --- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java +++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java @@ -19,7 +19,6 @@ package org.apache.sysml.api; -import java.io.IOException; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; diff --git a/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java b/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java index 7bda306..8ecd962 100644 --- a/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java +++ b/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java @@ -38,7 +38,6 @@ import org.apache.sysml.conf.DMLConfig; import org.apache.sysml.conf.DMLOptions; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.parser.DMLProgram; -import org.apache.sysml.parser.DMLTranslator; import org.apache.sysml.parser.ParseException; import org.apache.sysml.parser.ParserFactory; import org.apache.sysml.parser.ParserWrapper; diff --git a/src/main/java/org/apache/sysml/hops/FunctionOp.java b/src/main/java/org/apache/sysml/hops/FunctionOp.java index dedbad6..534c0a0 100644 --- a/src/main/java/org/apache/sysml/hops/FunctionOp.java +++ b/src/main/java/org/apache/sysml/hops/FunctionOp.java @@ -22,7 +22,6 @@ package org.apache.sysml.hops; import java.util.ArrayList; import java.util.List; -import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.lops.FunctionCallCP; import org.apache.sysml.lops.FunctionCallCPSingle; import org.apache.sysml.lops.Lop; diff --git a/src/main/java/org/apache/sysml/utils/Statistics.java b/src/main/java/org/apache/sysml/utils/Statistics.java index 656de32..a2afae0 100644 --- a/src/main/java/org/apache/sysml/utils/Statistics.java +++ b/src/main/java/org/apache/sysml/utils/Statistics.java @@ -32,7 +32,6 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.DoubleAdder; import java.util.concurrent.atomic.LongAdder; -import org.apache.sysml.api.DMLScript; import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.conf.DMLConfig; import org.apache.sysml.hops.OptimizerUtils; diff --git a/src/test/java/org/apache/sysml/test/gpu/LstmCPUTest.java b/src/test/java/org/apache/sysml/test/gpu/LstmCPUTest.java index 5c93bca..4c4ab74 100644 --- a/src/test/java/org/apache/sysml/test/gpu/LstmCPUTest.java +++ b/src/test/java/org/apache/sysml/test/gpu/LstmCPUTest.java @@ -23,8 +23,6 @@ import java.util.Arrays; import java.util.HashMap; import java.util.List; -import org.apache.sysml.runtime.instructions.gpu.DnnGPUInstruction; -import org.apache.sysml.runtime.instructions.gpu.DnnGPUInstruction.LstmOperator; import org.apache.sysml.test.utils.TestUtils; import org.junit.Test; diff --git a/src/test/java/org/apache/sysml/test/integration/functions/unary/matrix/AbsTest.java b/src/test/java/org/apache/sysml/test/integration/functions/unary/matrix/AbsTest.java index a3027d6..6b61066 100644 --- a/src/test/java/org/apache/sysml/test/integration/functions/unary/matrix/AbsTest.java +++ b/src/test/java/org/apache/sysml/test/integration/functions/unary/matrix/AbsTest.java @@ -20,8 +20,6 @@ package org.apache.sysml.test.integration.functions.unary.matrix; import org.junit.Test; -import org.apache.sysml.api.DMLScript; -import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM; import org.apache.sysml.test.integration.AutomatedTestBase; import org.apache.sysml.test.integration.TestConfiguration; diff --git a/src/test/java/org/apache/sysml/test/integration/functions/unary/matrix/NegationTest.java b/src/test/java/org/apache/sysml/test/integration/functions/unary/matrix/NegationTest.java index c2613c2..6b2000a 100644
[systemml] branch master updated: [SYSTEMML-2511] Fix bitset sparsity estimation on large input data
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new fe83cad [SYSTEMML-2511] Fix bitset sparsity estimation on large input data fe83cad is described below commit fe83cad3e13d049eacea19662b1a4e3b1704cb6d Author: Matthias Boehm AuthorDate: Tue Feb 19 15:08:08 2019 +0100 [SYSTEMML-2511] Fix bitset sparsity estimation on large input data This patch fixes a corruption introduced by previous refactoring that led to always allocating a BitsetMatrix1 (w/ linearized long array) independent of the input size, leading to incorrect sketches and class cast exceptions on subsequent estimation. --- src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java b/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java index e26dd49..cf9f627 100644 --- a/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java +++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java @@ -88,7 +88,7 @@ public class EstimatorBitsetMM extends SparsityEstimator return null; //ensure synopsis is properly cached and reused if( node.isLeaf() && node.getSynopsis() == null ) - node.setSynopsis(new BitsetMatrix1(node.getData())); + node.setSynopsis(createBitset(node.getData())); else if( !node.isLeaf() ) estim(node); //recursively obtain synopsis return (BitsetMatrix) node.getSynopsis();
[systemml] branch master updated: [SYSTEMML-2468] Extended MNC exact propagation of sketch counts
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 01da6a6 [SYSTEMML-2468] Extended MNC exact propagation of sketch counts 01da6a6 is described below commit 01da6a6fef8bbfe68734da64604943547f45ae79 Author: Matthias Boehm AuthorDate: Tue Feb 19 14:13:04 2019 +0100 [SYSTEMML-2468] Extended MNC exact propagation of sketch counts This patch extends the MNC sketch propagation by a special case, where we can exactly infer (in an inexpensive manner) the output column count histogram, if the lhs has one non-zero per row and the rhs is leaf node and sparse. However, initial experiments were not fully conclusive and hecnce, this is still disabled. --- .../sysml/hops/estim/EstimatorMatrixHistogram.java | 30 +++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java b/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java index a82feed..5f1abff 100644 --- a/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java +++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java @@ -42,6 +42,7 @@ public class EstimatorMatrixHistogram extends SparsityEstimator { //internal configurations private static final boolean DEFAULT_USE_EXTENDED = true; + private static final boolean ADVANCED_SKETCH_PROP = false; private final boolean _useExtended; @@ -71,6 +72,7 @@ public class EstimatorMatrixHistogram extends SparsityEstimator } //sketch propagation for intermediates other than final result + h2.setData(root.getRight().isLeaf() ? root.getRight().getData() : null); MatrixHistogram outMap = MatrixHistogram .deriveOutputHistogram(h1, h2, ret, root.getOp(), root.getMisc()); root.setSynopsis(outMap); @@ -227,6 +229,7 @@ public class EstimatorMatrixHistogram extends SparsityEstimator private final int rNonEmpty, cNonEmpty; //number of non-empty rows/cols (w/ empty is nnz=0) private final int rNdiv2, cNdiv2; //number of rows/cols with nnz > #cols/2 and #rows/2 private boolean fullDiag; //true if there exists a full diagonal of nonzeros + private MatrixBlock _data = null; //optional leaf data public MatrixHistogram(MatrixBlock in, boolean useExcepts) { // 1) allocate basic synopsis @@ -348,6 +351,10 @@ public class EstimatorMatrixHistogram extends SparsityEstimator IntStream.range(0, getCols()).mapToLong(i-> cNnz[i]).sum(); } + public void setData(MatrixBlock mb) { + _data = mb; + } + public static MatrixHistogram deriveOutputHistogram(MatrixHistogram h1, MatrixHistogram h2, double spOut, OpCode op, long[] misc) { switch(op) { case MM: return deriveMMHistogram(h1, h2, spOut); @@ -396,6 +403,7 @@ public class EstimatorMatrixHistogram extends SparsityEstimator } } + @SuppressWarnings("unused") private static MatrixHistogram deriveMMHistogram(MatrixHistogram h1, MatrixHistogram h2, double spOut) { //exact propagation if lhs or rhs full diag if( h1.fullDiag ) return h2; @@ -416,9 +424,25 @@ public class EstimatorMatrixHistogram extends SparsityEstimator rMaxNnz = Math.max(rMaxNnz, rNnz[i]); } int[] cNnz = new int[h2.getCols()]; - for( int i=0; i
[systemml] branch master updated: [SYSTEMML-2509] Fix transform binning metadata frame allocation
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 0c23e1f [SYSTEMML-2509] Fix transform binning metadata frame allocation 0c23e1f is described below commit 0c23e1fa194d37d67e0490c2894519b0ea6720e4 Author: Matthias Boehm AuthorDate: Wed Feb 13 11:52:19 2019 +0100 [SYSTEMML-2509] Fix transform binning metadata frame allocation This patch fixes special cases where binning is the only transformation or where it requires the most metadata rows (e.g., more than recoding), for which cases so far the output metadata frame was not properly allocated. --- .../org/apache/sysml/runtime/matrix/data/FrameBlock.java | 12 ++-- .../apache/sysml/runtime/transform/encode/EncoderBin.java| 6 ++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java index 87c6aca..ef16feb 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java @@ -248,8 +248,16 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable */ public void ensureAllocatedColumns(int numRows) { //early abort if already allocated - if( _coldata != null && _schema.length == _coldata.length ) - return; + if( _coldata != null && _schema.length == _coldata.length ) { + //handle special case that to few rows allocated + if( _numRows < numRows ) { + String[] tmp = new String[getNumColumns()]; + int len = numRows - _numRows; + for(int i=0; i
[systemml] branch master updated: [SYSTEMML-2289] Additional sampling-based sparsity estimator baseline
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new e443eff [SYSTEMML-2289] Additional sampling-based sparsity estimator baseline e443eff is described below commit e443eff949b48f45d1453a6cbf483b87a612c307 Author: Matthias Boehm AuthorDate: Sat Feb 9 15:38:21 2019 +0100 [SYSTEMML-2289] Additional sampling-based sparsity estimator baseline This patch adds an additional baseline sparsity estimator based on sampling and hashing, which implements the apporach described in Rasmus Resen Amossen, Andrea Campagna, Rasmus Pagh: Better Size Estimation for Sparse Matrix Products. Algorithmica 69(3): 741-757 (2014) Credit: We're grateful to the authors who shared their code. This implementation improves upon it by fitting the SparsityEstimator API, support for binary matrix products, avoid unnecessary file access, use Well1024a for seeding local RNGs, and generally improve performance. --- .../apache/sysml/hops/estim/EstimatorSample.java | 2 +- .../apache/sysml/hops/estim/EstimatorSampleRa.java | 268 + .../functions/estim/OuterProductTest.java | 26 +- .../functions/estim/SelfProductTest.java | 21 ++ .../functions/estim/SquaredProductTest.java| 85 ++- 5 files changed, 398 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java b/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java index ec624f0..821aa73 100644 --- a/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java +++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java @@ -56,7 +56,7 @@ public class EstimatorSample extends SparsityEstimator } public EstimatorSample(double sampleFrac, boolean extended) { - if( sampleFrac < 0 || sampleFrac > 1.0 ) + if( sampleFrac <= 0 || sampleFrac > 1.0 ) throw new DMLRuntimeException("Invalid sample fraction: "+sampleFrac); _frac = sampleFrac; _extended = extended; diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorSampleRa.java b/src/main/java/org/apache/sysml/hops/estim/EstimatorSampleRa.java new file mode 100644 index 000..2e39d02 --- /dev/null +++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorSampleRa.java @@ -0,0 +1,268 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.hops.estim; + +import org.apache.commons.lang.NotImplementedException; +import org.apache.commons.math3.random.Well1024a; +import org.apache.sysml.hops.OptimizerUtils; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.matrix.MatrixCharacteristics; +import org.apache.sysml.runtime.matrix.data.LibMatrixDatagen; +import org.apache.sysml.runtime.matrix.data.MatrixBlock; +import org.apache.sysml.runtime.matrix.data.SparseBlock; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.Random; + +/** + * This estimator implements an approach based on row/column sampling + * + * Rasmus Resen Amossen, Andrea Campagna, Rasmus Pagh: + * Better Size Estimation for Sparse Matrix Products. Algorithmica 69(3): 741-757 (2014) + * + * Credit: This code is based on the original implementation provided by the authors, + * modified to fit the SparsityEstimator API, support binary matrix products, avoid + * unnecessary file access, use Well1024a for seeding local RNGs, and generally + * improve performance. + */ +public class EstimatorSampleRa extends SparsityEstimator +{ + private static final int RUNS = -1; + private static final double SAMPLE_FRACTION = 0.1; //10% + private static final double EPSILON = 0.05; // Multiplicative error + private static final double DELTA = 0.1; // Probability of error + private static final int K = -1; + + private fin
[systemml] branch master updated: [SYSTEMML-2509] Fix transform sequences of binning/dummy coding, tests
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new a1bc419 [SYSTEMML-2509] Fix transform sequences of binning/dummy coding, tests a1bc419 is described below commit a1bc419b033f635273e7b91cf9a8dea329e03567 Author: Matthias Boehm AuthorDate: Mon Feb 4 20:56:29 2019 +0100 [SYSTEMML-2509] Fix transform sequences of binning/dummy coding, tests This patch is a follow-up on fixing the binning support, specifically for columns that are both binned and dummy coded. For an example scenario of {recode: [1,2,7], bin: [3,8] dummycode: [3,8]}, we incorrectly constructed the following composite encoder (which assumed that all dummy coded columns need to be recoded): CompositeEncoder(4): -- EncoderRecode: [1, 2, 3, 7, 8] -- EncoderPassThrough: [4, 5, 6, 9] -- EncoderDummycode: [3, 8] -- EncoderBin: [3, 8] Now, we fixed that by only adding dummy coded columns that are not binned to the recode list and brining the basic encoders into the right sequence (i.e., binning before dummy coding): CompositeEncoder(4): -- EncoderRecode: [1, 2, 7] -- EncoderPassThrough: [4, 5, 6, 9] -- EncoderBin: [3, 8] -- EncoderDummycode: [3, 8] Finally, this patch also includes the necessary tests to ensure such issues don't occur in the future. --- .../runtime/transform/encode/EncoderFactory.java | 15 ++-- .../transform/TransformFrameEncodeApplyTest.java | 88 +- .../input/homes3/homes.tfspec_binDummy.json| 6 ++ .../input/homes3/homes.tfspec_binDummy2.json | 6 ++ 4 files changed, 88 insertions(+), 27 deletions(-) diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java index 3d2a100..1118ca6 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java +++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java @@ -56,19 +56,20 @@ public class EncoderFactory List lencoders = new ArrayList<>(); //prepare basic id lists (recode, dummycode, pass-through) - //note: any dummycode column requires recode as preparation List rcIDs = Arrays.asList(ArrayUtils.toObject( TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_RECODE))); List dcIDs = Arrays.asList(ArrayUtils.toObject( TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_DUMMYCODE))); - rcIDs = new ArrayList(CollectionUtils.union(rcIDs, dcIDs)); List binIDs = TfMetaUtils.parseBinningColIDs(jSpec, colnames); + //note: any dummycode column requires recode as preparation, unless it follows binning + rcIDs = new ArrayList( + CollectionUtils.union(rcIDs, CollectionUtils.subtract(dcIDs, binIDs))); List ptIDs = new ArrayList(CollectionUtils.subtract( - CollectionUtils.subtract(UtilFunctions.getSeqList(1, clen, 1), rcIDs), binIDs)); + CollectionUtils.subtract(UtilFunctions.getSeqList(1, clen, 1), rcIDs), binIDs)); List oIDs = Arrays.asList(ArrayUtils.toObject( - TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_OMIT))); + TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_OMIT))); List mvIDs = Arrays.asList(ArrayUtils.toObject( - TfMetaUtils.parseJsonObjectIDList(jSpec, colnames, TfUtils.TXMETHOD_IMPUTE))); + TfMetaUtils.parseJsonObjectIDList(jSpec, colnames, TfUtils.TXMETHOD_IMPUTE))); //create individual encoders if( !rcIDs.isEmpty() ) { @@ -79,10 +80,10 @@ public class EncoderFactory if( !ptIDs.isEmpty() ) lencoders.add(new EncoderPassThrough( ArrayUtils.toPrimitive(ptIDs.toArray(new Integer[0])), clen)); - if( !dcIDs.isEmpty() ) - lencoders.add(new EncoderDummycode(jSpec, colnames, schema.length)); if( !binIDs.isEmpty() ) lencoders.add(new EncoderBin(jSpec, colnames, schema.length)); + if( !dcIDs.i
[systemml] branch master updated: [SYSTEMML-2509] Fix binning support in transformencode over frames
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 3d09c4b [SYSTEMML-2509] Fix binning support in transformencode over frames 3d09c4b is described below commit 3d09c4b1621ef8f7db3841da1e7d36d64298aef1 Author: Matthias Boehm AuthorDate: Sat Jan 26 22:43:41 2019 +0100 [SYSTEMML-2509] Fix binning support in transformencode over frames This patch fixes missing binning support in transformencode over frames. So far, only the apply was working properly but no meta data was build, which corrupted the returned output matrix and meta data. Now, local CP operations work as intended but distributed operations and sequences of binning/dummy-coding require additional work. --- .../sysml/runtime/transform/encode/EncoderBin.java | 114 +++-- .../runtime/transform/encode/EncoderFactory.java | 8 +- .../runtime/transform/encode/EncoderRecode.java| 2 +- .../sysml/runtime/transform/meta/TfMetaUtils.java | 6 +- .../transform/TransformEncodeDecodeTest.java | 1 - .../transform/TransformFrameEncodeApplyTest.java | 16 ++- .../transform/TransformFrameEncodeApply.dml| 1 - 7 files changed, 81 insertions(+), 67 deletions(-) diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java index 016adb4..2f94003 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java +++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java @@ -35,7 +35,7 @@ import org.apache.sysml.runtime.transform.meta.TfMetaUtils; import org.apache.sysml.runtime.util.UtilFunctions; public class EncoderBin extends Encoder -{ +{ private static final long serialVersionUID = 1917445005206076078L; public static final String MIN_PREFIX = "min"; @@ -43,70 +43,36 @@ public class EncoderBin extends Encoder public static final String NBINS_PREFIX = "nbins"; private int[] _numBins = null; - private double[] _min=null, _max=null; // min and max among non-missing values //frame transform-apply attributes + //TODO binMins is redundant and could be removed private double[][] _binMins = null; private double[][] _binMaxs = null; - - public EncoderBin(JSONObject parsedSpec, String[] colnames, int clen) - throws JSONException, IOException - { - this(parsedSpec, colnames, clen, false); - } - public EncoderBin(JSONObject parsedSpec, String[] colnames, int clen, boolean colsOnly) + public EncoderBin(JSONObject parsedSpec, String[] colnames, int clen) throws JSONException, IOException { - super( null, clen ); + super( null, clen ); if ( !parsedSpec.containsKey(TfUtils.TXMETHOD_BIN) ) return; - if( colsOnly ) { - List collist = TfMetaUtils.parseBinningColIDs(parsedSpec, colnames); - initColList(ArrayUtils.toPrimitive(collist.toArray(new Integer[0]))); - } - else - { - JSONObject obj = (JSONObject) parsedSpec.get(TfUtils.TXMETHOD_BIN); - JSONArray attrs = (JSONArray) obj.get(TfUtils.JSON_ATTRS); - JSONArray nbins = (JSONArray) obj.get(TfUtils.JSON_NBINS); - initColList(attrs); - - _numBins = new int[attrs.size()]; - for(int i=0; i < _numBins.length; i++) - _numBins[i] = UtilFunctions.toInt(nbins.get(i)); - - // initialize internal transformation metadata - _min = new double[_colList.length]; - Arrays.fill(_min, Double.POSITIVE_INFINITY); - _max = new double[_colList.length]; - Arrays.fill(_max, Double.NEGATIVE_INFINITY); - } - } - - public void prepare(String[] words, TfUtils agents) { - if ( !isApplicable() ) - return; + //parse column names or column ids + List collist = TfMetaUtils.parseBinningColIDs(parsedSpec, colnames); + initColList(ArrayUtils.toPrimitive(collist.toArray(new Integer[0]))); - for(int i=0; i <_colList.length; i++) { - int colID = _colList[i]; - - String w = null; -
[systemml] branch master updated: [SYSTEMML-2468] Improved matrix histogram estimator for left-deep trees
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new 14a79af [SYSTEMML-2468] Improved matrix histogram estimator for left-deep trees 14a79af is described below commit 14a79af677979f80f10328e67767822f6d43d2ff Author: Matthias Boehm AuthorDate: Mon Jan 14 21:47:27 2019 +0100 [SYSTEMML-2468] Improved matrix histogram estimator for left-deep trees This patch improves the matrix histogram sparsity estimator for combinations of derived and exact sketches as they appear for example in left-deep trees of matrix product chains. Specifically, we now use a generalized code path that exploits extension vectors if they are available and otherwise simply uses zero instead. --- .../apache/sysml/hops/estim/EstimatorMatrixHistogram.java | 15 +-- 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java b/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java index 57fc97e..a82feed 100644 --- a/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java +++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java @@ -168,8 +168,8 @@ public class EstimatorMatrixHistogram extends SparsityEstimator nnz += (long)h1.cNnz[j] * h2.rNnz[j]; } //special case, with hybrid exact and approximate output - else if(h1.cNnz1e!=null && h2.rNnz1e != null) { - //note: normally h1.getRows()*h2.getCols() would define mnOut + else if(h1.cNnz1e!=null || h2.rNnz1e != null) { + //NOTE: normally h1.getRows()*h2.getCols() would define mnOut //but by leveraging the knowledge of rows/cols w/ <=1 nnz, we account //that exact and approximate fractions touch different areas long mnOut = _useExtended ? @@ -177,12 +177,15 @@ public class EstimatorMatrixHistogram extends SparsityEstimator (long)(h1.getRows()-h1.rN1) * (h2.getCols()-h2.cN1); double spOutRest = 0; for( int j=0; j
[systemml] branch master updated: [SYSTEMML-2486] Fix memoization of sparsity sketches for DAG leafs
This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git The following commit(s) were added to refs/heads/master by this push: new b2fa1af [SYSTEMML-2486] Fix memoization of sparsity sketches for DAG leafs b2fa1af is described below commit b2fa1af0c9919c0d703b1eddc32c3cd493e82bf2 Author: Matthias Boehm AuthorDate: Mon Jan 14 18:06:52 2019 +0100 [SYSTEMML-2486] Fix memoization of sparsity sketches for DAG leafs This patch improves the performance of sparsity estimation for DAGs where leaf nodes are reachable multiple times. So far, we redundantly created the leaf sketches from the base data on each access. Instead, we now properly memoize these sketches similar to inner nodes. --- .../apache/sysml/hops/estim/EstimatorBitsetMM.java | 20 +-- .../sysml/hops/estim/EstimatorDensityMap.java | 21 .../sysml/hops/estim/EstimatorMatrixHistogram.java | 23 -- .../apache/sysml/hops/estim/SparsityEstimator.java | 7 --- 4 files changed, 40 insertions(+), 31 deletions(-) diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java b/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java index 07d4cdc..e26dd49 100644 --- a/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java +++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java @@ -46,12 +46,9 @@ public class EstimatorBitsetMM extends SparsityEstimator { @Override public MatrixCharacteristics estim(MMNode root) { - estimateInputs(root); - BitsetMatrix m1Map = !root.getLeft().isLeaf() ? (BitsetMatrix) root.getLeft().getSynopsis() : - new BitsetMatrix1(root.getLeft().getData()); - BitsetMatrix m2Map = root.getRight() == null ? null : - !root.getRight().isLeaf() ? (BitsetMatrix) root.getRight().getSynopsis() : - new BitsetMatrix1(root.getRight().getData()); + BitsetMatrix m1Map = getCachedSynopsis(root.getLeft()); + BitsetMatrix m2Map = getCachedSynopsis(root.getRight()); + BitsetMatrix outMap = estimInternal(m1Map, m2Map, root.getOp()); root.setSynopsis(outMap); // memorize boolean matrix return root.setMatrixCharacteristics(new MatrixCharacteristics( @@ -86,6 +83,17 @@ public class EstimatorBitsetMM extends SparsityEstimator outMap.getNumColumns(), outMap.getNonZeros()); } + private BitsetMatrix getCachedSynopsis(MMNode node) { + if( node == null ) + return null; + //ensure synopsis is properly cached and reused + if( node.isLeaf() && node.getSynopsis() == null ) + node.setSynopsis(new BitsetMatrix1(node.getData())); + else if( !node.isLeaf() ) + estim(node); //recursively obtain synopsis + return (BitsetMatrix) node.getSynopsis(); + } + private BitsetMatrix estimInternal(BitsetMatrix m1Map, BitsetMatrix m2Map, OpCode op) { switch(op) { case MM: return m1Map.matMult(m2Map); diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorDensityMap.java b/src/main/java/org/apache/sysml/hops/estim/EstimatorDensityMap.java index 260df5d..8a78a9e 100644 --- a/src/main/java/org/apache/sysml/hops/estim/EstimatorDensityMap.java +++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorDensityMap.java @@ -55,14 +55,8 @@ public class EstimatorDensityMap extends SparsityEstimator @Override public MatrixCharacteristics estim(MMNode root) { - estimateInputs(root); - DensityMap m1Map = !root.getLeft().isLeaf() ? - (DensityMap)root.getLeft().getSynopsis() : - new DensityMap(root.getLeft().getData(), _b); - DensityMap m2Map = root.getRight()==null ? null: - !root.getRight().isLeaf() ? - (DensityMap)root.getRight().getSynopsis() : - new DensityMap(root.getRight().getData(), _b); + DensityMap m1Map = getCachedSynopsis(root.getLeft()); + DensityMap m2Map = getCachedSynopsis(root.getRight()); //estimate output density map and sparsity DensityMap outMap = estimIntern(m1Map, m2Map, root.getOp()); @@ -94,6 +88,17 @@ public class EstimatorDensityMap extends SparsityEstimator return estim(m, null, op); } + private DensityMap getCachedSynopsis(MMNode node) { + if( node == null ) + return null; + //ensure
systemml git commit: [SYSTEMML-2508] Improved spark cumagg compilation (single row block)
Repository: systemml Updated Branches: refs/heads/master 341a1dc78 -> 8895ebc45 [SYSTEMML-2508] Improved spark cumagg compilation (single row block) This patch improves the compilation of spark cumulative aggregates where the input matrix has a single row block by avoiding the unnecessary offset computation. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/8895ebc4 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/8895ebc4 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/8895ebc4 Branch: refs/heads/master Commit: 8895ebc454ce85e823d6332e40d7effd874e59df Parents: 341a1dc Author: Matthias Boehm Authored: Sun Dec 16 16:04:01 2018 +0100 Committer: Matthias Boehm Committed: Sun Dec 16 17:07:01 2018 +0100 -- .../java/org/apache/sysml/hops/UnaryOp.java | 39 +--- .../misc/RewriteCumulativeAggregatesTest.java | 25 +++-- .../misc/RewriteCumulativeAggregates.R | 6 ++- 3 files changed, 52 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/systemml/blob/8895ebc4/src/main/java/org/apache/sysml/hops/UnaryOp.java -- diff --git a/src/main/java/org/apache/sysml/hops/UnaryOp.java b/src/main/java/org/apache/sysml/hops/UnaryOp.java index 2952e85..77655de 100644 --- a/src/main/java/org/apache/sysml/hops/UnaryOp.java +++ b/src/main/java/org/apache/sysml/hops/UnaryOp.java @@ -22,6 +22,7 @@ package org.apache.sysml.hops; import java.util.ArrayList; import org.apache.sysml.conf.ConfigurationManager; +import org.apache.sysml.hops.rewrite.HopRewriteUtils; import org.apache.sysml.lops.Aggregate; import org.apache.sysml.lops.Checkpoint; import org.apache.sysml.lops.Aggregate.OperationTypes; @@ -455,8 +456,15 @@ public class UnaryOp extends MultiThreadedHop long bclen = input.getColsInBlock(); boolean force = !dimsKnown() || _etypeForced == ExecType.SPARK; OperationTypes aggtype = getCumulativeAggType(); - Lop X = input.constructLops(); + + //special case single row block (no offsets needed) + if( rlen > 0 && clen > 0 && rlen <= brlen ) { + Lop offset = HopRewriteUtils.createDataGenOpByVal(new LiteralOp(1), + new LiteralOp(clen), getCumulativeInitValue()).constructLops(); + return constructCumOffBinary(X, offset, aggtype, rlen, clen, brlen, bclen); + } + Lop TEMP = X; ArrayList DATA = new ArrayList<>(); int level = 0; @@ -497,22 +505,27 @@ public class UnaryOp extends MultiThreadedHop //split, group and mr cumsum while( level-- > 0 ) { - //(for spark, the CumulativeOffsetBinary subsumes both the split aggregate and - //the subsequent offset binary apply of split aggregates against the original data) - double initValue = getCumulativeInitValue(); - boolean broadcast = ALLOW_CUMAGG_BROADCAST - && OptimizerUtils.checkSparkBroadcastMemoryBudget(OptimizerUtils.estimateSize( - TEMP.getOutputParameters().getNumRows(), TEMP.getOutputParameters().getNumCols())); - - CumulativeOffsetBinary binary = new CumulativeOffsetBinary(DATA.get(level), TEMP, - DataType.MATRIX, ValueType.DOUBLE, initValue, broadcast, aggtype, ExecType.SPARK); - binary.getOutputParameters().setDimensions(rlen, clen, brlen, bclen, -1); - setLineNumbers(binary); - TEMP = binary; + TEMP = constructCumOffBinary(DATA.get(level), + TEMP, aggtype, rlen, clen, brlen, bclen); } return TEMP; } + + private Lop constructCumOffBinary(Lop data, Lop offset, OperationTypes aggtype, long rlen, long clen, long brlen, long bclen) { + //(for spark, the CumulativeOffsetBinary subsumes both the split aggregate and + //the subsequent offset binary apply of split aggregates against the original data) + double initValue = getCumulativeInitValue(); + boolean broadcast = ALLOW_CUMAGG_BROADCAST + && OptimizerUtils.checkSparkBroadcastMemoryBudget(OptimizerUtils.estimateSize( + offset.getOutputParameters().getNumRows(), offset.getOutputParameters().getNumCols())); + +
systemml git commit: [MINOR] Fine tuning spark checkpoint data size thresholds
Repository: systemml Updated Branches: refs/heads/master 9a1f64b42 -> 3b87c2ba9 [MINOR] Fine tuning spark checkpoint data size thresholds Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/3b87c2ba Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/3b87c2ba Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/3b87c2ba Branch: refs/heads/master Commit: 3b87c2ba9d77ffa3d901eae38de9c1157994d74e Parents: 9a1f64b Author: Matthias Boehm Authored: Wed Dec 12 13:53:23 2018 +0100 Committer: Matthias Boehm Committed: Wed Dec 12 13:53:23 2018 +0100 -- src/main/java/org/apache/sysml/hops/OptimizerUtils.java | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/systemml/blob/3b87c2ba/src/main/java/org/apache/sysml/hops/OptimizerUtils.java -- diff --git a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java index a43abb3..e6a25d2 100644 --- a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java +++ b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java @@ -914,7 +914,11 @@ public class OptimizerUtils * @return true if the given matrix characteristics exceed threshold */ public static boolean exceedsCachingThreshold(long dim2, double outMem) { - return !(dim2 > 1 && outMem < getLocalMemBudget() + //NOTE: We heuristically cache matrices that are close to or larger + //than the local memory budget. The different relative fractions + //according to number of columns is reflecting common operations + //(e.g., two inputs/one output for binary vector operations) + return !(dim2 > 1 && outMem < getLocalMemBudget()/2 || dim2 == 1 && outMem < getLocalMemBudget()/3); }
[2/2] systemml git commit: [SYSTEMML-2507] New rewrites for cumulative aggregate patterns
[SYSTEMML-2507] New rewrites for cumulative aggregate patterns This patch adds the following simplification rewrites as well as related tests: (a) X * cumsum(diag(matrix(1,nrow(X),1))) -> lower.tri, if X squared (b) colSums(cumsum(X)) -> cumSums(X*seq(nrow(X),1)) (c) rev(cumsum(rev(X))) -> X + colSums(X) - cumsum(X) Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/9a1f64b4 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/9a1f64b4 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/9a1f64b4 Branch: refs/heads/master Commit: 9a1f64b42c177a82a98716ad9ef34d4d266178d2 Parents: b96807b Author: Matthias Boehm Authored: Tue Dec 11 20:10:23 2018 +0100 Committer: Matthias Boehm Committed: Tue Dec 11 20:10:46 2018 +0100 -- .../RewriteAlgebraicSimplificationDynamic.java | 33 - .../RewriteAlgebraicSimplificationStatic.java | 45 +++ .../hops/rewrite/RewriteGPUSpecificOps.java | 26 ++-- .../misc/RewriteCumulativeAggregatesTest.java | 126 +++ .../misc/RewriteCumulativeAggregates.R | 43 +++ .../misc/RewriteCumulativeAggregates.dml| 49 6 files changed, 306 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/systemml/blob/9a1f64b4/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java -- diff --git a/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java b/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java index 36864aa..9556181 100644 --- a/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java +++ b/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java @@ -175,6 +175,7 @@ public class RewriteAlgebraicSimplificationDynamic extends HopRewriteRule hi = simplifyMatrixMultDiag(hop, hi, i); //e.g., diag(X)%*%Y -> X*Y, if ncol(Y)==1 / -> Y*X if ncol(Y)>1 hi = simplifyDiagMatrixMult(hop, hi, i); //e.g., diag(X%*%Y)->rowSums(X*t(Y)); if col vector hi = simplifySumDiagToTrace(hi); //e.g., sum(diag(X)) -> trace(X); if col vector + hi = simplifyLowerTriExtraction(hop, hi, i); //e.g., X * cumsum(diag(matrix(1,nrow(X),1))) -> lower.tri hi = pushdownBinaryOperationOnDiag(hop, hi, i); //e.g., diag(X)*7 -> diag(X*7); if col vector hi = pushdownSumOnAdditiveBinary(hop, hi, i); //e.g., sum(A+B) -> sum(A)+sum(B); if dims(A)==dims(B) if(OptimizerUtils.ALLOW_OPERATOR_FUSION) { @@ -1046,7 +1047,7 @@ public class RewriteAlgebraicSimplificationDynamic extends HopRewriteRule if( hi instanceof AggUnaryOp ) { AggUnaryOp au = (AggUnaryOp) hi; - if( au.getOp()==AggOp.SUM && au.getDirection()==Direction.RowCol ) //sum + if( au.getOp()==AggOp.SUM && au.getDirection()==Direction.RowCol ) //sum { Hop hi2 = au.getInput().get(0); if( hi2 instanceof ReorgOp && ((ReorgOp)hi2).getOp()==ReOrgOp.DIAG && hi2.getDim2()==1 ) //diagM2V @@ -1054,7 +1055,7 @@ public class RewriteAlgebraicSimplificationDynamic extends HopRewriteRule Hop hi3 = hi2.getInput().get(0); //remove diag operator - HopRewriteUtils.replaceChildReference(au, hi2, hi3, 0); + HopRewriteUtils.replaceChildReference(au, hi2, hi3, 0); HopRewriteUtils.cleanupUnreferenced(hi2); //change sum to trace @@ -1063,12 +1064,38 @@ public class RewriteAlgebraicSimplificationDynamic extends HopRewriteRule LOG.debug("Applied simplifySumDiagToTrace"); } } - } return hi; } + private static Hop simplifyLowerTriExtraction(Hop parent, Hop hi, int pos) { + //pattern: X * cumsum(diag(matrix(1,nrow(X),1))) -> lower.tri (only right) + if( HopRewriteUtils.isBinary(hi, OpOp2.MULT) + && hi.getDim1() == hi.getDim2() && hi.getDim1() > 1 ) { + Hop left =
[1/2] systemml git commit: [SYSTEMML-2506] Improved cumagg compilation (intermediate memory)
Repository: systemml Updated Branches: refs/heads/master 7019f3bc8 -> 9a1f64b42 [SYSTEMML-2506] Improved cumagg compilation (intermediate memory) This patch improves the compilation of cumulative aggregate operations, to correctly account for potential dense-sparse conversions when computing memory estimates. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/b96807b9 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/b96807b9 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/b96807b9 Branch: refs/heads/master Commit: b96807b907203ce8ef1bbd017d06f3c6c9ef8fec Parents: 7019f3b Author: Matthias Boehm Authored: Tue Dec 11 16:58:27 2018 +0100 Committer: Matthias Boehm Committed: Tue Dec 11 16:58:27 2018 +0100 -- src/main/java/org/apache/sysml/hops/UnaryOp.java | 11 +-- 1 file changed, 9 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/systemml/blob/b96807b9/src/main/java/org/apache/sysml/hops/UnaryOp.java -- diff --git a/src/main/java/org/apache/sysml/hops/UnaryOp.java b/src/main/java/org/apache/sysml/hops/UnaryOp.java index 4071d6f..2952e85 100644 --- a/src/main/java/org/apache/sysml/hops/UnaryOp.java +++ b/src/main/java/org/apache/sysml/hops/UnaryOp.java @@ -42,6 +42,8 @@ import org.apache.sysml.lops.UnaryCP; import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; +import org.apache.sysml.runtime.matrix.data.MatrixBlock; +import org.apache.sysml.runtime.util.UtilFunctions; /* Unary (cell operations): e.g, b_ij = round(a_ij) @@ -562,15 +564,20 @@ public class UnaryOp extends MultiThreadedHop } @Override - protected double computeIntermediateMemEstimate( long dim1, long dim2, long nnz ) + protected double computeIntermediateMemEstimate(long dim1, long dim2, long nnz) { double ret = 0; - if ( _op == OpOp1.IQM || _op == OpOp1.MEDIAN) { + if( _op == OpOp1.IQM || _op == OpOp1.MEDIAN ) { // buffer (=2*input_size) and output (=input_size) for SORT operation // getMemEstimate works for both cases of known dims and worst-case stats ret = getInput().get(0).getMemEstimate() * 3; } + else if( isCumulativeUnaryOperation() ) { + //account for potential final dense-sparse transformation (worst-case sparse representation) + ret += MatrixBlock.estimateSizeSparseInMemory(dim1, dim2, + MatrixBlock.SPARSITY_TURN_POINT - UtilFunctions.DOUBLE_EPS); + } if (isGPUEnabled()) { // Intermediate memory required to convert sparse to dense
systemml git commit: [SYSTEMML-2503/04] Fix correctness in-place and broadcast cumagg ops
Repository: systemml Updated Branches: refs/heads/master bda61b600 -> 1a58946a0 [SYSTEMML-2503/04] Fix correctness in-place and broadcast cumagg ops This patch fixes correctness issues of in-place cumulative aggregate operations and well as the handling of lineage tracing on spark cumagg offset. In addition, the patch also includes a minor performance improvement that avoids unnecessary copying of offset vectors on cumagg offset operations. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/1a58946a Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/1a58946a Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/1a58946a Branch: refs/heads/master Commit: 1a58946a0a335ccae61d0cf3873a937467ae5544 Parents: bda61b6 Author: Matthias Boehm Authored: Sat Dec 8 13:40:33 2018 +0100 Committer: Matthias Boehm Committed: Sat Dec 8 13:40:33 2018 +0100 -- .../instructions/spark/CumulativeOffsetSPInstruction.java | 9 ++--- .../apache/sysml/runtime/matrix/data/LibMatrixAgg.java| 10 ++ .../org/apache/sysml/runtime/matrix/data/MatrixBlock.java | 4 ++-- .../java/org/apache/sysml/runtime/util/DataConverter.java | 9 - 4 files changed, 22 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/systemml/blob/1a58946a/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java -- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java index 1b26060..3dba53e 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java @@ -32,6 +32,7 @@ import scala.Tuple2; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; import org.apache.sysml.runtime.functionobjects.Builtin; +import org.apache.sysml.runtime.functionobjects.Builtin.BuiltinCode; import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.instructions.cp.CPOperand; import org.apache.sysml.runtime.instructions.spark.data.PartitionedBroadcast; @@ -94,8 +95,9 @@ public class CumulativeOffsetSPInstruction extends BinarySPInstruction { //get and join inputs JavaPairRDD inData = sec.getBinaryBlockRDDHandleForVariable(input1.getName()); JavaPairRDD> joined = null; + boolean broadcast = _broadcast && !SparkUtils.isHashPartitioned(inData); - if( _broadcast && !SparkUtils.isHashPartitioned(inData) ) { + if( broadcast ) { //broadcast offsets and broadcast join with data PartitionedBroadcast inAgg = sec.getBroadcastForVariable(input2.getName()); joined = inData.mapToPair(new RDDCumSplitLookupFunction(inAgg,_initValue, rlen, brlen)); @@ -119,7 +121,7 @@ public class CumulativeOffsetSPInstruction extends BinarySPInstruction { updateUnaryOutputMatrixCharacteristics(sec); sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); - sec.addLineage(output.getName(), input2.getName(), _broadcast); + sec.addLineage(output.getName(), input2.getName(), broadcast); } private static class RDDCumSplitFunction implements PairFlatMapFunction, MatrixIndexes, MatrixBlock> @@ -229,7 +231,8 @@ public class CumulativeOffsetSPInstruction extends BinarySPInstruction { //blockwise cumagg computation, incl offset aggregation return LibMatrixAgg.cumaggregateUnaryMatrix(dblkIn, blkOut, _uop, - DataConverter.convertToDoubleVector(oblkIn)); + DataConverter.convertToDoubleVector(oblkIn, false, + ((Builtin)_uop.fn).bFunc == BuiltinCode.CUMSUM)); } } } http://git-wip-us.apache.org/repos/asf/systemml/blob/1a58946a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixAgg.java -- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixAgg.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixAgg.java index 5e785d9..ed7d8f1 100644 ---
[2/2] systemml git commit: [SYSTEMML-2504] In-place CP cumulative aggregates, incl compiler
[SYSTEMML-2504] In-place CP cumulative aggregates, incl compiler This patch adds an option for in-place CP cumulative aggregates because result allocation is the major bottleneck. As an initial compiler integration, we now compiler inplace CP operations for the aggregation of partial aggregates in Spark cumsum because it guarantees validity. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/25a10f41 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/25a10f41 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/25a10f41 Branch: refs/heads/master Commit: 25a10f412614235d8974f371a2bb07bc08c88cee Parents: 21b1a53 Author: Matthias Boehm Authored: Wed Dec 5 20:38:37 2018 +0100 Committer: Matthias Boehm Committed: Wed Dec 5 20:38:37 2018 +0100 -- .../java/org/apache/sysml/hops/UnaryOp.java | 10 +- src/main/java/org/apache/sysml/lops/Unary.java | 7 +-- .../instructions/cp/UnaryCPInstruction.java | 5 +++-- .../sysml/runtime/matrix/data/LibMatrixAgg.java | 20 +++- .../runtime/matrix/operators/UnaryOperator.java | 10 -- 5 files changed, 36 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/systemml/blob/25a10f41/src/main/java/org/apache/sysml/hops/UnaryOp.java -- diff --git a/src/main/java/org/apache/sysml/hops/UnaryOp.java b/src/main/java/org/apache/sysml/hops/UnaryOp.java index d1110c3..4071d6f 100644 --- a/src/main/java/org/apache/sysml/hops/UnaryOp.java +++ b/src/main/java/org/apache/sysml/hops/UnaryOp.java @@ -170,7 +170,7 @@ public class UnaryOp extends MultiThreadedHop int k = isCumulativeUnaryOperation() || isExpensiveUnaryOperation() ? OptimizerUtils.getConstrainedNumThreads( _maxNumThreads ) : 1; Unary unary1 = new Unary(input.constructLops(), - HopsOpOp1LopsU.get(_op), getDataType(), getValueType(), et, k); + HopsOpOp1LopsU.get(_op), getDataType(), getValueType(), et, k, false); setOutputDimensions(unary1); setLineNumbers(unary1); setLops(unary1); @@ -404,15 +404,15 @@ public class UnaryOp extends MultiThreadedHop agg.getOutputParameters().setDimensions(rlenAgg, clen, brlen, bclen, -1); agg.setupCorrectionLocation(CorrectionLocationType.NONE); // aggregation uses kahanSum but the inputs do not have correction values setLineNumbers(agg); - TEMP = agg; + TEMP = agg; level++; force = false; //in case of unknowns, generate one level } //in-memory cum sum (of partial aggregates) if( TEMP.getOutputParameters().getNumRows()!=1 ) { - int k = OptimizerUtils.getConstrainedNumThreads( _maxNumThreads ); - Unary unary1 = new Unary( TEMP, HopsOpOp1LopsU.get(_op), DataType.MATRIX, ValueType.DOUBLE, ExecType.CP, k); + int k = OptimizerUtils.getConstrainedNumThreads( _maxNumThreads ); + Unary unary1 = new Unary( TEMP, HopsOpOp1LopsU.get(_op), DataType.MATRIX, ValueType.DOUBLE, ExecType.CP, k, true); unary1.getOutputParameters().setDimensions(TEMP.getOutputParameters().getNumRows(), clen, brlen, bclen, -1); setLineNumbers(unary1); TEMP = unary1; @@ -487,7 +487,7 @@ public class UnaryOp extends MultiThreadedHop //in-memory cum sum (of partial aggregates) if( TEMP.getOutputParameters().getNumRows()!=1 ){ int k = OptimizerUtils.getConstrainedNumThreads( _maxNumThreads ); - Unary unary1 = new Unary( TEMP, HopsOpOp1LopsU.get(_op), DataType.MATRIX, ValueType.DOUBLE, ExecType.CP, k); + Unary unary1 = new Unary( TEMP, HopsOpOp1LopsU.get(_op), DataType.MATRIX, ValueType.DOUBLE, ExecType.CP, k, true); unary1.getOutputParameters().setDimensions(TEMP.getOutputParameters().getNumRows(), clen, brlen, bclen, -1); setLineNumbers(unary1); TEMP = unary1; http://git-wip-us.apache.org/repos/asf/systemml/blob/25a10f41/src/main/java/org/apache/sysml/lops/Unary.java
[1/2] systemml git commit: [SYSTEMML-2503] Exploit existing hash partitioning in spark cumoff ops
Repository: systemml Updated Branches: refs/heads/master 7a3447a50 -> 25a10f412 [SYSTEMML-2503] Exploit existing hash partitioning in spark cumoff ops Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/21b1a531 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/21b1a531 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/21b1a531 Branch: refs/heads/master Commit: 21b1a53141c74b4aa3af6e0263af3f6b0d7c1336 Parents: 7a3447a Author: Matthias Boehm Authored: Wed Dec 5 19:39:53 2018 +0100 Committer: Matthias Boehm Committed: Wed Dec 5 19:39:53 2018 +0100 -- .../runtime/instructions/spark/CumulativeOffsetSPInstruction.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/systemml/blob/21b1a531/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java -- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java index 53e6e91..8befc5a 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java @@ -35,6 +35,7 @@ import org.apache.sysml.runtime.functionobjects.Builtin; import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.instructions.cp.CPOperand; import org.apache.sysml.runtime.instructions.spark.data.PartitionedBroadcast; +import org.apache.sysml.runtime.instructions.spark.utils.SparkUtils; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.data.LibMatrixAgg; import org.apache.sysml.runtime.matrix.data.MatrixBlock; @@ -95,7 +96,7 @@ public class CumulativeOffsetSPInstruction extends BinarySPInstruction { JavaPairRDD inData = sec.getBinaryBlockRDDHandleForVariable(input1.getName()); JavaPairRDD> joined = null; - if( _broadcast ) { + if( _broadcast && !SparkUtils.isHashPartitioned(inData) ) { //broadcast offsets and broadcast join with data PartitionedBroadcast inAgg = sec.getBroadcastForVariable(input2.getName()); joined = inData.mapToPair(new RDDCumSplitLookupFunction(inAgg,_initValue, rlen, brlen));
[1/3] systemml git commit: [SYSTEMML-2500] Async matrix allocation on Spark RDD collect
Repository: systemml Updated Branches: refs/heads/master 95cbbd656 -> 7a3447a50 [SYSTEMML-2500] Async matrix allocation on Spark RDD collect This patch introduces a general performance improvement of RDD collect operations into the driver memory, by interleaving the matrix allocation with the collect (and pending RDD evaluation). This is generally useful because it reduces the serial fraction of parallel programs. For example, for 100 distributed sum(cumsum(X)) operations, it reduced the total runtime from 1,102s to 1,006s. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/77a7ef15 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/77a7ef15 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/77a7ef15 Branch: refs/heads/master Commit: 77a7ef155d5f3546d053c7f3d11b1ff3b8021834 Parents: 95cbbd6 Author: Matthias Boehm Authored: Sat Dec 1 17:08:45 2018 +0100 Committer: Matthias Boehm Committed: Sat Dec 1 17:08:45 2018 +0100 -- .../controlprogram/caching/LazyWriteBuffer.java | 4 .../controlprogram/context/SparkExecutionContext.java | 14 ++ .../org/apache/sysml/runtime/io/IOUtilFunctions.java | 10 ++ .../apache/sysml/runtime/matrix/data/MatrixBlock.java | 10 ++ 4 files changed, 34 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/systemml/blob/77a7ef15/src/main/java/org/apache/sysml/runtime/controlprogram/caching/LazyWriteBuffer.java -- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/LazyWriteBuffer.java b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/LazyWriteBuffer.java index 391f21a..d1dc801 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/LazyWriteBuffer.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/LazyWriteBuffer.java @@ -272,6 +272,10 @@ public class LazyWriteBuffer } } + public static ExecutorService getUtilThreadPool() { + return _fClean != null ? _fClean._pool : null; + } + /** * Extended LinkedHashMap with convenience methods for adding and removing * last/first entries. http://git-wip-us.apache.org/repos/asf/systemml/blob/77a7ef15/src/main/java/org/apache/sysml/runtime/controlprogram/context/SparkExecutionContext.java -- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/SparkExecutionContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/SparkExecutionContext.java index 8981c87..b04aad0 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/SparkExecutionContext.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/SparkExecutionContext.java @@ -24,6 +24,7 @@ import java.util.Arrays; import java.util.HashMap; import java.util.LinkedList; import java.util.List; +import java.util.concurrent.Future; import java.util.stream.Collectors; import java.util.stream.LongStream; @@ -46,7 +47,6 @@ import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM; import org.apache.sysml.api.mlcontext.MLContext; import org.apache.sysml.api.mlcontext.MLContextUtil; -import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.lops.Checkpoint; import org.apache.sysml.parser.Expression.ValueType; @@ -72,6 +72,7 @@ import org.apache.sysml.runtime.instructions.spark.functions.CreateSparseBlockFu import org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction; import org.apache.sysml.runtime.instructions.spark.utils.RDDAggregateUtils; import org.apache.sysml.runtime.instructions.spark.utils.SparkUtils; +import org.apache.sysml.runtime.io.IOUtilFunctions; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.data.FrameBlock; import org.apache.sysml.runtime.matrix.data.InputInfo; @@ -824,7 +825,7 @@ public class SparkExecutionContext extends ExecutionContext long t0 = ConfigurationManager.isStatistics() ? System.nanoTime() : 0; MatrixBlock out = null; - + if( rlen <= brlen && clen <= bclen ) //SINGLE BLOCK { //special case without copy and nnz maintenance @@ -846,9 +847,14 @@ public class SparkExecutionContext extends ExecutionContext //create output matrix block (w/ lazy allocation) out = new MatrixBlock(rlen, clen, sparse, lnnz); - +
[3/3] systemml git commit: [SYSTEMML-2502] Performance spark cumagg offset aggregation (zero-copy)
[SYSTEMML-2502] Performance spark cumagg offset aggregation (zero-copy) This patch avoid unnecessary copy operations of input data blocks, which were used to avoid data corruption on offset aggregation into the first row. Instead we now directly pass the offset vector into the dedicated cumulative aggregate operations. On our running example of 100 distributed sum(cumsum(X)) operations, this patch reduced the total runtime from 887s to 732s. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/7a3447a5 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/7a3447a5 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/7a3447a5 Branch: refs/heads/master Commit: 7a3447a50b6d2abdbaf6dce9d021a3ce7c2717d7 Parents: fee20fb Author: Matthias Boehm Authored: Sat Dec 1 21:06:04 2018 +0100 Committer: Matthias Boehm Committed: Sat Dec 1 21:06:04 2018 +0100 -- .../spark/CumulativeOffsetSPInstruction.java| 62 +++- .../sysml/runtime/matrix/data/LibMatrixAgg.java | 10 +++- 2 files changed, 27 insertions(+), 45 deletions(-) -- http://git-wip-us.apache.org/repos/asf/systemml/blob/7a3447a5/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java -- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java index 952a6d0..53e6e91 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java @@ -32,50 +32,40 @@ import scala.Tuple2; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; import org.apache.sysml.runtime.functionobjects.Builtin; -import org.apache.sysml.runtime.functionobjects.Multiply; -import org.apache.sysml.runtime.functionobjects.Plus; -import org.apache.sysml.runtime.functionobjects.PlusMultiply; import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.instructions.cp.CPOperand; import org.apache.sysml.runtime.instructions.spark.data.PartitionedBroadcast; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; +import org.apache.sysml.runtime.matrix.data.LibMatrixAgg; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.MatrixIndexes; -import org.apache.sysml.runtime.matrix.operators.BinaryOperator; import org.apache.sysml.runtime.matrix.operators.Operator; import org.apache.sysml.runtime.matrix.operators.UnaryOperator; +import org.apache.sysml.runtime.util.DataConverter; import org.apache.sysml.runtime.util.UtilFunctions; import org.apache.sysml.utils.IntUtils; public class CumulativeOffsetSPInstruction extends BinarySPInstruction { - private BinaryOperator _bop = null; private UnaryOperator _uop = null; + private boolean _cumsumprod = false; private final double _initValue ; private final boolean _broadcast; private CumulativeOffsetSPInstruction(Operator op, CPOperand in1, CPOperand in2, CPOperand out, double init, boolean broadcast, String opcode, String istr) { super(SPType.CumsumOffset, op, in1, in2, out, opcode, istr); - if ("bcumoffk+".equals(opcode)) { - _bop = new BinaryOperator(Plus.getPlusFnObject()); + if ("bcumoffk+".equals(opcode)) _uop = new UnaryOperator(Builtin.getBuiltinFnObject("ucumk+")); - } - else if ("bcumoff*".equals(opcode)) { - _bop = new BinaryOperator(Multiply.getMultiplyFnObject()); + else if ("bcumoff*".equals(opcode)) _uop = new UnaryOperator(Builtin.getBuiltinFnObject("ucum*")); - } else if ("bcumoff+*".equals(opcode)) { - _bop = new BinaryOperator(PlusMultiply.getFnObject()); _uop = new UnaryOperator(Builtin.getBuiltinFnObject("ucumk+*")); + _cumsumprod = true; } - else if ("bcumoffmin".equals(opcode)) { - _bop = new BinaryOperator(Builtin.getBuiltinFnObject("min")); + else if ("bcumoffmin".equals(opcode)) _uop = new UnaryOperator(Builtin.getBuiltinFnObject("ucummin")); - } - else if ("bcumoffmax".equals(opcode)) { - _bop = new BinaryOperator(Builtin.getBuiltinFnObject("max")); +