[systemml] branch master updated: [MINOR] Additional lineage parfor remote tests, and cleanups

2020-06-23 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new c6d7a52  [MINOR] Additional lineage parfor remote tests, and cleanups
c6d7a52 is described below

commit c6d7a52e2e4259fa62ba8e0b15cdfe1397baac0f
Author: Matthias Boehm 
AuthorDate: Tue Jun 23 22:46:05 2020 +0200

[MINOR] Additional lineage parfor remote tests, and cleanups

This patch adds msvm w/ remote_spark parfor workers to the test suite
and fixes missing support for tak+ operators in the recompute-by-lineage
utility.
---
 scripts/builtin/l2svm.dml  |  2 +-
 .../sysds/hops/ipa/FunctionCallSizeInfo.java   |  9 ++--
 .../sysds/runtime/lineage/LineageItemUtils.java| 25 ++---
 .../functions/lineage/LineageTraceParforTest.java  |  7 +++
 .../functions/lineage/LineageTraceParforMSVM.dml   | 61 ++
 5 files changed, 90 insertions(+), 14 deletions(-)

diff --git a/scripts/builtin/l2svm.dml b/scripts/builtin/l2svm.dml
index 3e251ae..f411fb9 100644
--- a/scripts/builtin/l2svm.dml
+++ b/scripts/builtin/l2svm.dml
@@ -72,7 +72,7 @@ m_l2svm = function(Matrix[Double] X, Matrix[Double] Y, 
Boolean intercept = FALSE
 
   # TODO make this a stop condition for l2svm instead of just printing.
   if(num_min + num_max != nrow(Y))
-print("L2SVM: WARNING invalid number of labels in Y")
+print("L2SVM: WARNING invalid number of labels in Y: "+num_min+" "+num_max)
 
   # Scale inputs to -1 for negative, and 1 for positive classification
   if(check_min != -1 | check_max != +1)
diff --git a/src/main/java/org/apache/sysds/hops/ipa/FunctionCallSizeInfo.java 
b/src/main/java/org/apache/sysds/hops/ipa/FunctionCallSizeInfo.java
index b349a5f..551ce98 100644
--- a/src/main/java/org/apache/sysds/hops/ipa/FunctionCallSizeInfo.java
+++ b/src/main/java/org/apache/sysds/hops/ipa/FunctionCallSizeInfo.java
@@ -233,14 +233,11 @@ public class FunctionCallSizeInfo
   &&  
h1.getDim1()==h2.getDim1() 
   &&  
h1.getDim2()==h2.getDim2()
   &&  
h1.getNnz()==h2.getNnz() );
-   //check literal values (equi 
value)
-   if( h1 instanceof LiteralOp ) {
-   consistent &= (h2 
instanceof LiteralOp 
+   //check literal values (both 
needs to be literals and same value)
+   if( h1 instanceof LiteralOp || 
h2 instanceof LiteralOp ) {
+   consistent &= (h1 
instanceof LiteralOp && h2 instanceof LiteralOp
&& 
HopRewriteUtils.isEqualValue((LiteralOp)h1, (LiteralOp)h2));
}
-   else if(h2 instanceof 
LiteralOp) {
-   consistent = false; 
//h2 literal, but h1 not
-   }
}
}
if( consistent )
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageItemUtils.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageItemUtils.java
index 467bbc9..e659025 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageItemUtils.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageItemUtils.java
@@ -278,6 +278,24 @@ public class LineageItemUtils {

operands.put(item.getId(), aggunary);
break;
}
+   case AggregateBinary: {
+   Hop input1 = 
operands.get(item.getInputs()[0].getId());
+   Hop input2 = 
operands.get(item.getInputs()[1].getId());
+   Hop aggbinary = 
HopRewriteUtils.createMatrixMultiply(input1, input2);
+   
operands.put(item.getId(), aggbinary);
+   break;
+   }
+   case AggregateTernary: {
+ 

[systemml] branch master updated: [SYSTEMDS-421] Fix IPA scalar propagation (inconsistent literals/vars)

2020-06-23 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new ff32c05  [SYSTEMDS-421] Fix IPA scalar propagation (inconsistent 
literals/vars)
ff32c05 is described below

commit ff32c05373af72da825713c74de3ddc4c46a2159
Author: Matthias Boehm 
AuthorDate: Tue Jun 23 21:41:32 2020 +0200

[SYSTEMDS-421] Fix IPA scalar propagation (inconsistent literals/vars)

This patch fixes the logic of IPA scalar propagation into functions with
multiple function calls. Similar to sizes, we check if literal function
arguments have consistent values and propagate valid ones. However, this
check had a logic problem of only checking if the first call was a
literal. This missed cases where the first call had a scalar variable
but the second call a valid scalar literal that could had been
propagated individually.
---
 dev/Tasks.txt  |   3 +
 .../sysds/hops/ipa/FunctionCallSizeInfo.java   |   5 +-
 .../recompile/IPAConstantPropagationFunTest.java   |  71 ++
 .../functions/recompile/IPAFunctionArgs.dml| 109 +
 4 files changed, 187 insertions(+), 1 deletion(-)

diff --git a/dev/Tasks.txt b/dev/Tasks.txt
index 689949c..c84a523 100644
--- a/dev/Tasks.txt
+++ b/dev/Tasks.txt
@@ -341,6 +341,9 @@ SYSTEMDS-410 Lineage Tracing, Reuse and Integration II
  * 413 Cache and reuse MultiReturnBuiltin instructionsOK
  * 414 New rewrite for PCA --> lmDS pipeline  OK
 
+SYSTEMDS-420 Compiler Improvements
+ * 421 Fix invalid IPA scalar propagation into functions  OK
+
 SYSTEMDS-500 Documentation Webpage Reintroduction
  * 501 Make Documentation webpage framework   OK
 
diff --git a/src/main/java/org/apache/sysds/hops/ipa/FunctionCallSizeInfo.java 
b/src/main/java/org/apache/sysds/hops/ipa/FunctionCallSizeInfo.java
index 7199d17..b349a5f 100644
--- a/src/main/java/org/apache/sysds/hops/ipa/FunctionCallSizeInfo.java
+++ b/src/main/java/org/apache/sysds/hops/ipa/FunctionCallSizeInfo.java
@@ -234,10 +234,13 @@ public class FunctionCallSizeInfo
   &&  
h1.getDim2()==h2.getDim2()
   &&  
h1.getNnz()==h2.getNnz() );
//check literal values (equi 
value)
-   if( h1 instanceof LiteralOp ){
+   if( h1 instanceof LiteralOp ) {
consistent &= (h2 
instanceof LiteralOp 
&& 
HopRewriteUtils.isEqualValue((LiteralOp)h1, (LiteralOp)h2));
}
+   else if(h2 instanceof 
LiteralOp) {
+   consistent = false; 
//h2 literal, but h1 not
+   }
}
}
if( consistent )
diff --git 
a/src/test/java/org/apache/sysds/test/functions/recompile/IPAConstantPropagationFunTest.java
 
b/src/test/java/org/apache/sysds/test/functions/recompile/IPAConstantPropagationFunTest.java
new file mode 100644
index 000..efd0397
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/test/functions/recompile/IPAConstantPropagationFunTest.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.recompile;
+
+import java.util.HashMap;
+
+import org.junit.Test;
+import org.apache.sysds.hops.OptimizerUtils;
+import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+impo

[systemml] branch master updated: [SYSTEMDS-412] Fix lineage-based reuse for update-inplace indexing

2020-06-11 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 3d876d3  [SYSTEMDS-412] Fix lineage-based reuse for update-inplace 
indexing
3d876d3 is described below

commit 3d876d33ad019fe026799df540fc20a86ec4
Author: Matthias Boehm 
AuthorDate: Thu Jun 11 15:57:20 2020 +0200

[SYSTEMDS-412] Fix lineage-based reuse for update-inplace indexing

This patch disabled lineage-based reuse for update-inplace left indexing
operations as reuse would create incorrect results due to later in-place
updates the change the cached data object.

Furthermore, this patch also aims to make the codegen tests for robust
wrt the surefire github action integration (less explain output).
---
 .../runtime/controlprogram/context/SparkExecutionContext.java | 8 ++--
 .../java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java | 5 -
 .../org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java | 2 +-
 .../apache/sysds/test/functions/codegen/DAGCellwiseTmplTest.java  | 2 +-
 .../org/apache/sysds/test/functions/codegen/MiscPatternTest.java  | 2 +-
 .../org/apache/sysds/test/functions/codegen/MultiAggTmplTest.java | 2 +-
 .../apache/sysds/test/functions/codegen/OuterProdTmplTest.java| 2 +-
 .../org/apache/sysds/test/functions/codegen/RowAggTmplTest.java   | 2 +-
 .../sysds/test/functions/codegen/RowConv2DOperationsTest.java | 2 +-
 .../sysds/test/functions/codegen/RowVectorComparisonTest.java | 2 +-
 .../apache/sysds/test/functions/codegen/SparseSideInputTest.java  | 2 +-
 .../apache/sysds/test/functions/codegen/SumProductChainTest.java  | 2 +-
 12 files changed, 16 insertions(+), 17 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/controlprogram/context/SparkExecutionContext.java
 
b/src/main/java/org/apache/sysds/runtime/controlprogram/context/SparkExecutionContext.java
index a1e2b92..11a4e93 100644
--- 
a/src/main/java/org/apache/sysds/runtime/controlprogram/context/SparkExecutionContext.java
+++ 
b/src/main/java/org/apache/sysds/runtime/controlprogram/context/SparkExecutionContext.java
@@ -171,18 +171,14 @@ public class SparkExecutionContext extends 
ExecutionContext
_spctx = null;
}
 
-   public void close()
-   {
+   public void close() {
synchronized( SparkExecutionContext.class ) {
-   if( _spctx != null )
-   {
+   if( _spctx != null ) {
//stop the spark context if existing
_spctx.stop();
-
//make sure stopped context is never used again
_spctx = null;
}
-
}
}
 
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
index 48a512a..22964ba 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
@@ -25,6 +25,7 @@ import 
org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.instructions.Instruction;
 import org.apache.sysds.runtime.instructions.cp.ComputationCPInstruction;
 import org.apache.sysds.runtime.instructions.cp.ListIndexingCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.MatrixIndexingCPInstruction;
 
 import java.util.Comparator;
 
@@ -151,7 +152,9 @@ public class LineageCacheConfig
&& !(inst instanceof ListIndexingCPInstruction);
boolean rightop = (ArrayUtils.contains(REUSE_OPCODES, 
inst.getOpcode())
|| (inst.getOpcode().equals("append") && 
isVectorAppend(inst, ec)));
-   return insttype && rightop;
+   boolean updateInplace = (inst instanceof 
MatrixIndexingCPInstruction)
+   && 
ec.getMatrixObject(((ComputationCPInstruction)inst).input1).getUpdateType().isInPlace();
+   return insttype && rightop && !updateInplace;
}

private static boolean isVectorAppend(Instruction inst, 
ExecutionContext ec) {
diff --git 
a/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java 
b/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java
index bd369c0..b1d184c 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java
@@ -477,7 +477,7 @@ public class CellwiseTmplTest extends AutomatedTestBase

 

[systemml] branch master updated: [SYSTEMDS-412] Fix robustness lineage DAGs, parfor integration

2020-06-11 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new e8c0a28  [SYSTEMDS-412] Fix robustness lineage DAGs, parfor integration
e8c0a28 is described below

commit e8c0a28c95b9a22f2a023715a3717c36528bd3ab
Author: Matthias Boehm 
AuthorDate: Thu Jun 11 14:08:13 2020 +0200

[SYSTEMDS-412] Fix robustness lineage DAGs, parfor integration

This patch makes further robustness improvements to the handling of
large lineage DAGs via non-recursive primitives. In this context,
explain needed special treatment to preserve the previous output in DFS
order w/ post-append.

Furthermore, this also fixes a number of issues of the parfor
integration such as (1) invalid cached hashes after sub-DAG replacement,
(2) introduced cycles during parfor lineage merge, (3) steplm script
improvements (disabled parfor dependency analysis was hiding the issue
that introduced the cycles), and (4) some debugging functionality to
reliably detect cycles in lineage DAGs.
---
 scripts/builtin/steplm.dml | 20 
 .../instructions/cp/DataGenCPInstruction.java  |  2 +-
 .../apache/sysds/runtime/lineage/LineageItem.java  |  5 ++
 .../sysds/runtime/lineage/LineageItemUtils.java| 55 --
 src/main/java/org/apache/sysds/utils/Explain.java  | 49 +++
 .../test/functions/lineage/LineageReuseAlg.java| 37 ++-
 .../functions/lineage/LineageTraceParforSteplm.dml |  4 +-
 7 files changed, 134 insertions(+), 38 deletions(-)

diff --git a/scripts/builtin/steplm.dml b/scripts/builtin/steplm.dml
index 01f35ba..800c2ca 100644
--- a/scripts/builtin/steplm.dml
+++ b/scripts/builtin/steplm.dml
@@ -98,7 +98,7 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, 
Integer icpt = 0,
 
   # First pass to examine single features
   AICs = matrix(0, 1, m_orig);
-  parfor (i in 1:m_orig, check = 0) {
+  parfor (i in 1:m_orig) {
 [AIC_1, beta_out_i] = linear_regression(X_orig[, i], y, icpt, reg, tol, 
maxi, verbose);
 AICs[1, i] = AIC_1;
 beta_out_all[1:nrow(beta_out_i), i] = beta_out_i;
@@ -129,25 +129,25 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, 
Integer icpt = 0,
 while (continue) {
   # Subsequent passes over the features
   beta_out_all_2 = matrix(0, boa_ncol, m_orig * 1);
-  AICs = matrix(0, 1, m_orig); # full overwrite
-  parfor (i in 1:m_orig, check = 0) {
+  AICs_2 = matrix(0, 1, m_orig); # full overwrite
+  parfor (i in 1:m_orig) {
 if (as.scalar(columns_fixed[1, i]) == 0) {
   # Construct the feature matrix
-  X = cbind(X_global, X_orig[, i]);
-  [AIC_2, beta_out_i] = linear_regression(X, y, icpt, reg, tol, maxi, 
verbose);
-  AICs[1, i] = AIC_2;
-  beta_out_all_2[1:nrow(beta_out_i), i] = beta_out_i;
+  Xi = cbind(X_global, X_orig[, i]);
+  [AIC_2, beta_out_i2] = linear_regression(Xi, y, icpt, reg, tol, 
maxi, verbose);
+  AICs_2[1, i] = AIC_2;
+  beta_out_all_2[1:nrow(beta_out_i2), i] = beta_out_i2;
 }
 else {
-  AICs[1,i] = Inf;
+  AICs_2[1,i] = Inf;
 }
   }
 
   # Determine the best AIC
   AIC_best_orig = AIC_best;
-  AIC_best = min(min(AICs), AIC_best_orig);
+  AIC_best = min(min(AICs_2), AIC_best_orig);
   AIC_check = checkAIC(AIC_best, AIC_best_orig, thr);
-  column_best = ifelse(AIC_check, as.scalar(rowIndexMin(AICs)), 
column_best);
+  column_best = ifelse(AIC_check, as.scalar(rowIndexMin(AICs_2)), 
column_best);
 
   # have the best beta store in the matrix
   beta_best = beta_out_all_2[, column_best];
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/cp/DataGenCPInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/cp/DataGenCPInstruction.java
index baacca6..8d688b8 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/cp/DataGenCPInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/cp/DataGenCPInstruction.java
@@ -402,7 +402,7 @@ public class DataGenCPInstruction extends 
UnaryCPInstruction {
tmpInstStr, position, 
String.valueOf(runtimeSeed)) : tmpInstStr;
}
//replace output variable name with a 
placeholder
-   //tmpInstStr = 
InstructionUtils.replaceOperandName(tmpInstStr);
+   tmpInstStr = 
InstructionUtils.replaceOperandName(tmpInstStr);
tmpInstStr = replaceNonLiteral(tmpInstStr, 
rows, 2, ec);
tmpInstStr = replaceNonLiteral(tmpInstStr, 
cols, 3, ec);
break;
diff --git

[systemml] branch master updated: [SYSTEMDS-412] Robustness lineage DAG ops (non-recursive resetVisit)

2020-06-07 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 86f7b1f  [SYSTEMDS-412] Robustness lineage DAG ops (non-recursive 
resetVisit)
86f7b1f is described below

commit 86f7b1f47bc1f1f4291e97adc3c5d996b0dc67ba
Author: Matthias Boehm 
AuthorDate: Sun Jun 7 22:29:56 2020 +0200

[SYSTEMDS-412] Robustness lineage DAG ops (non-recursive resetVisit)

This patch is a first step toward making the lineage DAG more robust
with regard to stack overflow errors, which occur for example in default
JVM configuration when writing out lineage DAGs of a depth >10,000s of
nodes. We use simple non-recursive stacks to perform these operations,
but explain and similar operations require some additional queueing to
preserve the previous output format (no need to break backwards
compatibility to previous releases).
---
 .../apache/sysds/runtime/lineage/LineageCache.java |  2 +-
 .../apache/sysds/runtime/lineage/LineageItem.java  | 43 +++---
 .../sysds/runtime/lineage/LineageItemUtils.java| 14 +++
 src/main/java/org/apache/sysds/utils/Explain.java  | 12 +++---
 4 files changed, 52 insertions(+), 19 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
index 9f53395..cb2d13b 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
@@ -255,7 +255,7 @@ public class LineageCache
String boundVarName = outputs.get(i).getName();
LineageItem boundLI = ec.getLineage().get(boundVarName);
if (boundLI != null)
-   boundLI.resetVisitStatus();
+   boundLI.resetVisitStatusNR();
if (boundLI == null || !LineageCache.probe(li) || 
!LineageCache.probe(boundLI)) {
AllOutputsCacheable = false;
}
diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageItem.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageItem.java
index e5345e8..38a4cb9 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageItem.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageItem.java
@@ -19,6 +19,8 @@
 
 package org.apache.sysds.runtime.lineage;
 
+import java.util.Stack;
+
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.parfor.util.IDSequence;
 import org.apache.sysds.runtime.util.UtilFunctions;
@@ -128,9 +130,9 @@ public class LineageItem {
if (!(o instanceof LineageItem))
return false;

-   resetVisitStatus();
+   resetVisitStatusNR();
boolean ret = equalsLI((LineageItem) o);
-   resetVisitStatus();
+   resetVisitStatusNR();
return ret;
}

@@ -180,16 +182,47 @@ public class LineageItem {
return !_opcode.isEmpty();
}

-   public LineageItem resetVisitStatus() {
+   /**
+* Non-recursive equivalent of {@link #resetVisitStatus()} 
+* for robustness with regard to stack overflow errors.
+*/
+   public void resetVisitStatusNR() {
+   Stack q = new Stack<>();
+   q.push(this);
+   while( !q.empty() ) {
+   LineageItem tmp = q.pop();
+   if( !tmp.isVisited() )
+   continue;
+   if (tmp.getInputs() != null)
+   for (LineageItem li : tmp.getInputs())
+   q.push(li);
+   tmp.setVisited(false);
+   }
+   }
+   
+   /**
+* Non-recursive equivalent of {@link #resetVisitStatus(LineageItem[])} 
+* for robustness with regard to stack overflow errors.
+* 
+* @param lis root lineage items
+*/
+   public static void resetVisitStatusNR(LineageItem[] lis) {
+   if (lis != null)
+   for (LineageItem liRoot : lis)
+   liRoot.resetVisitStatusNR();
+   }
+   
+   @Deprecated
+   public void resetVisitStatus() {
if (!isVisited())
-   return this;
+   return;
if (_inputs != null)
for (LineageItem li : getInputs())
li.resetVisitStatus();
setVisited(false);
-   return this;
}

+   @Deprecated
public st

[systemml] branch master updated: [MINOR] Fix reading dml scripts from dist fs / object store

2020-06-05 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 9d5999d  [MINOR] Fix reading dml scripts from dist fs / object store
9d5999d is described below

commit 9d5999dc91df37beaddbdd48fe3c7487188f52a7
Author: Matthias Boehm 
AuthorDate: Fri Jun 5 17:45:36 2020 +0200

[MINOR] Fix reading dml scripts from dist fs / object store
---
 src/main/java/org/apache/sysds/api/DMLScript.java | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/main/java/org/apache/sysds/api/DMLScript.java 
b/src/main/java/org/apache/sysds/api/DMLScript.java
index a069182..6854ebb 100644
--- a/src/main/java/org/apache/sysds/api/DMLScript.java
+++ b/src/main/java/org/apache/sysds/api/DMLScript.java
@@ -112,8 +112,6 @@ public class DMLScript
public static String _uuid = IDHandler.createDistributedUniqueID();
private static final Log LOG = 
LogFactory.getLog(DMLScript.class.getName());
 
-   private static FileSystem fs = null;
-
///
// public external interface

@@ -283,7 +281,7 @@ public class DMLScript
|| 
IOUtilFunctions.isObjectStoreFileScheme(new Path(fileName)) )
{ 
Path scriptPath = new Path(fileName);
-   fs = 
IOUtilFunctions.getFileSystem(scriptPath);
+   FileSystem fs = 
IOUtilFunctions.getFileSystem(scriptPath);
in = new BufferedReader(new 
InputStreamReader(fs.open(scriptPath)));
}
// from local file system
@@ -303,8 +301,6 @@ public class DMLScript
throw ex;
}
finally {
-   if(fs != null)
-   fs.close();
IOUtilFunctions.closeSilently(in);
}




[systemml] branch master updated: [MINOR] Fix BuiltinFunctionExpression

2020-06-03 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new d8c9495  [MINOR] Fix BuiltinFunctionExpression
d8c9495 is described below

commit d8c9495ae6d8e0507d113718b23dfd8fa5035e6d
Author: Sebastian 
AuthorDate: Wed Jun 3 18:57:30 2020 +0200

[MINOR] Fix BuiltinFunctionExpression

Closes #937.
Closes #944.
---
 .../java/org/apache/sysds/parser/BuiltinFunctionExpression.java   | 8 
 1 file changed, 8 insertions(+)

diff --git 
a/src/main/java/org/apache/sysds/parser/BuiltinFunctionExpression.java 
b/src/main/java/org/apache/sysds/parser/BuiltinFunctionExpression.java
index 96e2ebc..b358396 100644
--- a/src/main/java/org/apache/sysds/parser/BuiltinFunctionExpression.java
+++ b/src/main/java/org/apache/sysds/parser/BuiltinFunctionExpression.java
@@ -912,6 +912,14 @@ public class BuiltinFunctionExpression extends 
DataIdentifier
case NROW:
case NCOL:
case LENGTH:
+   checkNumParameters(1);
+   checkDataTypeParam(getFirstExpr(), 
+   DataType.FRAME, DataType.LIST, DataType.MATRIX);
+   output.setDataType(DataType.SCALAR);
+   output.setDimensions(0, 0);
+   output.setBlocksize(0);
+   output.setValueType(ValueType.INT64);
+   break;
case COUNT_DISTINCT:
case COUNT_DISTINCT_APPROX:
checkNumParameters(1);



[systemml] branch master updated: [SYSTEMDS-397] Neural Collaborative Filtering (NCF) algorithm script

2020-06-02 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 5e0d5c4  [SYSTEMDS-397] Neural Collaborative Filtering (NCF) algorithm 
script
5e0d5c4 is described below

commit 5e0d5c45162c7f26e6003659d58f091a3a794f11
Author: Patrick Deutschmann 
AuthorDate: Tue Jun 2 23:29:57 2020 +0200

[SYSTEMDS-397] Neural Collaborative Filtering (NCF) algorithm script

AMLS project SS2020.
Closes #925.
---
 dev/docs/Tasks.txt |   1 +
 .../Example - Neural Collaborative Filtering.ipynb | 347 +
 scripts/nn/examples/README.md  |   7 +
 scripts/nn/examples/ncf-dummy-data.dml |  57 
 scripts/nn/examples/ncf-real-data.dml  |  65 
 scripts/staging/NCF.dml| 330 
 6 files changed, 807 insertions(+)

diff --git a/dev/docs/Tasks.txt b/dev/docs/Tasks.txt
index f3d4acd..8c6b306 100644
--- a/dev/docs/Tasks.txt
+++ b/dev/docs/Tasks.txt
@@ -311,6 +311,7 @@ SYSTEMDS-390 New Builtin Functions IV
  * 394 Builtin for one-hot encoding of matrix (not frame), see table  OK
  * 395 SVM rework and utils (confusionMatrix, msvmPredict)OK
  * 396 Builtin for counting number of distinct values OK
+ * 397 Algorithm for neural collaborative filtering (NCF) OK
 
 SYSTEMDS-400 Spark Backend Improvements
  * 401 Fix output block indexes of rdiag (diagM2V)OK
diff --git a/scripts/nn/examples/Example - Neural Collaborative Filtering.ipynb 
b/scripts/nn/examples/Example - Neural Collaborative Filtering.ipynb
new file mode 100644
index 000..5c047fd
--- /dev/null
+++ b/scripts/nn/examples/Example - Neural Collaborative Filtering.ipynb
@@ -0,0 +1,347 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Neural Collaborative Filtering (NCF)\n",
+"\n",
+"This examples trains a neural network on the MovieLens data set using the 
concept of [Neural Collaborative Filtering 
(NCF)](https://dl.acm.org/doi/abs/10.1145/3038912.3052569) that is aimed at 
approaching recommendation problems using deep neural networks as opposed to 
common matrix factorization approaches."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"## Setup and Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+"import numpy as np\n",
+"import pandas as pd\n",
+"import matplotlib.pyplot as plt\n",
+"from sklearn.model_selection import train_test_split"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"## Download Data - MovieLens"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"The MovieLens data set is provided by the Unniversity of Minnesota and 
the GroupLens Research Group:\n",
+"\n",
+"> This dataset (ml-latest-small) describes 5-star rating and free-text 
tagging activity from [MovieLens](http://movielens.org/), a movie 
recommendation service. It contains 100836 ratings and 3683 tag applications 
across 9742 movies. These data were created by 610 users between March 29, 1996 
and September 24, 2018. This dataset was generated on September 26, 
2018.\n",
+"Users were selected at random for inclusion. All selected users had rated 
at least 20 movies. No demographic information is included. Each user is 
represented by an id, and no other information is provided.\n",
+"The data are contained in the files links.csv, movies.csv, ratings.csv 
and tags.csv. More details about the contents and use of all these files 
follows.\n",
+"This is a development dataset. As such, it may change over time and is 
not an appropriate dataset for shared research results. See available benchmark 
datasets if that is your intent.\n",
+"This and other GroupLens data sets are publicly available for download at 
http://grouplens.org/datasets/.;
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Archive:  ml-latest-small.zip\n",
+  "   creating: ml-latest-small/\n",

[systemml] branch master updated: [SYSTEMDS-396] Distinct values count/estimation functions

2020-06-02 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 02e5c6d  [SYSTEMDS-396] Distinct values count/estimation functions
02e5c6d is described below

commit 02e5c6db0dd5d02416e45874253c59db04151605
Author: Sebastian 
AuthorDate: Tue Jun 2 22:07:34 2020 +0200

[SYSTEMDS-396] Distinct values count/estimation functions

New function for counting the number of distinct values in a
MatrixBlock. It is using the builtin AggregateInstructions to parse
through hop lop. It can be called to execute with different types of
estimators:

- count : The default implementation that counts by adding to an
hashmap.
  Not memory efficient, but returns exact counts.
- KMV : An estimation algorithm K Minimum Values
- HLL : An estimation algorithm Hyper Log Log (Not finished)

Closes #909.
---
 .github/workflows/functionsTests.yml   |   1 +
 dev/docs/Tasks.txt |   1 +
 .../java/org/apache/sysds/common/Builtins.java |   2 +
 src/main/java/org/apache/sysds/common/Types.java   |   4 +-
 .../org/apache/sysds/lops/PartialAggregate.java|  21 +-
 .../sysds/parser/BuiltinFunctionExpression.java|   5 +-
 .../org/apache/sysds/parser/DMLTranslator.java |   2 +
 .../sysds/runtime/functionobjects/Builtin.java |   3 +-
 .../runtime/instructions/CPInstructionParser.java  |   2 +
 .../cp/AggregateUnaryCPInstruction.java|  24 +-
 .../matrix/data/LibMatrixCountDistinct.java| 277 +
 .../matrix/operators/CountDistinctOperator.java|  64 +
 .../apache/sysds/runtime/util/DataConverter.java   |  20 ++
 src/main/java/org/apache/sysds/utils/Hash.java | 133 ++
 src/test/java/org/apache/sysds/test/TestUtils.java |  38 ++-
 .../test/component/matrix/CountDistinctTest.java   | 195 +++
 .../apache/sysds/test/component/misc/UtilHash.java | 106 
 .../builtin/BuiltinFactorizationTest.java  |   2 +-
 .../functions/countDistinct/CountDistinct.java |  49 
 .../countDistinct/CountDistinctApprox.java |  56 +
 .../functions/countDistinct/CountDistinctBase.java | 109 
 .../functions/countDistinct/countDistinct.dml  |  24 ++
 .../countDistinct/countDistinctApprox.dml  |  24 ++
 23 files changed, 1150 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/functionsTests.yml 
b/.github/workflows/functionsTests.yml
index b983018..fb1a5bb 100644
--- a/.github/workflows/functionsTests.yml
+++ b/.github/workflows/functionsTests.yml
@@ -49,6 +49,7 @@ jobs:
   codegen,
   codegenalg.partone,
   codegenalg.parttwo,
+  countDistinct,
   data.misc,
   data.rand,
   data.tensor,
diff --git a/dev/docs/Tasks.txt b/dev/docs/Tasks.txt
index 9a51eb5..f3d4acd 100644
--- a/dev/docs/Tasks.txt
+++ b/dev/docs/Tasks.txt
@@ -310,6 +310,7 @@ SYSTEMDS-390 New Builtin Functions IV
  * 393 Builtin to find Connected Components of a graphOK
  * 394 Builtin for one-hot encoding of matrix (not frame), see table  OK
  * 395 SVM rework and utils (confusionMatrix, msvmPredict)OK
+ * 396 Builtin for counting number of distinct values OK
 
 SYSTEMDS-400 Spark Backend Improvements
  * 401 Fix output block indexes of rdiag (diagM2V)OK
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java 
b/src/main/java/org/apache/sysds/common/Builtins.java
index 7345077..5ee7a79 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -178,6 +178,8 @@ public enum Builtins {
TRACE("trace", false),
TO_ONE_HOT("toOneHot", true),
TYPEOF("typeOf", false),
+   COUNT_DISTINCT("countDistinct",false),
+   COUNT_DISTINCT_APPROX("countDistinctApprox",false),
VAR("var", false),
XOR("xor", false),
WINSORIZE("winsorize", true, false), //TODO parameterize w/ prob, 
min/max val
diff --git a/src/main/java/org/apache/sysds/common/Types.java 
b/src/main/java/org/apache/sysds/common/Types.java
index 2d66e81..996132f 100644
--- a/src/main/java/org/apache/sysds/common/Types.java
+++ b/src/main/java/org/apache/sysds/common/Types.java
@@ -175,7 +175,9 @@ public class Types
PROD, SUM_PROD,
MIN, MAX,
TRACE, MEAN, VAR,
-   MAXINDEX, MININDEX;
+   MAXINDEX, MININDEX,
+   COUNT_DISTINCT,
+   COUNT_DISTINCT_APPROX;

@Override
public String toString() {
diff --git a/src/main/java/org/apache/sysds/lops/PartialAggregate.java 
b/src/mai

[systemml] branch master updated: [MINOR] Fix multi-threaded federated MV multiply, and test issues

2020-05-31 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 0118a3e  [MINOR] Fix multi-threaded federated MV multiply, and test 
issues
0118a3e is described below

commit 0118a3eef317826cf79bf01471f07a67631cee64
Author: Matthias Boehm 
AuthorDate: Sun May 31 22:41:59 2020 +0200

[MINOR] Fix multi-threaded federated MV multiply, and test issues

So far, the federated matrix-vector multiplications were always executed
in a single-threaded manner, now we execute them according to the local
parallelism configuration at the federated worker.

Also, it seems I introduced a bug of privacy handling during the merge,
which this patch also fixes (e.g., on scalar casts of non-cacheable data
objects).
---
 .../federated/FederatedWorkerHandler.java  |  8 +++-
 .../cp/AggregateBinaryCPInstruction.java   | 23 ++
 .../instructions/cp/VariableCPInstruction.java |  3 ---
 .../gpu/AggregateBinaryGPUInstruction.java |  4 +---
 .../instructions/spark/CpmmSPInstruction.java  | 12 +++
 .../instructions/spark/MapmmSPInstruction.java | 21 ++--
 .../instructions/spark/PMapmmSPInstruction.java| 17 
 .../instructions/spark/PmmSPInstruction.java   |  6 +-
 .../instructions/spark/ZipmmSPInstruction.java |  3 +--
 .../sysds/runtime/privacy/PrivacyMonitor.java  |  2 +-
 .../compress/ParCompressedMatrixTest.java  | 10 +++---
 11 files changed, 29 insertions(+), 80 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/controlprogram/federated/FederatedWorkerHandler.java
 
b/src/main/java/org/apache/sysds/runtime/controlprogram/federated/FederatedWorkerHandler.java
index bba731c..6fe814a 100644
--- 
a/src/main/java/org/apache/sysds/runtime/controlprogram/federated/FederatedWorkerHandler.java
+++ 
b/src/main/java/org/apache/sysds/runtime/controlprogram/federated/FederatedWorkerHandler.java
@@ -38,15 +38,13 @@ import 
org.apache.sysds.runtime.controlprogram.caching.FrameObject;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.caching.TensorObject;
 import org.apache.sysds.runtime.controlprogram.parfor.util.IDSequence;
-import org.apache.sysds.runtime.functionobjects.Multiply;
-import org.apache.sysds.runtime.functionobjects.Plus;
+import org.apache.sysds.runtime.instructions.InstructionUtils;
 import org.apache.sysds.runtime.instructions.cp.Data;
 import org.apache.sysds.runtime.instructions.cp.ListObject;
 import org.apache.sysds.runtime.io.IOUtilFunctions;
 import org.apache.sysds.runtime.matrix.data.LibMatrixAgg;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.AggregateBinaryOperator;
-import org.apache.sysds.runtime.matrix.operators.AggregateOperator;
 import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 import org.apache.sysds.runtime.meta.MatrixCharacteristics;
@@ -187,8 +185,8 @@ public class FederatedWorkerHandler extends 
ChannelInboundHandlerAdapter {
matTo = PrivacyMonitor.handlePrivacy(matTo);
MatrixBlock matBlock1 = matTo.acquireReadAndRelease();
// TODO other datatypes
-   AggregateBinaryOperator ab_op = new AggregateBinaryOperator(
-   Multiply.getMultiplyFnObject(), new 
AggregateOperator(0, Plus.getPlusFnObject()));
+   AggregateBinaryOperator ab_op = InstructionUtils
+   
.getMatMultOperator(OptimizerUtils.getConstrainedNumThreads(0));
MatrixBlock result = isMatVecMult ?
matBlock1.aggregateBinaryOperations(matBlock1, vector, 
new MatrixBlock(), ab_op) :
vector.aggregateBinaryOperations(vector, matBlock1, new 
MatrixBlock(), ab_op);
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/cp/AggregateBinaryCPInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/cp/AggregateBinaryCPInstruction.java
index 1e3186d..0df8108 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/cp/AggregateBinaryCPInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/cp/AggregateBinaryCPInstruction.java
@@ -19,17 +19,12 @@
 
 package org.apache.sysds.runtime.instructions.cp;
 
-import org.apache.sysds.common.Types.DataType;
-import org.apache.sysds.common.Types.ValueType;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
-import org.apache.sysds.runtime.functionobjects.Multiply;
-import

[systemml] branch master updated: [SYSTEMDS-362] Federated runtime propagation of privacy constraints

2020-05-31 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 98cb93d  [SYSTEMDS-362] Federated runtime propagation of privacy 
constraints
98cb93d is described below

commit 98cb93d0dbbc2f1fabc3796bbc21aca1874eed5f
Author: sebwrede 
AuthorDate: Sun May 31 19:41:45 2020 +0200

[SYSTEMDS-362] Federated runtime propagation of privacy constraints

* Runtime propagation of privacy constraints
* Privacy level as Enum with three levels: Private, PrivateAggregate,
and None
* Privacy handling in FederatedWorkerHandler preventing private data
from being included in federated response
* Test of privacy handling of different federated request types
* Test of different privacy levels and combinations for Federated L2SVM

Closes #919.
---
 src/main/java/org/apache/sysds/hops/Hop.java   |   2 +-
 .../org/apache/sysds/parser/BinaryExpression.java  |   2 +-
 .../org/apache/sysds/parser/DataExpression.java|  11 +-
 .../java/org/apache/sysds/parser/Identifier.java   |   5 +-
 .../federated/FederatedWorkerHandler.java  |   7 +
 .../sysds/runtime/instructions/Instruction.java|   4 +
 .../instructions/cp/BuiltinNaryCPInstruction.java  |   8 +
 .../runtime/instructions/cp/CPInstruction.java |   3 +
 .../instructions/cp/QuaternaryCPInstruction.java   |   3 +
 .../instructions/cp/VariableCPInstruction.java | 518 -
 .../runtime/instructions/fed/FEDInstruction.java   |   5 +-
 .../instructions/spark/ReblockSPInstruction.java   |   2 +-
 ...acyConstraint.java => DMLPrivacyException.java} |  38 +-
 .../sysds/runtime/privacy/PrivacyConstraint.java   |  30 +-
 .../sysds/runtime/privacy/PrivacyMonitor.java  |  96 
 .../sysds/runtime/privacy/PrivacyPropagator.java   | 315 -
 .../org/apache/sysds/runtime/util/HDFSTool.java|   7 +-
 .../test/functions/privacy/FederatedL2SVMTest.java | 384 +++
 .../privacy/FederatedWorkerHandlerTest.java| 339 ++
 .../MatrixMultiplicationPropagationTest.java   |  53 ++-
 .../privacy/MatrixRuntimePropagationTest.java  | 123 +
 .../privacy/MatrixRuntimePropagationTest.dml   |  28 ++
 22 files changed, 1695 insertions(+), 288 deletions(-)

diff --git a/src/main/java/org/apache/sysds/hops/Hop.java 
b/src/main/java/org/apache/sysds/hops/Hop.java
index f0ef363..24aade1 100644
--- a/src/main/java/org/apache/sysds/hops/Hop.java
+++ b/src/main/java/org/apache/sysds/hops/Hop.java
@@ -73,7 +73,7 @@ public abstract class Hop implements ParseInfo
protected ValueType _valueType;
protected boolean _visited = false;
protected DataCharacteristics _dc = new MatrixCharacteristics();
-   protected PrivacyConstraint _privacyConstraint = new 
PrivacyConstraint();
+   protected PrivacyConstraint _privacyConstraint = null;
protected UpdateType _updateType = UpdateType.COPY;
 
protected ArrayList _parent = new ArrayList<>();
diff --git a/src/main/java/org/apache/sysds/parser/BinaryExpression.java 
b/src/main/java/org/apache/sysds/parser/BinaryExpression.java
index 6c177e2..acccb66 100644
--- a/src/main/java/org/apache/sysds/parser/BinaryExpression.java
+++ b/src/main/java/org/apache/sysds/parser/BinaryExpression.java
@@ -146,7 +146,7 @@ public class BinaryExpression extends Expression
}
 
// Set privacy of output
-   output.setPrivacy(PrivacyPropagator.MergeBinary(
+   output.setPrivacy(PrivacyPropagator.mergeBinary(
getLeft().getOutput().getPrivacy(), 
getRight().getOutput().getPrivacy()));
 
this.setOutput(output);
diff --git a/src/main/java/org/apache/sysds/parser/DataExpression.java 
b/src/main/java/org/apache/sysds/parser/DataExpression.java
index c94532d..779f788 100644
--- a/src/main/java/org/apache/sysds/parser/DataExpression.java
+++ b/src/main/java/org/apache/sysds/parser/DataExpression.java
@@ -37,6 +37,7 @@ import org.apache.sysds.runtime.DMLRuntimeException;
 import 
org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysds.runtime.io.FileFormatPropertiesMM;
 import org.apache.sysds.runtime.io.IOUtilFunctions;
+import org.apache.sysds.runtime.privacy.PrivacyConstraint.PrivacyLevel;
 import org.apache.sysds.runtime.util.HDFSTool;
 import org.apache.sysds.runtime.util.UtilFunctions;
 import org.apache.sysds.utils.JSONHelper;
@@ -1097,10 +1098,8 @@ public class DataExpression extends DataIdentifier

// set privacy
Expression eprivacy = getVarParam("privacy");
-   boolean privacy = false;
-   if

[systemml] branch master updated: [SYSTEMDS-274] Fix compressed colMins/colMaxs w/ shared dictionary

2020-05-31 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new ea2d971  [SYSTEMDS-274] Fix compressed colMins/colMaxs w/ shared 
dictionary
ea2d971 is described below

commit ea2d971ec4ad3a0cf93fe78224a6f14176a6235b
Author: Matthias Boehm 
AuthorDate: Sun May 31 18:26:55 2020 +0200

[SYSTEMDS-274] Fix compressed colMins/colMaxs w/ shared dictionary

This patch fixes remaining issues of incorrect results for colMins and
colMaxs over compressed matrix blocks with shared DDC1 dictionaries.
Specifically, if the individual column groups have only partial overlap,
the shared dictionary contains a superset of column group distinct
values. Since aggregation functions like min and max are executed only
over the dictionary (without touching the compressed data), it led to
incorrect results as we find extreme values that do not actually exist
in the column group.

Three alternatives approaches could solve this: (1) drop shared
dictionaries, (2) execute colMins and colMax over the compressed data,
or (3) refactor the double array dictionary into a proper class
hierarchy and maintain additional meta data for shared dictionaries. We
decided for (3) in order to keep predictable performance, irrespective
of shared dictionaries and because this class hierarchy allows for
further improvements of shared dictionaries between any subsets of
column groups.

Additionally, this fix also cleanups incorrect estimates of the
individual column groups (because getValueSize was used in the estimates
as a number of values, although it gave the size in bytes) as well as
some of the Class-layout size estimation tests.

Closes #927.
---
 dev/docs/Tasks.txt |   2 +-
 .../runtime/compress/CompressedMatrixBlock.java|   5 +-
 .../compress/CompressedMatrixBlockFactory.java |  59 ++
 .../sysds/runtime/compress/colgroup/ColGroup.java  |   7 --
 .../runtime/compress/colgroup/ColGroupDDC.java |   7 +-
 .../runtime/compress/colgroup/ColGroupDDC1.java|  34 +++---
 .../runtime/compress/colgroup/ColGroupDDC2.java|  39 ---
 .../runtime/compress/colgroup/ColGroupOLE.java |  33 --
 .../runtime/compress/colgroup/ColGroupOffset.java  |  43 ---
 .../runtime/compress/colgroup/ColGroupRLE.java |  30 +++--
 .../runtime/compress/colgroup/ColGroupSizes.java   |   3 +-
 .../runtime/compress/colgroup/ColGroupValue.java   | 126 -
 .../runtime/compress/colgroup/Dictionary.java  |  96 
 .../compress/colgroup/DictionaryShared.java|  79 +
 .../component/compress/CompressedMatrixTest.java   |  10 +-
 .../component/compress/CompressedTestBase.java |   7 +-
 .../compress/colgroup/JolEstimateTest.java |   8 +-
 .../compress/colgroup/JolEstimateTestEmpty.java|   6 +-
 18 files changed, 398 insertions(+), 196 deletions(-)

diff --git a/dev/docs/Tasks.txt b/dev/docs/Tasks.txt
index cd90a66..6b5dbb0 100644
--- a/dev/docs/Tasks.txt
+++ b/dev/docs/Tasks.txt
@@ -223,7 +223,7 @@ SYSTEMDS-270 Compressed Matrix Blocks
  * 272 Simplify and speedup compression tests OK
  * 273 Refactor compressed Matrix Block to simplify responsibilities  OK
  * 273a Redesign allocation of ColGroups in ColGroupFactory
- * 274 Make the DDC Compression dictionary share correctly
+ * 274 Make the DDC Compression dictionary share correctlyOK
  * 275 Include compressionSettings in DMLConfiguration
  * 276 Allow Uncompressed Columns to be in sparse formats
  * 277 Sampling based estimators fix
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java 
b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
index 3ad65c5..1085afc 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
@@ -243,9 +243,10 @@ public class CompressedMatrixBlock extends 
AbstractCompressedMatrixBlock {
if(_sharedDDC1Dict) {
boolean seenDDC1 = false;
for(ColGroup grp : _colGroups)
-   if(grp.getNumCols() == 1 && grp.getCompType() 
== CompressionType.DDC) {
+   if(grp.getNumCols() == 1 && grp instanceof 
ColGroupDDC1) {
+   ColGroupDDC1 grpDDC = (ColGroupDDC1) 
grp;
if(seenDDC1)
-   total -= grp.getValuesSize();
+   total -= 
grpDDC.getDictionarySize();
   

[systemml] branch master updated: [MINOR] Fix invalid consistency checks of spark append_aligned rbind

2020-05-29 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 405462e  [MINOR] Fix invalid consistency checks of spark 
append_aligned rbind
405462e is described below

commit 405462e84ad1192e447bb09c03fe20d112bf6afb
Author: Matthias Boehm 
AuthorDate: Sat May 30 00:39:07 2020 +0200

[MINOR] Fix invalid consistency checks of spark append_aligned rbind
---
 .../apache/sysds/runtime/instructions/spark/BinarySPInstruction.java| 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/spark/BinarySPInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/spark/BinarySPInstruction.java
index dc4e09b..3c3e5b6 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/spark/BinarySPInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/spark/BinarySPInstruction.java
@@ -459,7 +459,7 @@ public abstract class BinarySPInstruction extends 
ComputationSPInstruction {
}

if( checkAligned ) {
-   if( mc1.getCols() % mc1.getBlocksize() != 0 )
+   if( (cbind ? mc1.getCols() : mc1.getRows()) % 
mc1.getBlocksize() != 0 )
throw new DMLRuntimeException("Input matrices 
are not aligned to blocksize boundaries. Wrong append selected");
}
}



[systemml] branch master updated: [SYSTEMDS-393] Performance distributed connected components

2020-05-29 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new be7191c  [SYSTEMDS-393] Performance distributed connected components
be7191c is described below

commit be7191c2502ad7f5445891ceb671f335e88e51c9
Author: Matthias Boehm 
AuthorDate: Sat May 30 00:15:11 2020 +0200

[SYSTEMDS-393] Performance distributed connected components

This patch makes a few tweaks to significantly improve the performance
of the new connected components builtin function where the graph G does
not fix in the driver memory and thus, spawns distributed spark
operations.

The test case was a 1M x 1M graph with 1G edges, ran with driver memory
of 10GB and 9 executors 80GB each. The baseline runtime of 10 calls to
connected components (each requiring 4 iterations until convergence) was
pretty bad with 1,512s due to excessive shuffle and GC overhead.

1) Modified Script: Removed the unnecessary removal of self-edges as the
chosen update rule is robust enough to handle both cases. This removed
the excessive shuffling overhead for matrix-matrix binary operations
without existing hash partitioning. This change alone reduced the total
runtime of 10 calls to 760s.

2) Handling of approximately known sparsity: The large GC overhead was
due to not converting the MCSR representation into read-optimized CSR
during checkpointing (spark caching). We now compute these conditions
with the upper bound information that is available in cases where the
exact nnz is unknown. This further reduce the total runtime to 131s

With codegen the runtime is further slightly improved to 120s (including
spark context creation, and matrix creation) as we avoid materializing G
* t(c) in memory by fusing it with rowMaxs(G * t(c)). For 40 update rule
computations (and thus scans of the graph), this is fairly reasonable.
---
 scripts/builtin/components.dml   | 9 -
 src/main/java/org/apache/sysds/hops/OptimizerUtils.java  | 8 ++--
 .../sysds/runtime/instructions/spark/RandSPInstruction.java  | 3 +++
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/scripts/builtin/components.dml b/scripts/builtin/components.dml
index 51d96db..5f37c07 100644
--- a/scripts/builtin/components.dml
+++ b/scripts/builtin/components.dml
@@ -27,11 +27,10 @@
 m_components = function(Matrix[Double] G, Integer maxi = 0, Boolean verbose = 
TRUE) 
   return (Matrix[Double] C) 
 {
-  # ensure there are no self-edges in the graph
-  if( trace(G) != 0 ) {
-G = G - diag(diag(G));
-if(verbose)
-  print("Connected Components: warning - removed self-edges from input 
graph");
+  # best effort check for symmetry (not exact but fast)
+  if( sum(rowSums(G) != t(colSums(G))) > 0 ) {
+stop("Connected Components: input graph needs to be "
+   + "symmetric but rowSums and colSums don't match up.");
   }
 
   # initialize state with vertex ids
diff --git a/src/main/java/org/apache/sysds/hops/OptimizerUtils.java 
b/src/main/java/org/apache/sysds/hops/OptimizerUtils.java
index ef2b5ff..213041f 100644
--- a/src/main/java/org/apache/sysds/hops/OptimizerUtils.java
+++ b/src/main/java/org/apache/sysds/hops/OptimizerUtils.java
@@ -477,8 +477,12 @@ public class OptimizerUtils
}
 
public static boolean checkSparseBlockCSRConversion( 
DataCharacteristics dcIn ) {
-   return Checkpoint.CHECKPOINT_SPARSE_CSR
-   && OptimizerUtils.getSparsity(dcIn) < 
MatrixBlock.SPARSITY_TURN_POINT;
+   //we use the non-zero bound to make the important csr decision 
in 
+   //an best effort manner (the precise non-zeros is irrelevant 
here)
+   double sp = OptimizerUtils.getSparsity(
+   dcIn.getRows(), dcIn.getCols(), 
dcIn.getNonZerosBound());
+   return Checkpoint.CHECKPOINT_SPARSE_CSR 
+   && sp < MatrixBlock.SPARSITY_TURN_POINT;
}

/**
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/spark/RandSPInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/spark/RandSPInstruction.java
index ef40773..17315f0 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/spark/RandSPInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/spark/RandSPInstruction.java
@@ -403,8 +403,11 @@ public class RandSPInstruction extends UnarySPInstruction {
if(!mcOut.dimsKnown(true)) {
//note: we cannot compute the nnz from sparsity because 
this would not reflect the
//actual number of non-zeros, except for extreme values 
of sp

[systemml] branch master updated: [SYSTEMDS-335] Updated weighted eviction scheme for lineage cache

2020-05-26 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 822b492  [SYSTEMDS-335] Updated weighted eviction scheme for lineage 
cache
822b492 is described below

commit 822b4922b938ece3a23204823f818545d471bae4
Author: arnabp 
AuthorDate: Tue May 26 21:01:28 2020 +0200

[SYSTEMDS-335] Updated weighted eviction scheme for lineage cache

This patch updates the weighted scheme by adding a elaborate scoring
function. The function has two components, a ratio of compute time,
in-memory size, and a last used timestamp. The components are associated
with weights, which can tune the eviction policies (e.g. weights 0 and 1
for time/size and timestamp respectively translate to LRU scheme). This
patch also replaces the earlier PriorityQueye by a TreeSet.

New eviction test, refactor LineageCacheConfig, eviction logic tuning.
This commit contains,
 1) Few updates in eviction logic. Thanks Matthias for catching an
 unneeded enqueue/dequeue.
 2) Refactoring of LineageCacheConfig class.
 3) A new test to compare the order of evicted items based on the
 specified policies.

Closes #915.
---
 docs/Tasks.txt |   2 +-
 .../sysds/runtime/lineage/LineageCacheConfig.java  | 154 +
 .../sysds/runtime/lineage/LineageCacheEntry.java   |  11 +-
 .../runtime/lineage/LineageCacheEviction.java  | 103 +-
 .../runtime/lineage/LineageCacheStatistics.java|   5 +-
 .../test/functions/dnn/Conv2DBackwardDataTest.java |   3 +-
 .../test/functions/dnn/Conv2DBackwardTest.java |   2 +-
 .../sysds/test/functions/dnn/Conv2DTest.java   |   2 +-
 .../sysds/test/functions/dnn/PoolBackwardTest.java |   2 +-
 .../apache/sysds/test/functions/dnn/PoolTest.java  |   2 +-
 .../sysds/test/functions/dnn/ReluBackwardTest.java |  44 ++
 .../test/functions/lineage/CacheEvictionTest.java  | 141 +++
 .../scripts/functions/lineage/CacheEviction1.dml   |  55 
 .../scripts/functions/lineage/LineageReuseAlg3.dml |   2 +-
 14 files changed, 357 insertions(+), 171 deletions(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 081a44b..91c966d 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -270,7 +270,7 @@ SYSTEMDS-330 Lineage Tracing, Reuse and Integration
  * 332 Parfor integration with multi-level reuse  OK
  * 333 Improve cache eviction with actual compute timeOK
  * 334 Cache scalars only with atleast one matrix inputs
- * 335 Weighted eviction policy (function of size & computetime)  OK
+ * 335 Weighted eviction policy (function(size,computetime,LRU time)) OK
  * 336 Better use of cache status to handle multithreading
  * 337 Adjust disk I/O speed by recording actual time taken   OK
  * 338 Extended lineage tracing (rmEmpty, lists), partial rewritesOK
diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
index 888d27d..2a3c426 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
@@ -26,12 +26,14 @@ import org.apache.sysds.runtime.instructions.Instruction;
 import org.apache.sysds.runtime.instructions.cp.ComputationCPInstruction;
 import org.apache.sysds.runtime.instructions.cp.ListIndexingCPInstruction;
 
-import java.util.ArrayList;
+import java.util.Comparator;
+
+public class LineageCacheConfig 
+{
+   //-CACHING LOGIC RELATED CONFIGURATIONS--//
 
-public class LineageCacheConfig {
-   
private static final String[] REUSE_OPCODES = new String[] {
-   "tsmm", "ba+*", "*", "/", "+", "nrow", "ncol",
+   "tsmm", "ba+*", "*", "/", "+", "nrow", "ncol", "round", "exp", 
"log",
"rightIndex", "leftIndex", "groupedagg", "r'", "solve", "spoof"
};

@@ -55,63 +57,81 @@ public class LineageCacheConfig {
|| DMLScript.LINEAGE_REUSE == NONE;
}
}
+
+   private static ReuseCacheType _cacheType = null;
+   private static CachedItemHead _itemH = null;
+   private static CachedItemTail _itemT = null;
+   private static boolean _compilerAssistedRW = false;
+
+   //-DISK SPILLING RELATED CONFIGURATIONS--//
+
+   private static boolean _allowSpill = false;
+   // Minimum reliable spilling estimate 

[systemml] branch master updated: [SYSTEMDS-209] Performance sparse matrix-colvector cell-wise multiply

2020-05-26 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 75648fe  [SYSTEMDS-209] Performance sparse matrix-colvector cell-wise 
multiply
75648fe is described below

commit 75648fe8a3817a4971480b09cce3ae0d694c7d06
Author: Matthias Boehm 
AuthorDate: Tue May 26 20:15:24 2020 +0200

[SYSTEMDS-209] Performance sparse matrix-colvector cell-wise multiply

While working on the new builtin function for connected components and
ultra-sparse graphs, we found that 'rowMaxs(G * t(c))' performed orders
of magnitude better than the semantically equivalent 't(colMaxs(G *
c))'. The reason was a missing handling of strict sparse-safe operations
for matrix-colvector operations, while this was already handled for
matrix-rowvector operations. In detail, we performed unnecessary
operations in the number of cells instead of in the number of non-zeros
leading to worse asymptotic behavior.

With the simple fix of this patch, now we have very similar performance.
For example, on a scenario of performing 100 times G*c where X is a
10Kx10K, sparsity=0.0001 matrix, total execution time (for 100
operations) improved from 4.2s to 167ms.
---
 docs/Tasks.txt   |  1 +
 .../apache/sysds/runtime/matrix/data/LibMatrixBincell.java   | 12 +---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 3c9782f..081a44b 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -171,6 +171,7 @@ SYSTEMDS-200 Various Fixes
  * 206 Fix codegen outer template compilation (tsmm)  OK
  * 207 Fix builtin function call hoisting from expressionsOK
  * 208 Fix bufferpool leak (live var analysis and createvar)  OK
+ * 209 Fix performance sparse M-CV elementwise multiply   OK
 
 SYSTEMDS-210 Extended lists Operations
  * 211 Cbind and Rbind over lists of matrices OK
diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixBincell.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixBincell.java
index 44d6f6a..34464a6 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixBincell.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixBincell.java
@@ -393,10 +393,9 @@ public class LibMatrixBincell
int alen = a.size(i);
int[] aix = a.indexes(i);
double[] avals = a.values(i);
-   for( int j=apos; j

[systemml] branch master updated: [SYSTEMDS-393] Fix convergence condition of connected components builtin

2020-05-25 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new b66a3c0  [SYSTEMDS-393] Fix convergence condition of connected 
components builtin
b66a3c0 is described below

commit b66a3c006ce6a3a888653e2d1accec479cc756fd
Author: Matthias Boehm 
AuthorDate: Mon May 25 21:11:52 2020 +0200

[SYSTEMDS-393] Fix convergence condition of connected components builtin
---
 scripts/builtin/components.dml   |  2 +-
 .../test/functions/builtin/BuiltinComponentsTest.java| 16 +++-
 .../scripts/functions/builtin/ConnectedComponents.dml|  4 ++--
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/scripts/builtin/components.dml b/scripts/builtin/components.dml
index f760a49..51d96db 100644
--- a/scripts/builtin/components.dml
+++ b/scripts/builtin/components.dml
@@ -40,7 +40,7 @@ m_components = function(Matrix[Double] G, Integer maxi = 0, 
Boolean verbose = TR
   iter = 1;
 
   # iterative computation of connected components
-  while( diff > 0 & (maxi==0 | maxi<=iter) ) {
+  while( diff > 0 & (maxi==0 | iter<=maxi) ) {
 u = max(rowMaxs(G * t(c)), c);
 diff = sum(u != c)
 c = u; # update assignment
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinComponentsTest.java
 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinComponentsTest.java
index e541f9d..8c1b05b 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinComponentsTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinComponentsTest.java
@@ -45,20 +45,25 @@ public class BuiltinComponentsTest extends 
AutomatedTestBase {
 
@Test
public void testConnectedComponents11CP() {
-   runConnectedComponentsTest(11, LopProperties.ExecType.CP);
+   runConnectedComponentsTest(11, 0, LopProperties.ExecType.CP);
}

@Test
public void testConnectedComponents201CP() {
-   runConnectedComponentsTest(201, LopProperties.ExecType.CP);
+   runConnectedComponentsTest(201, 0, LopProperties.ExecType.CP);
}

@Test
public void testConnectedComponents2001CP() {
-   runConnectedComponentsTest(2001, LopProperties.ExecType.CP);
+   runConnectedComponentsTest(2001, 0, LopProperties.ExecType.CP);
+   }
+   
+   @Test
+   public void testConnectedComponents11Maxi100CP() {
+   runConnectedComponentsTest(11, 100, LopProperties.ExecType.CP);
}
 
-   private void runConnectedComponentsTest(int numVertices, ExecType 
instType)
+   private void runConnectedComponentsTest(int numVertices, int maxi, 
ExecType instType)
{
Types.ExecMode platformOld = setExecMode(instType);
 
@@ -68,7 +73,8 @@ public class BuiltinComponentsTest extends AutomatedTestBase {
 
String HOME = SCRIPT_DIR + TEST_DIR;
fullDMLScriptName = HOME + TEST_NAME + ".dml";
-   programArgs = new String[]{ "-args", input("X"), 
output("R")};
+   programArgs = new String[]{ "-args",
+   input("X"), String.valueOf(maxi), output("R")};
 
//generate actual dataset (3 components)
double[][] X = new double[numVertices-3][2];
diff --git a/src/test/scripts/functions/builtin/ConnectedComponents.dml 
b/src/test/scripts/functions/builtin/ConnectedComponents.dml
index 0c6fbe7..56403a8 100644
--- a/src/test/scripts/functions/builtin/ConnectedComponents.dml
+++ b/src/test/scripts/functions/builtin/ConnectedComponents.dml
@@ -23,6 +23,6 @@ X = read($1)
 n = max(X);
 G = table(X[,1], X[, 2], n, n)
 G = G + t(G); #symmetry
-C = components(G=G, verbose=FALSE)
+C = components(G=G, maxi=$2, verbose=FALSE)
 
-write(C, $2)
+write(C, $3)



[systemml] branch master updated: [MINOR] Integration of steplm builtin (avoid excessive test output)

2020-05-24 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new c61e7ac  [MINOR] Integration of steplm builtin (avoid excessive test 
output)
c61e7ac is described below

commit c61e7ac71a83df3d525b6131ca76cf8252c6802f
Author: Matthias Boehm 
AuthorDate: Sun May 24 17:19:39 2020 +0200

[MINOR] Integration of steplm builtin (avoid excessive test output)
---
 scripts/algorithms/StepLinearRegDS.dml | 324 +
 scripts/builtin/steplm.dml |  19 +-
 2 files changed, 16 insertions(+), 327 deletions(-)

diff --git a/scripts/algorithms/StepLinearRegDS.dml 
b/scripts/algorithms/StepLinearRegDS.dml
index 20b1777..a8740f5 100644
--- a/scripts/algorithms/StepLinearRegDS.dml
+++ b/scripts/algorithms/StepLinearRegDS.dml
@@ -79,331 +79,15 @@ fileX = $X;
 fileY = $Y;
 fileB = $B;
 fileS = $S;
-
 write_beta = ifdef($write_beta, TRUE);
-
-# currently only the forward selection strategy in supported: start from one 
feature and iteratively add 
-# features until AIC improves
-dir = "forward";
-
 fmt  = ifdef ($fmt, "text");
-intercept_status = ifdef ($icpt, 1);
+intercept = ifdef ($icpt, 1);
 thr = ifdef ($thr, 0.001);
 
-print ("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
-print ("Reading X and Y...");
 X_orig = read (fileX);
 y = read (fileY);
 
-n = nrow (X_orig);
-m_orig = ncol (X_orig);
-
-# BEGIN STEPWISE LINEAR REGRESSION
-
-if (dir == "forward") {
-  continue = TRUE;
-  columns_fixed = matrix (0, rows = 1, cols = m_orig);
-  columns_fixed_ordered = matrix (0, rows = 1, cols = 1);
-
-  # X_global stores the best model found at each step
-  X_global = matrix (0, rows = n, cols = 1);
-
-  if (intercept_status == 1 | intercept_status == 2) {
-beta = mean (y);
-AIC_best = 2 + n * log(sum((beta - y)^2) / n);
-  } else {
-beta = 0.0;
-AIC_best = n * log(sum(y^2) / n);
-  }
-
-  AICs = matrix (AIC_best, rows = 1, cols = m_orig);
-  print ("Best AIC without any features: " + AIC_best);
-
-  boa_ncol = ncol(X_orig)
-  if (intercept_status != 0) {
-boa_ncol = boa_ncol + 1
-  }
-
-  beta_out_all = matrix(0, rows = boa_ncol, cols = m_orig * 1);
-
-  y_ncol = 1;
-
-  # First pass to examine single features
-  parfor (i in 1:m_orig, check = 0) {
-columns_fixed_ordered_1 = matrix(i, rows=1, cols=1);
-
-[AIC_1, beta_out_i] = linear_regression (X_orig[, i], y, m_orig, 
columns_fixed_ordered_1,
- write_beta, 0);
-
-AICs[1, i] = AIC_1;
-
-beta_out_all[1:nrow(beta_out_i), (i - 1) * y_ncol + 1 : i * y_ncol] = 
beta_out_i[, 1:1];
-
-  }
-
-  # Determine the best AIC
-  column_best = 0;
-  for (k in 1:m_orig) {
-AIC_cur = as.scalar (AICs[1, k]);
-if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) 
) {
-  column_best = k;
-  AIC_best = as.scalar(AICs[1, k]);
-}
-  }
-
-  # beta best so far
-  beta_best = beta_out_all[, (column_best-1) * y_ncol + 1: column_best * 
y_ncol];
-
-  if (column_best == 0) {
-print ("AIC of an empty model is " + AIC_best + " and adding no feature 
achieves more than " +
-   (thr * 100) + "% decrease in AIC!");
-Selected = matrix (0, rows = 1, cols = 1);
-if (intercept_status == 0) {
-  B = matrix (beta, rows = m_orig, cols = 1);
-} else {
-  B_tmp = matrix (0, rows = m_orig + 1, cols = 1);
-  B_tmp[m_orig + 1, ] = beta;
-  B = B_tmp;
-}
-
-beta_out = B;
-
-write(Selected, fileS, format=fmt);
-write(beta_out, fileB, format=fmt);
-
-stop ("");
-  }
-  print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
-  columns_fixed[1, column_best] = 1;
-  columns_fixed_ordered[1, 1] = column_best;
-  X_global = X_orig[, column_best];
-
-while (continue) {
-# Subsequent passes over the features
-beta_out_all_2 = matrix(0, rows = boa_ncol, cols = m_orig * 1);
-
-parfor (i in 1:m_orig, check = 0) {
-  if (as.scalar(columns_fixed[1, i]) == 0) {
-
-# Construct the feature matrix
-X = cbind (X_global, X_orig[, i]);
-
-tmp = matrix(0, rows=1, cols=1);
-tmp[1, 1] = i;
-columns_fixed_ordered_2 = append(columns_fixed_ordered, tmp )
-[AIC_2, beta_out_i] = linear_regression (X, y, m_orig, 
columns_fixed_ordered_2, write_beta, 0);
-beta_out_all_2[1:nrow(beta_out_i), (i - 1) * y_ncol + 1 : i * y_ncol] 
= beta_out_i[,1:1];
-
-AICs[1, i] = AIC_2;
-  }
-}
-
-# Determine the best AIC
-for (k in 1:m_orig) {
-  AIC_cur = as.scalar (AICs[1, k]);
-  if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * 
AIC_best)) &
-(as.scalar(columns_fixed[1, k]

[systemml] branch master updated: [MINOR] Fix l2svm algorithm and cleanup codegen/builtin tests

2020-05-24 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 407f736  [MINOR] Fix l2svm algorithm and cleanup codegen/builtin tests
407f736 is described below

commit 407f736720edc865d9547792c46f982675e292db
Author: Matthias Boehm 
AuthorDate: Sun May 24 16:52:06 2020 +0200

[MINOR] Fix l2svm algorithm and cleanup codegen/builtin tests

This patch makes some minor fixes to the algorithm integration of the
builtin l2svm function (which failed the codegen tests), and cleans up
the codegen and builtin tests via result caching, removed explain
output, and slightly smaller sizes to avoid spurious test failures (for
which reason failures are often ignored without double checking).
---
 scripts/algorithms/l2-svm.dml  |  8 --
 .../apache/sysds/test/applications/ArimaTest.java  |  2 +-
 .../test/applications/MDABivariateStatsTest.java   | 17 ++---
 .../test/functions/builtin/BuiltinCVLmTest.java|  2 --
 .../functions/builtin/BuiltinComponentsTest.java   |  2 +-
 .../builtin/BuiltinFactorizationTest.java  |  6 ++---
 .../test/functions/builtin/BuiltinGLMTest.java | 18 +++---
 .../functions/builtin/BuiltinGridSearchTest.java   |  2 +-
 .../builtin/BuiltinImageBrightnessTest.java|  2 +-
 .../functions/builtin/BuiltinImageCropTest.java|  2 +-
 .../functions/builtin/BuiltinImageMirrorTest.java  |  2 +-
 .../test/functions/builtin/BuiltinKmeansTest.java  |  2 +-
 .../test/functions/builtin/BuiltinL2SVMTest.java   |  4 +--
 .../functions/builtin/BuiltinLmPredictTest.java|  2 +-
 .../test/functions/builtin/BuiltinLmTest.java  |  2 +-
 .../builtin/BuiltinMultiLogRegPredictTest.java |  2 +-
 .../builtin/BuiltinMulticlassSVMTest.java  |  2 +-
 .../functions/builtin/BuiltinNaiveBayesTest.java   |  1 -
 .../functions/builtin/BuiltinNormalizeTest.java|  2 +-
 .../test/functions/builtin/BuiltinOutlierTest.java |  2 +-
 .../test/functions/builtin/BuiltinSTEPLmTest.java  |  2 +-
 .../test/functions/builtin/BuiltinScaleTest.java   | 29 +++---
 .../test/functions/builtin/BuiltinSigmoidTest.java |  2 +-
 .../functions/builtin/BuiltinSliceFinderTest.java  |  2 +-
 .../functions/builtin/BuiltinToOneHotTest.java |  2 +-
 .../functions/builtin/BuiltinWinsorizeTest.java|  2 +-
 .../functions/builtin/MultipleBuiltinsTest.java|  2 +-
 .../codegenalg/partone/AlgorithmKMeans.java|  2 +-
 .../codegenalg/partone/AlgorithmL2SVM.java | 14 +--
 .../codegenalg/parttwo/AlgorithmDatagen.java   |  2 +-
 .../functions/codegenalg/parttwo/AlgorithmGLM.java |  2 +-
 .../parttwo/AlgorithmStepwiseRegression.java   |  2 +-
 .../scripts/functions/codegenalg/Algorithm_L2SVM.R |  8 +++---
 33 files changed, 87 insertions(+), 66 deletions(-)

diff --git a/scripts/algorithms/l2-svm.dml b/scripts/algorithms/l2-svm.dml
index 04d6524..1c2fb9d 100644
--- a/scripts/algorithms/l2-svm.dml
+++ b/scripts/algorithms/l2-svm.dml
@@ -57,15 +57,19 @@ verbose = ifdef($verbose, FALSE)
 X = read($X)
 Y = read($Y)
 
+positive_label = max(Y)
+negative_label = min(Y)
+dimensions = ncol(X)
+
 w = l2svm(X=X, Y=Y, intercept=intercept, 
-  epsilon=epsilon, lambda=labmda, 
+  epsilon=epsilon, lambda=lambda, 
   maxIterations=maxIterations,
   verbose=verbose)
 
 extra_model_params = matrix(0, 4, 1)
 extra_model_params[1,1] = positive_label
 extra_model_params[2,1] = negative_label
-extra_model_params[3,1] = intercept
+extra_model_params[3,1] = as.double(intercept)
 extra_model_params[4,1] = dimensions
 
 w = rbind(w, extra_model_params)
diff --git a/src/test/java/org/apache/sysds/test/applications/ArimaTest.java 
b/src/test/java/org/apache/sysds/test/applications/ArimaTest.java
index c9ab019..020ffb6 100644
--- a/src/test/java/org/apache/sysds/test/applications/ArimaTest.java
+++ b/src/test/java/org/apache/sysds/test/applications/ArimaTest.java
@@ -130,7 +130,7 @@ public class ArimaTest extends AutomatedTestBase {
rCmd = getRCmd(inputDir(), Integer.toString(max_func_invoc), 
Integer.toString(p), Integer.toString(d), Integer.toString(q), 
Integer.toString(P), 
Integer.toString(D), Integer.toString(Q), 
Integer.toString(s), Integer.toString(include_mean), 
Integer.toString(useJacobi), expectedDir());

-   int timeSeriesLength = 5000;
+   int timeSeriesLength = 3000;
double[][] timeSeries = getRandomMatrix(timeSeriesLength, 1, 1, 
5, 0.9, System.currentTimeMillis());

MatrixCharacteristics mc = new 
MatrixCharacteristics(timeSeriesLength,1,-1,-1);
diff --git 
a/src/test/java/org/apache/sysds/test/applications/MDABivariateStatsTest.java 
b/src/test/java/org/apache/sysds/test/applications

[systemml] branch master updated: [SYSTEMDS-395] Cleanup SVM scripts, new confusionMatrix, msvmPredict

2020-05-23 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 8e4f7d8  [SYSTEMDS-395] Cleanup SVM scripts, new confusionMatrix, 
msvmPredict
8e4f7d8 is described below

commit 8e4f7d82e0df9ce5c0634d7516de10fb262603ed
Author: Sebastian 
AuthorDate: Sun May 24 00:02:36 2020 +0200

[SYSTEMDS-395] Cleanup SVM scripts, new confusionMatrix, msvmPredict

- ConfusionMatrix
- msvmPredict

Make confusion matrixes, based on Predictions and labels.
It returns two matrixes:
- A count matrix, containing counts of each case in the matrix.
- An avg matrix, returning the accuracy of each class, and thereby how
  the percentage distribution across labels, aka the percentage
confusion.

msvmPredict applies the trained msvm model and returns
- Y_hat, the output from the model, that is the raw output from the
model
- Y, the row max of the raw output, which is the highest value
predictions.

Furthermore some consistency changes in L2SVM and MSVM.

Closes #910.
---
 docs/Tasks.txt |   1 +
 scripts/algorithms/l2-svm.dml  | 114 ++--
 scripts/builtin/confusionMatrix.dml|  62 +++
 scripts/builtin/kmeans.dml |  22 +--
 scripts/builtin/l2svm.dml  |  97 +-
 scripts/builtin/msvm.dml   |  42 ++---
 scripts/builtin/msvmPredict.dml|  53 ++
 scripts/builtin/multiLogRegPredict.dml |  17 +-
 .../java/org/apache/sysds/common/Builtins.java |   2 +
 .../builtin/BuiltinConfusionMatrixTest.java| 195 +
 .../builtin/BuiltinMulticlassSVMPredictTest.java   | 186 
 .../builtin/BuiltinMulticlassSVMTest.java  |  54 +++---
 .../builtin/{l2svm.dml => confusionMatrix.dml} |   8 +-
 src/test/scripts/functions/builtin/l2svm.dml   |   2 +-
 src/test/scripts/functions/builtin/multisvm.R  |  10 +-
 src/test/scripts/functions/builtin/multisvm.dml|   4 +-
 .../builtin/{l2svm.dml => multisvmPredict.dml} |   7 +-
 .../functions/federated/FederatedL2SVMTest.dml |   2 +-
 .../federated/FederatedL2SVMTestReference.dml  |   2 +-
 19 files changed, 653 insertions(+), 227 deletions(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 1196566..3c9782f 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -304,6 +304,7 @@ SYSTEMDS-390 New Builtin Functions IV
  * 392 Builtin function for missing value imputation via FDs  OK
  * 393 Builtin to find Connected Components of a graphOK
  * 394 Builtin for one-hot encoding of matrix (not frame), see table  OK
+ * 395 SVM rework and utils (confusionMatrix, msvmPredict)OK
 
 Others:
  * Break append instruction to cbind and rbind 
diff --git a/scripts/algorithms/l2-svm.dml b/scripts/algorithms/l2-svm.dml
index 4cbcdb5..04d6524 100644
--- a/scripts/algorithms/l2-svm.dml
+++ b/scripts/algorithms/l2-svm.dml
@@ -21,10 +21,6 @@
 
 # Implements binary-class SVM with squared slack variables
 #
-# Example Usage:
-# Assume L2SVM_HOME is set to the home of the dml script
-# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
-# Assume epsilon = 0.001, lambda = 1, maxiterations = 100
 #
 # INPUT PARAMETERS:
 # 
-
@@ -40,111 +36,31 @@
 # maxiter   Int 100 Maximum number of conjugate gradient iterations
 # model String  --- Location to write model
 # fmt   String  "text"  The output format of the output, such as 
"text" or "csv"
-# Log   String  --- [OPTIONAL] Location to write the log file
 # 
-
 
-# hadoop jar SystemDS.jar -f $L2SVM_HOME/l2-svm.dml -nvargs X=$INPUT_DIR/X 
Y=$INPUT_DIR/Y \
-#   icpt=0 tol=0.001 reg=1 maxiter=100 model=$OUPUT_DIR/w Log=$OUTPUT_DIR/Log 
fmt="text"
-#
+# Example Execution:
+# systemds -f $SYSTEMDS_ROOT/scripts/algorithms/l2-svm.dml \
+#   -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/Y \
+#   icpt=FALSE tol=0.001 reg=1 maxiter=100 \
+#   model=$OUPUT_DIR/w fmt="text"
+
 # Note about inputs: 
 # Assumes that labels (entries in Y) are set to either -1 or +1 or 
non-negative integers
 
 fmt = ifdef($fmt, "text")
-intercept = ifdef($icpt, 0)
+intercept = ifdef($icpt, FALSE)
 epsilon = ifdef($tol, 0.001)
 lambda = ifdef($reg, 1.0)
-maxiterations = ifdef($maxiter, 100)
+maxIterations = ifdef($maxiter, 100)
+verbose = ifdef($verbose, FALSE)
 
 X = read($X)
 Y = read($Y)
 
-#check input parameter assertions
-if(nrow(X) < 2)
- 

[systemml] branch master updated: [SYSTEMDS-254] Fixes distributed slice finding implementation

2020-05-23 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new e78962b  [SYSTEMDS-254] Fixes distributed slice finding implementation
e78962b is described below

commit e78962b8db5b1c90fdb37ae6d9c6284f744cdbfc
Author: gilgenbergg 
AuthorDate: Sat May 23 23:24:30 2020 +0200

[SYSTEMDS-254] Fixes distributed slice finding implementation

Closes #908.
---
 docs/Tasks.txt |  1 +
 scripts/staging/slicing/base/Bucket.py |  2 +
 scripts/staging/slicing/base/SparkNode.py  | 70 +---
 scripts/staging/slicing/base/__init__.py   |  4 +-
 scripts/staging/slicing/base/node.py   | 74 --
 scripts/staging/slicing/base/slicer.py | 12 ++--
 scripts/staging/slicing/base/union_slicer.py   | 19 ++
 .../slicing/spark_modules/join_data_parallel.py| 24 +++
 .../staging/slicing/spark_modules/spark_slicer.py  |  9 +--
 .../slicing/spark_modules/spark_union_slicer.py|  9 +--
 .../staging/slicing/spark_modules/spark_utils.py   | 15 ++---
 .../slicing/spark_modules/union_data_parallel.py   | 22 ---
 .../slicing/tests/classification/__init__.py   |  4 +-
 .../staging/slicing/tests/regression/__init__.py   |  4 +-
 14 files changed, 117 insertions(+), 152 deletions(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 7b64145..1196566 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -209,6 +209,7 @@ SYSTEMDS-250 Extended Slice Finding
  * 251 Alternative slice enumeration approach OK
  * 252 Initial data slicing implementation Python OK
  * 253 Distributed slicing algorithms (task/data parallel)OK
+ * 254 Consolidation and fixes distributed slice finding  OK
 
 SYSTEMDS-260 Misc Tools
  * 261 Stable marriage algorithm  OK
diff --git a/scripts/staging/slicing/base/Bucket.py 
b/scripts/staging/slicing/base/Bucket.py
index 0277f6d..dc8402e 100644
--- a/scripts/staging/slicing/base/Bucket.py
+++ b/scripts/staging/slicing/base/Bucket.py
@@ -45,6 +45,8 @@ class Bucket:
 self.parents = []
 self.sum_error = 0
 self.size = 0
+self.s_upper = 0
+self.s_lower = 0
 self.score = 0
 self.error = 0
 self.max_tuple_error = 0
diff --git a/scripts/staging/slicing/base/SparkNode.py 
b/scripts/staging/slicing/base/SparkNode.py
index fbaa0bd..a123624 100644
--- a/scripts/staging/slicing/base/SparkNode.py
+++ b/scripts/staging/slicing/base/SparkNode.py
@@ -65,25 +65,16 @@ class SparkNode:
 print(mask)
 if loss_type == 0:
 self.calc_l2(mask)
-if loss_type == 1:
+elif loss_type == 1:
 self.calc_class(mask)
 
 def calc_class(self, mask):
 self.e_max = 1
-size = 0
-mistakes = 0
-for row in self.preds:
-flag = True
-for attr in mask:
-if attr not in row[0].indices:
-flag = False
-if flag:
-size = size + 1
-if row[1] == 0:
-mistakes += 1
-self.size = size
-if size != 0:
-self.loss = mistakes / size
+filtered = self.filter_by_mask(mask)
+self.size = len(filtered)
+mistakes = len(list(filter(lambda row: row[1] == 0, filtered)))
+if self.size != 0:
+self.loss = mistakes / self.size
 else:
 self.loss = 0
 self.e_upper = self.loss
@@ -92,25 +83,22 @@ class SparkNode:
 max_tuple_error = 0
 sum_error = 0
 size = 0
-for row in self.preds:
-flag = True
-for attr in mask:
-if attr not in row[0].indices:
-flag = False
-if flag:
-size = size + 1
-if row[1] > max_tuple_error:
-max_tuple_error = row[1]
-sum_error = sum_error + row[1]
+filtered = self.filter_by_mask(mask)
+self.size = len(filtered)
+for row in filtered:
+if row[1] > max_tuple_error:
+max_tuple_error = row[1]
+sum_error += row[1]
 self.e_max = max_tuple_error
 self.e_upper = max_tuple_error
 self.e_max_upper = max_tuple_error
-if size != 0:
-self.loss = sum_error/size
+if self.size != 0:
+self.loss = sum_error/self.size
 else:
 self.loss = 0
-self.size = size
-self.s_upper = size
+
+def filter_by_mask(self, mask):
+return list(filter(lambda row: all(attr in row[0].indices for attr in 
mask), self.preds))
 
 def calc_s_upper(self, c

[systemml] branch master updated: [SYSTEMDS-394] New builtin function toOneHot (one hot encoding)

2020-05-23 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 86fd7b3  [SYSTEMDS-394] New builtin function toOneHot (one hot 
encoding)
86fd7b3 is described below

commit 86fd7b3d4aae5dbca8090e2638e0abc4da696655
Author: Patrick Deutschmann 
AuthorDate: Sat May 23 22:57:17 2020 +0200

[SYSTEMDS-394] New builtin function toOneHot (one hot encoding)

Adds a builtin function toOneHot which transforms a vector containing
integers into a one-hot-encoded matrix (note transform works over frames
and reassigns the integer codes)

Closes #916.
---
 docs/Tasks.txt |   1 +
 docs/dml-language-reference.md |   2 +-
 scripts/builtin/toOneHot.dml   |  43 
 .../java/org/apache/sysds/common/Builtins.java |   1 +
 .../functions/builtin/BuiltinToOneHotTest.java | 113 +
 src/test/scripts/functions/builtin/toOneHot.dml|  25 +
 6 files changed, 184 insertions(+), 1 deletion(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 6539ff8..7b64145 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -302,6 +302,7 @@ SYSTEMDS-390 New Builtin Functions IV
  * 391 New GLM builtin function (from algorithms) OK
  * 392 Builtin function for missing value imputation via FDs  OK
  * 393 Builtin to find Connected Components of a graphOK
+ * 394 Builtin for one-hot encoding of matrix (not frame), see table  OK
 
 Others:
  * Break append instruction to cbind and rbind 
diff --git a/docs/dml-language-reference.md b/docs/dml-language-reference.md
index 652a451..76f2656 100644
--- a/docs/dml-language-reference.md
+++ b/docs/dml-language-reference.md
@@ -699,7 +699,7 @@ cummin() | Column prefix-min (For row-prefix min, use 
cummin(t(X)) | Input: matr
 cummax() | Column prefix-max (For row-prefix min, use cummax(t(X)) | Input: 
matrix  Output: matrix of the same dimensions | A = matrix("3 4 1 6 5 2", 
rows=3, cols=2)  B = cummax(A)  The output matrix B = [[3, 4], [3, 
6], [5, 6]]
 sample(range, size, replacement, seed) | Sample returns a column vector of 
length size, containing uniform random numbers from [1, range] | Input:  
range: integer  size: integer  replacement: boolean (Optional, 
default: FALSE)  seed: integer (Optional)  Output: Matrix dimensions 
are size x 1 | sample(100, 5)  sample(100, 5, TRUE)  sample(100, 120, 
TRUE)  sample(100, 5, 1234) # 1234 is the seed  sample(100, 5, TRUE, 
1234)
 outer(vector1, vector2, "op") | Applies element wise binary operation "op" 
(for example: "", "==", "=", "*", "min") on the all combination of 
vector.  Note: Using "*", we get outer product of two vectors. | Input: 
vectors of same size d, string  Output: matrix of size d X d | A = 
matrix("1 4", rows = 2, cols = 1)  B = matrix("3 6", rows = 1, cols = 2) 
 C = outer(A, B, "")  D = outer(A, B, "*")  The output 
matrix C = [[1, 1], [0, 1]]  The out [...]
-
+toOneHot(X, num_classes)| Converts a vector containing integers to a 
one-hot-encoded matrix | Input: vector with N integer entries between 1 and 
num_classes, number of columns (must be >= largest value in X)Output: 
one-hot-encoded matrix with shape (N, num_classes) | X = round(rand(rows=10, 
cols=1, min=2, max=10)); num_classes = ​12; Y = toOneHot(X, 
num_classes); 
 
  Alternative forms of table()
 
diff --git a/scripts/builtin/toOneHot.dml b/scripts/builtin/toOneHot.dml
new file mode 100644
index 000..8134f5c
--- /dev/null
+++ b/scripts/builtin/toOneHot.dml
@@ -0,0 +1,43 @@
+#-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-
+
+# One-hot encodes a vector
+
+# INPUT PARAMETERS:
+# 

+# NAME   

[systemml] branch master updated: [MINOR] Update Dockerfile (fixes, new R dependency)

2020-05-23 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new b742443  [MINOR] Update Dockerfile (fixes, new R dependency)
b742443 is described below

commit b7424431f3f6131e3245c03ba8d40aa81d2cb245
Author: Sebastian 
AuthorDate: Sat May 23 22:40:54 2020 +0200

[MINOR] Update Dockerfile (fixes, new R dependency)

Closes #918.
---
 .github/action/Dockerfile  |  2 +-
 .github/workflows/componentTests.yml   | 17 -
 docker/build.sh|  6 +++---
 docker/entrypoint.sh   |  2 +-
 docker/sysds.Dockerfile|  4 ++--
 src/test/scripts/installDependencies.R |  1 +
 6 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/.github/action/Dockerfile b/.github/action/Dockerfile
index 5a23a8e..36420da 100644
--- a/.github/action/Dockerfile
+++ b/.github/action/Dockerfile
@@ -19,4 +19,4 @@
 #
 #-
 
-FROM sebaba/testingsysds:0.2
+FROM sebaba/testingsysds:2.0
diff --git a/.github/workflows/componentTests.yml 
b/.github/workflows/componentTests.yml
index 0cc934c..195fb5c 100644
--- a/.github/workflows/componentTests.yml
+++ b/.github/workflows/componentTests.yml
@@ -52,4 +52,19 @@ jobs:
   run: mvn clean compile test-compile
 
 - name: Component Tests
-  run: mvn surefire:test -DskipTests=false 
-Dtest=org.apache.sysds.test.component.*.**
+  run: |
+log="/tmp/sysdstest.log"
+echo "Starting Tests"
+mvn surefire:test -DskipTests=false 
-Dtest=org.apache.sysds.test.component.*.** 2>&1 > $log
+grep_args="SUCCESS"
+grepvals="$( tail -n 100 $log | grep $grep_args)"
+if [[ $grepvals == *"SUCCESS"* ]]; then
+   echo "- last 100 lines from test 
"
+   tail -n 100 $log
+   echo "-- last 100 lines from test end 
---"
+   exit 0
+else
+   echo "\n $(cat $log)"
+   exit 1
+fi
+
diff --git a/docker/build.sh b/docker/build.sh
index 73add3c..643813e 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -23,13 +23,13 @@
 # Build the docker containers
 
 # The first build is for running systemds through docker.
-docker image build -f docker/sysds.Dockerfile -t sebaba/sysds:0.2 .
+docker image build -f docker/sysds.Dockerfile -t sebaba/sysds:2.0 .
 
 # The second build is for testing systemds. This image installs the R 
dependencies needed to run the tests.
-docker image build -f docker/testsysds.Dockerfile -t sebaba/testingsysds:0.2 .
+docker image build -f docker/testsysds.Dockerfile -t sebaba/testingsysds:2.0 .
 
 # The third build is python docker for systemds. 
-docker image build -f docker/pythonsysds.Dockerfile -t sebaba/pythonsysds:0.2 .
+docker image build -f docker/pythonsysds.Dockerfile -t sebaba/pythonsysds:2.0 .
 
 # You might want to prune the docker system afterwards using
 # docker system prune
\ No newline at end of file
diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh
index 713e948..fd80cbe 100755
--- a/docker/entrypoint.sh
+++ b/docker/entrypoint.sh
@@ -24,7 +24,7 @@
 
 cd /github/workspace
 
-build="$(mvn -T 2 clean compile test-compile surefire:test | grep 'BUILD')"
+build="$(mvn -T 2 clean compile test-compile | grep 'BUILD')"
 
 if [[ $build == *"SUCCESS"* ]]; then
   echo "Successfull build"
diff --git a/docker/sysds.Dockerfile b/docker/sysds.Dockerfile
index 01a0094..cf788a1 100644
--- a/docker/sysds.Dockerfile
+++ b/docker/sysds.Dockerfile
@@ -39,11 +39,11 @@ RUN wget 
http://archive.apache.org/dist/maven/maven-3/$MAVEN_VERSION/binaries/ap
 # Install Extras
 RUN apk add --no-cache git bash
 
-RUN git clone https://github.com/apache/systemml.git
+RUN git clone https://github.com/apache/systemml.git systemds
 
 WORKDIR /usr/src/systemds/
 
-RUN mvn package
+RUN mvn clean package -P distribution
 
 # Remove Maven since it is not needed for running the system
 RUN rm -r /usr/lib/mvn
diff --git a/src/test/scripts/installDependencies.R 
b/src/test/scripts/installDependencies.R
index 9689f63..4696361 100644
--- a/src/test/scripts/installDependencies.R
+++ b/src/test/scripts/installDependencies.R
@@ -55,6 +55,7 @@ custom_install("caret");
 custom_install("sigmoid");
 custom_install("DescTools");
 custom_install("mice");
+custom_install("mclust");
 
 print("Installation Done")
 



[systemml] branch master updated: [SYSTEMDS-393] Builtin function for connected components, tests

2020-05-23 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 50fac03  [SYSTEMDS-393] Builtin function for connected components, 
tests
50fac03 is described below

commit 50fac03eed584c2019c95a4702367ddc442e7bf2
Author: Matthias Boehm 
AuthorDate: Sat May 23 22:09:04 2020 +0200

[SYSTEMDS-393] Builtin function for connected components, tests

This patch adds a new built-in function for finding the connected
components in a undirected graph, represented as a symmetric 0/1 matrix.

On a scenario of finding the connected components of the DBLP co-author
graph (for selected DB venues and >=2011 -> 35632x35632, 310582
non-zeros), the algorithm terminated in 12 iterations, found 2443
connected components (w/ 837 single-author components), and took only
4.1s including I/O, transform recoding, and graph construction.
---
 docs/Tasks.txt |  1 +
 scripts/builtin/components.dml | 53 
 .../java/org/apache/sysds/common/Builtins.java |  1 +
 .../functions/builtin/BuiltinComponentsTest.java   | 94 ++
 .../functions/builtin/ConnectedComponents.dml  | 28 +++
 5 files changed, 177 insertions(+)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 66d0901..6539ff8 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -301,6 +301,7 @@ SYSTEMDS-380 Memory Footprint
 SYSTEMDS-390 New Builtin Functions IV
  * 391 New GLM builtin function (from algorithms) OK
  * 392 Builtin function for missing value imputation via FDs  OK
+ * 393 Builtin to find Connected Components of a graphOK
 
 Others:
  * Break append instruction to cbind and rbind 
diff --git a/scripts/builtin/components.dml b/scripts/builtin/components.dml
new file mode 100644
index 000..f760a49
--- /dev/null
+++ b/scripts/builtin/components.dml
@@ -0,0 +1,53 @@
+#-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-
+
+# Computes the connected components of a graph and returns a
+# vector indicating the assignment of vertices to components,
+# where each component is identified by the maximum vertex ID
+# (i.e., row/column position of the input graph) 
+
+m_components = function(Matrix[Double] G, Integer maxi = 0, Boolean verbose = 
TRUE) 
+  return (Matrix[Double] C) 
+{
+  # ensure there are no self-edges in the graph
+  if( trace(G) != 0 ) {
+G = G - diag(diag(G));
+if(verbose)
+  print("Connected Components: warning - removed self-edges from input 
graph");
+  }
+
+  # initialize state with vertex ids
+  c = seq(1,nrow(G));
+  diff = Inf;
+  iter = 1;
+
+  # iterative computation of connected components
+  while( diff > 0 & (maxi==0 | maxi<=iter) ) {
+u = max(rowMaxs(G * t(c)), c);
+diff = sum(u != c)
+c = u; # update assignment
+if( verbose )
+  print("Connected components: iter = "+iter+", #diff = "+diff);
+iter = iter + 1;
+  }
+
+  C = c;
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java 
b/src/main/java/org/apache/sysds/common/Builtins.java
index b738d40..6c53692 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -68,6 +68,7 @@ public enum Builtins {
COLSD("colSds", false),
COLSUM("colSums", false),
COLVAR("colVars", false),
+   COMPONENTS("components", true),
CONV2D("conv2d", false),
CONV2D_BACKWARD_FILTER("conv2d_backward_filter", false),
CONV2D_BACKWARD_DATA("conv2d_backward_data", false),
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinComponentsTest.java
 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinComponentsTest.java
new file mode 100644
index 000..ca54528

[systemml] branch master updated: [MINOR] Performance lineage tracing of literal operands

2020-05-22 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 6f44461  [MINOR] Performance lineage tracing of literal operands
6f44461 is described below

commit 6f44461692b5e3eea2a1554425ce6228ae4ddea2
Author: Matthias Boehm 
AuthorDate: Sat May 23 00:02:00 2020 +0200

[MINOR] Performance lineage tracing of literal operands

This patch makes a minor performance improvement reuse thread-local
string builders (as done for instructions) for the construction for
lineage literals as well.

On the following reduced example script, this patch improved the total
execution time from 56s to 50.3s due to partially removed garbage
collection overhead:

X = rand(rows=10, cols=10, seed=1);
for(i in 1:1e6) {
  tmp1 = ((X + 1) * 2) / 3
  tmp2 = (tmp1 - 1) * tmp1
  X = tmp2;
  if( i%%1e5==0 )
print("Iteration "+i);
}
print(sum(X));

Notice that this script creates over one million lineage items for
literals to cover the 1e6 distinct values of the loop variable i.
---
 .../runtime/controlprogram/IfProgramBlock.java | 12 ---
 .../runtime/instructions/InstructionUtils.java | 19 +-
 .../sysds/runtime/instructions/cp/CPOperand.java   | 23 +++---
 3 files changed, 21 insertions(+), 33 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/controlprogram/IfProgramBlock.java 
b/src/main/java/org/apache/sysds/runtime/controlprogram/IfProgramBlock.java
index 9d0d58e..94ec8e1 100644
--- a/src/main/java/org/apache/sysds/runtime/controlprogram/IfProgramBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/controlprogram/IfProgramBlock.java
@@ -95,10 +95,8 @@ public class IfProgramBlock extends ProgramBlock

ec.getLineagePath().setBranchPredicateValue(predResult.getBooleanValue());

//execute if statement
-   if(predResult.getBooleanValue())
-   {   
-   try 
-   {   
+   if(predResult.getBooleanValue()) {
+   try  {
for (int i=0 ; i < _childBlocksIfBody.size() ; 
i++) {
_childBlocksIfBody.get(i).execute(ec);
}
@@ -106,13 +104,11 @@ public class IfProgramBlock extends ProgramBlock
catch(DMLScriptException e) {
throw e;
}
-   catch(Exception e)
-   {
+   catch(Exception e) {
throw new 
DMLRuntimeException(this.printBlockErrorLocation() + "Error evaluating if 
statement body ", e);
}
}
-   else
-   {
+   else {
try {
for (int i=0 ; i < _childBlocksElseBody.size() 
; i++) {
_childBlocksElseBody.get(i).execute(ec);
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/InstructionUtils.java 
b/src/main/java/org/apache/sysds/runtime/instructions/InstructionUtils.java
index 740d821..1401bfa 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/InstructionUtils.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/InstructionUtils.java
@@ -955,24 +955,25 @@ public class InstructionUtils
if( operand >= parts.length )
throw new DMLRuntimeException("Operand position "
+ operand + " exceeds the length of the 
instruction.");
-   
//replace and reconstruct string
parts[operand] = newValue;
-   StringBuilder sb = new StringBuilder(instStr.length());
-   sb.append(parts[0]);
-   for( int i=1; i

[systemml] branch master updated: [SYSTEMDS-339] Fix robustness lineage tracing/parsing, part II

2020-05-22 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new f29ae42  [SYSTEMDS-339] Fix robustness lineage tracing/parsing, part II
f29ae42 is described below

commit f29ae426be1722fba9468609976709068e6e5d7d
Author: Matthias Boehm 
AuthorDate: Fri May 22 22:38:54 2020 +0200

[SYSTEMDS-339] Fix robustness lineage tracing/parsing, part II

This patch fixes many additional issues in lineage tracing and parsing
in order to support the round-trip for steplm and kmeans.

1) Lineage tracing with default arguments of function call parameters
(so far missing arguments where traces as literal variable name)

2) Lineage Tracing: rshape with parameters, ctable w/ dimensions,
rand/seq w/ variable rows/cols, from/to/incr inputs

3) Lineage Parsing: rshape, rdiag, nrow, ncol, all casts ops, ifelse
with scalar/matrix inputs (so far block size wrong), ctable /w
dimensions, gappend spark ops

4) New lineage parfor algorithm tests: steplm, kmeans
---
 scripts/builtin/kmeans.dml |  6 +--
 src/main/java/org/apache/sysds/common/Types.java   | 35 ++---
 .../apache/sysds/hops/recompile/Recompiler.java|  8 +--
 .../apache/sysds/hops/rewrite/HopRewriteUtils.java | 37 --
 .../RewriteAlgebraicSimplificationDynamic.java |  6 +--
 .../runtime/instructions/InstructionUtils.java | 41 ++-
 .../instructions/cp/CtableCPInstruction.java   | 27 ++
 .../instructions/cp/DataGenCPInstruction.java  | 56 ++--
 .../instructions/cp/FunctionCallCPInstruction.java |  7 ++-
 .../instructions/cp/ReshapeCPInstruction.java  |  9 
 .../instructions/spark/RandSPInstruction.java  | 10 +++-
 .../sysds/runtime/lineage/LineageItemUtils.java| 59 +-
 .../functions/lineage/LineageTraceParforTest.java  | 34 -
 ...aceParfor4.dml => LineageTraceParforKmeans.dml} |  3 +-
 ...aceParfor4.dml => LineageTraceParforSteplm.dml} |  0
 15 files changed, 211 insertions(+), 127 deletions(-)

diff --git a/scripts/builtin/kmeans.dml b/scripts/builtin/kmeans.dml
index 23482da..96591c6 100644
--- a/scripts/builtin/kmeans.dml
+++ b/scripts/builtin/kmeans.dml
@@ -60,8 +60,8 @@ m_kmeans = function(Matrix[Double] X, Integer k = 0, Integer 
runs = 10, Integer
 
   print ("Taking data samples for initialization...");
 
-  [sample_maps, samples_vs_runs_map, sample_block_size] =
-  get_sample_maps (num_records, num_runs, num_centroids * 
avg_sample_size_per_centroid);
+  [sample_maps, samples_vs_runs_map, sample_block_size] = get_sample_maps(
+num_records, num_runs, num_centroids * avg_sample_size_per_centroid);
 
   is_row_in_samples = rowSums (sample_maps);
   X_samples = sample_maps %*% X;
@@ -230,7 +230,7 @@ get_sample_maps = function (int num_records, int 
num_samples, int approx_sample_
 # Replace all sample record ids over "num_records" (i.e. out of range) by 
"num_records + 1":
 is_sample_rec_id_within_range = (sample_rec_ids <= num_records);
 sample_rec_ids = sample_rec_ids * is_sample_rec_id_within_range
-+ (num_records + 1) * (1 - is_sample_rec_id_within_range);
+  + (num_records + 1) * (1 - is_sample_rec_id_within_range);
 
 # Rearrange all samples (and their out-of-range indicators) into one 
column-vector:
 sample_rec_ids = matrix (sample_rec_ids, rows = num_rows, cols = 1, byrow 
= FALSE);
diff --git a/src/main/java/org/apache/sysds/common/Types.java 
b/src/main/java/org/apache/sysds/common/Types.java
index d693b7f..2d66e81 100644
--- a/src/main/java/org/apache/sysds/common/Types.java
+++ b/src/main/java/org/apache/sysds/common/Types.java
@@ -206,6 +206,15 @@ public class Types
MULT2, MINUS1_MULT, MINUS_RIGHT, 
POW2, SUBTRACT_NZ;

+
+   public boolean isScalarOutput() {
+   return this == CAST_AS_SCALAR
+   || this == NROW || this == NCOL
+   || this == LENGTH || this == EXISTS
+   || this == IQM || this == LINEAGE
+   || this == MEDIAN;
+   }
+   
@Override
public String toString() {
switch(this) {
@@ -244,7 +253,7 @@ public class Types
case "ucumk+":  return CUMSUM;
case "ucumk+*": return CUMSUMPROD;
case "*2":  return MULT2;
-   case "!":   return OpOp1.NOT;
+   case "!":   return NOT;
cas

[systemml] branch master updated: [SYSTEMDS-339] Fix robustness lineage tracing/parsing, part I

2020-05-20 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 498aef6  [SYSTEMDS-339] Fix robustness lineage tracing/parsing, part I
498aef6 is described below

commit 498aef6c5915c8256da44a041049cc4a59a84d41
Author: Matthias Boehm 
AuthorDate: Wed May 20 23:37:12 2020 +0200

[SYSTEMDS-339] Fix robustness lineage tracing/parsing, part I

This patch adds steplm as a new parfor lineage tracing/parsing test case
and fixes many related lineage tracing/parsing issues:

1) Parfor Lineage Merge: robustness against empty first worker (no
lineage for result variable, e.g., due to conditional control flow)

2) Lineage Tracing: support for replace, rexpand

3) Lineage Parsing: support for seq, ifelse, log, log_nz, groupedagg,
rmempty, replace, rexpand

4) Steplm: Improved initialization of parfor result variables to avoid
cycles and stackoverflow errors in overwrite scenarios.

5) Minor: parsing of ternary operator codes

However, additional fixes are required for lineage tracing wrt default
handling in functions, literal replacement during dynamic recompilation,
and better overwrite support in matrix indexing (currently this creates
lineage cycles).
---
 docs/Tasks.txt |  1 +
 scripts/builtin/steplm.dml |  1 +
 src/main/java/org/apache/sysds/common/Types.java   |  2 +-
 .../apache/sysds/hops/rewrite/HopRewriteUtils.java |  9 ++-
 .../runtime/controlprogram/ParForProgramBlock.java |  6 +-
 .../instructions/cp/DataGenCPInstruction.java  | 12 
 .../cp/ParameterizedBuiltinCPInstruction.java  | 57 +---
 .../sysds/runtime/lineage/LineageItemUtils.java| 75 ++
 .../functions/lineage/LineageTraceParforTest.java  | 19 +-
 .../functions/lineage/LineageTraceParfor1.dml  |  4 +-
 .../functions/lineage/LineageTraceParfor2.dml  |  4 +-
 .../functions/lineage/LineageTraceParfor3.dml  |  4 +-
 ...ageTraceParfor3.dml => LineageTraceParfor4.dml} | 13 ++--
 13 files changed, 161 insertions(+), 46 deletions(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 6d5ff80..66d0901 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -272,6 +272,7 @@ SYSTEMDS-330 Lineage Tracing, Reuse and Integration
  * 336 Better use of cache status to handle multithreading
  * 337 Adjust disk I/O speed by recording actual time taken   OK
  * 338 Extended lineage tracing (rmEmpty, lists), partial rewritesOK
+ * 339 Lineage tracing robustness (indexed updates, algorithms)
  
 SYSTEMDS-340 Compiler Assisted Lineage Caching and Reuse
  * 341 Finalize unmarking of loop dependent operations
diff --git a/scripts/builtin/steplm.dml b/scripts/builtin/steplm.dml
index fd0018d..28208c8 100644
--- a/scripts/builtin/steplm.dml
+++ b/scripts/builtin/steplm.dml
@@ -126,6 +126,7 @@ m_steplm = function(Matrix[Double] X, Matrix[Double] y, 
Integer icpt = 0,
 while (continue) {
   # Subsequent passes over the features
   beta_out_all_2 = matrix(0, boa_ncol, m_orig * 1);
+  AICs = matrix(0, 1, m_orig); # full overwrite
   parfor (i in 1:m_orig, check = 0) {
 if (as.scalar(columns_fixed[1, i]) == 0) {
   # Construct the feature matrix
diff --git a/src/main/java/org/apache/sysds/common/Types.java 
b/src/main/java/org/apache/sysds/common/Types.java
index 11e597e..d693b7f 100644
--- a/src/main/java/org/apache/sysds/common/Types.java
+++ b/src/main/java/org/apache/sysds/common/Types.java
@@ -359,7 +359,7 @@ public class Types
case "cm": return OpOp3.MOMENT;
case "+*": return OpOp3.PLUS_MULT;
case "-*": return OpOp3.MINUS_MULT;
-   default:   return OpOp3.valueOf(code);
+   default:   return 
OpOp3.valueOf(code.toUpperCase());
}
}
}
diff --git a/src/main/java/org/apache/sysds/hops/rewrite/HopRewriteUtils.java 
b/src/main/java/org/apache/sysds/hops/rewrite/HopRewriteUtils.java
index 9d86b4d..9e73fcc 100644
--- a/src/main/java/org/apache/sysds/hops/rewrite/HopRewriteUtils.java
+++ b/src/main/java/org/apache/sysds/hops/rewrite/HopRewriteUtils.java
@@ -779,8 +779,13 @@ public class HopRewriteUtils
}

public static TernaryOp createTernaryOp(Hop mleft, Hop smid, Hop 
mright, OpOp3 op) {
-   TernaryOp ternOp = new TernaryOp("tmp", DataType.MATRIX, 
ValueType.FP64, op, mleft, smid, mright);
-   ternOp.setBlocksize(mleft.getBlocksize());
+   //NOTe: for ifelse it's sufficient to check mright as 
smid==mright
+   System.out.print

[systemml] branch master updated: [SYSTEMDS-344] New IPA pass for marking deterministic functions/SBs

2020-05-17 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new f1bffeb  [SYSTEMDS-344] New IPA pass for marking deterministic 
functions/SBs
f1bffeb is described below

commit f1bffeb299eec6a57d5290fd12c81ba92c9f03e2
Author: arnabp 
AuthorDate: Sun May 17 22:24:22 2020 +0200

[SYSTEMDS-344] New IPA pass for marking deterministic functions/SBs

This patch moves the fragile and less efficient non-determinism check
in runtime to compile time. This adds a new IPA rewrite to unmark the
functions and StatementBlocks containing direct or transitive
nondeterministic calls (e.g. rand with UNSPECIFIED_SEED) for lineage
caching.

AMLS project SS 2020.
Closes #911.
---
 docs/Tasks.txt |   2 +-
 src/main/java/org/apache/sysds/hops/DataGenOp.java |   8 +
 .../sysds/hops/ipa/IPAPassFlagNonDeterminism.java  | 201 +
 .../sysds/hops/ipa/InterProceduralAnalysis.java|   2 +
 .../apache/sysds/hops/rewrite/HopRewriteUtils.java |   7 +
 .../org/apache/sysds/parser/DMLTranslator.java |   1 +
 .../sysds/parser/FunctionStatementBlock.java   |   9 +
 .../org/apache/sysds/parser/StatementBlock.java|  10 +
 .../runtime/controlprogram/BasicProgramBlock.java  |   4 +-
 .../controlprogram/FunctionProgramBlock.java   |   9 +
 .../instructions/cp/FunctionCallCPInstruction.java |   6 +-
 .../apache/sysds/runtime/lineage/LineageCache.java |   8 +-
 .../sysds/runtime/util/ProgramConverter.java   |   4 +
 .../functions/lineage/FunctionFullReuseTest.java   |   7 +-
 .../functions/lineage/FunctionFullReuse8.dml   |  57 ++
 .../scripts/functions/lineage/LineageReuseAlg2.dml |   4 +-
 16 files changed, 322 insertions(+), 17 deletions(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 8163e9f..6d5ff80 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -277,7 +277,7 @@ SYSTEMDS-340 Compiler Assisted Lineage Caching and Reuse
  * 341 Finalize unmarking of loop dependent operations
  * 342 Mark functions as last-use to enable early eviction
  * 343 Identify equal last level HOPs to ensure SB-level reuse
- * 344 Unmark functions/SBs containing non-determinism for caching
+ * 344 Unmark functions/SBs containing non-determinism for cachingOK
  * 345 Compiler assisted cache configuration
 
 SYSTEMDS-350 Data Cleaning Framework
diff --git a/src/main/java/org/apache/sysds/hops/DataGenOp.java 
b/src/main/java/org/apache/sysds/hops/DataGenOp.java
index edcb448..8fdf98d 100644
--- a/src/main/java/org/apache/sysds/hops/DataGenOp.java
+++ b/src/main/java/org/apache/sysds/hops/DataGenOp.java
@@ -468,6 +468,14 @@ public class DataGenOp extends MultiThreadedHop
return ret;
}

+   public boolean hasUnspecifiedSeed() {
+   if (_op == OpOpDG.RAND || _op == OpOpDG.SINIT) {
+   Hop seed = 
getInput().get(_paramIndexMap.get(DataExpression.RAND_SEED));
+   return 
seed.getName().equals(String.valueOf(DataGenOp.UNSPECIFIED_SEED));
+   }
+   return false;
+   }
+   
public Hop getConstantValue() {
return 
getInput().get(_paramIndexMap.get(DataExpression.RAND_MIN));
}
diff --git 
a/src/main/java/org/apache/sysds/hops/ipa/IPAPassFlagNonDeterminism.java 
b/src/main/java/org/apache/sysds/hops/ipa/IPAPassFlagNonDeterminism.java
new file mode 100644
index 000..a96
--- /dev/null
+++ b/src/main/java/org/apache/sysds/hops/ipa/IPAPassFlagNonDeterminism.java
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.hops.ipa;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+
+import org.apache.sysds.hops.FunctionOp;
+import org.apache.sysds.hops.Hop;
+import org.apache.sysds.hops.HopsException;
+import org.apache.sysds.hops.rewrite.HopRewriteUtils;
+import org.apache.sysds.parser.DMLProgram;
+import org.apache.sysds.parser.F

[systemml] branch master updated: [SYSTEMDS-74] Cleanup lineage tracing (unnecessary variable names)

2020-05-16 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new bd0b319  [SYSTEMDS-74] Cleanup lineage tracing (unnecessary variable 
names)
bd0b319 is described below

commit bd0b319df52215b359c04590ed4091ad136ea4f9
Author: Matthias Boehm 
AuthorDate: Sat May 16 17:00:59 2020 +0200

[SYSTEMDS-74] Cleanup lineage tracing (unnecessary variable names)

This patch removes unnecessary attributes from lineage items in order to
reduce the size (and GC overhead) for long lineage traces. So far, each
lineage item kept the variable name to which is was bound. As lineage
information should be independent of such properties, this information
was already ignored for lineage hashing and comparisons. In few, places
however, we use it to rewire placeholders, which is now cleaned up.
---
 .../cp/AggregateUnaryCPInstruction.java|  9 +--
 .../instructions/cp/ComputationCPInstruction.java  |  8 ++-
 .../instructions/cp/DataGenCPInstruction.java  |  6 +-
 .../instructions/cp/ListIndexingCPInstruction.java |  7 +-
 .../instructions/cp/MatrixAppendCPInstruction.java |  7 +-
 .../cp/MatrixBuiltinNaryCPInstruction.java |  7 +-
 .../cp/MatrixIndexingCPInstruction.java|  7 +-
 .../cp/MultiReturnBuiltinCPInstruction.java| 15 ++--
 .../cp/ParameterizedBuiltinCPInstruction.java  | 16 ++---
 .../cp/ScalarBuiltinNaryCPInstruction.java |  8 +--
 .../instructions/cp/SpoofCPInstruction.java|  9 +--
 .../instructions/cp/VariableCPInstruction.java | 41 +--
 .../fed/ComputationFEDInstruction.java |  7 +-
 .../spark/BuiltinNarySPInstruction.java|  7 +-
 .../spark/ComputationSPInstruction.java|  7 +-
 .../spark/MatrixIndexingSPInstruction.java |  7 +-
 .../instructions/spark/RandSPInstruction.java  |  5 +-
 .../instructions/spark/WriteSPInstruction.java |  5 +-
 .../apache/sysds/runtime/lineage/LineageCache.java |  6 +-
 .../apache/sysds/runtime/lineage/LineageItem.java  | 81 --
 .../sysds/runtime/lineage/LineageItemUtils.java| 37 ++
 .../apache/sysds/runtime/lineage/LineageMap.java   | 71 ++-
 .../sysds/runtime/lineage/LineageParser.java   | 15 ++--
 .../sysds/runtime/lineage/LineageRewriteReuse.java | 58 
 .../sysds/runtime/lineage/LineageTraceable.java| 32 -
 .../test/functions/lineage/LineageReadTest.java|  2 +-
 .../test/functions/lineage/LineageRewriteTest.java |  4 +-
 27 files changed, 251 insertions(+), 233 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/cp/AggregateUnaryCPInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/cp/AggregateUnaryCPInstruction.java
index 7c52737..5f053e9 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/cp/AggregateUnaryCPInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/cp/AggregateUnaryCPInstruction.java
@@ -147,12 +147,9 @@ public class AggregateUnaryCPInstruction extends 
UnaryCPInstruction
throw new DMLRuntimeException("Lineage 
trace "
+ "for variable 
"+input1.getName()+" unavailable.");

-   LineageItem li = DMLScript.LINEAGE_DEDUP ?
-   
LineageItemUtils.rDecompress(ec.getLineageItem(input1)) :
-   ec.getLineageItem(input1);
-   
-   ec.setScalarOutput(output_name, new 
StringObject(
-   Explain.explain(li)));
+   LineageItem li = !DMLScript.LINEAGE_DEDUP ? 
ec.getLineageItem(input1):
+   
LineageItemUtils.rDecompress(ec.getLineageItem(input1));
+   ec.setScalarOutput(output_name, new 
StringObject(Explain.explain(li)));
break;
}
default: {
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/cp/ComputationCPInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/cp/ComputationCPInstruction.java
index 3eecb80..a1c3568 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/cp/ComputationCPInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/cp/ComputationCPInstruction.java
@@ -19,6 +19,7 @@
 
 package org.apache.sysds.runtime.instructions.cp;
 
+import org.apache.commons.lang3.tuple.Pair;
 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.common.Types.ExecMode;
 import org.ap

[systemml] branch master updated: [MINOR] Cache Python pip and apt dependencies

2020-05-15 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 9bc5232  [MINOR] Cache Python pip and apt dependencies
9bc5232 is described below

commit 9bc52328f5535492d50cca811a67bd81829220ce
Author: Sebastian 
AuthorDate: Fri May 15 22:53:58 2020 +0200

[MINOR] Cache Python pip and apt dependencies

Closes #913.
---
 .github/workflows/python.yml | 26 ++
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 156d843..c0002b4 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -53,9 +53,19 @@ jobs:
   with:
 path: ~/.m2/repository
 key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
-restore-keys: |
-  ${{ runner.os }}-maven-
-  
+
+- name: Cache Pip Dependencies
+  uses: actions/cache@v1
+  with:
+path: ~/.cache/pip
+key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ 
hashFiles('src/main/python/setup.py') }}
+
+- name: Cache Deb Dependencies
+  uses: actions/cache@v1
+  with:
+path: /var/cache/apt/archives
+key: ${{ runner.os }}-${{ hashFiles('.github/workflows/python.yml') }}
+
 - name: Maven clean & package
   run: mvn clean package -P distribution
 
@@ -65,15 +75,7 @@ jobs:
 python-version: ${{ matrix.python-version }}
 architecture: 'x64'
 
-- name: Cache Pip Dependencies
-  uses: actions/cache@v1
-  with:
-path: ~/.cache/pip
-key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ 
hashFiles('src/main/python/setup.py') }}
-restore-keys: |
-  ${{ runner.os }}-pip-${{ matrix.python-version }}-
-
-- name: Install protobuf
+- name: Install Protobuf
   run: sudo apt-get install protobuf-compiler libprotoc-dev 
   
 - name: Install pip Dependencies



[systemml] branch master updated: [MINOR] Fix missing licenses and build rat check

2020-05-15 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 99a7271  [MINOR] Fix missing licenses and build rat check
99a7271 is described below

commit 99a7271a12a8d30e9f604cb46db5c6c6ee20d241
Author: Sebastian 
AuthorDate: Fri May 15 22:49:56 2020 +0200

[MINOR] Fix missing licenses and build rat check

ignore __pycache__ folders for rat
add license missing in compression tests
add license missing in scripts - __init__.py files
rat check for build workflow

Closes #912.
---
 .github/workflows/build.yml  |  2 +-
 pom.xml  |  1 +
 scripts/staging/slicing/__init__.py  | 20 
 scripts/staging/slicing/base/__init__.py | 20 
 scripts/staging/slicing/tests/__init__.py| 20 
 .../staging/slicing/tests/classification/__init__.py | 20 
 scripts/staging/slicing/tests/regression/__init__.py | 20 
 .../compress/colgroup/JolEstimateDDCTest.java| 19 +++
 .../component/compress/colgroup/JolEstimateTest.java | 19 +++
 9 files changed, 140 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 0ae7f82..ac762cd 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -48,4 +48,4 @@ jobs:
   ${{ runner.os }}-maven-
 
 - name: Build
-  run: mvn package
+  run: mvn package -P rat
diff --git a/pom.xml b/pom.xml
index 6d6ab8d..bbc3088 100644
--- a/pom.xml
+++ b/pom.xml
@@ -509,6 +509,7 @@

**/*.mtx

**/*.mtd

**/*.out
+   
**/__pycache__/**

**/part-*

**/*.keep

**/target/**
diff --git a/scripts/staging/slicing/__init__.py 
b/scripts/staging/slicing/__init__.py
index e69de29..e66abb4 100644
--- a/scripts/staging/slicing/__init__.py
+++ b/scripts/staging/slicing/__init__.py
@@ -0,0 +1,20 @@
+# -
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -
diff --git a/scripts/staging/slicing/base/__init__.py 
b/scripts/staging/slicing/base/__init__.py
index e69de29..e66abb4 100644
--- a/scripts/staging/slicing/base/__init__.py
+++ b/scripts/staging/slicing/base/__init__.py
@@ -0,0 +1,20 @@
+# -
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -
diff --git a/scripts/staging/slicing/tests/__init__.py 
b/scripts/staging/slicing/tests/__init__.py
index e69de29..e66abb4 100644
--- a/scripts/staging/slicing/tests/__init__.py
+++ b/scripts/staging

[systemml] branch master updated: [SYSTEMDS-263] ONNX graph importer (Python API, docs, tests)

2020-05-14 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 0ac0c25  [SYSTEMDS-263] ONNX graph importer (Python API, docs, tests)
0ac0c25 is described below

commit 0ac0c2571b39e96f7a117fd317d73443632f6f26
Author: Lukas Timpl 
AuthorDate: Thu May 14 23:39:04 2020 +0200

[SYSTEMDS-263] ONNX graph importer (Python API, docs, tests)

This PR implements a first poc-implementation for an ONNX importer.

It adds support for the following operators: Add, Sub, MatMul, Neg, Xor,
Or, And, Relu, Tanh, Sigmoid, Softmax, Dropout, MaxPool, Conv, If; as
well as the logic for nested sub-graphs.

AMLS project SS 2020
Closes #904.
---
 .github/workflows/python.yml   |  17 +-
 .gitignore |   3 +
 docs/Tasks.txt |   3 +-
 docs/onnx-systemds-design.md   |  46 --
 pom.xml|   1 +
 .../python/docs/source/assets/sample_graph.png | Bin 0 -> 35508 bytes
 src/main/python/docs/source/index.rst  |   8 +
 src/main/python/docs/source/onnx_systemds.rst  |  59 +++
 .../python/docs/source/onnx_systemds_design.rst| 217 ++
 src/main/python/systemds/__init__.py   |   2 +-
 src/main/python/systemds/onnx_systemds/README.md   |  22 +
 src/main/python/systemds/onnx_systemds/__init__.py |  14 +
 src/main/python/systemds/onnx_systemds/convert.py  |  53 +++
 .../python/systemds/onnx_systemds/onnx_helper.py   | 218 ++
 .../python/systemds/onnx_systemds/operator_gen.py  | 465 +
 src/main/python/systemds/onnx_systemds/render.py   | 215 ++
 .../templates/graph_function.dml.jinja |  54 +++
 .../onnx_systemds/templates/graph_header.dml.jinja |  22 +
 .../onnx_systemds/templates/main.dml.jinja |  26 ++
 .../templates/matrix_initialize.dml.jinja  |  24 ++
 .../onnx_systemds/templates/model_header.dml.jinja |  36 ++
 .../templates/module_import.dml.jinja  |  17 +
 .../operators/2input_1output_operator.dml.jinja|  18 +
 .../templates/operators/function_call.dml.jinja|  31 ++
 .../templates/operators/if_operator.dml.jinja  |  19 +
 .../templates/operators/neg.dml.jinja  |  18 +
 .../onnx_systemds/templates/util.dml.jinja |  42 ++
 src/main/python/systemds/onnx_systemds/util.py |  40 ++
 src/main/python/{systemds => tests}/__init__.py|   5 -
 .../python/{systemds => tests/onnx}/__init__.py|   4 -
 .../dml_wrapper/simple_conv_layer_2_wrapper.dml|  27 ++
 .../onnx/dml_wrapper/simple_conv_layer_wrapper.dml |  25 ++
 .../dml_wrapper/simple_dropout_layer_wrapper.dml   |  22 +
 .../onnx/dml_wrapper/simple_if_graph_wrapper.dml   |  27 ++
 .../dml_wrapper/simple_mat_add_mul_sub_wrapper.dml |  24 ++
 .../onnx/dml_wrapper/simple_mat_add_wrapper.dml|  24 ++
 .../dml_wrapper/simple_mat_initialized_wrapper.dml |  21 +
 .../dml_wrapper/simple_maxpool_layer_wrapper.dml   |  22 +
 .../simple_relu_tanh_sigmoid_softmax_wrapper.dml   |  27 ++
 .../simple_conv_layer_2_reference.out  |   5 +
 .../simple_conv_layer_reference.out|  25 ++
 .../output_reference/simple_if_graph_reference.out |   5 +
 .../simple_mat_add_mul_sub_reference.out   |   4 +
 .../output_reference/simple_mat_add_reference.out  |   4 +
 .../simple_mat_initialized_reference.out   |   9 +
 .../simple_maxpool_layer_reference.out |  25 ++
 .../simple_relu_tanh_sigmoid_softmax_reference.out |  11 +
 .../tests/onnx/test_models/model_generate.py   | 388 +
 src/main/python/tests/onnx/test_simple.py  |  65 +++
 src/main/python/tests/onnx/util.py |  84 
 50 files changed, 2485 insertions(+), 58 deletions(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 27c12ec..156d843 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -72,9 +72,12 @@ jobs:
 key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ 
hashFiles('src/main/python/setup.py') }}
 restore-keys: |
   ${{ runner.os }}-pip-${{ matrix.python-version }}-
+
+- name: Install protobuf
+  run: sudo apt-get install protobuf-compiler libprotoc-dev 
   
 - name: Install pip Dependencies
-  run: pip install numpy py4j wheel scipy sklearn
+  run: pip install numpy py4j wheel scipy sklearn jinja2 onnx
 
 - name: Build Python Package
   run: |
@@ -97,3 +100,15 @@ jobs:
 cd src/main/python
 python -m unittest tests/lineage/*.py
 echo "Exit Status: " $?
+
+- name: Run onnx-systemds python tests
+  run: |
+export SYSTEMDS_ROOT=$(pwd)
+export 

[systemml] branch master updated: [MINOR] Avoid unnecessary overhead in createvar instructions

2020-05-14 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 30d5c40  [MINOR] Avoid unnecessary overhead in createvar instructions
30d5c40 is described below

commit 30d5c408b900b1aa4c8ddeb2f1264b830f460a05
Author: Matthias Boehm 
AuthorDate: Thu May 14 18:20:17 2020 +0200

[MINOR] Avoid unnecessary overhead in createvar instructions

This patch makes a minor performance improvement to the createvar
instruction execution (which happens for every non-scalar operator). In
detail, the need for creating unique file names (from one instruction),
led to unnecessary string concatenation and thus object allocation. We
now reuse the existing thread-local string builders as used for
instruction generation.

On a special-case scenario with ~1M loop iterations over tiny data (100
values), this patch improved the createvar overhead from 22.1s to 5.6s
(and overall from 49s to 33s).
---
 .../sysds/runtime/instructions/InstructionUtils.java |  8 
 .../runtime/instructions/cp/VariableCPInstruction.java   | 16 ++--
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/InstructionUtils.java 
b/src/main/java/org/apache/sysds/runtime/instructions/InstructionUtils.java
index a47d6de..f1c8dc6 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/InstructionUtils.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/InstructionUtils.java
@@ -1008,4 +1008,12 @@ public class InstructionUtils
sb.append(inputs[inputs.length-1]);
return sb.toString();
}
+   
+   public static String concatStrings(String... inputs) {
+   StringBuilder sb = _strBuilders.get();
+   sb.setLength(0); //reuse allocated space
+   for( int i=0; i obj = new 
TensorObject(getInput1().getValueType(), fname);
//clone meta data because it is updated on 
copy-on-write, otherwise there
//is potential for hidden side effects between 
variables.
@@ -560,6 +558,8 @@ public class VariableCPInstruction extends CPInstruction 
implements LineageTrace
}
else if( getInput1().getDataType() == DataType.FRAME ) {
String fname = getInput2().getName();
+   if( Boolean.parseBoolean(getInput3().getName()) 
)
+   fname = getUniqueFileName(fname);
FrameObject fobj = new FrameObject(fname);
fobj.setMetaData((MetaData)metadata.clone());
fobj.setFileFormatProperties(_formatProperties);
@@ -1257,4 +1257,8 @@ public class VariableCPInstruction extends CPInstruction 
implements LineageTrace
|| opcode == VariableOperationCode.CastAsDoubleVariable
|| opcode == 
VariableOperationCode.CastAsBooleanVariable;
}
+   
+   public static String getUniqueFileName(String fname) {
+   return InstructionUtils.concatStrings(fname, "_", 
String.valueOf(_uniqueVarID.getNextID()));
+   }
 }



[systemml] 02/02: [SYSTEMDS] Fix and cleanup steplm feature selection built-in

2020-05-12 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git

commit 996f61281c45a428986a89bc14c982ad41af0382
Author: Matthias Boehm 
AuthorDate: Tue May 12 22:35:13 2020 +0200

[SYSTEMDS] Fix and cleanup steplm feature selection built-in

This patch makes several improvements to the existing steplm built-in
function (correctness and performance):

1) So far, the lm parameters were not correctly passed through to the
actual lm call, which for example rendered tol, reg, and icpt parameters
ineffective (except for icpt=1 which was the only one tested).

2) Cleanup of unnecessarily operations and control flow

3) Converted the main two for loops of greedy model building to parfor
loops (which required a slightly different analysis of the best model).

On a scenario of a dense 10K x 1K input matrix (with convergence after
20 iterations -> ~21000 lm training calls), this patch improved
performance from 103.9s to 14.4s due to much better utilization (with
fewer barriers) of the available 24 virtual cores.
---
 scripts/builtin/steplm.dml | 254 -
 1 file changed, 113 insertions(+), 141 deletions(-)

diff --git a/scripts/builtin/steplm.dml b/scripts/builtin/steplm.dml
index 608a477..fd0018d 100644
--- a/scripts/builtin/steplm.dml
+++ b/scripts/builtin/steplm.dml
@@ -60,170 +60,138 @@
 # STDEV_TOT_Y   Standard Deviation of the response value Y
 # AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual 
bias
 
-m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, 
Double reg = 1e-7, Double tol = 1e-7, Integer maxi = 0, Boolean verbose = TRUE)
-return(Matrix[Double] C, Matrix[Double] S) {
-
-  # currently only the forward selection strategy in supported: start
-  # from one feature and iteratively add features until AIC improves
-  dir = "forward";
+m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, 
+  Double reg = 1e-7, Double tol = 1e-7, Integer maxi = 0, Boolean verbose = 
TRUE)
+  return(Matrix[Double] B, Matrix[Double] S)
+{
+  if( icpt!=0 & icpt!=1 & icpt!=2 )
+stop("Invalid steplm invocation with icpt="+icpt+" (valid values: 
0,1,2).");
+
+  # NOTE: currently only the forward selection strategy in supported:
+  # start from one feature and iteratively add features until AIC improves
   thr = 0.001;
 
   print("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
-  print("Reading X and Y...");
   X_orig = X;
   n = nrow(X_orig);
   m_orig = ncol(X_orig);
 
   # BEGIN STEPWISE LINEAR REGRESSION
-  if (dir == "forward") {
+  columns_fixed = matrix(0, 1, m_orig);
+  columns_fixed_ordered = matrix(0, 1, 1);
+  
+  # X_global stores the best model found at each step
+  X_global = matrix(0, n, 1);
+  
+  if (icpt == 1 | icpt == 2) {
+beta = mean(y);
+AIC_best_orig = 2 + n * log(sum((beta - y) ^ 2) / n);
+  } else {
+beta = 0.0;
+AIC_best_orig = n * log(sum(y ^ 2) / n);
+  }
+  print("Best AIC without any features: " + AIC_best_orig);
+  boa_ncol = ncol(X_orig) + as.integer(icpt!=0);
+  beta_out_all = matrix(0, boa_ncol, m_orig);
+
+  # First pass to examine single features
+  AICs = matrix(0, 1, m_orig);
+  parfor (i in 1:m_orig, check = 0) {
+[AIC_1, beta_out_i] = linear_regression(X_orig[, i], y, icpt, reg, tol, 
maxi, verbose);
+AICs[1, i] = AIC_1;
+beta_out_all[1:nrow(beta_out_i), i] = beta_out_i;
+  }
+  AIC_best = min(min(AICs), AIC_best_orig);
+  AIC_check = checkAIC(AIC_best, AIC_best_orig, thr);
+  column_best = ifelse(AIC_check, as.scalar(rowIndexMin(AICs)), 0);
+
+  # beta best so far
+  beta_best = beta_out_all[, column_best];
+  if (column_best == 0) {
+print("AIC of an empty model is " + AIC_best + " and adding no feature 
achieves more than " + (thr * 100) + "% decrease in AIC!");
+B = matrix(0, m_orig, 1);
+if (icpt != 0)
+  B = rbind(B, as.matrix(beta));
+S = matrix(0, 1, 1);
+  }
+  else {
+
+print("Best AIC " + AIC_best + " achieved with feature: " + column_best);
+
+columns_fixed[1, column_best] = 1;
+columns_fixed_ordered[1, 1] = column_best;
+X_global = X_orig[, column_best];
+
 continue = TRUE
-columns_fixed = matrix(0, 1, m_orig);
-columns_fixed_ordered = matrix(0, 1, 1);
-
-   # X_global stores the best model found at each step
-X_global = matrix(0, n, 1);
-
-   if (icpt == 1 | icpt == 2) {
-  beta = mean(y);
-  AIC_best = 2 + n * log(sum((beta - y) ^ 2) / n);
-} else {
-  beta = 0.0;
-  AIC_best = n * log(sum(y ^ 2) / n);
-}
-AICs = matrix(AIC_best, 1, m_orig);
-print("Best AIC without any features: " + AIC_best);

[systemml] branch master updated (5e726cf -> 996f612)

2020-05-12 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git.


from 5e726cf  [SYSTEMDS-55] Fix file format handling, docs, and github test 
config
 new 96a719b  [SYSTEMDS-238] Fix lineage merge on parfor w/ conditional 
control flow
 new 996f612  [SYSTEMDS] Fix and cleanup steplm feature selection built-in

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 scripts/builtin/steplm.dml | 254 +
 .../runtime/controlprogram/ParForProgramBlock.java |   3 +-
 2 files changed, 115 insertions(+), 142 deletions(-)



[systemml] 01/02: [SYSTEMDS-238] Fix lineage merge on parfor w/ conditional control flow

2020-05-12 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git

commit 96a719bc384f0c60dc1994be49d72d91d2031dea
Author: Matthias Boehm 
AuthorDate: Tue May 12 22:26:14 2020 +0200

[SYSTEMDS-238] Fix lineage merge on parfor w/ conditional control flow

This patch makes a minor robustness fix to the parfor lineage merge for
the case that certain workers did not make any updates of result
variables due to conditional control flow in the parfor body.
---
 .../org/apache/sysds/runtime/controlprogram/ParForProgramBlock.java| 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/controlprogram/ParForProgramBlock.java 
b/src/main/java/org/apache/sysds/runtime/controlprogram/ParForProgramBlock.java
index 1c98c34..812cf2c 100644
--- 
a/src/main/java/org/apache/sysds/runtime/controlprogram/ParForProgramBlock.java
+++ 
b/src/main/java/org/apache/sysds/runtime/controlprogram/ParForProgramBlock.java
@@ -1351,7 +1351,8 @@ public class ParForProgramBlock extends ForProgramBlock
LineageItem current = lineages[0].get(var._name);
for( int i=1; i

[systemml] branch master updated: [SYSTEMDS-253] Distributed slice finding (task/data parallel, fixes)

2020-05-03 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 8cbc85a  [SYSTEMDS-253] Distributed slice finding (task/data parallel, 
fixes)
8cbc85a is described below

commit 8cbc85a949b3699cde8ed3cf3e3abec6a27fbc60
Author: gilgenbergg 
AuthorDate: Sun May 3 17:17:57 2020 +0200

[SYSTEMDS-253] Distributed slice finding (task/data parallel, fixes)

Closes #881.
---
 docs/Tasks.txt |   3 +-
 scripts/staging/hmm/HMM.py |   2 -
 scripts/staging/slicing/__init__.py|   0
 scripts/staging/slicing/base/Bucket.py | 168 +
 .../staging/slicing/base/{node.py => SparkNode.py} |  79 ++
 scripts/staging/slicing/base/__init__.py   |   0
 scripts/staging/slicing/base/node.py   |  28 +++-
 scripts/staging/slicing/base/slicer.py | 135 +
 .../base/tests/classification/test_adult.py| 101 -
 .../slicing/base/tests/classification/test_iris.py |  88 ---
 .../base/tests/regression/test_insurance.py|  81 --
 .../slicing/base/tests/regression/test_salary.py   |  87 ---
 scripts/staging/slicing/base/top_k.py  |   7 +-
 scripts/staging/slicing/base/union_slicer.py   |  78 --
 .../slicing/spark_modules/join_data_parallel.py| 120 +++
 .../staging/slicing/spark_modules/spark_slicer.py  | 100 
 .../slicing/spark_modules/spark_union_slicer.py|  70 +
 .../staging/slicing/spark_modules/spark_utils.py   | 141 +
 .../slicing/spark_modules/union_data_parallel.py   | 119 +++
 scripts/staging/slicing/tests/__init__.py  |   0
 .../slicing/tests/classification/__init__.py   |   0
 .../slicing/tests/classification/sparked_adults.py | 118 +++
 .../slicing/tests/classification/test_adult.py | 121 +++
 .../slicing/tests/classification/test_iris.py  | 109 +
 .../staging/slicing/tests/regression/__init__.py   |   0
 .../slicing/tests/regression/bd_spark_salary.py| 131 
 .../slicing/tests/regression/spark_salary.py   | 123 +++
 .../slicing/tests/regression/test_insurance.py | 103 +
 .../slicing/tests/regression/test_salary.py| 104 +
 29 files changed, 1717 insertions(+), 499 deletions(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 9d2dba4..9fa9a6f 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -203,7 +203,8 @@ SYSTEMDS-240 GPU Backend Improvements
 
 SYSTEMDS-250 Extended Slice Finding
  * 251 Alternative slice enumeration approach OK
- * 252 Initial data slicing implementation Python
+ * 252 Initial data slicing implementation Python OK
+ * 253 Distributed slicing algorithms (task/data parallel)OK
 
 SYSTEMDS-260 Misc Tools
  * 261 Stable marriage algorithm  OK
diff --git a/scripts/staging/hmm/HMM.py b/scripts/staging/hmm/HMM.py
index 61fa0d0..d9eb187 100644
--- a/scripts/staging/hmm/HMM.py
+++ b/scripts/staging/hmm/HMM.py
@@ -19,8 +19,6 @@
 #
 #-
 
-#Author: Afan Secic
-
 from bs4 import BeautifulSoup,SoupStrainer
 import nltk
 from nltk.tokenize import sent_tokenize, word_tokenize
diff --git a/scripts/staging/slicing/__init__.py 
b/scripts/staging/slicing/__init__.py
new file mode 100644
index 000..e69de29
diff --git a/scripts/staging/slicing/base/Bucket.py 
b/scripts/staging/slicing/base/Bucket.py
new file mode 100644
index 000..0277f6d
--- /dev/null
+++ b/scripts/staging/slicing/base/Bucket.py
@@ -0,0 +1,168 @@
+#-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-
+
+class Bucket:
+
+key: []
+attributes: []
+name: ""
+error: float
+  

[systemml] branch master updated: [SYSTEMDS-335] Weighted eviction policy in lineage cache

2020-05-03 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 6e811d7  [SYSTEMDS-335] Weighted eviction policy in lineage cache
6e811d7 is described below

commit 6e811d75facf0a6cbff0ee9ff93c15beedc1302f
Author: arnabp 
AuthorDate: Sun May 3 16:26:05 2020 +0200

[SYSTEMDS-335] Weighted eviction policy in lineage cache

This patch contains a new eviction policy for lineage cache. A min-heap
based priority queue over a function of computation time and
size is maintained to define the order of evictions.The idea is to evict
large matrices, which take little time to recompute. This weighted
scheme significantly reduces the number of evictions (including disk
spilling). This patch also refactors the LineageCache class to hide the
eviction policy related maintenance.

Closes #905.
---
 docs/Tasks.txt |   2 +-
 .../instructions/cp/FunctionCallCPInstruction.java |   2 +
 .../apache/sysds/runtime/lineage/LineageCache.java | 430 ++---
 .../sysds/runtime/lineage/LineageCacheConfig.java  |  20 +
 .../sysds/runtime/lineage/LineageCacheEntry.java   | 112 ++
 .../runtime/lineage/LineageCacheEviction.java  | 371 ++
 6 files changed, 544 insertions(+), 393 deletions(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 42b2b3e..9d2dba4 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -262,7 +262,7 @@ SYSTEMDS-330 Lineage Tracing, Reuse and Integration
  * 332 Parfor integration with multi-level reuse  OK
  * 333 Improve cache eviction with actual compute timeOK
  * 334 Cache scalars only with atleast one matrix inputs
- * 335 Weighted eviction policy (function of size & computetime)
+ * 335 Weighted eviction policy (function of size & computetime)  OK
  * 336 Better use of cache status to handle multithreading
  * 337 Adjust disk I/O speed by recording actual time taken   OK
  * 338 Extended lineage tracing (rmEmpty, lists), partial rewritesOK
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/cp/FunctionCallCPInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/cp/FunctionCallCPInstruction.java
index def4859..3c4e1a9 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/cp/FunctionCallCPInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/cp/FunctionCallCPInstruction.java
@@ -231,6 +231,8 @@ public class FunctionCallCPInstruction extends 
CPInstruction {
if( DMLScript.LINEAGE && LineageCacheConfig.isMultiLevelReuse() 
) {
LineageCache.putValue(fpb.getOutputParams(), liInputs, 
getCacheFunctionName(_functionName, 
fpb), ec, t1-t0);
+   //FIXME: send _boundOutputNames instead of 
fpb.getOutputParams as 
+   //those are already replaced by boundoutput names in 
the lineage map.
}
}
 
diff --git a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
index a42a376..32e5585 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCache.java
@@ -38,15 +38,12 @@ import org.apache.sysds.runtime.instructions.cp.Data;
 import org.apache.sysds.runtime.instructions.cp.MMTSJCPInstruction;
 import 
org.apache.sysds.runtime.instructions.cp.ParameterizedBuiltinCPInstruction;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
-import org.apache.sysds.runtime.lineage.LineageCacheConfig.LineageCacheStatus;
 import org.apache.sysds.runtime.lineage.LineageCacheConfig.ReuseCacheType;
 import org.apache.sysds.runtime.matrix.data.InputInfo;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.data.OutputInfo;
 import org.apache.sysds.runtime.meta.MetaDataFormat;
-import org.apache.sysds.runtime.util.LocalFileUtils;
 
-import java.io.IOException;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -55,20 +52,13 @@ import java.util.Map;
 
 public class LineageCache
 {
-   private static final Map _cache = new HashMap<>();
-   private static final Map _spillList = new 
HashMap<>();
-   private static final HashSet _removelist = new HashSet<>();
+   private static final Map _cache = new 
HashMap<>();
private static final double CACHE_FRAC = 0.05; // 5% of JVM heap size
-   private static final long CACHE_LIMIT; //limit in bytes
-   private static final boolean DEBUG = false;
-   private static String _outdir = null;
-   private static long _cachesize

[systemml] branch master updated: [MINOR] Fix unnecessary date handling in MLContext API

2020-05-03 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 1c40be6  [MINOR] Fix unnecessary date handling in MLContext API
1c40be6 is described below

commit 1c40be6e31975ddc8e734b6613fdb32997ba0439
Author: bd2019us 
AuthorDate: Sun May 3 16:15:33 2020 +0200

[MINOR] Fix unnecessary date handling in MLContext API

Closes #862.
---
 src/main/java/org/apache/sysds/api/mlcontext/MLContext.java | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/main/java/org/apache/sysds/api/mlcontext/MLContext.java 
b/src/main/java/org/apache/sysds/api/mlcontext/MLContext.java
index bfcc491..9d0b55b 100644
--- a/src/main/java/org/apache/sysds/api/mlcontext/MLContext.java
+++ b/src/main/java/org/apache/sysds/api/mlcontext/MLContext.java
@@ -19,7 +19,6 @@
  
 package org.apache.sysds.api.mlcontext;
 
-import java.util.Date;
 import java.util.Set;
 
 import org.apache.log4j.Logger;
@@ -330,9 +329,8 @@ public class MLContext implements ConfigurableAPI
try {
executionScript = script;
 
-   Long time = new Long((new Date()).getTime());
if ((script.getName() == null) || 
(script.getName().equals(""))) {
-   script.setName(time.toString());
+   
script.setName(String.valueOf(System.currentTimeMillis()));
}
 
MLResults results = scriptExecutor.execute(script);



[systemml] branch master updated: [MINOR] Various improvements of data cleaning built-in primitives

2020-05-03 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 8fbcd75  [MINOR] Various improvements of data cleaning built-in 
primitives
8fbcd75 is described below

commit 8fbcd758674a07fa0a0f41be2ecea110b53691cc
Author: Shafaq Siddiqi 
AuthorDate: Sun May 3 14:50:43 2020 +0200

[MINOR] Various improvements of data cleaning built-in primitives

Closes #901.
---
 scripts/builtin/mice.dml   | 27 ---
 scripts/builtin/multiLogReg.dml|  3 +-
 scripts/builtin/outlierByIQR.dml   | 20 +++--
 scripts/builtin/outlierBySd.dml| 20 +++--
 scripts/builtin/winsorize.dml  |  7 ++
 .../test/functions/builtin/BuiltinMiceTest.java| 91 +++---
 .../functions/builtin/BuiltinOutlierByIQRTest.java |  9 +++
 .../functions/builtin/BuiltinOutlierBySDTest.java  | 21 +++--
 .../functions/builtin/BuiltinWinsorizeTest.java|  4 +-
 src/test/scripts/functions/builtin/mice.R  | 85 +---
 src/test/scripts/functions/builtin/mice.dml| 28 +--
 .../scripts/functions/builtin/outlier_by_IQR.dml   |  2 +-
 .../scripts/functions/builtin/outlier_by_sd.dml|  2 +-
 src/test/scripts/functions/builtin/winsorize.R |  4 +-
 .../scripts/functions/caching/BufferpoolLeak.dml   |  2 +-
 15 files changed, 220 insertions(+), 105 deletions(-)

diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml
index 99d2be2..b00d542 100644
--- a/scripts/builtin/mice.dml
+++ b/scripts/builtin/mice.dml
@@ -26,6 +26,7 @@
 # NAMETYPEDEFAULT MEANING
 # 
-
 # F   String---Data Frame
+# cMask   Double---A 0/1 row vector for identifying 
numeric (0) adn categorical features (1)
 # iterInteger3 Number of iteration for multiple 
imputations 
 # completeInteger3 A complete dataset generated though a 
specific iteration
 # 
-
@@ -40,17 +41,21 @@
 
 # Assumption missing value are represented with empty string i.e ",," in csv 
file  
 # variables with suffix n are storing continous/numeric data and variables 
with suffix c are storing categorical data
-s_mice= function(Frame[String] F, Matrix[Double] cMask, Integer iter = 3, 
Integer complete = 3)
+s_mice= function(Frame[String] F, Matrix[Double] cMask, Integer iter = 3, 
Integer complete = 3, Boolean verbose = FALSE)
 return(Frame[String] dataset, Frame[String] singleSet)
 {
 
   if(ncol(F) == 1)
 stop("invalid aregument: can not apply mice on single column")
+
+  if(complete > iter)
+complete = iter
 
-  # adding a temporary categorical feature (in-case all attributes are 
continous)
+
+  # adding a temporary  feature (in-case all attributes are of same type)
   F = cbind(F,  as.frame(matrix(1,nrow(F), 1)))
   cMask = cbind(cMask, matrix(1,1,1))
-  
+
   n = nrow(F)
   row = n*complete;
   col = ncol(F) 
@@ -58,6 +63,10 @@ return(Frame[String] dataset, Frame[String] singleSet)
   Mask_Result = matrix(0, rows=1, cols=col)
   scat = seq(1, ncol(cMask))
   cat = removeEmpty(target=scat, margin="rows", select=t(cMask))
+
+  if(nrow(cat) == ncol(F))
+cMask[1,ncol(cMask)] = 0
+  
   s=""
   for(i in 1: nrow(cat), check =0)
 s = s+as.integer(as.scalar(cat[i, 1]))+",";
@@ -168,7 +177,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
 in_n = in_n + 1;
   }
  
-  if((as.scalar(dXMask[1,i]) == 1) & (sum(Mask_c[, in_c]) != 0))
+  if( (as.scalar(dXMask[1,i]) == 1) & (sum(Mask_c[, in_c]) != 0) )
   {
 j = (i + as.scalar(dist[1,in_c])) - 1
 
@@ -199,8 +208,8 @@ return(Frame[String] dataset, Frame[String] singleSet)
   Mask_Filled_c[,in_c] = table(R, 1, pred, n, 1);
 i = as.integer(j)
   }
-  
-  in_c = in_c + 1
+  if(in_c < col)
+in_c = in_c + 1
   i = i+1;
 }
 
@@ -214,7 +223,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
   Result = Result[2: n*iter+1, ]
   Mask_Result = Mask_Result[2: n*iter+1, ]
   index = (((complete*n)-n)+1)
-  #voting for aggregation of categorical imputations
+  # voting for aggregation of categorical imputations
   agg = cAggregate(Mask_Result*cMask, iter, n)
   
   # aggregating the results
@@ -229,11 +238,11 @@ return(Frame[String] dataset, Frame[String] singleSet)
   dataset =   XO + Agg_Matrix
   singleSet = Result[index:row, ]

-  # # decoding nominal columns 
+  # decoding nominal columns 
   dataset = transformdecode(target=dataset, spec=jspecR, meta=M);
   s

[systemml] branch master updated: [MINOR] Fix failing component tests (due to excessive log output)

2020-05-01 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 97cf552  [MINOR] Fix failing component tests (due to excessive log 
output)
97cf552 is described below

commit 97cf5523cf2443e9a462b4fa1321735dc8c60285
Author: Matthias Boehm 
AuthorDate: Sat May 2 00:53:16 2020 +0200

[MINOR] Fix failing component tests (due to excessive log output)

Closes #906.
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 4fad885..4c25f07 100644
--- a/pom.xml
+++ b/pom.xml
@@ -262,7 +262,7 @@
1C
-Xms4g -Xmx4g
false
-   plain
+   brief
true





[systemml] branch master updated: [SYSTEMDS-391] New built-in GLM function (Generalized Linear Model)

2020-04-30 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new c7ce2ff  [SYSTEMDS-391] New built-in GLM function (Generalized Linear 
Model)
c7ce2ff is described below

commit c7ce2ff95f306284363faf64321b1d2f36bbbeb4
Author: Shafaq Siddiqi 
AuthorDate: Thu Apr 30 22:30:11 2020 +0200

[SYSTEMDS-391] New built-in GLM function (Generalized Linear Model)

Closes #888.
---
 docs/Tasks.txt |3 +
 scripts/algorithms/StepGLM.dml |   16 +-
 scripts/builtin/glm.dml| 1118 
 .../java/org/apache/sysds/common/Builtins.java |1 +
 src/test/java/org/apache/sysds/test/TestUtils.java |  383 +++
 .../apache/sysds/test/applications/GLMTest.java|  417 +---
 .../test/functions/builtin/BuiltinGLMTest.java |  269 +
 src/test/scripts/functions/builtin/glmTest.R   |  139 +++
 src/test/scripts/functions/builtin/glmTest.dml |   25 +
 9 files changed, 1955 insertions(+), 416 deletions(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 35d07e6..86ab8fe 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -290,5 +290,8 @@ SYSTEMDS-370 Lossy Compression Blocks
 SYSTEMDS-380 Memory Footprint
  * 371 Matrix Block Memory footprint update
 
+SYSTEMDS-390 New Builtin Functions IV
+ * 391 New GLM builtin-in function (from algorithms)  OK
+
 Others:
  * Break append instruction to cbind and rbind 
diff --git a/scripts/algorithms/StepGLM.dml b/scripts/algorithms/StepGLM.dml
index 2ce5a1b..213f373 100644
--- a/scripts/algorithms/StepGLM.dml
+++ b/scripts/algorithms/StepGLM.dml
@@ -127,18 +127,18 @@ if (dir == "forward") {
  
if (intercept_status == 0) {
# Compute AIC of an empty model with no features and no 
intercept (all Ys are zero)
-   [AIC_best] = glm (X_global, Y, 0, num_features, 
columns_fixed_ordered, " ");
+   [AIC_best] = glm_fit (X_global, Y, 0, num_features, 
columns_fixed_ordered, " ");
} else {
# compute AIC of an empty model with only intercept (all Ys are 
constant)
all_ones = matrix (1, rows = num_records, cols = 1);
-   [AIC_best] = glm (all_ones, Y, 0, num_features, 
columns_fixed_ordered, " ");
+   [AIC_best] = glm_fit (all_ones, Y, 0, num_features, 
columns_fixed_ordered, " ");
}
print ("Best AIC without any features: " + AIC_best);
   
# First pass to examine single features
AICs = matrix (AIC_best, rows = 1, cols = num_features);
parfor (i in 1:num_features) {  
-   [AIC_1] = glm (X_orig[,i], Y, intercept_status, num_features, 
columns_fixed_ordered, " ");
+   [AIC_1] = glm_fit (X_orig[,i], Y, intercept_status, 
num_features, columns_fixed_ordered, " ");
AICs[1,i] = AIC_1;
}
   
@@ -156,11 +156,11 @@ if (dir == "forward") {
print ("AIC of an empty model is " + AIC_best + " and adding no 
feature achieves more than " + (thr * 100) + "% decrease in AIC!");
if (intercept_status == 0) {
# Compute AIC of an empty model with no features and no 
intercept (all Ys are zero)
-   [AIC_best] = glm (X_global, Y, 0, num_features, 
columns_fixed_ordered, fileB);
+   [AIC_best] = glm_fit (X_global, Y, 0, num_features, 
columns_fixed_ordered, fileB);
} else {
# compute AIC of an empty model with only intercept 
(all Ys are constant)
###all_ones = matrix (1, rows = num_records, cols = 1);
-   [AIC_best] = glm (all_ones, Y, 0, num_features, 
columns_fixed_ordered, fileB);
+   [AIC_best] = glm_fit (all_ones, Y, 0, num_features, 
columns_fixed_ordered, fileB);
}
};
   
@@ -177,7 +177,7 @@ if (dir == "forward") {
# Construct the feature matrix
X = cbind (X_global, X_orig[,i]);
 
-   [AIC_2] = glm (X, Y, intercept_status, 
num_features, columns_fixed_ordered, " ");
+   [AIC_2] = glm_fit (X, Y, intercept_status, 
num_features, columns_fixed_ordered, " ");
AICs[1,i] = AIC_2;
}   
}
@@ -209,7 +209,7 @@ if (dir == "forward") {
   
# run GLM with selected set of features
print ("Running GLM with selected features...");
-   [AIC] = glm (X_global, Y, intercept_status, num_

[systemml] branch master updated: [SYSTEMDS-338] Extended lineage tracing and partial reuse

2020-04-30 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new dc01c4d  [SYSTEMDS-338] Extended lineage tracing and partial reuse
dc01c4d is described below

commit dc01c4db8b46bc413ceabf4998e0b8d44969db73
Author: arnabp 
AuthorDate: Thu Apr 30 22:01:51 2020 +0200

[SYSTEMDS-338] Extended lineage tracing and partial reuse

This patch contains
- two new partial rewrites which are specializations of existing
rewrites,
- bug fixes and optimizations in partial rewrites,
- lineage tracing for removeEmpty,
- a new test class to test algorithms and builtins with reuse,
- extension of lineage tracing of list objects.
Note that lineage doesn't work with most of the list handling methods
today. Due to that the current generalized grid search builtin is far
from working with lineage framework.

Closes #897.
---
 docs/Tasks.txt |   1 +
 .../instructions/cp/DataGenCPInstruction.java  |   4 +
 .../instructions/cp/ListIndexingCPInstruction.java |   7 +
 .../cp/ParameterizedBuiltinCPInstruction.java  |   8 +
 .../apache/sysds/runtime/lineage/LineageCache.java |   4 +-
 .../sysds/runtime/lineage/LineageCacheConfig.java  |   9 +-
 .../apache/sysds/runtime/lineage/LineageMap.java   |   3 +-
 .../sysds/runtime/lineage/LineageRewriteReuse.java | 244 +
 .../functions/lineage/FunctionFullReuseTest.java   |   5 -
 ...tionFullReuseTest.java => LineageReuseAlg.java} |  60 ++---
 .../test/functions/lineage/LineageRewriteTest.java |  10 +-
 ...FunctionFullReuse5.dml => LineageReuseAlg1.dml} |   2 -
 .../scripts/functions/lineage/LineageReuseAlg2.dml |  60 +
 .../{FunctionFullReuse5.dml => RewriteTest13.dml}  |  24 +-
 14 files changed, 333 insertions(+), 108 deletions(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 97fa914..35d07e6 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -264,6 +264,7 @@ SYSTEMDS-330 Lineage Tracing, Reuse and Integration
  * 335 Weighted eviction policy (function of size & computetime)
  * 336 Better use of cache status to handle multithreading
  * 337 Adjust disk I/O speed by recording actual time taken   OK
+ * 338 Extended lineage tracing (rmEmpty, lists), partial rewritesOK
  
 SYSTEMDS-340 Compiler Assisted Lineage Caching and Reuse
  * 341 Finalize unmarking of loop dependent operations
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/cp/DataGenCPInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/cp/DataGenCPInstruction.java
index c29c539..7a37608 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/cp/DataGenCPInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/cp/DataGenCPInstruction.java
@@ -164,6 +164,10 @@ public class DataGenCPInstruction extends 
UnaryCPInstruction {
public long getSeed() {
return seed;
}
+   
+   public boolean isOnesCol() {
+   return minValue == maxValue && minValue == 1 && sparsity == 1 
&& getCols() == 1;
+   }
 
public static DataGenCPInstruction parseInstruction(String str)
{
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/cp/ListIndexingCPInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/cp/ListIndexingCPInstruction.java
index f15ad65..523eceb 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/cp/ListIndexingCPInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/cp/ListIndexingCPInstruction.java
@@ -25,6 +25,8 @@ import org.apache.sysds.common.Types.ValueType;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.caching.CacheableData;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysds.runtime.lineage.LineageItem;
+import org.apache.sysds.runtime.lineage.LineageItemUtils;
 
 public final class ListIndexingCPInstruction extends IndexingCPInstruction {
 
@@ -93,4 +95,9 @@ public final class ListIndexingCPInstruction extends 
IndexingCPInstruction {
else
throw new DMLRuntimeException("Invalid opcode (" + 
opcode +") encountered in ListIndexingCPInstruction.");
}
+   @Override
+   public LineageItem[] getLineageItems(ExecutionContext ec) {
+   return new LineageItem[]{new LineageItem(output.getName(), 
getOpcode(),
+   LineageItemUtils.getLineage(ec, 
input1,input2,input3,rowLower,rowUpper))};
+   }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/cp/ParameterizedBuiltinCPInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/cp/Parameteriz

[systemml] branch master updated: [SYSTEMDS-316] Fix Python lm/rand tests (tolerance, workflow)

2020-04-29 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new a7f17b3  [SYSTEMDS-316] Fix Python lm/rand tests (tolerance, workflow)
a7f17b3 is described below

commit a7f17b3d17176ea8339cb5b5bcdd3c5854763761
Author: Julia Le 
AuthorDate: Wed Apr 29 23:48:33 2020 +0200

[SYSTEMDS-316] Fix Python lm/rand tests (tolerance, workflow)

Just a few changes to the lm test case (increasing tolerance) so that the
tc doesn't fail randomly. Remove multiple definitions of run in python
workflow file.

AMLS project SS 2020, part 2.
Closes #902.
---
 .github/workflows/python.yml  |  4 +---
 src/main/python/tests/test_lm.py  |  4 ++--
 src/main/python/tests/test_matrix_rand.py | 30 +-
 3 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 31af9d5..27c12ec 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -74,9 +74,7 @@ jobs:
   ${{ runner.os }}-pip-${{ matrix.python-version }}-
   
 - name: Install pip Dependencies
-  run: pip install numpy py4j wheel
-  run: pip install scipy
-  run: pip install sklearn
+  run: pip install numpy py4j wheel scipy sklearn
 
 - name: Build Python Package
   run: |
diff --git a/src/main/python/tests/test_lm.py b/src/main/python/tests/test_lm.py
index 24abd5c..1bd9ad7 100644
--- a/src/main/python/tests/test_lm.py
+++ b/src/main/python/tests/test_lm.py
@@ -38,7 +38,7 @@ sds = SystemDSContext()
 
 regressor = LinearRegression(fit_intercept=False)
 shape = (random.randrange(1, 30), random.randrange(1, 30))
-eps = 1e-05
+eps = 1e-03
 
 class TestLm(unittest.TestCase):
 def setUp(self):
@@ -60,8 +60,8 @@ class TestLm(unittest.TestCase):
 model.coef_ = model.coef_.reshape(sds_model_weights.shape)
 self.assertTrue(np.allclose(sds_model_weights, model.coef_, eps))
 except Exception as e:
-self.assertTrue(False, "This should not raise an exception!")
 print(e)
+self.assertTrue(False, "This should not raise an exception!")
 
 def test_lm_invalid_shape(self):
 X = np.random.rand(shape[0], 0)
diff --git a/src/main/python/tests/test_matrix_rand.py 
b/src/main/python/tests/test_matrix_rand.py
index d267bca..b1f964b 100644
--- a/src/main/python/tests/test_matrix_rand.py
+++ b/src/main/python/tests/test_matrix_rand.py
@@ -27,14 +27,16 @@ import unittest
 import numpy as np
 import scipy.stats as st
 import random
+import math
 
 path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../")
 sys.path.insert(0, path)
 from systemds.context import SystemDSContext
 
-shape = (random.randrange(1, 50), random.randrange(1, 50))
+shape = (random.randrange(1, 25), random.randrange(1, 25))
+dist_shape = (10, 15)
 min_max = (0, 1)
-sparsity = 0.2
+sparsity = random.uniform(0.0, 1.0)
 seed = 123
 distributions = ["norm", "uniform"]
 
@@ -58,37 +60,31 @@ class TestRand(unittest.TestCase):
 self.assertTrue((m.min() >= min_max[0]) and (m.max() <= min_max[1]))
 
 def test_rand_sparsity(self):
-m = sds.rand(rows=shape[0], cols=shape[1], sparsity=sparsity, 
seed=seed).compute()
-count, bins = np.histogram(m.flatten("F"))
-non_zero_value_percent = sum(count[1:]) * 100 / sum(count)
-e = 0.05
+m = sds.rand(rows=shape[0], cols=shape[1], sparsity=sparsity, 
seed=0).compute()
+non_zero_value_percent = np.count_nonzero(m) * 100 /np.prod(m.shape)
 
-self.assertTrue(
-sum(count) == (shape[0] * shape[1])
-and (non_zero_value_percent >= (sparsity - e) * 100)
-and (non_zero_value_percent <= (sparsity + e) * 100)
-)
+self.assertTrue(math.isclose(non_zero_value_percent, sparsity*100, 
rel_tol=5))
 
 def test_rand_uniform_distribution(self):
 m = sds.rand(
-rows=shape[0],
-cols=shape[1],
+rows=dist_shape[0],
+cols=dist_shape[1],
 pdf="uniform",
 min=min_max[0],
 max=min_max[1],
-seed=seed).compute()
+seed=0).compute()
 
 dist = find_best_fit_distribution(m.flatten("F"), distributions)
 self.assertTrue(dist == "uniform")
 
 def test_rand_normal_distribution(self):
 m = sds.rand(
-rows=shape[0],
-cols=shape[1],
+rows=dist_shape[0],
+cols=dist_shape[1],
 pdf="normal",
 min=min_max[0],
 max=min_max[1],
-seed=seed).compute()
+seed=0).compute()
 
 dist = find_best_fit_distribution(m.flatten("F"), distributions)
 self.assertTrue(dist == "norm")



[systemml] branch master updated: [MINOR] Fix readme badges with links to master branch

2020-04-29 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 9377496  [MINOR] Fix readme badges with links to master branch
9377496 is described below

commit 937749621bf4d79238f52b0baae4d3308a9318fd
Author: Sebastian 
AuthorDate: Wed Apr 29 23:24:10 2020 +0200

[MINOR] Fix readme badges with links to master branch

- Fix badges to only reflect status on push to master branch
- Make badges link to the tests conducted.

Closes #903.
---
 README.md | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 49a7eb2..117b02a 100644
--- a/README.md
+++ b/README.md
@@ -40,10 +40,10 @@ programs over matrices, while replacing the underlying data 
model and compiler,
 supported functionalities. Until the first release, you can build your own 
snapshot via Apache Maven:
  `mvn clean package -P distribution`.
   
-![Build](https://github.com/apache/systemml/workflows/Build/badge.svg)
-![Documentation](https://github.com/apache/systemml/workflows/Documentation/badge.svg)
-![Component 
Test](https://github.com/apache/systemml/workflows/Component%20Test/badge.svg)
-![Application 
Test](https://github.com/apache/systemml/workflows/Application%20Test/badge.svg)
-![Function 
Test](https://github.com/apache/systemml/workflows/Function%20Test/badge.svg)
-![Python 
Test](https://github.com/apache/systemml/workflows/Python%20Test/badge.svg)
-![Federated Python 
Test](https://github.com/apache/systemml/workflows/Federated%20Python%20Test/badge.svg)
+[![Build](https://github.com/apache/systemml/workflows/Build/badge.svg?branch=master=push)](https://github.com/apache/systemml/actions?query=workflow%3A%22Build%22+branch%3Amaster+event%3Apush)
+[![Documentation](https://github.com/apache/systemml/workflows/Documentation/badge.svg?branch=master=push)](https://github.com/apache/systemml/actions?query=workflow%3ADocumentation+branch%3Amaster+event%3Apush)
+[![Component 
Test](https://github.com/apache/systemml/workflows/Component%20Test/badge.svg?branch=master=push)](https://github.com/apache/systemml/actions?query=workflow%3A%22Component+Test%22+branch%3Amaster+event%3Apush)
+[![Application 
Test](https://github.com/apache/systemml/workflows/Application%20Test/badge.svg?branch=master=push)](https://github.com/apache/systemml/actions?query=workflow%3A%22Application+Test%22+branch%3Amaster+event%3Apush)
+[![Function 
Test](https://github.com/apache/systemml/workflows/Function%20Test/badge.svg?branch=master=push)](https://github.com/apache/systemml/actions?query=workflow%3A%22Function+Test%22+branch%3Amaster+event%3Apush)
+[![Python 
Test](https://github.com/apache/systemml/workflows/Python%20Test/badge.svg?branch=master=push)](https://github.com/apache/systemml/actions?query=workflow%3A%22Python+Test%22+branch%3Amaster+event%3Apush)
+[![Federated Python 
Test](https://github.com/apache/systemml/workflows/Federated%20Python%20Test/badge.svg?branch=master=push)](https://github.com/apache/systemml/actions?query=workflow%3A%22Federated+Python+Test%22+branch%3Amaster+event%3Apush)



[systemml] branch master updated: [SYSTEMML-2121] AutoEncoder test for codegenalg suite

2020-04-27 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new bdf78e4  [SYSTEMML-2121] AutoEncoder test for codegenalg suite
bdf78e4 is described below

commit bdf78e462506ef8ef7fc9e6b23a6520e4155eca0
Author: Janardhan 
AuthorDate: Tue Apr 28 00:15:58 2020 +0200

[SYSTEMML-2121] AutoEncoder test for codegenalg suite

This patch adds a test case for AutoEncoder with codegen
enabled against a corresponding R script.

Closes #890.
---
 .../codegenalg/partone/AlgorithmAutoEncoder.java   |  57 -
 .../functions/codegenalg/Algorithm_AutoEncoder.R   | 239 
 .../functions/codegenalg/Algorithm_AutoEncoder.dml | 251 +
 3 files changed, 542 insertions(+), 5 deletions(-)

diff --git 
a/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java
 
b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java
index 90d7ff8..fca850c 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java
@@ -20,7 +20,9 @@
 package org.apache.sysds.test.functions.codegenalg.partone;
 
 import java.io.File;
+import java.util.HashMap;
 
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
 import org.junit.Assert;
 import org.junit.Test;
 import org.apache.sysds.api.DMLScript;
@@ -37,11 +39,12 @@ public class AlgorithmAutoEncoder extends AutomatedTestBase
private final static String TEST_DIR = "functions/codegenalg/";
private final static String TEST_CLASS_DIR = TEST_DIR + 
AlgorithmAutoEncoder.class.getSimpleName() + "/";

-   private final static int rows = 2468;
+   private final static int rows = 1068;
private final static int cols = 784;

private final static double sparsity1 = 0.7; //dense
private final static double sparsity2 = 0.1; //sparse
+   private final static double eps   = 1e-5;

private final static int H1 = 500;
private final static int H2 = 2;
@@ -179,22 +182,66 @@ public class AlgorithmAutoEncoder extends 
AutomatedTestBase
TestConfiguration config = 
getTestConfiguration(TEST_NAME);
loadTestConfiguration(config);

-   fullDMLScriptName = 
"scripts/staging/autoencoder-2layer.dml";
+   fullDMLScriptName = SCRIPT_DIR + TEST_DIR + 
"/Algorithm_AutoEncoder.dml";
+   //"scripts/staging/autoencoder-2layer.dml";
programArgs = new String[]{ "-stats", "-nvargs", 
"X="+input("X"),
-   "H1="+H1, "H2="+H2, "EPOCH="+epochs, 
"BATCH="+batchsize, 
+   "H1="+H1, "H2="+H2, "EPOCH="+epochs, 
"BATCH="+batchsize,
+   
"W1_rand="+input("W1_rand"),"W2_rand="+input("W2_rand"),
+   "W3_rand="+input("W3_rand"), 
"W4_rand="+input("W4_rand"),
+   "order_rand="+input("order_rand"),
"W1_out="+output("W1"), "b1_out="+output("b1"),
"W2_out="+output("W2"), "b2_out="+output("b2"),
"W3_out="+output("W3"), "b3_out="+output("b3"),
"W4_out="+output("W4"), "b4_out="+output("b4")};
+
+   rCmd = getRCmd(inputDir(), String.valueOf(H1), 
String.valueOf(H2),
+   String.valueOf(epochs), 
String.valueOf(batchsize), expectedDir());
OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = 
rewrites;

//generate actual datasets
double[][] X = getRandomMatrix(rows, cols, 0, 1, 
sparse?sparsity2:sparsity1, 714);
writeInputMatrixWithMTD("X", X, true);
-   
+
+   //generate rand matrices for W1, W2, W3, W4 here itself 
for passing onto both DML and R scripts
+   double[][] W1_rand = getRandomMatrix(H1, cols, 0, 1, 
sparse?sparsity2:sparsity1, 800);
+   writeInputMatrixWithMTD("W1_rand", W1_rand, true);
+   

[systemml] branch master updated: [SYSTEMDS-316] Extended Python API (rand, lm, matrix multiplication)

2020-04-26 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 608b9e5  [SYSTEMDS-316] Extended Python API (rand, lm, matrix 
multiplication)
608b9e5 is described below

commit 608b9e5bfb6c612134fde25249beca10c467160e
Author: Julia Le 
AuthorDate: Sun Apr 26 20:33:27 2020 +0200

[SYSTEMDS-316] Extended Python API (rand, lm, matrix multiplication)

Add rand(), lm and matrix multiplication to Python API
Adapt rand testcases and add exception handling to rand function
Add testcase for LM, update testcase for rand() and add rand testcase to
python.yml
Update python.yml and add simple example of lm to the documentation

AMLS project SS 2020.
Closes #892.
---
 .github/workflows/python.yml   |   4 +-
 docs/Tasks.txt |   1 +
 src/main/python/docs/source/matrix.rst |   1 +
 src/main/python/docs/source/simple_examples.rst|  39 ++
 .../python/systemds/context/systemds_context.py|  30 -
 src/main/python/systemds/matrix/matrix.py  |  31 -
 src/main/python/systemds/matrix/operation_node.py  |  19 +++
 src/main/python/systemds/utils/consts.py   |   2 +-
 src/main/python/tests/test_lm.py   |  79 
 src/main/python/tests/test_matrix_binary_op.py |   3 +
 src/main/python/tests/test_matrix_rand.py  | 140 +
 11 files changed, 345 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 84933ac..31af9d5 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -75,6 +75,8 @@ jobs:
   
 - name: Install pip Dependencies
   run: pip install numpy py4j wheel
+  run: pip install scipy
+  run: pip install sklearn
 
 - name: Build Python Package
   run: |
@@ -96,4 +98,4 @@ jobs:
 export SYSDS_QUIET=1
 cd src/main/python
 python -m unittest tests/lineage/*.py
-echo "Exit Status: " $?
\ No newline at end of file
+echo "Exit Status: " $?
diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 3a7abe7..97fa914 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -249,6 +249,7 @@ SYSTEMDS-310 Python Bindings
  * 313 Python Documentation upload via Github Actions OK
  * 314 Python SystemDS context managerOK
  * 315 Python Federated Matrices TestsOK
+ * 316 Extended Python API (rand, lm, mm) OK
 
 SYSTEMDS-320 Merge SystemDS into Apache SystemML  OK
  * 321 Merge histories of SystemDS and SystemML   OK
diff --git a/src/main/python/docs/source/matrix.rst 
b/src/main/python/docs/source/matrix.rst
index dd88c7c..e75eff4 100644
--- a/src/main/python/docs/source/matrix.rst
+++ b/src/main/python/docs/source/matrix.rst
@@ -98,3 +98,4 @@ the recommended way is to use the methods defined on 
``SystemDSContext``.
 
 .. autofunction:: systemds.matrix.seq
 
+.. autofunction:: systemds.matrix.rand
\ No newline at end of file
diff --git a/src/main/python/docs/source/simple_examples.rst 
b/src/main/python/docs/source/simple_examples.rst
index 2175fd4..e92cfed 100644
--- a/src/main/python/docs/source/simple_examples.rst
+++ b/src/main/python/docs/source/simple_examples.rst
@@ -122,3 +122,42 @@ The output should be similar to::
[-0.0011352 ]
[-0.01686351]
[-0.03839821]]
+
+SystemDS includes a built-in function lm, which solves linear regression. The 
lm function takes as input a matrix of
+feature vectors and a vector of response values y. The output of the function 
is a vector of weights.
+
+.. code-block:: python
+
+  # Import numpy and SystemDS matrix
+  import numpy as np
+  from systemds.context import SystemDSContext
+
+  # Set a seed
+  np.random.seed(0)
+  # Generate matrix of feature vectors
+  features = np.random.rand(10, 15)
+  # Generate a 1-column matrix of response values
+  y = np.random.rand(10, 1)
+
+  # compute the weights
+  with SystemDSContext() as sds:
+weights = sds.matrix(features).lm(sds.matrix(y)).compute()
+print(weights)
+
+The output should be similar to::
+
+  [[-0.11538199]
+  [-0.20386541]
+  [-0.39956035]
+  [ 1.04078623]
+  [ 0.4327084 ]
+  [ 0.18954599]
+  [ 0.49858968]
+  [-0.26812763]
+  [ 0.09961844]
+  [-0.57000751]
+  [-0.43386048]
+  [ 0.55358873]
+  [-0.54638565]
+  [ 0.2205885 ]
+  [ 0.37957689]]
diff --git a/src/main/python/systemds/context/systemds_context.py 
b/src/main/python/systemds/context/systemds_context.py
index d5bdeb8..01f31a6 100644
--- a/src/main/python/systemds/context/systemds_context.py
+++ b/src/main/python/systemds/context/systemds_context.py
@@ -30,7 +30,7 @@ import numpy as np
 from py4j

[systemml] branch master updated: [MINOR] Cleanup codegen algorithm tests (config setup redundancy)

2020-04-25 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 592c44b  [MINOR] Cleanup codegen algorithm tests (config setup 
redundancy)
592c44b is described below

commit 592c44b7325e36b45e51f8510d352c05a0156b42
Author: Matthias Boehm 
AuthorDate: Sat Apr 25 23:07:46 2020 +0200

[MINOR] Cleanup codegen algorithm tests (config setup redundancy)
---
 .../org/apache/sysds/test/AutomatedTestBase.java   |  24 
 .../codegenalg/partone/AlgorithmAutoEncoder.java   |  68 -
 .../codegenalg/partone/AlgorithmKMeans.java| 124 +++-
 .../codegenalg/partone/AlgorithmL2SVM.java |  60 +++-
 .../codegenalg/partone/AlgorithmLinregCG.java  | 123 +++-
 .../codegenalg/partone/AlgorithmMDABivar.java  |  33 +
 .../codegenalg/partone/AlgorithmMLogreg.java   | 156 +
 .../codegenalg/partone/AlgorithmMSVM.java  |  72 --
 .../functions/codegenalg/partone/AlgorithmPCA.java |  58 +++-
 .../codegenalg/parttwo/AlgorithmARIMA.java |  32 +
 .../codegenalg/parttwo/AlgorithmDatagen.java   |  92 +---
 .../functions/codegenalg/parttwo/AlgorithmGLM.java | 112 ++-
 .../codegenalg/parttwo/AlgorithmPNMF.java  |  40 ++
 .../codegenalg/parttwo/AlgorithmPageRank.java  |  51 +++
 .../parttwo/AlgorithmStepwiseRegression.java   |  60 +++-
 15 files changed, 427 insertions(+), 678 deletions(-)

diff --git a/src/test/java/org/apache/sysds/test/AutomatedTestBase.java 
b/src/test/java/org/apache/sysds/test/AutomatedTestBase.java
index 5217d0c..6d1b396 100644
--- a/src/test/java/org/apache/sysds/test/AutomatedTestBase.java
+++ b/src/test/java/org/apache/sysds/test/AutomatedTestBase.java
@@ -116,6 +116,23 @@ public abstract class AutomatedTestBase {
 */
private static final File CONFIG_TEMPLATE_FILE = new File(CONFIG_DIR, 
"SystemDS-config.xml");
 
+   protected enum CodegenTestType {
+   DEFAULT, FUSE_ALL, FUSE_NO_REDUNDANCY;
+   
+   public String getCodgenConfig() {
+   switch(this) {
+   case DEFAULT:
+   return "SystemDS-config-codegen.xml";
+   case FUSE_ALL:
+   return 
"SystemDS-config-codegen-fuse-all.xml";
+   case FUSE_NO_REDUNDANCY:
+   return 
"SystemDS-config-codegen-fuse-no-redundancy.xml";
+   default: 
+   throw new RuntimeException("Unsupported 
codegen test config: "+this.name());
+   }
+   }
+   }
+   
/**
 * Location under which we create local temporary directories for test 
cases. To adjust where testTemp is located,
 * use -Dsystemds.testTemp.root.dir=. This is necessary 
if any parent directories are
@@ -289,6 +306,13 @@ public abstract class AutomatedTestBase {
return CONFIG_TEMPLATE_FILE;
}
 
+   protected File getCodegenConfigFile(String parent, CodegenTestType 
type) {
+   // Instrumentation in this test's output log to show custom 
configuration file used for template.
+   File tmp = new File(parent, type.getCodgenConfig());
+   System.out.println("This test case overrides default 
configuration with " + tmp.getPath());
+   return tmp;
+   }
+   
protected ExecMode setExecMode(ExecType instType) {
switch(instType) {
case SPARK: return setExecMode(ExecMode.SPARK);
diff --git 
a/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java
 
b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java
index 6fef59a..90d7ff8 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java
@@ -36,15 +36,6 @@ public class AlgorithmAutoEncoder extends AutomatedTestBase
private final static String TEST_NAME1 = "Algorithm_AutoEncoder";
private final static String TEST_DIR = "functions/codegenalg/";
private final static String TEST_CLASS_DIR = TEST_DIR + 
AlgorithmAutoEncoder.class.getSimpleName() + "/";
-   private final static String TEST_CONF_DEFAULT = 
"SystemDS-config-codegen.xml";
-   private final static File TEST_CONF_FILE_DEFAULT = new File(SCRIPT_DIR 
+ TEST_DIR, TEST_CONF_DEFAULT);
-   private final static St

[systemml] branch master updated: [SYSTEMML-2121] PCA test for codegenalg suite

2020-04-25 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 955365c  [SYSTEMML-2121] PCA test for codegenalg suite
955365c is described below

commit 955365c5da1a916541d734a4e9494ab61c932503
Author: Janardhan Pulivarthi 
AuthorDate: Sat Apr 25 22:15:06 2020 +0200

[SYSTEMML-2121] PCA test for codegenalg suite

This patch adds a test case for algorithm test with codegen
  enabled against an R script.

  The test matrix is as follows:

  | Rewrite | Sparse |  FuseAll | FuseNoRedundancy |
  | --- | -- |  |  |
  - Spark |   1 |0   | 0|0 |
or CP |   1 |1   | 0|0 |
  |   0 |0   | 0|0 |
  |   0 |1   | 0|0 |
  |   0 |0   | 1|0 |
  |   0 |1   | 1|0 |
  |   0 |0   | 0|1 |
  |   0 |1   | 0|1 |

Closes #889.
---
 scripts/algorithms/PCA.dml |  14 +-
 .../functions/codegenalg/partone/AlgorithmPCA.java | 213 +
 .../scripts/functions/codegenalg/Algorithm_PCA.R   |  87 +
 3 files changed, 301 insertions(+), 13 deletions(-)

diff --git a/scripts/algorithms/PCA.dml b/scripts/algorithms/PCA.dml
index d165351..ea7afd7 100644
--- a/scripts/algorithms/PCA.dml
+++ b/scripts/algorithms/PCA.dml
@@ -62,19 +62,7 @@ if (model != "") {
D = ncol(A);
 
# perform z-scoring (centering and scaling)
-   if (center == 1) {
-   cm = colMeans(A);
-   A = A - cm;
-   }
-   if (scale == 1) {
-   cvars = (colSums (A^2));
-   if (center == 1){
-   cm = colMeans(A);
-   cvars = (cvars - N*(cm^2))/(N-1);   
-   }
-   Azscored = (A)/sqrt(cvars);
-A = Azscored;
-   }   
+   A = scale(A, center==1, scale==1);
 
# co-variance matrix 
mu = colSums(A)/N;
diff --git 
a/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmPCA.java
 
b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmPCA.java
new file mode 100644
index 000..e0a1906
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmPCA.java
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.codegenalg.partone;
+
+import java.io.File;
+import java.util.HashMap;
+
+import org.junit.Test;
+import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.hops.OptimizerUtils;
+import org.apache.sysds.lops.LopProperties.ExecType;
+import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+
+public class AlgorithmPCA extends AutomatedTestBase
+{
+   private final static String TEST_NAME1 = "Algorithm_PCA";
+   private final static String TEST_DIR = "functions/codegenalg/";
+   private final static String TEST_CLASS_DIR = TEST_DIR + 
AlgorithmPCA.class.getSimpleName() + "/";
+   private final static String TEST_CONF_DEFAULT = 
"SystemDS-config-codegen.xml";
+   private final static File TEST_CONF_FILE_DEFAULT = new File(SCRIPT_DIR 
+ TEST_DIR, TEST_CONF_DEFAULT);
+   private final static String TEST_CONF_FUSE_ALL = 
"SystemDS-config-codegen-fuse-all.xml";
+   private final static File TEST_CONF_FILE_FUSE_ALL = new File(SCRIPT_DIR 
+ TEST_DIR, TEST_CONF_FUSE_ALL);
+   private final static String TEST_CONF_FUSE_NO_REDUNDANCY = 
"SystemDS-config-codegen-fuse-no-redundancy.xml";

[systemml] branch master updated: [MINOR] Script-level improvements mice builtin function

2020-04-25 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new f450ead  [MINOR] Script-level improvements mice builtin function
f450ead is described below

commit f450ead5506d1615b5979bee85b39891e0f0fc00
Author: Matthias Boehm 
AuthorDate: Sat Apr 25 19:40:58 2020 +0200

[MINOR] Script-level improvements mice builtin function

* Loop vectorization of scalar assignment
* Removed unnecessary branch for table padding
* Minor modifications of rmEmpty use to increase common subexpression
elimination
---
 scripts/builtin/mice.dml | 44 +++-
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml
index 3f3c325..99d2be2 100644
--- a/scripts/builtin/mice.dml
+++ b/scripts/builtin/mice.dml
@@ -56,12 +56,12 @@ return(Frame[String] dataset, Frame[String] singleSet)
   col = ncol(F) 
   Result = matrix(0, rows=1, cols = col)
   Mask_Result = matrix(0, rows=1, cols=col)
-  cat = t(cMask) * seq(1, ncol(cMask))
-  cat = removeEmpty(target = cat, margin = "rows")
+  scat = seq(1, ncol(cMask))
+  cat = removeEmpty(target=scat, margin="rows", select=t(cMask))
   s=""
   for(i in 1: nrow(cat), check =0)
-s = s+as.integer(as.scalar(cat[i, 1]))+",";  
-  
+s = s+as.integer(as.scalar(cat[i, 1]))+",";
+  
   
   # encoding categorical columns using recode transformation
   jspecR = "{ids:true, recode:["+s+"]}";
@@ -70,7 +70,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
   XO = replace(target=X, pattern=NaN, replacement=0);
 
   # remove categorical features and impute continous features with mean
-  eX_n = removeEmpty(target=X, margin="cols", select=(1-cMask))
+  eX_n = removeEmpty(target=X, margin="cols", select=(cMask==0))
   col_n = ncol(eX_n);
   # storing the mask/address of missing values
   Mask_n = is.na(eX_n);
@@ -80,7 +80,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
   # filling the missing data with their means
   X2_n = eX_n+(Mask_n*colMeans(eX_n))
   # matrices for computing actul data
-  p_n = table( (seq(1, ncol(eX_n))) , (removeEmpty(target = t(cMask==0)*seq(1, 
ncol(cMask)), margin ="rows")) ,  1 )
+  p_n = table(seq(1, ncol(eX_n)), removeEmpty(target=scat, margin="rows", 
select=t(cMask==0)))
   if(ncol(p_n) < ncol(cMask))
 p_n = cbind(p_n, matrix(0, nrow(p_n), ncol(cMask)-ncol(p_n)))
   q = XO * cMask
@@ -91,8 +91,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
   eX_c2 = removeEmpty(target = eX_c, margin = "rows", select = (rowSums(eX_c 
!= 0)==col_c))
   colMod = matrix(0, 1, ncol(eX_c))
   # compute columnwise mode
-  parfor(i in 1: col_c)
-  {
+  parfor(i in 1: col_c) {
 f = eX_c2[, i] # adding one in data for dealing with zero category
 cat_counts = table(f, 1, n, 1);  # counts for each category
 mode = as.scalar(rowIndexMax(t(cat_counts)));
@@ -100,13 +99,10 @@ return(Frame[String] dataset, Frame[String] singleSet)
   }
   
   # find the mask of missing values 
-  tmpMask_c = (eX_c == 0);
-  tmpMask_c = (tmpMask_c * colMod) # fill missing values with mode
+  tmpMask_c = (eX_c==0) * colMod # fill missing values with mode
   
   # Generate a matrix of actual length
-  p_c = table((seq(1, ncol(tmpMask_c))) , (removeEmpty(target = 
t(cMask)*seq(1, ncol(cMask)), margin ="rows")), 1)
-  if(ncol(p_c) < ncol(cMask))
-p_c = cbind(p_c, matrix(0, nrow(p_c), ncol(cMask)-ncol(p_c)))
+  p_c = table(seq(1, ncol(tmpMask_c)), removeEmpty(target=scat, margin 
="rows", select=t(cMask)), ncol(tmpMask_c), ncol(cMask))
 
   Mask_c = tmpMask_c %*% p_c 
   inverseMask_c = Mask_c == 0
@@ -131,14 +127,13 @@ return(Frame[String] dataset, Frame[String] singleSet)
   dXMask = matrix(0, 1, ncol(dX))
   index = 1
   for(k in 1:col) {
-if(as.scalar(dcDistincts[1,k]) != 0) {
-  for(l in 1:as.scalar(dcDistincts[1,k])){
-dXMask[1,index] = 1
-index = index +1
-  }
+nDistk = as.scalar(dcDistincts[1,k]);
+if(nDistk != 0) {
+  dXMask[1,index:(index+nDistk-1)] = matrix(1,1,nDistk)
+  index += nDistk;
 }
 else
-  index = index +1
+  index += 1
   }
   
   #multiple imputations
@@ -149,7 +144,6 @@ return(Frame[String] dataset, Frame[String] singleSet)
 in_n = 1; in_c = 1; i=1; j=1; # varibales for index selection
 while(i <= ncol(dX))
 {
-  
   if(as.scalar(dXMask[1,i]) == 0)
   {
 # construct column selector
@@ -175,7 +169,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
   }
  
   if((as.scalar(dXMask[1,i]) == 1) & (sum(Mask_c[, in_c]) != 0))
-  {  
+  {
 j = (i + as.scalar(dist[1,in_

[systemml] branch master updated: [SYSTEMDS-208] Fix buffer pool leak and cleanup robustness

2020-04-25 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new beb4840  [SYSTEMDS-208] Fix buffer pool leak and cleanup robustness
beb4840 is described below

commit beb4840439ce6ca027470ce0cf3d2c903c1fa40d
Author: Matthias Boehm 
AuthorDate: Sat Apr 25 16:49:45 2020 +0200

[SYSTEMDS-208] Fix buffer pool leak and cleanup robustness

This patch fixes a buffer pool eviction leak, where each calls to mice
added 3 uncleaned objects to the buffer pool and thus eventually ran
into severe eviction (up to 'no space left on device').

A closer investigation revealed missing rmVar instructions in complex
control flow programs. Specifically, we now reintroduced the notion of
exit instructions for while/for/parfor/if and derive and add a packed
rmVar instruction if necessary (based on livein and liveout sets).

To make the mentioned exit instructions more effective, this patch also
introduces a best-effort cleanup of liveout variable sets, which are too
conservative for nested control flow. However, this cleanup is only done
where it is guaranteed to be safe, i.e., the top-level of statement
blocks at the main program and individual functions.

Finally, the memory leak was due to creatvar instructions overwriting
existing objects in the symbol table without proper cleanup. This is a
consequence of missing rmvar instructions, but in order to guard against
all cases, we now check this condition and perform a proper cleanup
which guards against such unknown leaks.
---
 docs/Tasks.txt |  1 +
 .../org/apache/sysds/parser/DMLTranslator.java | 41 +--
 .../runtime/controlprogram/BasicProgramBlock.java  |  4 +-
 .../runtime/controlprogram/ForProgramBlock.java|  3 ++
 .../runtime/controlprogram/IfProgramBlock.java | 27 +++---
 .../runtime/controlprogram/ParForProgramBlock.java |  3 ++
 .../sysds/runtime/controlprogram/ProgramBlock.java | 27 +-
 .../runtime/controlprogram/WhileProgramBlock.java  |  3 ++
 .../controlprogram/caching/LazyWriteBuffer.java|  7 ++-
 .../instructions/cp/VariableCPInstruction.java |  4 ++
 .../sysds/runtime/util/ProgramConverter.java   |  5 ++
 src/main/java/org/apache/sysds/utils/Explain.java  | 33 +++-
 .../test/functions/caching/BufferpoolLeakTest.java | 60 ++
 .../scripts/functions/caching/BufferpoolLeak.dml   | 28 ++
 14 files changed, 204 insertions(+), 42 deletions(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index d1e30c0..e63544c 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -166,6 +166,7 @@ SYSTEMDS-200 Various Fixes
  * 205 Fix scoping of builtin dml-bodied functions (vs user-defined)
  * 206 Fix codegen outer template compilation (tsmm)  OK
  * 207 Fix builtin function call hoisting from expressionsOK
+ * 208 Fix bufferpool leak (live var analysis and createvar)  OK
 
 SYSTEMDS-210 Extended lists Operations
  * 211 Cbind and Rbind over lists of matrices OK
diff --git a/src/main/java/org/apache/sysds/parser/DMLTranslator.java 
b/src/main/java/org/apache/sysds/parser/DMLTranslator.java
index f1f64c1..789ea9d 100644
--- a/src/main/java/org/apache/sysds/parser/DMLTranslator.java
+++ b/src/main/java/org/apache/sysds/parser/DMLTranslator.java
@@ -25,6 +25,7 @@ import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.List;
+import java.util.Set;
 import java.util.stream.Collectors;
 
 import org.apache.commons.logging.Log;
@@ -88,6 +89,7 @@ import org.apache.sysds.runtime.controlprogram.Program;
 import org.apache.sysds.runtime.controlprogram.ProgramBlock;
 import org.apache.sysds.runtime.controlprogram.WhileProgramBlock;
 import org.apache.sysds.runtime.instructions.Instruction;
+import org.apache.sysds.runtime.instructions.cp.VariableCPInstruction;
 
 
 public class DMLTranslator 
@@ -200,6 +202,8 @@ public class DMLTranslator
currentLiveOut = sb.analyze(currentLiveOut);
}
}
+   
+   cleanupLiveOutVariables(dmlp.getStatementBlocks(), new 
VariableSet());
}

public void liveVariableAnalysisFunction(DMLProgram dmlp, 
FunctionStatementBlock fsb) {
@@ -218,15 +222,32 @@ public class DMLTranslator
//STEP 2: backward direction
VariableSet currentLiveOut = new VariableSet();
VariableSet currentLiveIn = new VariableSet();
+   VariableSet unionLiveIn = new VariableSet();

for (DataIdentifier id : fstmt.getInputParams())
currentLiveIn.addVariable(id.getName(), id

[systemml] branch master updated: [SYSTEMDS-361] New privacy constraint meta data (compiler/runtime)

2020-04-24 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 013ca82  [SYSTEMDS-361] New privacy constraint meta data 
(compiler/runtime)
013ca82 is described below

commit 013ca8224c23b1d9f63e254162a56fb78bf74c96
Author: sebwrede 
AuthorDate: Fri Apr 24 20:00:13 2020 +0200

[SYSTEMDS-361] New privacy constraint meta data (compiler/runtime)

Closes #895.
---
 docs/Tasks.txt |   7 +
 .../java/org/apache/sysds/hops/AggBinaryOp.java|   3 +-
 src/main/java/org/apache/sysds/hops/DataOp.java|   1 +
 src/main/java/org/apache/sysds/hops/Hop.java   |  21 ++-
 src/main/java/org/apache/sysds/hops/LiteralOp.java |   1 +
 src/main/java/org/apache/sysds/lops/DataGen.java   |   4 +-
 src/main/java/org/apache/sysds/lops/Lop.java   |  18 +++
 .../java/org/apache/sysds/lops/compile/Dag.java|  34 +++-
 .../org/apache/sysds/parser/BinaryExpression.java  |  26 ++--
 .../org/apache/sysds/parser/DMLTranslator.java |   3 +
 .../org/apache/sysds/parser/DataExpression.java| 135 +++-
 .../java/org/apache/sysds/parser/Identifier.java   |  15 ++
 .../controlprogram/caching/CacheableData.java  |  16 +-
 .../sysds/runtime/instructions/Instruction.java|  12 ++
 .../instructions/cp/VariableCPInstruction.java |   2 +
 .../org/apache/sysds/runtime/io/MatrixReader.java  |   4 +-
 .../sysds/runtime/privacy/PrivacyConstraint.java   |  42 +
 .../sysds/runtime/privacy/PrivacyPropagator.java   |  38 +
 .../org/apache/sysds/runtime/util/HDFSTool.java|  38 -
 .../org/apache/sysds/test/AutomatedTestBase.java   |  43 +-
 src/test/java/org/apache/sysds/test/TestUtils.java | 129 ++--
 .../test/functions/data/misc/WriteMMTest.java  |   2 +-
 .../MatrixMultiplicationPropagationTest.java   | 171 +
 .../MatrixMultiplicationPropagationTest.dml|  27 
 24 files changed, 591 insertions(+), 201 deletions(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 2283d57..d1e30c0 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -260,5 +260,12 @@ SYSTEMDS-340 Compiler Assisted Lineage Caching and Reuse
 SYSTEMDS-350 Data Cleaning Framework
  * 351 New builtin function for error correction by schemaOK
 
+SYSTEMDS-360 Privacy/Data Exchange Constraints
+ * 361 Initial privacy meta data (compiler/runtime)   OK
+ * 362 Runtime privacy propagation
+ * 363 Compile-time privacy propagation
+ * 364 Error handling violated privacy constraints
+ * 365 Extended privacy/data exchange constraints
+
 Others:
  * Break append instruction to cbind and rbind 
diff --git a/src/main/java/org/apache/sysds/hops/AggBinaryOp.java 
b/src/main/java/org/apache/sysds/hops/AggBinaryOp.java
index b456cc8..a04d267 100644
--- a/src/main/java/org/apache/sysds/hops/AggBinaryOp.java
+++ b/src/main/java/org/apache/sysds/hops/AggBinaryOp.java
@@ -627,7 +627,8 @@ public class AggBinaryOp extends MultiThreadedHop
setOutputDimensions(matmultCP);
}

-   setLineNumbers( matmultCP );
+   setLineNumbers(matmultCP);
+   setPrivacy(matmultCP);
setLops(matmultCP);
}
 
diff --git a/src/main/java/org/apache/sysds/hops/DataOp.java 
b/src/main/java/org/apache/sysds/hops/DataOp.java
index 7a22727..99cf91e 100644
--- a/src/main/java/org/apache/sysds/hops/DataOp.java
+++ b/src/main/java/org/apache/sysds/hops/DataOp.java
@@ -311,6 +311,7 @@ public class DataOp extends Hop
}

setLineNumbers(l);
+   setPrivacy(l);
setLops(l);

//add reblock/checkpoint lops if necessary
diff --git a/src/main/java/org/apache/sysds/hops/Hop.java 
b/src/main/java/org/apache/sysds/hops/Hop.java
index ba0dd03..79a251f 100644
--- a/src/main/java/org/apache/sysds/hops/Hop.java
+++ b/src/main/java/org/apache/sysds/hops/Hop.java
@@ -50,6 +50,7 @@ import 
org.apache.sysds.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.meta.DataCharacteristics;
 import org.apache.sysds.runtime.meta.MatrixCharacteristics;
+import org.apache.sysds.runtime.privacy.PrivacyConstraint;
 import org.apache.sysds.runtime.util.UtilFunctions;
 
 import java.util.ArrayList;
@@ -72,6 +73,7 @@ public abstract class Hop implements ParseInfo
protected ValueType _valueType;
protected boolean _visited = false;
protected DataCharacteristics _dc = new MatrixCharacteristics();
+   protected PrivacyConstraint _privacyConstraint = new 
PrivacyConstraint();
protected UpdateType _updateType = UpdateType.COPY;
 
protected ArrayList _parent = new

[systemml] branch master updated: [MINOR] Fix Python lineage tracing tests

2020-04-24 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new bd537bb  [MINOR] Fix Python lineage tracing tests
bd537bb is described below

commit bd537bb39130484318216a91d27e8243f6166c54
Author: Sebastian 
AuthorDate: Fri Apr 24 19:40:09 2020 +0200

[MINOR] Fix Python lineage tracing tests

Change to the way lineage trace tests are executed, such that instead of
having files to compare to, the python trace is compared to a trace made
from systemds directly.

Motivated by the fact that the previous tests were failing, because of
inconsistencies between new traces and old.

Furthermore this commit contains:

- A Badge for Federated Python tests
- A update to the Automated tests of Federated Python for new build
  instruction

Closes #896.
---
 .github/workflows/federatedPython.yml  |2 +-
 .github/workflows/python.yml   |9 +
 README.md  |1 +
 src/main/python/tests/lineage/README.md|   40 +
 src/main/python/tests/lineage/test_lineagetrace.py |  103 +
 src/main/python/tests/lt.txt   |1 -
 src/main/python/tests/lt2.txt  |4 -
 src/main/python/tests/lt_l2svm.txt | 2035 
 src/main/python/tests/test_l2svm_lineage.py|   88 -
 src/main/python/tests/test_lineagetrace.py |   75 -
 10 files changed, 154 insertions(+), 2204 deletions(-)

diff --git a/.github/workflows/federatedPython.yml 
b/.github/workflows/federatedPython.yml
index 9ec7b20..c07cde9 100644
--- a/.github/workflows/federatedPython.yml
+++ b/.github/workflows/federatedPython.yml
@@ -51,7 +51,7 @@ jobs:
   ${{ runner.os }}-maven-
   
 - name: Maven clean & package
-  run: mvn clean package
+  run: mvn clean package -P distribution
 
 - name: Setup Python
   uses: actions/setup-python@v1
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index cc3e1cb..84933ac 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -88,3 +88,12 @@ jobs:
 echo "Beginning tests"
 python -m unittest tests/*.py
 echo "Exit Status: " $?
+
+- name: Run all lineage python tests
+  run: |
+export SYSTEMDS_ROOT=$(pwd)
+export PATH=$SYSTEMDS_ROOT/bin:$PATH
+export SYSDS_QUIET=1
+cd src/main/python
+python -m unittest tests/lineage/*.py
+echo "Exit Status: " $?
\ No newline at end of file
diff --git a/README.md b/README.md
index ce1d574..86803e8 100644
--- a/README.md
+++ b/README.md
@@ -36,3 +36,4 @@ limitations under the License.
 ![Application 
Test](https://github.com/apache/systemml/workflows/Application%20Test/badge.svg)
 ![Function 
Test](https://github.com/apache/systemml/workflows/Function%20Test/badge.svg)
 ![Python 
Test](https://github.com/apache/systemml/workflows/Python%20Test/badge.svg)
+![Federated Python 
Test](https://github.com/apache/systemml/workflows/Federated%20Python%20Test/badge.svg)
diff --git a/src/main/python/tests/lineage/README.md 
b/src/main/python/tests/lineage/README.md
new file mode 100644
index 000..eb2eb4c
--- /dev/null
+++ b/src/main/python/tests/lineage/README.md
@@ -0,0 +1,40 @@
+
+
+# Python Lineage Tests
+
+To enable testing the lineage you have to setup your path environment.
+
+## Linux/bash
+
+From the root of the repository call:
+
+```bash
+# Do once in terminal
+export SYSTEMDS_ROOT=$(pwd)
+export PATH=$SYSTEMDS_ROOT/bin:$PATH
+export SYSDS_QUIET=1
+```
+
+Once the environment is setup, you can begin testing with the following:
+
+```bash
+cd src/main/python/
+python tests/lineage/*.py
+```
diff --git a/src/main/python/tests/lineage/test_lineagetrace.py 
b/src/main/python/tests/lineage/test_lineagetrace.py
new file mode 100644
index 000..e462c48
--- /dev/null
+++ b/src/main/python/tests/lineage/test_lineagetrace.py
@@ -0,0 +1,103 @@
+# -
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  

[systemml] branch master updated: [SYSTEMDS-333, 337] Improved lineage cache eviction

2020-04-23 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 12f69c7  [SYSTEMDS-333,337] Improved lineage cache eviction
12f69c7 is described below

commit 12f69c7c111cbe5e0ccc35d8bac58674b06480af
Author: arnabp 
AuthorDate: Thu Apr 23 22:12:25 2020 +0200

[SYSTEMDS-333,337] Improved lineage cache eviction

This patch improves lineage cache eviction by taking into account actual
execution time of instructions/functions. The ordering policy is still
LRU. Future commits will bring better approach to estimate spilling time
and new eviction policies.

Closes #891.
---
 docs/Tasks.txt |   6 +-
 .../runtime/controlprogram/BasicProgramBlock.java  |   8 +-
 .../sysds/runtime/controlprogram/ProgramBlock.java |   4 +-
 .../instructions/cp/FunctionCallCPInstruction.java |   7 +-
 .../apache/sysds/runtime/lineage/LineageCache.java | 295 ++---
 .../sysds/runtime/lineage/LineageCacheConfig.java  |  23 +-
 .../runtime/lineage/LineageCacheStatistics.java|  10 +
 .../sysds/runtime/lineage/LineageRewriteReuse.java |   9 +-
 .../java/org/apache/sysds/utils/Statistics.java|   2 +-
 .../functions/lineage/.FunctionFullReuse5.dml.swp  | Bin 0 -> 4096 bytes
 10 files changed, 258 insertions(+), 106 deletions(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 6e6118c..2283d57 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -244,7 +244,11 @@ SYSTEMDS-320 Merge SystemDS into Apache SystemML   
   OK
 SYSTEMDS-330 Lineage Tracing, Reuse and Integration
  * 331 Cache and reuse scalar outputs (instruction and multi-level)   OK
  * 332 Parfor integration with multi-level reuse  OK
- * 333 Use exact execution time for cost based eviction
+ * 333 Improve cache eviction with actual compute timeOK
+ * 334 Cache scalars only with atleast one matrix inputs
+ * 335 Weighted eviction policy (function of size & computetime)
+ * 336 Better use of cache status to handle multithreading
+ * 337 Adjust disk I/O speed by recording actual time taken   OK
  
 SYSTEMDS-340 Compiler Assisted Lineage Caching and Reuse
  * 341 Finalize unmarking of loop dependent operations
diff --git 
a/src/main/java/org/apache/sysds/runtime/controlprogram/BasicProgramBlock.java 
b/src/main/java/org/apache/sysds/runtime/controlprogram/BasicProgramBlock.java
index 5f44ac3..4590f0e 100644
--- 
a/src/main/java/org/apache/sysds/runtime/controlprogram/BasicProgramBlock.java
+++ 
b/src/main/java/org/apache/sysds/runtime/controlprogram/BasicProgramBlock.java
@@ -108,14 +108,17 @@ public class BasicProgramBlock extends ProgramBlock

//statement-block-level, lineage-based reuse
LineageItem[] liInputs = null;
+   long t0 = 0;
if (_sb != null && LineageCacheConfig.isMultiLevelReuse()) {
liInputs = 
LineageItemUtils.getLineageItemInputstoSB(_sb.getInputstoSB(), ec);
List outNames = _sb.getOutputNamesofSB();
-   if( LineageCache.reuse(outNames, _sb.getOutputsofSB(), 
outNames.size(), liInputs, _sb.getName(), ec) ) {
+   if(liInputs != null && LineageCache.reuse(outNames, 
_sb.getOutputsofSB(), 
+   outNames.size(), liInputs, 
_sb.getName(), ec) ) {
if( DMLScript.STATISTICS )

LineageCacheStatistics.incrementSBHits();
return;
}
+   t0 = System.nanoTime();
}
 
//actual instruction execution
@@ -123,6 +126,7 @@ public class BasicProgramBlock extends ProgramBlock

//statement-block-level, lineage-based caching
if (_sb != null && liInputs != null)
-   LineageCache.putValue(_sb.getOutputsofSB(), liInputs, 
_sb.getName(), ec);
+   LineageCache.putValue(_sb.getOutputsofSB(), liInputs, 
_sb.getName(), 
+   ec, System.nanoTime()-t0);
}
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/controlprogram/ProgramBlock.java 
b/src/main/java/org/apache/sysds/runtime/controlprogram/ProgramBlock.java
index 5cde84e..8859d39 100644
--- a/src/main/java/org/apache/sysds/runtime/controlprogram/ProgramBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/controlprogram/ProgramBlock.java
@@ -43,6 +43,7 @@ import org.apache.sysds.runtime.instructions.cp.IntObject;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
 import org.apache.sysds.runtime.instructions.cp.StringObject;
 import org.apache.sys

[systemml] branch master updated: [SYSTEMDS-315] Python Federated Matrices (test, docs, scripts)

2020-04-23 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 0fe05a9  [SYSTEMDS-315] Python Federated Matrices (test, docs, scripts)
0fe05a9 is described below

commit 0fe05a97840238c0130e58c1e4ec19b9195bb1a9
Author: Sebastian 
AuthorDate: Thu Apr 23 21:59:44 2020 +0200

[SYSTEMDS-315] Python Federated Matrices (test, docs, scripts)

- Easy start of federated worker in /bin/systemds.sh
- Setup of tests for the Python language bindings federated matrices
  - Tests of the basic federated operations
  - Out commented "advanced" functionality that is for later.
- Initial tutorial on Python federated matrices
- Minor :bug: fix in federated matrix, not allowing multiple sources
- Github workflow action for automated federated tests

Closes #871.
---
 .github/workflows/federatedPython.yml  |  85 +++
 README.md  |   8 +-
 bin/README.md  |  81 --
 bin/systemds.sh|  40 ++-
 docker/build.sh|   3 +
 docker/{build.sh => pythonsysds.Dockerfile}|  14 +-
 docker/sysds.Dockerfile|  13 +-
 docker/testsysds.Dockerfile|  10 +-
 docs/README.md |  25 +-
 docs/Tasks.txt |   1 +
 src/assembly/bin/README.md |   6 +-
 src/main/python/docs/source/federated.rst  | 126 +
 src/main/python/docs/source/index.rst  |   7 +
 src/main/python/systemds/matrix/matrix.py  |   3 +-
 src/main/python/tests/federated/runFedTest.sh  |  67 +
 .../tests/federated/test_federated_aggregations.py | 236 +
 .../python/tests/federated/test_federated_basic.py | 281 +
 17 files changed, 940 insertions(+), 66 deletions(-)

diff --git a/.github/workflows/federatedPython.yml 
b/.github/workflows/federatedPython.yml
new file mode 100644
index 000..9ec7b20
--- /dev/null
+++ b/.github/workflows/federatedPython.yml
@@ -0,0 +1,85 @@
+#-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-
+
+name: Federated Python Test
+
+on: [push, pull_request]
+
+jobs:
+  applicationsTests:
+runs-on: ${{ matrix.os }}
+strategy:
+  fail-fast: false
+  matrix:
+python-version: [3.6]
+os: [ubuntu-latest]
+java: [ 1.8 ]
+name:  Python Test
+steps:
+- name: Checkout Repository
+  uses: actions/checkout@v2
+
+- name: Setup Java
+  uses: actions/setup-java@v1
+  with:
+java-version: ${{ matrix.java }}
+
+- name: Cache Maven Dependencies
+  uses: actions/cache@v1
+  with:
+path: ~/.m2/repository
+key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
+restore-keys: |
+  ${{ runner.os }}-maven-
+  
+- name: Maven clean & package
+  run: mvn clean package
+
+- name: Setup Python
+  uses: actions/setup-python@v1
+  with:
+python-version: ${{ matrix.python-version }}
+architecture: 'x64'
+
+- name: Cache Pip Dependencies
+  uses: actions/cache@v1
+  with:
+path: ~/.cache/pip
+key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ 
hashFiles('src/main/python/setup.py') }}
+restore-keys: |
+  ${{ runner.os }}-pip-${{ matrix.python-version }}-
+  
+- name: Install pip Dependencies
+  run: pip install numpy py4j wheel
+
+- name: Build Python Package
+  run: |
+cd src/main/python
+python create_python_dist.py
+
+- name: Run Federated Python Tests
+  run: |
+export SYSTEMDS_ROOT=$(pwd)
+export PATH=$SYSTEMDS_ROOT/bin:$PATH
+cd src/main/python
+./tests/federated/runFedTest.sh 
tests/fed

[systemml] branch master updated (9bd68ff -> cf74661)

2020-04-21 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git.


from 9bd68ff  [SYSTEMDS-233] Fix multi-level lineage caching (parfor, 
determinism)
 new 0426099  [MINOR] Removal of remaining pydml test files and tests
 new cf74661  [SYSTEMDS-207] Fix dml-builtin-function hoisting from 
expressions

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 docs/Tasks.txt |  1 +
 .../java/org/apache/sysds/parser/DMLProgram.java   |  4 +-
 .../org/apache/sysds/parser/StatementBlock.java| 59 ++
 .../functions/misc/FunctionInExpressionTest.java   |  7 +++
 .../test/functions/mlcontext/MLContextTest.java| 59 --
 .../scripts/functions/misc/FunInExpression7.dml|  7 ++-
 .../scripts/functions/misc/PackageFunCall1.pydml   | 25 -
 .../scripts/functions/misc/PackageFunCall2.pydml   | 26 --
 .../scripts/functions/misc/PackageFunLib.pydml | 25 -
 9 files changed, 52 insertions(+), 161 deletions(-)
 copy scripts/nn/test/compare_backends/gen_softmax.dml => 
src/test/scripts/functions/misc/FunInExpression7.dml (86%)
 delete mode 100644 src/test/scripts/functions/misc/PackageFunCall1.pydml
 delete mode 100644 src/test/scripts/functions/misc/PackageFunCall2.pydml
 delete mode 100644 src/test/scripts/functions/misc/PackageFunLib.pydml



[systemml] 02/02: [SYSTEMDS-207] Fix dml-builtin-function hoisting from expressions

2020-04-21 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git

commit cf74661016928e3413d693f939a67964f3256b19
Author: Matthias Boehm 
AuthorDate: Tue Apr 21 22:00:46 2020 +0200

[SYSTEMDS-207] Fix dml-builtin-function hoisting from expressions

Function calls to dml-bodied functions need to bind their outputs to
logical variable names and hence require a cut of the basic block for
correctness. To still allow such functions in expressions (which is very
common), we perform function call hoisting from expressions during
parsing in order to be able to cut after entire statements. This
automatically applied to the new dml-bodied builtin functions too, but
because theses functions are loaded before ran into null pointer
exceptions during validation (thanks Arnab for catching this).

This fix extends the function hoisting by probing for dml-bodied builtin
functions and lazily loading, parsing, and adding the required functions
if needed. By reusing the recently added mechanics from lazy function
loading in eval functions, we keep the number of alternative entry
points very small.
---
 docs/Tasks.txt |  1 +
 .../java/org/apache/sysds/parser/DMLProgram.java   |  4 +-
 .../org/apache/sysds/parser/StatementBlock.java| 59 ++
 .../functions/misc/FunctionInExpressionTest.java   |  7 +++
 .../scripts/functions/misc/FunInExpression7.dml| 26 ++
 5 files changed, 73 insertions(+), 24 deletions(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 5ae71b1..7a61c05 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -165,6 +165,7 @@ SYSTEMDS-200 Various Fixes
  * 204 Fix rewrite simplify sequences of binary comparisons   OK
  * 205 Fix scoping of builtin dml-bodied functions (vs user-defined)
  * 206 Fix codegen outer template compilation (tsmm)  OK
+ * 207 Fix builtin function call hoisting from expressionsOK
 
 SYSTEMDS-210 Extended lists Operations
  * 211 Cbind and Rbind over lists of matrices OK
diff --git a/src/main/java/org/apache/sysds/parser/DMLProgram.java 
b/src/main/java/org/apache/sysds/parser/DMLProgram.java
index 4e5e229..2487aec 100644
--- a/src/main/java/org/apache/sysds/parser/DMLProgram.java
+++ b/src/main/java/org/apache/sysds/parser/DMLProgram.java
@@ -166,11 +166,11 @@ public class DMLProgram
try {
//handle statement blocks of all functions
for( FunctionStatementBlock fsb : 
getFunctionStatementBlocks() )
-   
StatementBlock.rHoistFunctionCallsFromExpressions(fsb);
+   
StatementBlock.rHoistFunctionCallsFromExpressions(fsb, this);
//handle statement blocks of main program
ArrayList tmp = new ArrayList<>();
for( StatementBlock sb : _blocks )
-   
tmp.addAll(StatementBlock.rHoistFunctionCallsFromExpressions(sb));
+   
tmp.addAll(StatementBlock.rHoistFunctionCallsFromExpressions(sb, this));
_blocks = tmp;
}
catch(LanguageException ex) {
diff --git a/src/main/java/org/apache/sysds/parser/StatementBlock.java 
b/src/main/java/org/apache/sysds/parser/StatementBlock.java
index f275a84..f6a8f72 100644
--- a/src/main/java/org/apache/sysds/parser/StatementBlock.java
+++ b/src/main/java/org/apache/sysds/parser/StatementBlock.java
@@ -23,6 +23,8 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -37,6 +39,7 @@ import org.apache.sysds.common.Types.ValueType;
 import org.apache.sysds.parser.Expression.FormatType;
 import org.apache.sysds.parser.LanguageException.LanguageErrorCodes;
 import org.apache.sysds.parser.PrintStatement.PRINTTYPE;
+import org.apache.sysds.parser.dml.DmlSyntacticValidator;
 import org.apache.sysds.runtime.controlprogram.parfor.util.IDSequence;
 import org.apache.sysds.utils.MLContextProxy;
 
@@ -460,13 +463,13 @@ public class StatementBlock extends LiveVariableAnalysis 
implements ParseInfo
 
}

-   public static List 
rHoistFunctionCallsFromExpressions(StatementBlock current) {
+   public static List 
rHoistFunctionCallsFromExpressions(StatementBlock current, DMLProgram prog) {
if (current instanceof FunctionStatementBlock) {
FunctionStatementBlock fsb = 
(FunctionStatementBlock)current;
FunctionStatement fstmt = 
(FunctionStatement)fsb.getStatement(0);
ArrayList tmp = new Arr

[systemml] 01/02: [MINOR] Removal of remaining pydml test files and tests

2020-04-21 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git

commit 0426099b80fb451239e7d9a39bdacf752c79c80e
Author: Matthias Boehm 
AuthorDate: Tue Apr 21 21:43:08 2020 +0200

[MINOR] Removal of remaining pydml test files and tests
---
 .../test/functions/mlcontext/MLContextTest.java| 59 --
 .../scripts/functions/misc/PackageFunCall1.pydml   | 25 -
 .../scripts/functions/misc/PackageFunCall2.pydml   | 26 --
 .../scripts/functions/misc/PackageFunLib.pydml | 25 -
 4 files changed, 135 deletions(-)

diff --git 
a/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextTest.java 
b/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextTest.java
index ce7df49..ac7b3e7 100644
--- a/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextTest.java
@@ -1063,22 +1063,6 @@ public class MLContextTest extends MLContextTestBase {
ml.execute(script);
}
 
-   @Test(expected = MLContextException.class)
-   public void testJavaRDDBadMetadataPYDML() {
-   System.out.println("MLContextTest - JavaRDD bad 
metadata PYML");
-
-   List list = new ArrayList<>();
-   list.add("1,2,3");
-   list.add("4,5,6");
-   list.add("7,8,9");
-   JavaRDD javaRDD = sc.parallelize(list);
-
-   MatrixMetadata mm = new MatrixMetadata(1, 1, 9);
-
-   Script script = dml("print('sum: ' + sum(M))").in("M", javaRDD, 
mm);
-   ml.execute(script);
-   }
-
@Test
public void testRDDGoodMetadataDML() {
System.out.println("MLContextTest - RDD good metadata 
DML");
@@ -1274,28 +1258,6 @@ public class MLContextTest extends MLContextTestBase {
}
 
@Test
-   public void testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified() {
-   System.out.println("MLContextTest - DataFrame sum PYDML, vector 
with ID column, no format specified");
-
-   List> list = new ArrayList<>();
-   list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
-   list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
-   list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
-   JavaRDD> javaRddTuple = 
sc.parallelize(list);
-
-   JavaRDD javaRddRow = javaRddTuple.map(new 
DoubleVectorRow());
-   List fields = new ArrayList<>();
-   
fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, 
DataTypes.DoubleType, true));
-   fields.add(DataTypes.createStructField("C1", new VectorUDT(), 
true));
-   StructType schema = DataTypes.createStructType(fields);
-   Dataset dataFrame = spark.createDataFrame(javaRddRow, 
schema);
-
-   Script script = dml("print('sum: ' + sum(M))").in("M", 
dataFrame);
-   setExpectedStdOut("sum: 45.0");
-   ml.execute(script);
-   }
-
-   @Test
public void testDataFrameSumDMLVectorWithNoIDColumnNoFormatSpecified() {
System.out.println("MLContextTest - DataFrame sum DML, vector 
with no ID column, no format specified");
 
@@ -1317,27 +1279,6 @@ public class MLContextTest extends MLContextTestBase {
}
 
@Test
-   public void 
testDataFrameSumPYDMLVectorWithNoIDColumnNoFormatSpecified() {
-   System.out.println("MLContextTest - DataFrame sum PYDML, vector 
with no ID column, no format specified");
-
-   List list = new ArrayList<>();
-   list.add(Vectors.dense(1.0, 2.0, 3.0));
-   list.add(Vectors.dense(4.0, 5.0, 6.0));
-   list.add(Vectors.dense(7.0, 8.0, 9.0));
-   JavaRDD javaRddVector = sc.parallelize(list);
-
-   JavaRDD javaRddRow = javaRddVector.map(new VectorRow());
-   List fields = new ArrayList<>();
-   fields.add(DataTypes.createStructField("C1", new VectorUDT(), 
true));
-   StructType schema = DataTypes.createStructType(fields);
-   Dataset dataFrame = spark.createDataFrame(javaRddRow, 
schema);
-
-   Script script = dml("print('sum: ' + sum(M))").in("M", 
dataFrame);
-   setExpectedStdOut("sum: 45.0");
-   ml.execute(script);
-   }
-
-   @Test
public void testDisplayBooleanDML() {
System.out.println("MLContextTest - display boolean DML");
String s = "print(b);";
diff 

[systemml] branch master updated: [SYSTEMDS-233] Fix multi-level lineage caching (parfor, determinism)

2020-04-15 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 9bd68ff  [SYSTEMDS-233] Fix multi-level lineage caching (parfor, 
determinism)
9bd68ff is described below

commit 9bd68ffc5d211583a2ebcfe5be514abf4cc29b69
Author: Matthias Boehm 
AuthorDate: Wed Apr 15 21:46:16 2020 +0200

[SYSTEMDS-233] Fix multi-level lineage caching (parfor, determinism)

This patch fixes some issues with multi-level lineage caching in parfor,
specifically (1) to allow function reuse despite differently named
parfor worker functions, and (2) the check for deterministic function
results incorrectly probed too far and thus missing opportunities.

However, down the road we should add an IPA pass which determines once
for all functions if they are deterministic and pass this information
down to the runtime, in order to avoid scenarios where threads are
already blocking on placeholders that are later removed due to
non-deterministic functions.
---
 .../apache/sysds/hops/recompile/Recompiler.java| 10 +-
 src/main/java/org/apache/sysds/lops/Lop.java   |  2 +-
 .../sysds/runtime/controlprogram/ProgramBlock.java | 17 ++--
 .../instructions/cp/FunctionCallCPInstruction.java | 23 +-
 .../apache/sysds/runtime/lineage/LineageCache.java | 16 +--
 .../runtime/lineage/LineageCacheStatistics.java| 10 +-
 .../sysds/runtime/lineage/LineageItemUtils.java| 10 +++---
 .../java/org/apache/sysds/utils/Statistics.java|  2 +-
 .../functions/lineage/FunctionFullReuseTest.java   |  7 +++
 .../functions/lineage/FunctionFullReuse6.dml   |  4 ++--
 10 files changed, 71 insertions(+), 30 deletions(-)

diff --git a/src/main/java/org/apache/sysds/hops/recompile/Recompiler.java 
b/src/main/java/org/apache/sysds/hops/recompile/Recompiler.java
index 2b11c73..d058c6a 100644
--- a/src/main/java/org/apache/sysds/hops/recompile/Recompiler.java
+++ b/src/main/java/org/apache/sysds/hops/recompile/Recompiler.java
@@ -155,7 +155,7 @@ public class Recompiler
}

// replace thread ids in new instructions
-   if( tid != 0 ) //only in parfor context
+   if( ProgramBlock.isThreadID(tid) ) //only in parfor context
newInst = 
ProgramConverter.createDeepCopyInstructionSet(newInst, tid, -1, null, null, 
null, false, false);

// remove writes if called through mlcontext or jmlc 
@@ -187,7 +187,7 @@ public class Recompiler
}

// replace thread ids in new instructions
-   if( tid != 0 ) //only in parfor context
+   if( ProgramBlock.isThreadID(tid) ) //only in parfor context
newInst = 
ProgramConverter.createDeepCopyInstructionSet(newInst, tid, -1, null, null, 
null, false, false);

// explain recompiled instructions
@@ -209,7 +209,7 @@ public class Recompiler
}

// replace thread ids in new instructions
-   if( tid != 0 ) //only in parfor context
+   if( ProgramBlock.isThreadID(tid) ) //only in parfor context
newInst = 
ProgramConverter.createDeepCopyInstructionSet(newInst, tid, -1, null, null, 
null, false, false);

// explain recompiled instructions
@@ -231,7 +231,7 @@ public class Recompiler
}

// replace thread ids in new instructions
-   if( tid != 0 ) //only in parfor context
+   if( ProgramBlock.isThreadID(tid) ) //only in parfor context
newInst = 
ProgramConverter.createDeepCopyInstructionSet(newInst, tid, -1, null, null, 
null, false, false);

// explain recompiled hops / instructions
@@ -253,7 +253,7 @@ public class Recompiler
}
 
// replace thread ids in new instructions
-   if( tid != 0 ) //only in parfor context
+   if( ProgramBlock.isThreadID(tid) ) //only in parfor context
newInst = 
ProgramConverter.createDeepCopyInstructionSet(newInst, tid, -1, null, null, 
null, false, false);

// explain recompiled hops / instructions
diff --git a/src/main/java/org/apache/sysds/lops/Lop.java 
b/src/main/java/org/apache/sysds/lops/Lop.java
index fa25000..8bb7e1a 100644
--- a/src/main/java/org/apache/sysds/lops/Lop.java
+++ b/src/main/java/org/apache/sysds/lops/Lop.java
@@ -82,7 +82,7 @@ public abstract class Lop
public static final String PROCESS_PREFIX = "_p";
public static final String CP_ROOT_THREAD

[systemml] branch master updated: [SYSTEMDS-118] New generic gridSearch builtin function

2020-04-13 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new acfe388  [SYSTEMDS-118] New generic gridSearch builtin function
acfe388 is described below

commit acfe3883a50b827e78db45d0db901a3f448add20
Author: Matthias Boehm 
AuthorDate: Mon Apr 13 22:05:52 2020 +0200

[SYSTEMDS-118] New generic gridSearch builtin function

This patch adds a new generic grid search function for hyper-parameter
optimization of arbitrary ML algorithms and parameter combinations. This
function takes train and eval functions by name as well as lists of
parameter names and vectors of their values, and returns the parameter
combination and model that gave the best results.

So far hyper-parameter optimization is working, but the core
training/scoring part needs additional features on list data types
(e.g., list-list append, and eval fcalls with lists of unnamed and named
parameters). Also, before it can be applied in practice it needs an
integration with cross validation.
---
 docs/Tasks.txt |  2 +-
 scripts/builtin/gridSearch.dml | 80 +
 .../java/org/apache/sysds/common/Builtins.java |  1 +
 .../functions/builtin/BuiltinGridSearchTest.java   | 82 ++
 .../scripts/functions/builtin/GridSearchLM.dml | 44 
 5 files changed, 208 insertions(+), 1 deletion(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index c4fa46f..5ae71b1 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -91,7 +91,7 @@ SYSTEMDS-110 New Builtin Functions
  * 115 Builtin function for model debugging (slice finder)OK
  * 116 Builtin function for kmeansOK
  * 117 Builtin function for lm cross validation   OK
- * 118 Builtin function for hyperparameter grid search with CVlm
+ * 118 Builtin function for hyperparameter grid search
  * 119 Builtin functions for l2svm and msvm   OK
 
 SYSTEMDS-120 Performance Features
diff --git a/scripts/builtin/gridSearch.dml b/scripts/builtin/gridSearch.dml
new file mode 100644
index 000..227b863
--- /dev/null
+++ b/scripts/builtin/gridSearch.dml
@@ -0,0 +1,80 @@
+#-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-
+
+m_gridSearch = function(Matrix[Double] X, Matrix[Double] y, String train, 
String predict,
+  List[String] params, List[Unknown] paramValues, Boolean verbose = TRUE) 
+  return (Matrix[Double] B, Frame[Unknown] opt) 
+{
+  # Step 0) preparation of parameters, lengths, and values in convenient form
+  numParams = length(params);
+  paramLens = matrix(0, numParams, 1);
+  for( j in 1:numParams ) {
+vect = as.matrix(paramValues[j,1]);
+paramLens[j,1] = nrow(vect);
+  }
+  paramVals = matrix(0, numParams, max(paramLens));
+  for( j in 1:numParams ) {
+vect = as.matrix(paramValues[j,1]);
+paramVals[j,1:nrow(vect)] = t(vect);
+  }
+   cumLens = rev(cumprod(rev(paramLens))/rev(paramLens));
+   numConfigs = prod(paramLens);
+  
+  # Step 1) materialize hyper-parameter combinations 
+  # (simplify debugging and compared to compute negligible)
+  HP = matrix(0, numConfigs, numParams);
+  parfor( i in 1:nrow(HP) ) {
+for( j in 1:numParams )
+  HP[i,j] = paramVals[j,as.scalar(((i-1)/cumLens[j,1])%%paramLens[j,1]+1)];
+  }
+
+  if( verbose )
+print("GridSeach: Hyper-parameter combinations: \n"+toString(HP));
+
+  # Step 2) training/scoring of parameter combinations
+  # TODO integrate cross validation
+  Rbeta = matrix(0, nrow(HP), ncol(X));
+  Rloss = matrix(0, nrow(HP), 1);
+  arguments = list(X=X, y=y);
+
+  parfor( i in 1:nrow(HP) ) {
+# a) prepare training arguments
+largs = arguments;
+for( j in 1:numParams ) {
+  key = as.scalar(params[j]);
+  value = as.scalar(HP[i,j]);
+  largs = append(largs, list(key=va

[systemml] branch master updated: [SYSTEMDS-15] Travis remove badge

2020-04-13 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 4bbba40  [SYSTEMDS-15] Travis remove badge
4bbba40 is described below

commit 4bbba4051e63e67a3a2366ee3f414f01cc7d0b93
Author: Sebastian 
AuthorDate: Mon Apr 13 18:44:24 2020 +0200

[SYSTEMDS-15] Travis remove badge

Missed that the badge still was in the README.
This is now removed, furthermore the task associated with travis have
been modified to reflect that it is removed, and why.

Closes #886.
---
 README.md  | 2 --
 docs/Tasks.txt | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 1ccb7b4..a9cb743 100644
--- a/README.md
+++ b/README.md
@@ -27,9 +27,7 @@ limitations under the License.
 
 ## Status
 
-[![Build 
Status](https://travis-ci.org/apache/systemml.svg?branch=master)](https://travis-ci.org/apache/systemml)
 
[![License](https://img.shields.io/badge/License-Apache%202.0-gre.svg)](https://opensource.org/licenses/Apache-2.0)
-
 ![Build](https://github.com/apache/systemml/workflows/Build/badge.svg)
 
![Documentation](https://github.com/apache/systemml/workflows/Documentation/badge.svg)
 ![Component 
Test](https://github.com/apache/systemml/workflows/Component%20Test/badge.svg)
diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 316971d..c4fa46f 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -9,7 +9,7 @@ SYSTEMDS-10 Compiler Rework / Misc
  * 12 Remove unnecessary HOP/LOP indirections OK
  * 13 Refactoring test cases into component/integration   OK
  * 14 Complete removal of external functions from all scripts
- * 15 Travis integration w/ subset of tests   OK
+ * 15 Travis integration w/ subset of tests   OK 
(removed for Github Actions)
  * 16 Remove instruction patching
  * 17 Refactoring of program block hierarchy  OK
  * 18 Improve API for new dml-bodied builtin functionsOK



[systemml] branch master updated: [SYSTEMDS-291] Extended eval lazy function compilation (nested builtins)

2020-04-13 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 5f1cdf3  [SYSTEMDS-291] Extended eval lazy function compilation 
(nested builtins)
5f1cdf3 is described below

commit 5f1cdf367b0616359461f1fd198898d59f0598a4
Author: Matthias Boehm 
AuthorDate: Mon Apr 13 18:39:47 2020 +0200

[SYSTEMDS-291] Extended eval lazy function compilation (nested builtins)

This patch extends the lazy function compilation of dml-bodied builtin
functions called through eval. We now support nested dml-bodied function
calls (e.g., eval -> lm -> lmDS/lmCG) which is crucial for generic
primitives of hyper-parameter optimization and the enumeration of
cleaning pipelines.
---
 .../sysds/hops/rewrite/RewriteConstantFolding.java |  2 +-
 .../java/org/apache/sysds/parser/DMLProgram.java   |  4 ++
 .../org/apache/sysds/parser/DMLTranslator.java |  2 +-
 .../sysds/parser/FunctionCallIdentifier.java   |  8 +--
 .../sysds/parser/FunctionStatementBlock.java   | 14 ++---
 .../org/apache/sysds/parser/IfStatementBlock.java  |  4 +-
 .../org/apache/sysds/parser/StatementBlock.java|  2 +-
 .../sysds/parser/dml/DmlSyntacticValidator.java|  8 ++-
 .../sysds/runtime/controlprogram/Program.java  | 18 +-
 .../controlprogram/paramserv/ParamservUtils.java   |  2 +-
 .../instructions/cp/EvalNaryCPInstruction.java | 70 ++
 .../sysds/runtime/lineage/LineageRewriteReuse.java |  2 +-
 .../test/functions/mlcontext/MLContextTest.java| 10 
 .../mlcontext/eval4-nested_builtin-test.dml| 30 ++
 14 files changed, 129 insertions(+), 47 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/hops/rewrite/RewriteConstantFolding.java 
b/src/main/java/org/apache/sysds/hops/rewrite/RewriteConstantFolding.java
index ec098e6..6e04082 100644
--- a/src/main/java/org/apache/sysds/hops/rewrite/RewriteConstantFolding.java
+++ b/src/main/java/org/apache/sysds/hops/rewrite/RewriteConstantFolding.java
@@ -184,7 +184,7 @@ public class RewriteConstantFolding extends HopRewriteRule

private BasicProgramBlock getProgramBlock() {
if( _tmpPB == null )
-   _tmpPB = new BasicProgramBlock( new Program() );
+   _tmpPB = new BasicProgramBlock(new Program());
return _tmpPB;
}

diff --git a/src/main/java/org/apache/sysds/parser/DMLProgram.java 
b/src/main/java/org/apache/sysds/parser/DMLProgram.java
index e86464c..4e5e229 100644
--- a/src/main/java/org/apache/sysds/parser/DMLProgram.java
+++ b/src/main/java/org/apache/sysds/parser/DMLProgram.java
@@ -131,6 +131,10 @@ public class DMLProgram
return ret;
}
 
+   public boolean containsFunctionStatementBlock(String name) {
+   return _functionBlocks.containsKey(name);
+   }
+   
public void addFunctionStatementBlock(String fname, 
FunctionStatementBlock fsb) {
_functionBlocks.put(fname, fsb);
}
diff --git a/src/main/java/org/apache/sysds/parser/DMLTranslator.java 
b/src/main/java/org/apache/sysds/parser/DMLTranslator.java
index 9e41f9b..e61c928 100644
--- a/src/main/java/org/apache/sysds/parser/DMLTranslator.java
+++ b/src/main/java/org/apache/sysds/parser/DMLTranslator.java
@@ -412,7 +412,7 @@ public class DMLTranslator
throws LanguageException, DMLRuntimeException, LopsException, 
HopsException 
{   
// constructor resets the set of registered functions
-   Program rtprog = new Program();
+   Program rtprog = new Program(prog);

// for all namespaces, translate function statement blocks into 
function program blocks
for (String namespace : prog.getNamespaces().keySet()){
diff --git a/src/main/java/org/apache/sysds/parser/FunctionCallIdentifier.java 
b/src/main/java/org/apache/sysds/parser/FunctionCallIdentifier.java
index fc5e1d8..497d591 100644
--- a/src/main/java/org/apache/sysds/parser/FunctionCallIdentifier.java
+++ b/src/main/java/org/apache/sysds/parser/FunctionCallIdentifier.java
@@ -115,8 +115,8 @@ public class FunctionCallIdentifier extends DataIdentifier
}
if (hasNamed && hasUnnamed){
raiseValidateError(" In DML, functions can only have 
named parameters " +
-   "(e.g., name1=value1, name2=value2) or 
unnamed parameters (e.g, value1, value2). " + 
-   _name + " has both parameter types.", 
conditional);
+   "(e.g., name1=value1, name2=value2) or unnamed 
parameters (e.g, value1, value2). " + 
+   

[systemml] branch master updated: [SYSTEMDS-263] Initial design ONNX graph importer

2020-04-12 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 0dae427  [SYSTEMDS-263] Initial design ONNX graph importer
0dae427 is described below

commit 0dae42705f91b00abc03be09d810b3a9286338c5
Author: Lukas Timpl 
AuthorDate: Sun Apr 12 20:43:47 2020 +0200

[SYSTEMDS-263] Initial design ONNX graph importer

Since ONNX does support conditional operators (loop, if), I've tailored
the design towards a command-line tool that generates a DML script as
discussed.

AMLS project SS2020.
Closes #885.
---
 docs/Tasks.txt   |  1 +
 docs/onnx-systemds-design.md | 46 
 2 files changed, 47 insertions(+)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index cfcab1a..316971d 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -205,6 +205,7 @@ SYSTEMDS-250 Extended Slice Finding
 SYSTEMDS-260 Misc Tools
  * 261 Stable marriage algorithm  OK
  * 262 Data augmentation tool for data cleaning   OK
+ * 263 ONNX graph importer/exporter
 
 SYSTEMDS-270 Compressed Matrix Blocks
  * 271 Reintroduce compressed matrix blocks from SystemML OK
diff --git a/docs/onnx-systemds-design.md b/docs/onnx-systemds-design.md
new file mode 100644
index 000..9650f9c
--- /dev/null
+++ b/docs/onnx-systemds-design.md
@@ -0,0 +1,46 @@
+# onnx-systemds
+
+A tool for importing/exporting 
[ONNX](https://github.com/onnx/onnx/blob/master/docs/IR.md) graphs into/from 
SystemDS DML scripts.
+
+
+## Goals
+
+* Support for importing [operators of the ONNX base 
definition](https://github.com/onnx/onnx/blob/master/docs/Operators.md)
+
+* Support for importing [operators defined by 
ONNX-ML](https://github.com/onnx/onnx/blob/master/docs/Operators-ml.md)
+
+* Support for exporting DML script to ONNX graphs
+
+## Limitations
+
+* Not able to support all data types / operators as they are not currently 
supported by SystemDS
+
+
+
+## Suggested Implementation
+
+Since the ONNX specification includes the conditional operators 
[loop](https://github.com/onnx/onnx/blob/master/docs/Operators.md#Loop) and 
[if](https://github.com/onnx/onnx/blob/master/docs/Operators.md#If), a direct 
conversion from ONNX to the internal HOP might not be ideal. 
+
+Hence my suggested implementation is a dedicated tool invoked from command 
line which generates DML scripts. This also enables optimizations performed by 
the compiler at both graph and program level.
+
+### Example Call
+
+```bash
+onnx-systemds model.onx --out model_script.dml
+```
+
+
+### Tooling
+
+* Due to the availability of a [Python 
API](https://github.com/onnx/onnx/blob/master/docs/PythonAPIOverview.md) for 
ONNX, I would suggest implementing the tool in Python
+* Another advantage of Python is good support for template engines e.g. 
[Jinja](https://jinja.palletsprojects.com/en/2.11.x/)
+* An implementation could use templates for various operators which are then 
combined into a script
+
+### Implementation Details
+
+ONNX is a [serialized 
graph](https://github.com/onnx/onnx/blob/master/docs/IR.md#graphs) structured 
as a sorted list of nodes that form a DAG (directed acyclic graph).
+
+1. Loading in the serialized structure
+2. 
[Checking](https://github.com/onnx/onnx/blob/master/docs/PythonAPIOverview.md#checking-an-onnx-model)
 model and 
[converting](https://github.com/onnx/onnx/blob/master/docs/PythonAPIOverview.md#converting-version-of-an-onnx-model-within-default-domain-aionnx)
 models to a common version
+3. Building a simple internal graph structure (for arbitrary operators)
+4. Generating the DML script while traversing this graph (provided information 
in doc_strings and other description variables are added as comments to improve 
human-readability of the generated script)



[systemml] branch master updated: [SYSTEMDS-52] Fix libsvm reader/writer integration and correctness

2020-04-11 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 84ef713  [SYSTEMDS-52] Fix libsvm reader/writer integration and 
correctness
84ef713 is described below

commit 84ef71326c6781bad4ed9b39a210ee2cd4a6d4bd
Author: Matthias Boehm 
AuthorDate: Sat Apr 11 23:12:37 2020 +0200

[SYSTEMDS-52] Fix libsvm reader/writer integration and correctness

This patch fixes a correctness issue of the libsvm local writers, which
incorrectly shifted the output indexes twice for space inputs.
Furthermore, the libsvm local readers were not fully integrated in all
code path yet.

The distributed libsvm readers/writers still remain to be integrated.
---
 docs/Tasks.txt |  4 +-
 .../sysds/runtime/io/MatrixReaderFactory.java  | 69 ++
 .../sysds/runtime/io/ReaderTextLIBSVMParallel.java |  2 +-
 .../apache/sysds/runtime/io/WriterTextLIBSVM.java  | 12 ++--
 .../test/functions/data/misc/NoRenameTest.java | 48 +++
 src/test/scripts/functions/data/NoRenameTest1.dml  |  2 +-
 6 files changed, 65 insertions(+), 72 deletions(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 7bb0c10..cfcab1a 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -47,8 +47,8 @@ SYSTEMDS-40 Preprocessing builtins
 
 SYSTEMDS-50 I/O Formats
  * 51 Support for homogeneous JSON (local/distributed)
- * 52 Support for libsvm files (local/distributed) 
- * 53 New sql data source (local, distributed)
+ * 52 Support for libsvm files (local/distributed)
+ * 53 New sql data source (local, distributed)
  * 54 Support for is.na, is.nan, is.infinite  OK
 
 SYSTEMDS-60 Update SystemML improvements
diff --git a/src/main/java/org/apache/sysds/runtime/io/MatrixReaderFactory.java 
b/src/main/java/org/apache/sysds/runtime/io/MatrixReaderFactory.java
index 3d2af34..168a336 100644
--- a/src/main/java/org/apache/sysds/runtime/io/MatrixReaderFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/io/MatrixReaderFactory.java
@@ -28,36 +28,34 @@ import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 
 public class MatrixReaderFactory 
 {
-
-   public static MatrixReader createMatrixReader( InputInfo iinfo ) 
+   public static MatrixReader createMatrixReader(InputInfo iinfo)
{
MatrixReader reader = null;
+   boolean par = 
ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_READ_TEXTFORMATS);
+   boolean mcsr = MatrixBlock.DEFAULT_SPARSEBLOCK == 
SparseBlock.Type.MCSR;

-   if( iinfo == InputInfo.TextCellInputInfo || iinfo == 
InputInfo.MatrixMarketInputInfo )
-   {
-   if( 
ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_READ_TEXTFORMATS)
 && MatrixBlock.DEFAULT_SPARSEBLOCK == SparseBlock.Type.MCSR )
-   reader = new ReaderTextCellParallel( iinfo );
-   else
-   reader = new ReaderTextCell( iinfo );
+   if( iinfo == InputInfo.TextCellInputInfo || iinfo == 
InputInfo.MatrixMarketInputInfo ) {
+   reader = (par & mcsr) ? 
+   new ReaderTextCellParallel(iinfo) : new 
ReaderTextCell(iinfo);
}
-   else if( iinfo == InputInfo.CSVInputInfo )
-   {
-   if( 
ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_READ_TEXTFORMATS)
 && MatrixBlock.DEFAULT_SPARSEBLOCK == SparseBlock.Type.MCSR )
-   reader = new ReaderTextCSVParallel(new 
FileFormatPropertiesCSV());
-   else
-   reader = new ReaderTextCSV(new 
FileFormatPropertiesCSV());
+   else if( iinfo == InputInfo.CSVInputInfo ) {
+   reader = (par & mcsr) ? 
+   new ReaderTextCSVParallel(new 
FileFormatPropertiesCSV()) :
+   new ReaderTextCSV(new 
FileFormatPropertiesCSV());
+   }
+   else if( iinfo == InputInfo.LIBSVMInputInfo) {
+   reader = (par & mcsr) ? 
+   new ReaderTextLIBSVMParallel() : new 
ReaderTextLIBSVM();
}
else if( iinfo == InputInfo.BinaryCellInputInfo ) 
reader = new ReaderBinaryCell();
else if( iinfo == InputInfo.BinaryBlockInputInfo ) {
-   if( 
ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_READ_BINARYFORMATS)
 && MatrixBlock.DEFAULT_SPARSEBLOCK == SparseBlock.Type.MCSR )
-   reader = 

[systemml] branch master updated: [SYSTEMML-2538] Fix csv/text output rename in forced singlenode

2020-04-11 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 47924e6  [SYSTEMML-2538] Fix csv/text output rename in forced 
singlenode
47924e6 is described below

commit 47924e6aced3dac0768756c7dfec932d696b6a3f
Author: Matthias Boehm 
AuthorDate: Sat Apr 11 22:51:50 2020 +0200

[SYSTEMML-2538] Fix csv/text output rename in forced singlenode

This patch fixes an issue where an input csv/text file is directly fed
into a persistent write, which eventually just renames the input file
because it already exist on HDFS in the right format. We now explicitly
guard against persistently read inputs, which only can occur w/ forced
singelnode execution mode because other (in spark and hybrid) there is a
reblock (potentially in memory) that creates a new metadata object.

Furthermore, this also includes a minor internal refactoring for
consistently obtaining input/output infos for external format strings,
as well as a slight modification of the MatrixMatrixCellwiseTest to run
over smaller inputs (because R is taken quite a while for them).
---
 .../java/org/apache/sysds/api/jmlc/Connection.java |   4 +-
 .../org/apache/sysds/parser/DataExpression.java|   4 +-
 .../controlprogram/caching/CacheableData.java  |   2 +-
 .../federated/FederatedWorkerHandler.java  |   2 +-
 .../instructions/cp/VariableCPInstruction.java |  38 ++-
 .../sysds/runtime/io/MatrixReaderFactory.java  |   2 +-
 .../sysds/runtime/matrix/data/InputInfo.java   |   8 +-
 .../sysds/runtime/matrix/data/OutputInfo.java  |   2 +-
 .../org/apache/sysds/test/AutomatedTestBase.java   |  14 +-
 .../FullMatrixMatrixCellwiseOperationTest.java |   4 +-
 .../test/functions/data/misc/NoRenameTest.java | 254 +
 .../functions/frame/FrameMatrixReblockTest.java|   5 +-
 .../test/functions/frame/FrameMatrixWriteTest.java |   2 +-
 .../transform/TransformEncodeDecodeTest.java   |   2 +-
 src/test/scripts/functions/data/NoRenameTest1.dml  |  24 ++
 src/test/scripts/functions/data/NoRenameTest2.dml  |  24 ++
 16 files changed, 345 insertions(+), 46 deletions(-)

diff --git a/src/main/java/org/apache/sysds/api/jmlc/Connection.java 
b/src/main/java/org/apache/sysds/api/jmlc/Connection.java
index e1557b8..a008939 100644
--- a/src/main/java/org/apache/sysds/api/jmlc/Connection.java
+++ b/src/main/java/org/apache/sysds/api/jmlc/Connection.java
@@ -372,7 +372,7 @@ public class Connection implements Closeable
long nnz = 
jmtd.containsKey(DataExpression.READNNZPARAM)?

jmtd.getLong(DataExpression.READNNZPARAM) : -1;
String format = 
jmtd.getString(DataExpression.FORMAT_TYPE);
-   InputInfo iinfo = 
InputInfo.stringExternalToInputInfo(format);
+   InputInfo iinfo = InputInfo.fromExternalString(format);

//read matrix file
return readDoubleMatrix(fname, iinfo, rows, cols, blen, 
nnz);
@@ -614,7 +614,7 @@ public class Connection implements Closeable
long rows = jmtd.getLong(DataExpression.READROWPARAM);
long cols = jmtd.getLong(DataExpression.READCOLPARAM);
String format = 
jmtd.getString(DataExpression.FORMAT_TYPE);
-   InputInfo iinfo = 
InputInfo.stringExternalToInputInfo(format);
+   InputInfo iinfo = InputInfo.fromExternalString(format);

//read frame file
return readStringFrame(fname, iinfo, rows, cols);
diff --git a/src/main/java/org/apache/sysds/parser/DataExpression.java 
b/src/main/java/org/apache/sysds/parser/DataExpression.java
index 6ab1eb2..1b7ddc4 100644
--- a/src/main/java/org/apache/sysds/parser/DataExpression.java
+++ b/src/main/java/org/apache/sysds/parser/DataExpression.java
@@ -220,7 +220,7 @@ public class DataExpression extends DataIdentifier
return null;
}   
dataExpr.addVarParam(currName, currExpr);
-   }   
+   }
}
else if (functionName.equalsIgnoreCase("rand")){

@@ -1178,7 +1178,7 @@ public class DataExpression extends DataIdentifier
getOutput().setNnz(-1L);
}

-   else{   
+   else{
raiseValidateError("Unknown Data Type " + 
dataTypeString + ". Valid  values: "

[systemml] branch master updated: [SYSTEMDS-351] New builtin dropInvalid for cleaning by expected schema

2020-04-10 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 158ccff  [SYSTEMDS-351] New builtin dropInvalid for cleaning by 
expected schema
158ccff is described below

commit 158ccffbadef845058d9f3a2c5084fbb8fa00429
Author: Shafaq Siddiqi 
AuthorDate: Fri Apr 10 20:44:33 2020 +0200

[SYSTEMDS-351] New builtin dropInvalid for cleaning by expected schema

Closes #883.
---
 docs/Tasks.txt |   3 +
 .../java/org/apache/sysds/common/Builtins.java |   1 +
 src/main/java/org/apache/sysds/hops/BinaryOp.java  |   3 +-
 src/main/java/org/apache/sysds/hops/Hop.java   |   5 +-
 src/main/java/org/apache/sysds/lops/Binary.java|  11 +-
 src/main/java/org/apache/sysds/lops/BinaryM.java   |   4 +-
 .../sysds/parser/BuiltinFunctionExpression.java|  10 +
 .../org/apache/sysds/parser/DMLTranslator.java |  21 ++-
 .../sysds/runtime/functionobjects/Builtin.java |   3 +-
 .../runtime/instructions/CPInstructionParser.java  |   5 +-
 .../runtime/instructions/InstructionUtils.java |   3 +
 .../runtime/instructions/SPInstructionParser.java  |  10 +-
 .../instructions/cp/BinaryCPInstruction.java   |   2 +
 .../cp/BinaryFrameFrameCPInstruction.java  |  47 +
 .../runtime/instructions/cp/CPInstruction.java |   2 +-
 .../spark/BinaryFrameFrameSPInstruction.java   |  84 +
 .../instructions/spark/BinarySPInstruction.java|  42 +++--
 .../sysds/runtime/matrix/data/FrameBlock.java  |  36 
 .../apache/sysds/runtime/util/UtilFunctions.java   |   4 +-
 .../functions/frame/FrameIsCorrectTypeTest.java| 206 +
 src/test/scripts/functions/frame/DropInvalid.dml   |  25 +++
 21 files changed, 480 insertions(+), 47 deletions(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index d1168e6..5fdd96d 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -250,5 +250,8 @@ SYSTEMDS-340 Compiler Assisted Lineage Caching and Reuse
  * 344 Unmark functions/SBs containing non-determinism for caching
  * 345 Compiler assisted cache configuration
 
+SYSTEMDS-350 Data Cleaning Framework
+ * 351 New builtin function for error correction by schemaOK
+
 Others:
  * Break append instruction to cbind and rbind 
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java 
b/src/main/java/org/apache/sysds/common/Builtins.java
index 7220198..4f20d87 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -83,6 +83,7 @@ public enum Builtins {
DETECTSCHEMA("detectSchema", false),
DIAG("diag", false),
DISCOVER_FD("discoverFD", true),
+   DROP_INVALID("dropInvalid", false),
EIGEN("eigen", false, ReturnType.MULTI_RETURN),
EXISTS("exists", false),
EXP("exp", false),
diff --git a/src/main/java/org/apache/sysds/hops/BinaryOp.java 
b/src/main/java/org/apache/sysds/hops/BinaryOp.java
index 769329b..8212b53 100644
--- a/src/main/java/org/apache/sysds/hops/BinaryOp.java
+++ b/src/main/java/org/apache/sysds/hops/BinaryOp.java
@@ -894,7 +894,8 @@ public class BinaryOp extends MultiThreadedHop
 
private static MMBinaryMethod optFindMMBinaryMethodSpark(Hop left, Hop 
right) {
// TODO size information for tensor
-   if (left._dataType == DataType.TENSOR && right._dataType == 
DataType.TENSOR)
+   if ((left._dataType == DataType.TENSOR && right._dataType == 
DataType.TENSOR)
+   || (left._dataType == DataType.FRAME && right._dataType 
== DataType.FRAME))
return MMBinaryMethod.MR_BINARY_R;
long m1_dim1 = left.getDim1();
long m1_dim2 = left.getDim2();
diff --git a/src/main/java/org/apache/sysds/hops/Hop.java 
b/src/main/java/org/apache/sysds/hops/Hop.java
index 01c61c9..f64b5f0 100644
--- a/src/main/java/org/apache/sysds/hops/Hop.java
+++ b/src/main/java/org/apache/sysds/hops/Hop.java
@@ -1057,6 +1057,7 @@ public abstract class Hop implements ParseInfo
LOG_NZ, //sparse-safe log; ppred(X,0,"!=")*log(X,0.5)
MINUS1_MULT, //1-X*Y
BITWAND, BITWOR, BITWXOR, BITWSHIFTL, BITWSHIFTR, //bitwise 
operations
+   DROP_INVALID, // frame operation for removing cells invalid wrt 
given data type
}
 
public static final HashMap 
HopsOpOp2LopsB;
@@ -1088,6 +1089,7 @@ public abstract class Hop implements ParseInfo
HopsOpOp2LopsB.put(OpOp2.BITWXOR, Binary.OperationTypes.BW_XOR);
HopsOpOp2LopsB.put(OpOp2.BITWSHIFTL, 
Binary.OperationTypes.BW_SHIFTL);
HopsOpOp2LopsB.put(Op

[systemml] branch master updated: [MINOR] Fix opcodes for lineage-based reuse (corrupted by rework)

2020-04-10 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 3fd8769  [MINOR] Fix opcodes for lineage-based reuse (corrupted by 
rework)
3fd8769 is described below

commit 3fd87695591bdba30964db995066472d148b252e
Author: Matthias Boehm 
AuthorDate: Fri Apr 10 18:39:39 2020 +0200

[MINOR] Fix opcodes for lineage-based reuse (corrupted by rework)
---
 src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java 
b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
index 75a305a..e130cfa 100644
--- a/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
+++ b/src/main/java/org/apache/sysds/runtime/lineage/LineageCacheConfig.java
@@ -30,7 +30,7 @@ import java.util.ArrayList;
 public class LineageCacheConfig {

private static final String[] REUSE_OPCODES = new String[] {
-   "tmm", "ba+*", "*", "/", "+", "nrow", "ncol",
+   "tsmm", "ba+*", "*", "/", "+", "nrow", "ncol",
"rightIndex", "leftIndex", "groupedagg", "r'", "solve", "spoof"
};




[systemml] branch master updated: [SYSTEMDS-314] New Python SystemDS context manager

2020-04-10 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 0793a18  [SYSTEMDS-314] New Python SystemDS context manager
0793a18 is described below

commit 0793a183cf874faaf1f7d143d6b4e64b48e35db9
Author: Kevin Innerebner 
AuthorDate: Fri Apr 10 17:53:34 2020 +0200

[SYSTEMDS-314] New Python SystemDS context manager

Closes #874.
---
 docs/Tasks.txt |   1 +
 src/main/python/docs/source/matrix.rst |  42 +-
 src/main/python/docs/source/simple_examples.rst|  67 +
 src/main/python/systemds/__init__.py   |   2 +-
 src/main/python/systemds/{ => context}/__init__.py |   4 +-
 .../python/systemds/context/systemds_context.py| 149 +
 src/main/python/systemds/matrix/matrix.py  |  44 +++---
 src/main/python/systemds/matrix/operation_node.py  | 124 +
 src/main/python/systemds/script_building/dag.py|  36 +++--
 src/main/python/systemds/script_building/script.py |  37 ++---
 .../systemds/{__init__.py => utils/consts.py}  |   9 +-
 src/main/python/systemds/utils/converters.py   |   6 +-
 src/main/python/systemds/utils/helpers.py  |  46 +--
 src/main/python/tests/test_l2svm.py|   9 +-
 src/main/python/tests/test_l2svm_lineage.py|  20 +--
 src/main/python/tests/test_lineagetrace.py |  23 ++--
 src/main/python/tests/test_matrix_aggregations.py  |  30 ++---
 src/main/python/tests/test_matrix_binary_op.py |  32 ++---
 18 files changed, 446 insertions(+), 235 deletions(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index d19672f..d1168e6 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -231,6 +231,7 @@ SYSTEMDS-310 Python Bindings
  * 311 Initial Python Binding for federated execution OK
  * 312 Python 3.6 compatibility   OK
  * 313 Python Documentation upload via Github Actions OK
+ * 314 Python SystemDS context managerOK
 
 SYSTEMDS-320 Merge SystemDS into Apache SystemML  OK
  * 321 Merge histories of SystemDS and SystemML   OK
diff --git a/src/main/python/docs/source/matrix.rst 
b/src/main/python/docs/source/matrix.rst
index f2f2fdc..dd88c7c 100644
--- a/src/main/python/docs/source/matrix.rst
+++ b/src/main/python/docs/source/matrix.rst
@@ -23,6 +23,39 @@
 Matrix API
 ==
 
+SystemDSContext
+---
+
+All operations using SystemDS need a java instance running.
+The connection is ensured by an ``SystemDSContext`` object.
+An ``SystemDSContext`` object can be created using:
+
+.. code_block:: python
+  sysds = SystemDSContext()
+
+When the calculations are finished the context has to be closed again:
+
+.. code_block:: python
+  sysds.close()
+
+Since it is annoying that it is always necessary to close the context, 
``SystemDSContext``
+implements the python context management protocol, which supports the 
following syntax:
+
+.. code_block:: python
+  with SystemDSContext() as sds:
+# do something with sds which is an SystemDSContext
+pass
+
+This will automatically close the ``SystemDSContext`` once the with-block is 
left.
+
+.. note::
+
+  Creating a context is an expensive procedure, because a sub-process starting 
a JVM might have to start, therefore
+  try to do this only once for your program, or always leave at least one 
context open.
+
+.. autoclass:: systemds.context.SystemDSContext
+  :members:
+
 OperationNode
 -
 
@@ -49,13 +82,12 @@ Matrix
 --
 
 A ``Matrix`` is represented either by an ``OperationNode``, or the derived 
class ``Matrix``.
-An Matrix can recognized it by checking the ``output_type`` of the object.
+An Matrix can be recognized it by checking the ``output_type`` of the object.
 
-Matrices are the most fundamental objects we operate on.
-If one generate the matrix in SystemDS directly via a function call,
-it can be used in an function which will generate an ``OperationNode`` e.g. 
``federated``, ``full``, ``seq``.
+Matrices are the most fundamental objects SystemDS operates on.
 
-If we want to work on an numpy array we need to use the class ``Matrix``.
+Although it is possible to generate matrices with the function calls or object 
construction specified below,
+the recommended way is to use the methods defined on ``SystemDSContext``.
 
 .. autoclass:: systemds.matrix.Matrix
 :members:
diff --git a/src/main/python/docs/source/simple_examples.rst 
b/src/main/python/docs/source/simple_examples.rst
index b9c35c3..2175fd4 100644
--- a/src/main/python/docs/source/simple_examples.rst
+++ b/src/main/python/docs/source/simple_examples.rst
@@ -27,18 +27,24 @@ Let's take a look at some code examples.
 Matrix Oper

[systemml] branch master updated: [MINOR] Remove Travis Testing

2020-04-10 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 4e0edec  [MINOR] Remove Travis Testing
4e0edec is described below

commit 4e0edec2d19fb28b59b830ac5dee479c8596041f
Author: Sebastian 
AuthorDate: Fri Apr 10 17:16:53 2020 +0200

[MINOR] Remove Travis Testing

The travis testing is removed since our testing is now executed
using Github Actions

The travis testing was only covering the component tests.

Closes #884.
---
 .travis.yml | 53 -
 1 file changed, 53 deletions(-)

diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 82101be..000
--- a/.travis.yml
+++ /dev/null
@@ -1,53 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-dist: xenial
-
-language: java
-
-jdk:
-  - openjdk8
-
-addons:
-#  apt:
-#sources:
-#- r-packages-trusty
-#packages:
-#- r-base-dev
-
-cache:
-  apt: true
-  directories:
-# caching .m2 causes an error loading hadoop-yarn-common-2.6.0.jar. Not sure 
why.
-#- ${HOME}/.m2
-#- ${HOME}/R
-#- /usr/local/lib/R/site-library
-
-install:
-#  - sudo Rscript ./src/test/scripts/installDependencies.R
-
-before_script:
-#  this is not needed anymore since adding authentication object in code for 
running hadoop/spark local
-#  - chmod -R 755 *
-
-script:
-  #  - mvn clean verify jacoco:report coveralls:report
-  - mvn test-compile 
-  - mvn surefire:test -Dtest=org.apache.sysds.test.component.**
-
-after_success:
-#  -  mvn test jacoco:report coveralls:report
\ No newline at end of file



[systemml] branch master updated: [SYSTEMDS-331, 332] Fix robustness lineage cache (deadlocks, correctness)

2020-04-10 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new f22e999  [SYSTEMDS-331,332] Fix robustness lineage cache (deadlocks, 
correctness)
f22e999 is described below

commit f22e9991e2370dc30a1fed01c3142c27071da42c
Author: Matthias Boehm 
AuthorDate: Fri Apr 10 16:28:39 2020 +0200

[SYSTEMDS-331,332] Fix robustness lineage cache (deadlocks, correctness)

This patch fixes the robustness of lineage-based caching, especially in
multi-threaded parfor programs. This includes:

1) Deadlock prevention: With multi-level caching, the placeholders that
prevent concurrent computation of redundant intermediates led to
deadlocks because the following threads blocked inside the critical
region and thus any caching of the thread that was producing the
intermediate (via a complex DAG of operations) was blocked.

2) Deadlock wrong Data Types: With the introduction of scalar caching
each thread had to decide to either pull a scalar or matrix on the
placeholders. Since this decision was made based on the data item (which
might not be available yet in parfor) threads were blocking on the wrong
type and thus again producing deadlocks.

3) Correctness: The loop iteration variable of parfor was not integrated
yet with lineage tracing leading to incorrect reuse for different parfor
iterations that depended on the iteration variable.

Furthermore, this patch also cleans up an unnecessarily wide public API
of the lineage cache in order to facilitate a correct internal
implementation. However, there are still a number of remaining issues,
e.g., with the computation of compensation plans and probing logic.
---
 docs/Tasks.txt |   2 +-
 .../org/apache/sysds/parser/StatementBlock.java|  43 ++-
 .../runtime/controlprogram/BasicProgramBlock.java  |  16 +-
 .../runtime/controlprogram/parfor/ParWorker.java   |  42 ++-
 .../instructions/cp/FunctionCallCPInstruction.java |   8 +-
 .../apache/sysds/runtime/lineage/LineageCache.java | 388 +++--
 .../sysds/runtime/lineage/LineageCacheConfig.java  |  30 ++
 .../sysds/runtime/lineage/LineageRewriteReuse.java |  54 +--
 .../functions/lineage/FunctionFullReuseTest.java   |  42 ++-
 .../functions/lineage/FunctionFullReuse6.dml   |  37 ++
 .../functions/lineage/FunctionFullReuse7.dml   |  37 ++
 11 files changed, 412 insertions(+), 287 deletions(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 42741da..d19672f 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -239,7 +239,7 @@ SYSTEMDS-320 Merge SystemDS into Apache SystemML
  OK
  
 SYSTEMDS-330 Lineage Tracing, Reuse and Integration
  * 331 Cache and reuse scalar outputs (instruction and multi-level)   OK
- * 332 Parfor integration with multi-level reuse 
+ * 332 Parfor integration with multi-level reuse  OK
  * 333 Use exact execution time for cost based eviction
  
 SYSTEMDS-340 Compiler Assisted Lineage Caching and Reuse
diff --git a/src/main/java/org/apache/sysds/parser/StatementBlock.java 
b/src/main/java/org/apache/sysds/parser/StatementBlock.java
index 2e87909..5991315 100644
--- a/src/main/java/org/apache/sysds/parser/StatementBlock.java
+++ b/src/main/java/org/apache/sysds/parser/StatementBlock.java
@@ -43,12 +43,12 @@ import org.apache.sysds.utils.MLContextProxy;
 
 public class StatementBlock extends LiveVariableAnalysis implements ParseInfo
 {
-
protected static final Log LOG = 
LogFactory.getLog(StatementBlock.class.getName());
protected static IDSequence _seq = new IDSequence();
private static IDSequence _seqSBID = new IDSequence();
protected final long _ID;
-
+   protected final String _name;
+   
protected DMLProgram _dmlProg;
protected ArrayList _statements;
ArrayList _hops = null;
@@ -62,6 +62,7 @@ public class StatementBlock extends LiveVariableAnalysis 
implements ParseInfo
 
public StatementBlock() {
_ID = getNextSBID();
+   _name = "SB"+_ID;
_dmlProg = null;
_statements = new ArrayList<>();
_read = new VariableSet();
@@ -96,6 +97,10 @@ public class StatementBlock extends LiveVariableAnalysis 
implements ParseInfo
public long getSBID() {
return _ID;
}
+   
+   public String getName() {
+   return _name;
+   }
 
public void addStatement(Statement s) {
_statements.add(s);
@@ -399,8 +404,9 @@ public class StatementBlock extends LiveVariableAnalysis 
implements ParseInfo
return inputs;
}
 
-   public ArrayList getOutputsofSB() {
-   ArrayList ou

[systemml] branch master updated: [SYSTEMML-2533] Fix named arguments in MNIST LeNet example script

2020-04-09 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new a3c0cce  [SYSTEMML-2533] Fix named arguments in MNIST LeNet example 
script
a3c0cce is described below

commit a3c0cce761c855b034302e1f0871d68d8eccd089
Author: Nathan Kan 
AuthorDate: Thu Apr 9 19:55:39 2020 +0200

[SYSTEMML-2533] Fix named arguments in MNIST LeNet example script

This fix backports the fix from #866 into the merged SystemDS code line.

Closes #867.
---
 scripts/nn/examples/mnist_lenet.dml | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/nn/examples/mnist_lenet.dml 
b/scripts/nn/examples/mnist_lenet.dml
index 57b8ba6..a882501 100644
--- a/scripts/nn/examples/mnist_lenet.dml
+++ b/scripts/nn/examples/mnist_lenet.dml
@@ -118,13 +118,13 @@ train = function(matrix[double] X, matrix[double] Y,
 stride, stride, pad, pad)
   outr1 = relu::forward(outc1)
   [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, 
Hf=2, Wf=2,
-strideh=2, stridew=2, 
pad=0, pad=0)
+strideh=2, stridew=2, 
padh=0, padw=0)
   ## layer 2: conv2 -> relu2 -> pool2
   [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, 
Woutp1, Hf, Wf,
 stride, stride, pad, pad)
   outr2 = relu::forward(outc2)
   [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, 
Hf=2, Wf=2,
-strideh=2, stridew=2, 
pad=0, pad=0)
+strideh=2, stridew=2, 
padh=0, padw=0)
   ## layer 3:  affine3 -> relu3 -> dropout
   outa3 = affine::forward(outp2, W3, b3)
   outr3 = relu::forward(outa3)
@@ -166,13 +166,13 @@ train = function(matrix[double] X, matrix[double] Y,
   [doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3)
   ## layer 2: conv2 -> relu2 -> pool2
   doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, 
Woutc2, Hf=2, Wf=2,
-strideh=2, stridew=2, pad=0, pad=0)
+strideh=2, stridew=2, padh=0, padw=0)
   doutc2 = relu::backward(doutr2, outc2)
   [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, 
b2, F1,
 Houtp1, Woutp1, Hf, Wf, stride, 
stride, pad, pad)
   ## layer 1: conv1 -> relu1 -> pool1
   doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, 
Woutc1, Hf=2, Wf=2,
-strideh=2, stridew=2, pad=0, pad=0)
+strideh=2, stridew=2, padh=0, padw=0)
   doutc1 = relu::backward(doutr1, outc1)
   [dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, 
W1, b1, C, Hin, Win,
   Hf, Wf, stride, stride, pad, pad)
@@ -264,13 +264,13 @@ predict = function(matrix[double] X, int C, int Hin, int 
Win,
   pad, pad)
 outr1 = relu::forward(outc1)
 [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, 
Hf=2, Wf=2,
-  strideh=2, stridew=2, pad=0, 
pad=0)
+  strideh=2, stridew=2, 
padh=0, padw=0)
 ## layer 2: conv2 -> relu2 -> pool2
 [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, 
Woutp1, Hf, Wf,
   stride, stride, pad, pad)
 outr2 = relu::forward(outc2)
 [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, 
Hf=2, Wf=2,
-  strideh=2, stridew=2, pad=0, 
pad=0)
+  strideh=2, stridew=2, 
padh=0, padw=0)
 ## layer 3:  affine3 -> relu3
 outa3 = affine::forward(outp2, W3, b3)
 outr3 = relu::forward(outa3)



[systemml] branch master updated: [MINOR] Extended JMLC API (handling of pinned variables)

2020-04-09 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 39c5654  [MINOR] Extended JMLC API (handling of pinned variables)
39c5654 is described below

commit 39c56541ca83ea36093384220d19a31b5578537e
Author: Anthony Thomas 
AuthorDate: Thu Apr 9 19:29:55 2020 +0200

[MINOR] Extended JMLC API (handling of pinned variables)

Closes #835.
---
 .../java/org/apache/sysds/api/jmlc/PreparedScript.java  | 17 +
 .../sysds/runtime/controlprogram/LocalVariableMap.java  |  4 
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/main/java/org/apache/sysds/api/jmlc/PreparedScript.java 
b/src/main/java/org/apache/sysds/api/jmlc/PreparedScript.java
index 04101f2..3175fcb 100644
--- a/src/main/java/org/apache/sysds/api/jmlc/PreparedScript.java
+++ b/src/main/java/org/apache/sysds/api/jmlc/PreparedScript.java
@@ -72,7 +72,7 @@ public class PreparedScript implements ConfigurableAPI
//input/output specification
private final HashSet _inVarnames;
private final HashSet _outVarnames;
-   private final HashMap _inVarReuse;
+   private final LocalVariableMap _inVarReuse;

//internal state (reused)
private final Program _prog;
@@ -91,7 +91,7 @@ public class PreparedScript implements ConfigurableAPI
_vars.setRegisteredOutputs(that._outVarnames);
_inVarnames = that._inVarnames;
_outVarnames = that._outVarnames;
-   _inVarReuse = new HashMap<>(that._inVarReuse);
+   _inVarReuse = new LocalVariableMap(that._inVarReuse);
_dmlconf = that._dmlconf;
_cconf = that._cconf;
}
@@ -115,7 +115,7 @@ public class PreparedScript implements ConfigurableAPI
Collections.addAll(_inVarnames, inputs);
_outVarnames = new HashSet<>();
Collections.addAll(_outVarnames, outputs);
-   _inVarReuse = new HashMap<>();
+   _inVarReuse = new LocalVariableMap();

//attach registered outputs (for dynamic recompile)
_vars.setRegisteredOutputs(_outVarnames);
@@ -415,7 +415,16 @@ public class PreparedScript implements ConfigurableAPI
public void clearParameters() {
_vars.removeAll();
}
-   
+
+   /**
+* Remove all references to pinned variables from this script.
+* Note: this *does not* remove the underlying data. It merely
+* removes a reference to it from this prepared script. This is
+* useful if you want to maintain an independent cache of weights
+* and allow the JVM to garbage collect under memory pressure.
+*/
+   public void clearPinnedData() { _inVarReuse.removeAll(); }
+
/**
 * Executes the prepared script over the bound inputs, creating the
 * result variables according to bound and registered outputs.
diff --git 
a/src/main/java/org/apache/sysds/runtime/controlprogram/LocalVariableMap.java 
b/src/main/java/org/apache/sysds/runtime/controlprogram/LocalVariableMap.java
index 92eabbd..1ac47b7 100644
--- 
a/src/main/java/org/apache/sysds/runtime/controlprogram/LocalVariableMap.java
+++ 
b/src/main/java/org/apache/sysds/runtime/controlprogram/LocalVariableMap.java
@@ -94,6 +94,10 @@ public class LocalVariableMap implements Cloneable
localMap.putAll(vals);
}
 
+   public void putAll(LocalVariableMap vars) {
+   putAll(vars.localMap);
+   }
+
public Data remove( String name ) {
return localMap.remove( name );
}



[systemml] branch master updated: [MINOR] Fix unnecessarily detailed test output in tests/functions/misc

2020-04-09 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 395b5d0  [MINOR] Fix unnecessarily detailed test output in 
tests/functions/misc
395b5d0 is described below

commit 395b5d08b75ac0cd71421ac83f7792ff02e2086a
Author: Matthias Boehm 
AuthorDate: Thu Apr 9 19:25:37 2020 +0200

[MINOR] Fix unnecessarily detailed test output in tests/functions/misc
---
 .../functions/misc/ConditionalValidateTest.java| 25 +++---
 .../test/functions/misc/ExistsVariableTest.java|  2 +-
 .../functions/misc/FunctionInExpressionTest.java   |  2 +-
 .../test/functions/misc/FunctionInliningTest.java  |  2 +-
 .../test/functions/misc/FunctionNotFoundTest.java  |  2 +-
 ...nstantFoldingScalarVariablePropagationTest.java |  2 +-
 .../test/functions/misc/IPANnzPropagationTest.java |  2 +-
 .../test/functions/misc/ListAndStructTest.java |  2 +-
 .../sysds/test/functions/misc/PrintMatrixTest.java |  8 ++-
 .../misc/RemoveUnnecessaryCTableTest.java  |  2 +-
 .../test/functions/misc/RewriteListTsmmCVTest.java |  2 +-
 .../misc/RewriteSlicedMatrixMultTest.java  |  4 ++--
 .../test/functions/misc/SizePropagationTest.java   |  2 +-
 .../functions/misc/ZeroRowsColsMatrixTest.java |  2 +-
 14 files changed, 27 insertions(+), 32 deletions(-)

diff --git 
a/src/test/java/org/apache/sysds/test/functions/misc/ConditionalValidateTest.java
 
b/src/test/java/org/apache/sysds/test/functions/misc/ConditionalValidateTest.java
index 7a54652..71282a7 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/misc/ConditionalValidateTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/misc/ConditionalValidateTest.java
@@ -105,14 +105,14 @@ public class ConditionalValidateTest extends 
AutomatedTestBase
String TEST_NAME = testName;
 
try
-   {   
+   {
TestConfiguration config = 
getTestConfiguration(TEST_NAME);
loadTestConfiguration(config);
 
-   String HOME = SCRIPT_DIR + TEST_DIR;
-   String input = input("Y");
+   String HOME = SCRIPT_DIR + TEST_DIR;
+   String input = input("Y");

-   fullDMLScriptName = HOME + TEST_NAME + ".dml";
+   fullDMLScriptName = HOME + TEST_NAME + ".dml";
programArgs = new String[]{"-args", input };

//write input
@@ -124,16 +124,15 @@ public class ConditionalValidateTest extends 
AutomatedTestBase

HDFSTool.writeMetaDataFile(input+(fileExists?"":"b")+".mtd", ValueType.FP64, 
mc, OutputInfo.TextCellOutputInfo);

//run tests
-   runTest(true, exceptionExpected, DMLException.class, -1);
-   
-   //cleanup
-   HDFSTool.deleteFileIfExistOnHDFS(input);
-   HDFSTool.deleteFileIfExistOnHDFS(input+"b");
-   HDFSTool.deleteFileIfExistOnHDFS(input+".mtd");
-   HDFSTool.deleteFileIfExistOnHDFS(input+"b.mtd");
+   runTest(true, exceptionExpected, DMLException.class, 
-1);
+
+   //cleanup
+   HDFSTool.deleteFileIfExistOnHDFS(input);
+   HDFSTool.deleteFileIfExistOnHDFS(input+"b");
+   HDFSTool.deleteFileIfExistOnHDFS(input+".mtd");
+   HDFSTool.deleteFileIfExistOnHDFS(input+"b.mtd");
}
-   catch(Exception ex)
-   {
+   catch(Exception ex) {
throw new RuntimeException(ex);
}
}
diff --git 
a/src/test/java/org/apache/sysds/test/functions/misc/ExistsVariableTest.java 
b/src/test/java/org/apache/sysds/test/functions/misc/ExistsVariableTest.java
index 8036dd9..ff32dbe 100644
--- a/src/test/java/org/apache/sysds/test/functions/misc/ExistsVariableTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/misc/ExistsVariableTest.java
@@ -68,7 +68,7 @@ public class ExistsVariableTest extends AutomatedTestBase
String HOME = SCRIPT_DIR + TEST_DIR;
String param = pos ? "1" : "0";
fullDMLScriptName = HOME + testName + ".dml";
-   programArgs = new String[]{"-explain", "-stats", "-args", 
param, output("R") };
+   programArgs = new String[]{"-stats", "-args", param, 
output("R") };
   

[systemml] branch master updated: [SYSTEMDS-331] Extended lineage-based reuse (caching of scalars)

2020-04-09 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 6c94556  [SYSTEMDS-331] Extended lineage-based reuse (caching of 
scalars)
6c94556 is described below

commit 6c9455678b2c38a41db741270a86812b05ee77ca
Author: arnabp 
AuthorDate: Thu Apr 9 17:45:12 2020 +0200

[SYSTEMDS-331] Extended lineage-based reuse (caching of scalars)

- This patch contains lineage caching support for scalar objects. This
enables instruction level and multi-level reuse of
operations/functions/statementblocks producing at least one scalar
output. This patch improves multi-level cache hits.
- Furthermore, this adds a new option `-reuse_multilevel` to enable
multi-level reuse.
- This patch also fixes few bugs and enhances reusable instructions
list.
- Additional fix for lineage cache reset to avoid endless loops on
eviction in sequences of tests

Closes #876.
---
 docs/Tasks.txt |  12 +
 src/main/java/org/apache/sysds/api/DMLOptions.java |   2 +
 .../runtime/controlprogram/BasicProgramBlock.java  |   5 +-
 .../runtime/instructions/cp/BooleanObject.java |   5 +
 .../runtime/instructions/cp/DoubleObject.java  |   5 +
 .../sysds/runtime/instructions/cp/IntObject.java   |   5 +
 .../runtime/instructions/cp/ScalarObject.java  |   3 +
 .../runtime/instructions/cp/StringObject.java  |   5 +
 .../org/apache/sysds/runtime/lineage/Lineage.java  |   1 +
 .../apache/sysds/runtime/lineage/LineageCache.java | 255 ++---
 .../sysds/runtime/lineage/LineageCacheConfig.java  |   8 +-
 .../runtime/lineage/LineageCacheStatistics.java|   4 +-
 .../sysds/runtime/lineage/LineageCodegenItem.java  |   4 +
 .../apache/sysds/runtime/lineage/LineageMap.java   |   8 +-
 .../sysds/runtime/lineage/LineageParser.java   |   6 +-
 .../sysds/runtime/lineage/LineageRewriteReuse.java |  77 ++-
 .../sysds/runtime/lineage/LineageTokenizer.java|   2 +-
 .../test/functions/lineage/FullReuseTest.java  |   7 +
 .../functions/lineage/FunctionFullReuseTest.java   |   2 +-
 .../test/functions/lineage/SBFullReuseTest.java|   2 +-
 src/test/scripts/functions/lineage/FullReuse4.dml  |  34 +++
 21 files changed, 296 insertions(+), 156 deletions(-)

diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 5fd6749..42741da 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -236,6 +236,18 @@ SYSTEMDS-320 Merge SystemDS into Apache SystemML   
   OK
  * 321 Merge histories of SystemDS and SystemML   OK
  * 322 Change global package namesOK
  * 323 Fix licenses and notice file   OK 
+ 
+SYSTEMDS-330 Lineage Tracing, Reuse and Integration
+ * 331 Cache and reuse scalar outputs (instruction and multi-level)   OK
+ * 332 Parfor integration with multi-level reuse 
+ * 333 Use exact execution time for cost based eviction
+ 
+SYSTEMDS-340 Compiler Assisted Lineage Caching and Reuse
+ * 341 Finalize unmarking of loop dependent operations
+ * 342 Mark functions as last-use to enable early eviction
+ * 343 Identify equal last level HOPs to ensure SB-level reuse
+ * 344 Unmark functions/SBs containing non-determinism for caching
+ * 345 Compiler assisted cache configuration
 
 Others:
  * Break append instruction to cbind and rbind 
diff --git a/src/main/java/org/apache/sysds/api/DMLOptions.java 
b/src/main/java/org/apache/sysds/api/DMLOptions.java
index 7eca8ab..b9972a3 100644
--- a/src/main/java/org/apache/sysds/api/DMLOptions.java
+++ b/src/main/java/org/apache/sysds/api/DMLOptions.java
@@ -120,6 +120,8 @@ public class DMLOptions {
dmlOptions.linReuseType 
= ReuseCacheType.REUSE_FULL;
else if 
(lineageType.equalsIgnoreCase("reuse_partial"))
dmlOptions.linReuseType 
= ReuseCacheType.REUSE_PARTIAL;
+   else if 
(lineageType.equalsIgnoreCase("reuse_multilevel"))
+   dmlOptions.linReuseType 
= ReuseCacheType.REUSE_MULTILEVEL;
else if 
(lineageType.equalsIgnoreCase("reuse_hybrid"))
dmlOptions.linReuseType 
= ReuseCacheType.REUSE_HYBRID;
else if 
(lineageType.equalsIgnoreCase("none"))
diff --git 
a/src/main/java/org/apache/sysds/runtime/controlprogram/BasicProgramBlock.java 
b/src/main/java/org/apache/sysds/runtime/controlprogram/BasicProgramBlock.java
index 2a9e281..1f52a75 100644
--- 
a/src/main/java

[systemml] branch master updated: [MINOR] Fix typo internal builtin function names (sigmoid)

2020-04-01 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 809c025  [MINOR] Fix typo internal builtin function names (sigmoid)
809c025 is described below

commit 809c02580570e136e2d150400abf184cbff01a74
Author: Matthias Boehm 
AuthorDate: Wed Apr 1 19:18:42 2020 +0200

[MINOR] Fix typo internal builtin function names (sigmoid)
---
 src/main/java/org/apache/sysds/common/Builtins.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/java/org/apache/sysds/common/Builtins.java 
b/src/main/java/org/apache/sysds/common/Builtins.java
index d260d35..7220198 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -154,7 +154,7 @@ public enum Builtins {
SAMPLE("sample", false),
SD("sd", false),
SEQ("seq", false),
-   SIGMOD("sigmoid", true),   // 1 / (1 + exp(-X))
+   SIGMOID("sigmoid", true),   // 1 / (1 + exp(-X))
SIGN("sign", false),
SIN("sin", false),
SINH("sinh", false),



[systemml] branch master updated: [MINOR] Updated readme w/ correct travis badge

2020-03-28 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new ef8b551  [MINOR] Updated readme w/ correct travis badge
ef8b551 is described below

commit ef8b551103085b73d0b471db9df6c5c0748f7d94
Author: Sebastian 
AuthorDate: Sat Mar 28 20:27:26 2020 +0100

[MINOR] Updated readme w/ correct travis badge

Closes #870.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ee23ed6..1ccb7b4 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ limitations under the License.
 
 ## Status
 
-[![Build 
Status](https://travis-ci.com/apache/systemml.svg?branch=master)](https://travis-ci.com/apache/systemml)
+[![Build 
Status](https://travis-ci.org/apache/systemml.svg?branch=master)](https://travis-ci.org/apache/systemml)
 
[![License](https://img.shields.io/badge/License-Apache%202.0-gre.svg)](https://opensource.org/licenses/Apache-2.0)
 
 ![Build](https://github.com/apache/systemml/workflows/Build/badge.svg)



[systemml] branch master updated: [SYSTEMDS-301] Improved github workflows (cache dependencies)

2020-03-28 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new dab0991  [SYSTEMDS-301] Improved github workflows (cache dependencies)
dab0991 is described below

commit dab09916436c9518afa3cf8da572db2bde32207a
Author: Sebastian 
AuthorDate: Sat Mar 28 20:22:12 2020 +0100

[SYSTEMDS-301] Improved github workflows (cache dependencies)

Closes #869.
---
 .github/workflows/applicationTests.yml |  3 ++-
 .github/workflows/build.yml| 16 
 .github/workflows/componentTests.yml   | 16 
 .github/workflows/documentation.yml| 28 +++-
 .github/workflows/functionsTests.yml   |  3 ++-
 .github/workflows/python.yml   | 20 ++--
 6 files changed, 69 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/applicationTests.yml 
b/.github/workflows/applicationTests.yml
index e4efc2c..652b31a 100644
--- a/.github/workflows/applicationTests.yml
+++ b/.github/workflows/applicationTests.yml
@@ -39,7 +39,8 @@ jobs:
 os: [ubuntu-latest]
 name:  Ap Test ${{ matrix.tests }} 
 steps:
-- uses: actions/checkout@v2
+- name: Checkout Repository
+  uses: actions/checkout@v2
 
 - name: Run all tests starting with "${{ matrix.tests }}"
   uses: ./.github/action/
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c449040..0ae7f82 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -30,14 +30,22 @@ jobs:
   fail-fast: false
   matrix:
 os: [ubuntu-latest, macOS-latest, windows-latest]
-
 steps:
-- uses: actions/checkout@v2
+- name: Checkout Repository
+  uses: actions/checkout@v2
 
-- name: Set up JDK 1.8
+- name: Setup Java 1.8
   uses: actions/setup-java@v1
   with:
 java-version: 1.8
 
-- name: Build with Maven
+- name: Cache Maven Dependencies
+  uses: actions/cache@v1
+  with:
+path: ~/.m2/repository
+key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
+restore-keys: |
+  ${{ runner.os }}-maven-
+
+- name: Build
   run: mvn package
diff --git a/.github/workflows/componentTests.yml 
b/.github/workflows/componentTests.yml
index 838b662..0cc934c 100644
--- a/.github/workflows/componentTests.yml
+++ b/.github/workflows/componentTests.yml
@@ -30,15 +30,23 @@ jobs:
   fail-fast: false
   matrix:
 os: [ubuntu-latest]
-java: [ 1.8 ]
 name: Component Tests ${{ matrix.os }}
 steps:
-- uses: actions/checkout@v2
+- name: Checkout Repository
+  uses: actions/checkout@v2
 
-- name: Setup Java
+- name: Setup Java 1.8
   uses: actions/setup-java@v1
   with:
-java-version: ${{ matrix.java }}
+java-version: 1.8
+
+- name: Cache Maven Dependencies
+  uses: actions/cache@v1
+  with:
+path: ~/.m2/repository
+key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
+restore-keys: |
+  ${{ runner.os }}-maven-
 
 - name: Maven clean compile & test-compile
   run: mvn clean compile test-compile
diff --git a/.github/workflows/documentation.yml 
b/.github/workflows/documentation.yml
index 96e4881..201210f 100644
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -31,13 +31,22 @@ jobs:
 runs-on: ubuntu-latest
 name: Documentation Java
 steps:
-- uses: actions/checkout@v2
+- name: Checkout Repository
+  uses: actions/checkout@v2
 
-- name: Setup Java
+- name: Setup Java 1.8
   uses: actions/setup-java@v1
   with:
-java-version:  1.8
+java-version: 1.8
 
+- name: Cache Maven Dependencies
+  uses: actions/cache@v1
+  with:
+path: ~/.m2/repository
+key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
+restore-keys: |
+  ${{ runner.os }}-maven-
+  
 - name: Make Documentation SystemDS Java
   run: mvn -P distribution package
 
@@ -51,14 +60,23 @@ jobs:
 runs-on: ubuntu-latest
 name: Documentation Python
 steps:
-- uses: actions/checkout@v2
+- name: Checkout Repository
+  uses: actions/checkout@v2
 
 - name: Setup Python
   uses: actions/setup-python@v1
   with:
 python-version: 3.7
 architecture: 'x64'
-
+
+- name: Cache Pip Dependencies
+  uses: actions/cache@v1
+  with:
+path: ~/.cache/pip
+key: ${{ runner.os }}-pip-docs-${{ 
hashFiles('src/main/python/docs/requires-docs.txt') }}
+restore-keys: |
+  ${{ runner.os }}-pip-docs-
+
 - name: Install Dependencies
   run: |
 cd src/main/python/docs
diff --git a/.github/workflows/functionsTests.yml 
b/.gi

[systemml] branch master updated: [MINOR] Fix mlcontext function tests (wrong url)

2020-03-27 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 2cc922c  [MINOR] Fix mlcontext function tests (wrong url)
2cc922c is described below

commit 2cc922c856967d75afac09f80fd4df73a620584a
Author: Matthias Boehm 
AuthorDate: Sat Mar 28 00:28:46 2020 +0100

[MINOR] Fix mlcontext function tests (wrong url)
---
 .../java/org/apache/sysds/test/functions/mlcontext/MLContextTest.java   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git 
a/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextTest.java 
b/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextTest.java
index fbc90c1..c2add4d 100644
--- a/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextTest.java
@@ -158,7 +158,7 @@ public class MLContextTest extends MLContextTestBase {
@Test
public void testCreateDMLScriptBasedOnURL() throws 
MalformedURLException {
System.out.println("MLContextTest - create DML script based on 
URL");
-   String urlString = 
"https://raw.githubusercontent.com/apache/systemml/systemds/master/src/test/scripts/applications/hits/HITS.dml;;
+   String urlString = 
"https://raw.githubusercontent.com/apache/systemml/master/src/test/scripts/applications/hits/HITS.dml;;
URL url = new URL(urlString);
Script script = dmlFromUrl(url);
String expectedContent = "Licensed to the Apache Software 
Foundation";



[systemml] branch master updated: [SYSTEMML-2533] Fix named arguments in MNIST LeNet example script

2020-03-01 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new e2b9858  [SYSTEMML-2533] Fix named arguments in MNIST LeNet example 
script
e2b9858 is described below

commit e2b985807c485b3c3f1b63e2926a2f5478441641
Author: Nathan Kan 
AuthorDate: Sun Mar 1 22:26:31 2020 +0100

[SYSTEMML-2533] Fix named arguments in MNIST LeNet example script

Closes #866.
---
 scripts/nn/examples/mnist_lenet.dml | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/scripts/nn/examples/mnist_lenet.dml 
b/scripts/nn/examples/mnist_lenet.dml
index 57b8ba6..484219d 100644
--- a/scripts/nn/examples/mnist_lenet.dml
+++ b/scripts/nn/examples/mnist_lenet.dml
@@ -118,13 +118,13 @@ train = function(matrix[double] X, matrix[double] Y,
 stride, stride, pad, pad)
   outr1 = relu::forward(outc1)
   [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, 
Hf=2, Wf=2,
-strideh=2, stridew=2, 
pad=0, pad=0)
+strideh=2, stridew=2, 
padh=0, padw=0)
   ## layer 2: conv2 -> relu2 -> pool2
   [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, 
Woutp1, Hf, Wf,
 stride, stride, pad, pad)
   outr2 = relu::forward(outc2)
   [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, 
Hf=2, Wf=2,
-strideh=2, stridew=2, 
pad=0, pad=0)
+strideh=2, stridew=2, 
padh=0, padw=0)
   ## layer 3:  affine3 -> relu3 -> dropout
   outa3 = affine::forward(outp2, W3, b3)
   outr3 = relu::forward(outa3)
@@ -166,13 +166,13 @@ train = function(matrix[double] X, matrix[double] Y,
   [doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3)
   ## layer 2: conv2 -> relu2 -> pool2
   doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, 
Woutc2, Hf=2, Wf=2,
-strideh=2, stridew=2, pad=0, pad=0)
+strideh=2, stridew=2, padh=0, padw=0)
   doutc2 = relu::backward(doutr2, outc2)
   [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, 
b2, F1,
 Houtp1, Woutp1, Hf, Wf, stride, 
stride, pad, pad)
   ## layer 1: conv1 -> relu1 -> pool1
   doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, 
Woutc1, Hf=2, Wf=2,
-strideh=2, stridew=2, pad=0, pad=0)
+strideh=2, stridew=2, padh=0, padw=0)
   doutc1 = relu::backward(doutr1, outc1)
   [dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, 
W1, b1, C, Hin, Win,
   Hf, Wf, stride, stride, pad, pad)
@@ -264,13 +264,13 @@ predict = function(matrix[double] X, int C, int Hin, int 
Win,
   pad, pad)
 outr1 = relu::forward(outc1)
 [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, 
Hf=2, Wf=2,
-  strideh=2, stridew=2, pad=0, 
pad=0)
+  strideh=2, stridew=2, 
padh=0, padw=0)
 ## layer 2: conv2 -> relu2 -> pool2
 [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, 
Woutp1, Hf, Wf,
   stride, stride, pad, pad)
 outr2 = relu::forward(outc2)
 [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, 
Hf=2, Wf=2,
-  strideh=2, stridew=2, pad=0, 
pad=0)
+  strideh=2, stridew=2, 
padh=0, padw=0)
 ## layer 3:  affine3 -> relu3
 outa3 = affine::forward(outp2, W3, b3)
 outr3 = relu::forward(outa3)
@@ -328,4 +328,3 @@ generate_dummy_data = function()
   classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform"))
   Y = table(seq(1, N), classes)  # one-hot encoding
 }
-



[systemml] branch master updated: [SYSTEMML-2530] Fix wrong integer casting for negative numbers

2019-08-09 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 4422a05  [SYSTEMML-2530] Fix wrong integer casting for negative numbers
4422a05 is described below

commit 4422a05325b03e0b656302774504ca9763e72c2a
Author: Matthias Boehm 
AuthorDate: Fri Aug 9 16:20:18 2019 +0200

[SYSTEMML-2530] Fix wrong integer casting for negative numbers

This patch backports SYSTEMDS-106 as it resolves an issue of incorrect
results that are so subtle that they might go unnoticed.
---
 src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java 
b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
index 42b519b..f6c1182 100644
--- a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
+++ b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
@@ -323,11 +323,13 @@ public class UtilFunctions
}

public static int toInt( double val ) {
-   return (int) Math.floor( val + DOUBLE_EPS );
+   return (int) (Math.signum(val)
+   * Math.floor(Math.abs(val) + DOUBLE_EPS));
}

public static long toLong( double val ) {
-   return (long) Math.floor( val + DOUBLE_EPS );
+   return (long) (Math.signum(val)
+   * Math.floor(Math.abs(val) + DOUBLE_EPS));
}

public static int toInt(Object obj) {



[systemml] 02/02: [SYSTEMML-2521] New rewrite for sparsity-aware matrix product chains

2019-03-17 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git

commit f42dfb358ac24b6633d01dd181b51d458cd1bbe7
Author: Matthias Boehm 
AuthorDate: Sun Mar 17 20:23:33 2019 +0100

[SYSTEMML-2521] New rewrite for sparsity-aware matrix product chains

This patch introduces a new dynamic rewrite for sparsity-aware matrix
multiplication chain optimization. For estimating the sparsity of
intermediates, we use the existing MNC sparsity estimator.

While this rewrite does find the optimal plan in case of perfect
estimates, it currently requires access to all input matrices of the mm
chain and these inputs need to fit into CP memory. Accordingly, this
rewrite is still disabled by default.
---
 .../sysml/hops/estim/EstimatorMatrixHistogram.java |   2 +-
 .../sysml/hops/rewrite/ProgramRewriteStatus.java   |  16 ++-
 .../RewriteMatrixMultChainOptimization.java|  88 ++--
 .../RewriteMatrixMultChainOptimizationSparse.java  | 157 +
 4 files changed, 215 insertions(+), 48 deletions(-)

diff --git 
a/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java 
b/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java
index 5f1abff..b079a7e 100644
--- a/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java
+++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java
@@ -59,7 +59,7 @@ public class EstimatorMatrixHistogram extends 
SparsityEstimator
return estim(root, true);
}

-   private MatrixCharacteristics estim(MMNode root, boolean topLevel) {
+   public MatrixCharacteristics estim(MMNode root, boolean topLevel) {
//NOTE: not estimateInputs due to handling of topLevel
MatrixHistogram h1 = getCachedSynopsis(root.getLeft());
MatrixHistogram h2 = getCachedSynopsis(root.getRight());
diff --git 
a/src/main/java/org/apache/sysml/hops/rewrite/ProgramRewriteStatus.java 
b/src/main/java/org/apache/sysml/hops/rewrite/ProgramRewriteStatus.java
index 552a598..a622948 100644
--- a/src/main/java/org/apache/sysml/hops/rewrite/ProgramRewriteStatus.java
+++ b/src/main/java/org/apache/sysml/hops/rewrite/ProgramRewriteStatus.java
@@ -19,9 +19,10 @@
 
 package org.apache.sysml.hops.rewrite;
 
+import org.apache.sysml.runtime.controlprogram.LocalVariableMap;
+
 public class ProgramRewriteStatus 
 {
-   
//status of applied rewrites
private boolean _rmBranches = false; //removed branches
private int _blkSize = -1;
@@ -29,14 +30,19 @@ public class ProgramRewriteStatus

//current context
private boolean _inParforCtx = false;
+   private LocalVariableMap _vars = null;

-   public ProgramRewriteStatus()
-   {
+   public ProgramRewriteStatus() {
_rmBranches = false;
_inParforCtx = false;
_injectCheckpoints = false;
}

+   public ProgramRewriteStatus(LocalVariableMap vars) {
+   this();
+   _vars = vars;
+   }
+   
public void setRemovedBranches(){
_rmBranches = true;
}
@@ -68,4 +74,8 @@ public class ProgramRewriteStatus
public boolean getInjectedCheckpoints(){
return _injectCheckpoints;
}
+   
+   public LocalVariableMap getVariables() {
+   return _vars;
+   }
 }
diff --git 
a/src/main/java/org/apache/sysml/hops/rewrite/RewriteMatrixMultChainOptimization.java
 
b/src/main/java/org/apache/sysml/hops/rewrite/RewriteMatrixMultChainOptimization.java
index 91033c4..cdb1e12 100644
--- 
a/src/main/java/org/apache/sysml/hops/rewrite/RewriteMatrixMultChainOptimization.java
+++ 
b/src/main/java/org/apache/sysml/hops/rewrite/RewriteMatrixMultChainOptimization.java
@@ -35,14 +35,16 @@ import org.apache.sysml.utils.Explain;
 
 /**
  * Rule: Determine the optimal order of execution for a chain of
- * matrix multiplications Solution: Classic Dynamic Programming
- * Approach Currently, the approach based only on matrix dimensions
+ * matrix multiplications 
+ * 
+ * Solution: Classic Dynamic Programming
+ * Approach: Currently, the approach based only on matrix dimensions
  * Goal: To reduce the number of computations in the run-time
  * (map-reduce) layer
  */
 public class RewriteMatrixMultChainOptimization extends HopRewriteRule
 {
-   private static final Log LOG = 
LogFactory.getLog(RewriteMatrixMultChainOptimization.class.getName());
+   protected static final Log LOG = 
LogFactory.getLog(RewriteMatrixMultChainOptimization.class.getName());
private static final boolean LDEBUG = false;

static {
@@ -61,7 +63,7 @@ public class RewriteMatrixMultChainOptimization extends 
HopRewriteRule
 
// Find the optimal order

[systemml] branch master updated (881f606 -> f42dfb3)

2019-03-17 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git.


from 881f606  [MINOR] Provide a more informative error message when the 
dimensions don't match during the validate phase
 new 4a38a47  [MINOR] Fix unnecessary warnings (unnecessary imports)
 new f42dfb3  [SYSTEMML-2521] New rewrite for sparsity-aware matrix product 
chains

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../org/apache/sysml/api/ScriptExecutorUtils.java  |   1 -
 .../apache/sysml/api/mlcontext/ScriptExecutor.java |   1 -
 .../java/org/apache/sysml/hops/FunctionOp.java |   1 -
 .../sysml/hops/estim/EstimatorMatrixHistogram.java |   2 +-
 .../sysml/hops/rewrite/ProgramRewriteStatus.java   |  16 ++-
 .../RewriteMatrixMultChainOptimization.java|  88 ++--
 .../RewriteMatrixMultChainOptimizationSparse.java  | 157 +
 .../java/org/apache/sysml/utils/Statistics.java|   1 -
 .../org/apache/sysml/test/gpu/LstmCPUTest.java |   2 -
 .../functions/unary/matrix/AbsTest.java|   2 -
 .../functions/unary/matrix/NegationTest.java   |   2 -
 .../functions/unary/matrix/SinTest.java|   2 -
 .../functions/unary/matrix/TanTest.java|   2 -
 13 files changed, 215 insertions(+), 62 deletions(-)
 create mode 100644 
src/main/java/org/apache/sysml/hops/rewrite/RewriteMatrixMultChainOptimizationSparse.java



[systemml] 01/02: [MINOR] Fix unnecessary warnings (unnecessary imports)

2019-03-17 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git

commit 4a38a4789302741965f49b4dd559a7078d94eb69
Author: Matthias Boehm 
AuthorDate: Sun Mar 17 12:09:33 2019 +0100

[MINOR] Fix unnecessary warnings (unnecessary imports)
---
 src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java | 1 -
 src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java| 1 -
 src/main/java/org/apache/sysml/hops/FunctionOp.java | 1 -
 src/main/java/org/apache/sysml/utils/Statistics.java| 1 -
 src/test/java/org/apache/sysml/test/gpu/LstmCPUTest.java| 2 --
 .../apache/sysml/test/integration/functions/unary/matrix/AbsTest.java   | 2 --
 .../sysml/test/integration/functions/unary/matrix/NegationTest.java | 2 --
 .../apache/sysml/test/integration/functions/unary/matrix/SinTest.java   | 2 --
 .../apache/sysml/test/integration/functions/unary/matrix/TanTest.java   | 2 --
 9 files changed, 14 deletions(-)

diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java 
b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
index 0d072e5..c9d1a5d 100644
--- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
@@ -19,7 +19,6 @@
 
 package org.apache.sysml.api;
 
-import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
diff --git a/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java 
b/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
index 7bda306..8ecd962 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
@@ -38,7 +38,6 @@ import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.conf.DMLOptions;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.parser.DMLProgram;
-import org.apache.sysml.parser.DMLTranslator;
 import org.apache.sysml.parser.ParseException;
 import org.apache.sysml.parser.ParserFactory;
 import org.apache.sysml.parser.ParserWrapper;
diff --git a/src/main/java/org/apache/sysml/hops/FunctionOp.java 
b/src/main/java/org/apache/sysml/hops/FunctionOp.java
index dedbad6..534c0a0 100644
--- a/src/main/java/org/apache/sysml/hops/FunctionOp.java
+++ b/src/main/java/org/apache/sysml/hops/FunctionOp.java
@@ -22,7 +22,6 @@ package org.apache.sysml.hops;
 import java.util.ArrayList;
 import java.util.List;
 
-import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.lops.FunctionCallCP;
 import org.apache.sysml.lops.FunctionCallCPSingle;
 import org.apache.sysml.lops.Lop;
diff --git a/src/main/java/org/apache/sysml/utils/Statistics.java 
b/src/main/java/org/apache/sysml/utils/Statistics.java
index 656de32..a2afae0 100644
--- a/src/main/java/org/apache/sysml/utils/Statistics.java
+++ b/src/main/java/org/apache/sysml/utils/Statistics.java
@@ -32,7 +32,6 @@ import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.DoubleAdder;
 import java.util.concurrent.atomic.LongAdder;
 
-import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.hops.OptimizerUtils;
diff --git a/src/test/java/org/apache/sysml/test/gpu/LstmCPUTest.java 
b/src/test/java/org/apache/sysml/test/gpu/LstmCPUTest.java
index 5c93bca..4c4ab74 100644
--- a/src/test/java/org/apache/sysml/test/gpu/LstmCPUTest.java
+++ b/src/test/java/org/apache/sysml/test/gpu/LstmCPUTest.java
@@ -23,8 +23,6 @@ import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 
-import org.apache.sysml.runtime.instructions.gpu.DnnGPUInstruction;
-import 
org.apache.sysml.runtime.instructions.gpu.DnnGPUInstruction.LstmOperator;
 import org.apache.sysml.test.utils.TestUtils;
 import org.junit.Test;
 
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/unary/matrix/AbsTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/unary/matrix/AbsTest.java
index a3027d6..6b61066 100644
--- 
a/src/test/java/org/apache/sysml/test/integration/functions/unary/matrix/AbsTest.java
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/unary/matrix/AbsTest.java
@@ -20,8 +20,6 @@
 package org.apache.sysml.test.integration.functions.unary.matrix;
 
 import org.junit.Test;
-import org.apache.sysml.api.DMLScript;
-import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
 import org.apache.sysml.test.integration.AutomatedTestBase;
 import org.apache.sysml.test.integration.TestConfiguration;
 
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/unary/matrix/NegationTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/unary/matrix/NegationTest.java
index c2613c2..6b2000a 100644

[systemml] branch master updated: [SYSTEMML-2511] Fix bitset sparsity estimation on large input data

2019-02-19 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new fe83cad  [SYSTEMML-2511] Fix bitset sparsity estimation on large input 
data
fe83cad is described below

commit fe83cad3e13d049eacea19662b1a4e3b1704cb6d
Author: Matthias Boehm 
AuthorDate: Tue Feb 19 15:08:08 2019 +0100

[SYSTEMML-2511] Fix bitset sparsity estimation on large input data

This patch fixes a corruption introduced by previous refactoring that
led to always allocating a BitsetMatrix1 (w/ linearized long array)
independent of the input size, leading to incorrect sketches and class
cast exceptions on subsequent estimation.
---
 src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java 
b/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
index e26dd49..cf9f627 100644
--- a/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
+++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
@@ -88,7 +88,7 @@ public class EstimatorBitsetMM extends SparsityEstimator
return null;
//ensure synopsis is properly cached and reused
if( node.isLeaf() && node.getSynopsis() == null )
-   node.setSynopsis(new BitsetMatrix1(node.getData()));
+   node.setSynopsis(createBitset(node.getData()));
else if( !node.isLeaf() )
estim(node); //recursively obtain synopsis
return (BitsetMatrix) node.getSynopsis();



[systemml] branch master updated: [SYSTEMML-2468] Extended MNC exact propagation of sketch counts

2019-02-19 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 01da6a6  [SYSTEMML-2468] Extended MNC exact propagation of sketch 
counts
01da6a6 is described below

commit 01da6a6fef8bbfe68734da64604943547f45ae79
Author: Matthias Boehm 
AuthorDate: Tue Feb 19 14:13:04 2019 +0100

[SYSTEMML-2468] Extended MNC exact propagation of sketch counts

This patch extends the MNC sketch propagation by a special case, where
we can exactly infer (in an inexpensive manner) the output column count
histogram, if the lhs has one non-zero per row and the rhs is leaf node
and sparse. However, initial experiments were not fully conclusive and
hecnce, this is still disabled.
---
 .../sysml/hops/estim/EstimatorMatrixHistogram.java | 30 +++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git 
a/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java 
b/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java
index a82feed..5f1abff 100644
--- a/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java
+++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java
@@ -42,6 +42,7 @@ public class EstimatorMatrixHistogram extends 
SparsityEstimator
 {
//internal configurations
private static final boolean DEFAULT_USE_EXTENDED = true;
+   private static final boolean ADVANCED_SKETCH_PROP = false;

private final boolean _useExtended;

@@ -71,6 +72,7 @@ public class EstimatorMatrixHistogram extends 
SparsityEstimator
}

//sketch propagation for intermediates other than final result
+   h2.setData(root.getRight().isLeaf() ? root.getRight().getData() 
: null);
MatrixHistogram outMap = MatrixHistogram
.deriveOutputHistogram(h1, h2, ret, root.getOp(), 
root.getMisc());
root.setSynopsis(outMap);
@@ -227,6 +229,7 @@ public class EstimatorMatrixHistogram extends 
SparsityEstimator
private final int rNonEmpty, cNonEmpty; //number of non-empty 
rows/cols (w/ empty is nnz=0)
private final int rNdiv2, cNdiv2;   //number of rows/cols 
with nnz > #cols/2 and #rows/2
private boolean fullDiag;   //true if there exists 
a full diagonal of nonzeros
+   private MatrixBlock _data = null; //optional leaf data

public MatrixHistogram(MatrixBlock in, boolean useExcepts) {
// 1) allocate basic synopsis
@@ -348,6 +351,10 @@ public class EstimatorMatrixHistogram extends 
SparsityEstimator
IntStream.range(0, getCols()).mapToLong(i-> 
cNnz[i]).sum();
}

+   public void setData(MatrixBlock mb) {
+   _data = mb;
+   }
+   
public static MatrixHistogram 
deriveOutputHistogram(MatrixHistogram h1, MatrixHistogram h2, double spOut, 
OpCode op, long[] misc) {
switch(op) {
case MM:  return deriveMMHistogram(h1, h2, 
spOut);
@@ -396,6 +403,7 @@ public class EstimatorMatrixHistogram extends 
SparsityEstimator
}
}

+   @SuppressWarnings("unused")
private static MatrixHistogram 
deriveMMHistogram(MatrixHistogram h1, MatrixHistogram h2, double spOut) {
//exact propagation if lhs or rhs full diag
if( h1.fullDiag ) return h2;
@@ -416,9 +424,25 @@ public class EstimatorMatrixHistogram extends 
SparsityEstimator
rMaxNnz = Math.max(rMaxNnz, rNnz[i]);
}
int[] cNnz = new int[h2.getCols()];
-   for( int i=0; i

[systemml] branch master updated: [SYSTEMML-2509] Fix transform binning metadata frame allocation

2019-02-13 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 0c23e1f  [SYSTEMML-2509] Fix transform binning metadata frame 
allocation
0c23e1f is described below

commit 0c23e1fa194d37d67e0490c2894519b0ea6720e4
Author: Matthias Boehm 
AuthorDate: Wed Feb 13 11:52:19 2019 +0100

[SYSTEMML-2509] Fix transform binning metadata frame allocation

This patch fixes special cases where binning is the only transformation
or where it requires the most metadata rows (e.g., more than recoding),
for which cases so far the output metadata frame was not properly
allocated.
---
 .../org/apache/sysml/runtime/matrix/data/FrameBlock.java | 12 ++--
 .../apache/sysml/runtime/transform/encode/EncoderBin.java|  6 ++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
index 87c6aca..ef16feb 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
@@ -248,8 +248,16 @@ public class FrameBlock implements Writable, CacheBlock, 
Externalizable
 */
public void ensureAllocatedColumns(int numRows) {
//early abort if already allocated
-   if( _coldata != null && _schema.length == _coldata.length ) 
-   return; 
+   if( _coldata != null && _schema.length == _coldata.length ) {
+   //handle special case that to few rows allocated
+   if( _numRows < numRows ) {
+   String[] tmp = new String[getNumColumns()];
+   int len = numRows - _numRows;
+   for(int i=0; i

[systemml] branch master updated: [SYSTEMML-2289] Additional sampling-based sparsity estimator baseline

2019-02-09 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new e443eff  [SYSTEMML-2289] Additional sampling-based sparsity estimator 
baseline
e443eff is described below

commit e443eff949b48f45d1453a6cbf483b87a612c307
Author: Matthias Boehm 
AuthorDate: Sat Feb 9 15:38:21 2019 +0100

[SYSTEMML-2289] Additional sampling-based sparsity estimator baseline

This patch adds an additional baseline sparsity estimator based on
sampling and hashing, which implements the apporach described in

Rasmus Resen Amossen, Andrea Campagna, Rasmus Pagh: Better Size
Estimation for Sparse Matrix Products. Algorithmica 69(3): 741-757
(2014)

Credit: We're grateful to the authors who shared their code. This 
implementation improves upon it by fitting the SparsityEstimator API, support 
for binary matrix products, avoid unnecessary file access, use Well1024a for 
seeding local RNGs, and generally improve performance.
---
 .../apache/sysml/hops/estim/EstimatorSample.java   |   2 +-
 .../apache/sysml/hops/estim/EstimatorSampleRa.java | 268 +
 .../functions/estim/OuterProductTest.java  |  26 +-
 .../functions/estim/SelfProductTest.java   |  21 ++
 .../functions/estim/SquaredProductTest.java|  85 ++-
 5 files changed, 398 insertions(+), 4 deletions(-)

diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java 
b/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java
index ec624f0..821aa73 100644
--- a/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java
+++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java
@@ -56,7 +56,7 @@ public class EstimatorSample extends SparsityEstimator
}

public EstimatorSample(double sampleFrac, boolean extended) {
-   if( sampleFrac < 0 || sampleFrac > 1.0 )
+   if( sampleFrac <= 0 || sampleFrac > 1.0 )
throw new DMLRuntimeException("Invalid sample fraction: 
"+sampleFrac);
_frac = sampleFrac;
_extended = extended;
diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorSampleRa.java 
b/src/main/java/org/apache/sysml/hops/estim/EstimatorSampleRa.java
new file mode 100644
index 000..2e39d02
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorSampleRa.java
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.hops.estim;
+
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.commons.math3.random.Well1024a;
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDatagen;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.SparseBlock;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Random;
+
+/**
+ * This estimator implements an approach based on row/column sampling
+ * 
+ * Rasmus Resen Amossen, Andrea Campagna, Rasmus Pagh:
+ * Better Size Estimation for Sparse Matrix Products. Algorithmica 69(3): 
741-757 (2014)
+ * 
+ * Credit: This code is based on the original implementation provided by the 
authors,
+ * modified to fit the SparsityEstimator API, support binary matrix products, 
avoid 
+ * unnecessary file access, use Well1024a for seeding local RNGs, and 
generally 
+ * improve performance.
+ */
+public class EstimatorSampleRa extends SparsityEstimator 
+{
+   private static final int RUNS = -1;
+   private static final double SAMPLE_FRACTION = 0.1; //10%
+   private static final double EPSILON = 0.05; // Multiplicative error
+   private static final double DELTA = 0.1; // Probability of error
+   private static final int K = -1;
+   
+   private fin

[systemml] branch master updated: [SYSTEMML-2509] Fix transform sequences of binning/dummy coding, tests

2019-02-04 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new a1bc419  [SYSTEMML-2509] Fix transform sequences of binning/dummy 
coding, tests
a1bc419 is described below

commit a1bc419b033f635273e7b91cf9a8dea329e03567
Author: Matthias Boehm 
AuthorDate: Mon Feb 4 20:56:29 2019 +0100

[SYSTEMML-2509] Fix transform sequences of binning/dummy coding, tests

This patch is a follow-up on fixing the binning support, specifically
for columns that are both binned and dummy coded. For an example
scenario of {recode: [1,2,7], bin: [3,8] dummycode: [3,8]}, we
incorrectly constructed the following composite encoder (which assumed
that all dummy coded columns need to be recoded):

CompositeEncoder(4):
-- EncoderRecode: [1, 2, 3, 7, 8]
-- EncoderPassThrough: [4, 5, 6, 9]
-- EncoderDummycode: [3, 8]
-- EncoderBin: [3, 8]

Now, we fixed that by only adding dummy coded columns that are not
binned to the recode list and brining the basic encoders into the right
sequence (i.e., binning before dummy coding):

CompositeEncoder(4):
-- EncoderRecode: [1, 2, 7]
-- EncoderPassThrough: [4, 5, 6, 9]
-- EncoderBin: [3, 8]
-- EncoderDummycode: [3, 8]

Finally, this patch also includes the necessary tests to ensure such
issues don't occur in the future.
---
 .../runtime/transform/encode/EncoderFactory.java   | 15 ++--
 .../transform/TransformFrameEncodeApplyTest.java   | 88 +-
 .../input/homes3/homes.tfspec_binDummy.json|  6 ++
 .../input/homes3/homes.tfspec_binDummy2.json   |  6 ++
 4 files changed, 88 insertions(+), 27 deletions(-)

diff --git 
a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java 
b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
index 3d2a100..1118ca6 100644
--- 
a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
+++ 
b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
@@ -56,19 +56,20 @@ public class EncoderFactory
List lencoders = new ArrayList<>();

//prepare basic id lists (recode, dummycode, 
pass-through)
-   //note: any dummycode column requires recode as 
preparation
List rcIDs = Arrays.asList(ArrayUtils.toObject(
TfMetaUtils.parseJsonIDList(jSpec, 
colnames, TfUtils.TXMETHOD_RECODE)));
List dcIDs = Arrays.asList(ArrayUtils.toObject(
TfMetaUtils.parseJsonIDList(jSpec, 
colnames, TfUtils.TXMETHOD_DUMMYCODE))); 
-   rcIDs = new 
ArrayList(CollectionUtils.union(rcIDs, dcIDs));
List binIDs = 
TfMetaUtils.parseBinningColIDs(jSpec, colnames);
+   //note: any dummycode column requires recode as 
preparation, unless it follows binning
+   rcIDs = new ArrayList(
+   CollectionUtils.union(rcIDs, 
CollectionUtils.subtract(dcIDs, binIDs)));
List ptIDs = new 
ArrayList(CollectionUtils.subtract(
-   
CollectionUtils.subtract(UtilFunctions.getSeqList(1, clen, 1), rcIDs), 
binIDs)); 
+   
CollectionUtils.subtract(UtilFunctions.getSeqList(1, clen, 1), rcIDs), binIDs));
List oIDs = Arrays.asList(ArrayUtils.toObject(
-   TfMetaUtils.parseJsonIDList(jSpec, 
colnames, TfUtils.TXMETHOD_OMIT))); 
+   TfMetaUtils.parseJsonIDList(jSpec, 
colnames, TfUtils.TXMETHOD_OMIT)));
List mvIDs = Arrays.asList(ArrayUtils.toObject(
-   
TfMetaUtils.parseJsonObjectIDList(jSpec, colnames, TfUtils.TXMETHOD_IMPUTE))); 
+   
TfMetaUtils.parseJsonObjectIDList(jSpec, colnames, TfUtils.TXMETHOD_IMPUTE)));

//create individual encoders
if( !rcIDs.isEmpty() ) {
@@ -79,10 +80,10 @@ public class EncoderFactory
if( !ptIDs.isEmpty() )
lencoders.add(new EncoderPassThrough(

ArrayUtils.toPrimitive(ptIDs.toArray(new Integer[0])), clen));
-   if( !dcIDs.isEmpty() )
-   lencoders.add(new EncoderDummycode(jSpec, 
colnames, schema.length));
if( !binIDs.isEmpty() )
lencoders.add(new EncoderBin(jSpec, colnames, 
schema.length));
+   if( !dcIDs.i

[systemml] branch master updated: [SYSTEMML-2509] Fix binning support in transformencode over frames

2019-01-26 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 3d09c4b  [SYSTEMML-2509] Fix binning support in transformencode over 
frames
3d09c4b is described below

commit 3d09c4b1621ef8f7db3841da1e7d36d64298aef1
Author: Matthias Boehm 
AuthorDate: Sat Jan 26 22:43:41 2019 +0100

[SYSTEMML-2509] Fix binning support in transformencode over frames

This patch fixes missing binning support in transformencode over frames.
So far, only the apply was working properly but no meta data was build,
which corrupted the returned output matrix and meta data. Now, local CP
operations work as intended but distributed operations and sequences of
binning/dummy-coding require additional work.
---
 .../sysml/runtime/transform/encode/EncoderBin.java | 114 +++--
 .../runtime/transform/encode/EncoderFactory.java   |   8 +-
 .../runtime/transform/encode/EncoderRecode.java|   2 +-
 .../sysml/runtime/transform/meta/TfMetaUtils.java  |   6 +-
 .../transform/TransformEncodeDecodeTest.java   |   1 -
 .../transform/TransformFrameEncodeApplyTest.java   |  16 ++-
 .../transform/TransformFrameEncodeApply.dml|   1 -
 7 files changed, 81 insertions(+), 67 deletions(-)

diff --git 
a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java 
b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java
index 016adb4..2f94003 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java
@@ -35,7 +35,7 @@ import org.apache.sysml.runtime.transform.meta.TfMetaUtils;
 import org.apache.sysml.runtime.util.UtilFunctions;
 
 public class EncoderBin extends Encoder 
-{  
+{
private static final long serialVersionUID = 1917445005206076078L;
 
public static final String MIN_PREFIX = "min";
@@ -43,70 +43,36 @@ public class EncoderBin extends Encoder
public static final String NBINS_PREFIX = "nbins";
 
private int[] _numBins = null;
-   private double[] _min=null, _max=null;  // min and max among 
non-missing values

//frame transform-apply attributes
+   //TODO binMins is redundant and could be removed
private double[][] _binMins = null;
private double[][] _binMaxs = null;
-   
-   public EncoderBin(JSONObject parsedSpec, String[] colnames, int clen) 
-   throws JSONException, IOException 
-   {
-   this(parsedSpec, colnames, clen, false);
-   }
 
-   public EncoderBin(JSONObject parsedSpec, String[] colnames, int clen, 
boolean colsOnly) 
+   public EncoderBin(JSONObject parsedSpec, String[] colnames, int clen) 
throws JSONException, IOException 
{
-   super( null, clen );
+   super( null, clen );
if ( !parsedSpec.containsKey(TfUtils.TXMETHOD_BIN) )
return;

-   if( colsOnly ) {
-   List collist = 
TfMetaUtils.parseBinningColIDs(parsedSpec, colnames);
-   initColList(ArrayUtils.toPrimitive(collist.toArray(new 
Integer[0])));
-   }
-   else 
-   {
-   JSONObject obj = (JSONObject) 
parsedSpec.get(TfUtils.TXMETHOD_BIN); 
-   JSONArray attrs = (JSONArray) 
obj.get(TfUtils.JSON_ATTRS);
-   JSONArray nbins = (JSONArray) 
obj.get(TfUtils.JSON_NBINS);
-   initColList(attrs);
-   
-   _numBins = new int[attrs.size()];
-   for(int i=0; i < _numBins.length; i++)
-   _numBins[i] = 
UtilFunctions.toInt(nbins.get(i)); 
-   
-   // initialize internal transformation metadata
-   _min = new double[_colList.length];
-   Arrays.fill(_min, Double.POSITIVE_INFINITY);
-   _max = new double[_colList.length];
-   Arrays.fill(_max, Double.NEGATIVE_INFINITY);
-   }
-   }
-
-   public void prepare(String[] words, TfUtils agents) {
-   if ( !isApplicable() )
-   return;
+   //parse column names or column ids
+   List collist = 
TfMetaUtils.parseBinningColIDs(parsedSpec, colnames);
+   initColList(ArrayUtils.toPrimitive(collist.toArray(new 
Integer[0])));

-   for(int i=0; i <_colList.length; i++) {
-   int colID = _colList[i];
-   
-   String w = null;
- 

[systemml] branch master updated: [SYSTEMML-2468] Improved matrix histogram estimator for left-deep trees

2019-01-14 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new 14a79af  [SYSTEMML-2468] Improved matrix histogram estimator for 
left-deep trees
14a79af is described below

commit 14a79af677979f80f10328e67767822f6d43d2ff
Author: Matthias Boehm 
AuthorDate: Mon Jan 14 21:47:27 2019 +0100

[SYSTEMML-2468] Improved matrix histogram estimator for left-deep trees

This patch improves the matrix histogram sparsity estimator for
combinations of derived and exact sketches as they appear for example in
left-deep trees of matrix product chains. Specifically, we now use a
generalized code path that exploits extension vectors if they are
available and otherwise simply uses zero instead.
---
 .../apache/sysml/hops/estim/EstimatorMatrixHistogram.java | 15 +--
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git 
a/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java 
b/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java
index 57fc97e..a82feed 100644
--- a/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java
+++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorMatrixHistogram.java
@@ -168,8 +168,8 @@ public class EstimatorMatrixHistogram extends 
SparsityEstimator
nnz += (long)h1.cNnz[j] * h2.rNnz[j];
}
//special case, with hybrid exact and approximate output
-   else if(h1.cNnz1e!=null && h2.rNnz1e != null) {
-   //note: normally h1.getRows()*h2.getCols() would define 
mnOut
+   else if(h1.cNnz1e!=null || h2.rNnz1e != null) {
+   //NOTE: normally h1.getRows()*h2.getCols() would define 
mnOut
//but by leveraging the knowledge of rows/cols w/ <=1 
nnz, we account
//that exact and approximate fractions touch different 
areas
long mnOut = _useExtended ?
@@ -177,12 +177,15 @@ public class EstimatorMatrixHistogram extends 
SparsityEstimator
(long)(h1.getRows()-h1.rN1) * 
(h2.getCols()-h2.cN1);
double spOutRest = 0;
for( int j=0; j

[systemml] branch master updated: [SYSTEMML-2486] Fix memoization of sparsity sketches for DAG leafs

2019-01-14 Thread mboehm7
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
 new b2fa1af  [SYSTEMML-2486] Fix memoization of sparsity sketches for DAG 
leafs
b2fa1af is described below

commit b2fa1af0c9919c0d703b1eddc32c3cd493e82bf2
Author: Matthias Boehm 
AuthorDate: Mon Jan 14 18:06:52 2019 +0100

[SYSTEMML-2486] Fix memoization of sparsity sketches for DAG leafs

This patch improves the performance of sparsity estimation for DAGs
where leaf nodes are reachable multiple times. So far, we redundantly
created the leaf sketches from the base data on each access. Instead, we
now properly memoize these sketches similar to inner nodes.
---
 .../apache/sysml/hops/estim/EstimatorBitsetMM.java | 20 +--
 .../sysml/hops/estim/EstimatorDensityMap.java  | 21 
 .../sysml/hops/estim/EstimatorMatrixHistogram.java | 23 --
 .../apache/sysml/hops/estim/SparsityEstimator.java |  7 ---
 4 files changed, 40 insertions(+), 31 deletions(-)

diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java 
b/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
index 07d4cdc..e26dd49 100644
--- a/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
+++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
@@ -46,12 +46,9 @@ public class EstimatorBitsetMM extends SparsityEstimator
 {
@Override
public MatrixCharacteristics estim(MMNode root) {
-   estimateInputs(root);
-   BitsetMatrix m1Map = !root.getLeft().isLeaf() ? (BitsetMatrix) 
root.getLeft().getSynopsis() :
-   new BitsetMatrix1(root.getLeft().getData());
-   BitsetMatrix m2Map = root.getRight() == null ? null :
-   !root.getRight().isLeaf() ? (BitsetMatrix) 
root.getRight().getSynopsis() :
-   new BitsetMatrix1(root.getRight().getData());
+   BitsetMatrix m1Map = getCachedSynopsis(root.getLeft());
+   BitsetMatrix m2Map = getCachedSynopsis(root.getRight());
+   
BitsetMatrix outMap = estimInternal(m1Map, m2Map, root.getOp());
root.setSynopsis(outMap); // memorize boolean matrix
return root.setMatrixCharacteristics(new MatrixCharacteristics(
@@ -86,6 +83,17 @@ public class EstimatorBitsetMM extends SparsityEstimator
outMap.getNumColumns(), outMap.getNonZeros());
}

+   private BitsetMatrix getCachedSynopsis(MMNode node) {
+   if( node == null )
+   return null;
+   //ensure synopsis is properly cached and reused
+   if( node.isLeaf() && node.getSynopsis() == null )
+   node.setSynopsis(new BitsetMatrix1(node.getData()));
+   else if( !node.isLeaf() )
+   estim(node); //recursively obtain synopsis
+   return (BitsetMatrix) node.getSynopsis();
+   }
+   
private BitsetMatrix estimInternal(BitsetMatrix m1Map, BitsetMatrix 
m2Map, OpCode op) {
switch(op) {
case MM:  return m1Map.matMult(m2Map);
diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorDensityMap.java 
b/src/main/java/org/apache/sysml/hops/estim/EstimatorDensityMap.java
index 260df5d..8a78a9e 100644
--- a/src/main/java/org/apache/sysml/hops/estim/EstimatorDensityMap.java
+++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorDensityMap.java
@@ -55,14 +55,8 @@ public class EstimatorDensityMap extends SparsityEstimator

@Override
public MatrixCharacteristics estim(MMNode root) {
-   estimateInputs(root);
-   DensityMap m1Map = !root.getLeft().isLeaf() ?
-   (DensityMap)root.getLeft().getSynopsis() : 
-   new DensityMap(root.getLeft().getData(), _b);
-   DensityMap m2Map = root.getRight()==null ? null:
-   !root.getRight().isLeaf() ? 
-   (DensityMap)root.getRight().getSynopsis() :
-   new DensityMap(root.getRight().getData(), _b);
+   DensityMap m1Map = getCachedSynopsis(root.getLeft());
+   DensityMap m2Map = getCachedSynopsis(root.getRight());

//estimate output density map and sparsity
DensityMap outMap = estimIntern(m1Map, m2Map, root.getOp());
@@ -94,6 +88,17 @@ public class EstimatorDensityMap extends SparsityEstimator
return estim(m, null, op);
}

+   private DensityMap getCachedSynopsis(MMNode node) {
+   if( node == null )
+   return null;
+   //ensure 

systemml git commit: [SYSTEMML-2508] Improved spark cumagg compilation (single row block)

2018-12-16 Thread mboehm7
Repository: systemml
Updated Branches:
  refs/heads/master 341a1dc78 -> 8895ebc45


[SYSTEMML-2508] Improved spark cumagg compilation (single row block)

This patch improves the compilation of spark cumulative aggregates where
the input matrix has a single row block by avoiding the unnecessary
offset computation.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/8895ebc4
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/8895ebc4
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/8895ebc4

Branch: refs/heads/master
Commit: 8895ebc454ce85e823d6332e40d7effd874e59df
Parents: 341a1dc
Author: Matthias Boehm 
Authored: Sun Dec 16 16:04:01 2018 +0100
Committer: Matthias Boehm 
Committed: Sun Dec 16 17:07:01 2018 +0100

--
 .../java/org/apache/sysml/hops/UnaryOp.java | 39 +---
 .../misc/RewriteCumulativeAggregatesTest.java   | 25 +++--
 .../misc/RewriteCumulativeAggregates.R  |  6 ++-
 3 files changed, 52 insertions(+), 18 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/systemml/blob/8895ebc4/src/main/java/org/apache/sysml/hops/UnaryOp.java
--
diff --git a/src/main/java/org/apache/sysml/hops/UnaryOp.java 
b/src/main/java/org/apache/sysml/hops/UnaryOp.java
index 2952e85..77655de 100644
--- a/src/main/java/org/apache/sysml/hops/UnaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/UnaryOp.java
@@ -22,6 +22,7 @@ package org.apache.sysml.hops;
 import java.util.ArrayList;
 
 import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.hops.rewrite.HopRewriteUtils;
 import org.apache.sysml.lops.Aggregate;
 import org.apache.sysml.lops.Checkpoint;
 import org.apache.sysml.lops.Aggregate.OperationTypes;
@@ -455,8 +456,15 @@ public class UnaryOp extends MultiThreadedHop
long bclen = input.getColsInBlock();
boolean force = !dimsKnown() || _etypeForced == ExecType.SPARK;
OperationTypes aggtype = getCumulativeAggType();
-   
Lop X = input.constructLops();
+   
+   //special case single row block (no offsets needed)
+   if( rlen > 0 && clen > 0 && rlen <= brlen ) {
+   Lop offset = HopRewriteUtils.createDataGenOpByVal(new 
LiteralOp(1),
+   new LiteralOp(clen), 
getCumulativeInitValue()).constructLops();
+   return constructCumOffBinary(X, offset, aggtype, rlen, 
clen, brlen, bclen);
+   }
+   
Lop TEMP = X;
ArrayList DATA = new ArrayList<>();
int level = 0;
@@ -497,22 +505,27 @@ public class UnaryOp extends MultiThreadedHop

//split, group and mr cumsum
while( level-- > 0  ) {
-   //(for spark, the CumulativeOffsetBinary subsumes both 
the split aggregate and 
-   //the subsequent offset binary apply of split 
aggregates against the original data)
-   double initValue = getCumulativeInitValue();
-   boolean broadcast = ALLOW_CUMAGG_BROADCAST
-   && 
OptimizerUtils.checkSparkBroadcastMemoryBudget(OptimizerUtils.estimateSize(
-   TEMP.getOutputParameters().getNumRows(), 
TEMP.getOutputParameters().getNumCols()));
-   
-   CumulativeOffsetBinary binary = new 
CumulativeOffsetBinary(DATA.get(level), TEMP, 
-   DataType.MATRIX, ValueType.DOUBLE, 
initValue, broadcast, aggtype, ExecType.SPARK);
-   binary.getOutputParameters().setDimensions(rlen, clen, 
brlen, bclen, -1);
-   setLineNumbers(binary);
-   TEMP = binary;
+   TEMP = constructCumOffBinary(DATA.get(level),
+   TEMP, aggtype, rlen, clen, brlen, bclen);
}

return TEMP;
}
+   
+   private Lop constructCumOffBinary(Lop data, Lop offset, OperationTypes 
aggtype, long rlen, long clen, long brlen, long bclen) {
+   //(for spark, the CumulativeOffsetBinary subsumes both the 
split aggregate and 
+   //the subsequent offset binary apply of split aggregates 
against the original data)
+   double initValue = getCumulativeInitValue();
+   boolean broadcast = ALLOW_CUMAGG_BROADCAST
+   && 
OptimizerUtils.checkSparkBroadcastMemoryBudget(OptimizerUtils.estimateSize(
+   offset.getOutputParameters().getNumRows(), 
offset.getOutputParameters().getNumCols()));
+   
+ 

systemml git commit: [MINOR] Fine tuning spark checkpoint data size thresholds

2018-12-12 Thread mboehm7
Repository: systemml
Updated Branches:
  refs/heads/master 9a1f64b42 -> 3b87c2ba9


[MINOR] Fine tuning spark checkpoint data size thresholds

Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/3b87c2ba
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/3b87c2ba
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/3b87c2ba

Branch: refs/heads/master
Commit: 3b87c2ba9d77ffa3d901eae38de9c1157994d74e
Parents: 9a1f64b
Author: Matthias Boehm 
Authored: Wed Dec 12 13:53:23 2018 +0100
Committer: Matthias Boehm 
Committed: Wed Dec 12 13:53:23 2018 +0100

--
 src/main/java/org/apache/sysml/hops/OptimizerUtils.java | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/systemml/blob/3b87c2ba/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
--
diff --git a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java 
b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
index a43abb3..e6a25d2 100644
--- a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
+++ b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
@@ -914,7 +914,11 @@ public class OptimizerUtils
 * @return true if the given matrix characteristics exceed threshold
 */
public static boolean exceedsCachingThreshold(long dim2, double outMem) 
{
-   return !(dim2 > 1 && outMem < getLocalMemBudget()
+   //NOTE: We heuristically cache matrices that are close to or 
larger
+   //than the local memory budget. The different relative 
fractions 
+   //according to number of columns is reflecting common operations
+   //(e.g., two inputs/one output for binary vector operations)
+   return !(dim2 > 1 && outMem < getLocalMemBudget()/2
|| dim2 == 1 && outMem < getLocalMemBudget()/3);
}




[2/2] systemml git commit: [SYSTEMML-2507] New rewrites for cumulative aggregate patterns

2018-12-11 Thread mboehm7
[SYSTEMML-2507] New rewrites for cumulative aggregate patterns

This patch adds the following simplification rewrites as well as related
tests:
(a) X * cumsum(diag(matrix(1,nrow(X),1))) -> lower.tri, if X squared
(b) colSums(cumsum(X)) -> cumSums(X*seq(nrow(X),1))
(c) rev(cumsum(rev(X))) -> X + colSums(X) - cumsum(X)


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/9a1f64b4
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/9a1f64b4
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/9a1f64b4

Branch: refs/heads/master
Commit: 9a1f64b42c177a82a98716ad9ef34d4d266178d2
Parents: b96807b
Author: Matthias Boehm 
Authored: Tue Dec 11 20:10:23 2018 +0100
Committer: Matthias Boehm 
Committed: Tue Dec 11 20:10:46 2018 +0100

--
 .../RewriteAlgebraicSimplificationDynamic.java  |  33 -
 .../RewriteAlgebraicSimplificationStatic.java   |  45 +++
 .../hops/rewrite/RewriteGPUSpecificOps.java |  26 ++--
 .../misc/RewriteCumulativeAggregatesTest.java   | 126 +++
 .../misc/RewriteCumulativeAggregates.R  |  43 +++
 .../misc/RewriteCumulativeAggregates.dml|  49 
 6 files changed, 306 insertions(+), 16 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/systemml/blob/9a1f64b4/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
--
diff --git 
a/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
 
b/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
index 36864aa..9556181 100644
--- 
a/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
+++ 
b/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
@@ -175,6 +175,7 @@ public class RewriteAlgebraicSimplificationDynamic extends 
HopRewriteRule
hi = simplifyMatrixMultDiag(hop, hi, i);  
//e.g., diag(X)%*%Y -> X*Y, if ncol(Y)==1 / -> Y*X if ncol(Y)>1 
hi = simplifyDiagMatrixMult(hop, hi, i);  
//e.g., diag(X%*%Y)->rowSums(X*t(Y)); if col vector
hi = simplifySumDiagToTrace(hi);  
//e.g., sum(diag(X)) -> trace(X); if col vector
+   hi = simplifyLowerTriExtraction(hop, hi, i);  
//e.g., X * cumsum(diag(matrix(1,nrow(X),1))) -> lower.tri
hi = pushdownBinaryOperationOnDiag(hop, hi, i);   
//e.g., diag(X)*7 -> diag(X*7); if col vector
hi = pushdownSumOnAdditiveBinary(hop, hi, i); 
//e.g., sum(A+B) -> sum(A)+sum(B); if dims(A)==dims(B)
if(OptimizerUtils.ALLOW_OPERATOR_FUSION) {
@@ -1046,7 +1047,7 @@ public class RewriteAlgebraicSimplificationDynamic 
extends HopRewriteRule
if( hi instanceof AggUnaryOp ) 
{
AggUnaryOp au = (AggUnaryOp) hi;
-   if( au.getOp()==AggOp.SUM && 
au.getDirection()==Direction.RowCol )  //sum   
+   if( au.getOp()==AggOp.SUM && 
au.getDirection()==Direction.RowCol )  //sum
{
Hop hi2 = au.getInput().get(0);
if( hi2 instanceof ReorgOp && 
((ReorgOp)hi2).getOp()==ReOrgOp.DIAG && hi2.getDim2()==1 ) //diagM2V
@@ -1054,7 +1055,7 @@ public class RewriteAlgebraicSimplificationDynamic 
extends HopRewriteRule
Hop hi3 = hi2.getInput().get(0);

//remove diag operator
-   
HopRewriteUtils.replaceChildReference(au, hi2, hi3, 0); 
+   
HopRewriteUtils.replaceChildReference(au, hi2, hi3, 0);

HopRewriteUtils.cleanupUnreferenced(hi2);

//change sum to trace
@@ -1063,12 +1064,38 @@ public class RewriteAlgebraicSimplificationDynamic 
extends HopRewriteRule
LOG.debug("Applied 
simplifySumDiagToTrace");
}
}
-   
}

return hi;
}

+   private static Hop simplifyLowerTriExtraction(Hop parent, Hop hi, int 
pos) {
+   //pattern: X * cumsum(diag(matrix(1,nrow(X),1))) -> lower.tri 
(only right)
+   if( HopRewriteUtils.isBinary(hi, OpOp2.MULT) 
+   && hi.getDim1() == hi.getDim2() && hi.getDim1() > 1 ) {
+   Hop left = 

[1/2] systemml git commit: [SYSTEMML-2506] Improved cumagg compilation (intermediate memory)

2018-12-11 Thread mboehm7
Repository: systemml
Updated Branches:
  refs/heads/master 7019f3bc8 -> 9a1f64b42


[SYSTEMML-2506] Improved cumagg compilation (intermediate memory)

This patch improves the compilation of cumulative aggregate operations,
to correctly account for potential dense-sparse conversions when
computing memory estimates.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/b96807b9
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/b96807b9
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/b96807b9

Branch: refs/heads/master
Commit: b96807b907203ce8ef1bbd017d06f3c6c9ef8fec
Parents: 7019f3b
Author: Matthias Boehm 
Authored: Tue Dec 11 16:58:27 2018 +0100
Committer: Matthias Boehm 
Committed: Tue Dec 11 16:58:27 2018 +0100

--
 src/main/java/org/apache/sysml/hops/UnaryOp.java | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/systemml/blob/b96807b9/src/main/java/org/apache/sysml/hops/UnaryOp.java
--
diff --git a/src/main/java/org/apache/sysml/hops/UnaryOp.java 
b/src/main/java/org/apache/sysml/hops/UnaryOp.java
index 4071d6f..2952e85 100644
--- a/src/main/java/org/apache/sysml/hops/UnaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/UnaryOp.java
@@ -42,6 +42,8 @@ import org.apache.sysml.lops.UnaryCP;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.util.UtilFunctions;
 
 
 /* Unary (cell operations): e.g, b_ij = round(a_ij)
@@ -562,15 +564,20 @@ public class UnaryOp extends MultiThreadedHop
}

@Override
-   protected double computeIntermediateMemEstimate( long dim1, long dim2, 
long nnz )
+   protected double computeIntermediateMemEstimate(long dim1, long dim2, 
long nnz)
{
double ret = 0;

-   if ( _op == OpOp1.IQM || _op == OpOp1.MEDIAN) {
+   if( _op == OpOp1.IQM || _op == OpOp1.MEDIAN ) {
// buffer (=2*input_size) and output (=input_size) for 
SORT operation
// getMemEstimate works for both cases of known dims 
and worst-case stats
ret = getInput().get(0).getMemEstimate() * 3; 
}
+   else if( isCumulativeUnaryOperation() ) {
+   //account for potential final dense-sparse 
transformation (worst-case sparse representation)
+   ret += MatrixBlock.estimateSizeSparseInMemory(dim1, 
dim2,
+   MatrixBlock.SPARSITY_TURN_POINT - 
UtilFunctions.DOUBLE_EPS);
+   }
 
if (isGPUEnabled()) {
// Intermediate memory required to convert sparse to 
dense



systemml git commit: [SYSTEMML-2503/04] Fix correctness in-place and broadcast cumagg ops

2018-12-08 Thread mboehm7
Repository: systemml
Updated Branches:
  refs/heads/master bda61b600 -> 1a58946a0


[SYSTEMML-2503/04] Fix correctness in-place and broadcast cumagg ops

This patch fixes correctness issues of in-place cumulative aggregate
operations and well as the handling of lineage tracing on spark cumagg
offset. In addition, the patch also includes a minor performance
improvement that avoids unnecessary copying of offset vectors on cumagg
offset operations.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/1a58946a
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/1a58946a
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/1a58946a

Branch: refs/heads/master
Commit: 1a58946a0a335ccae61d0cf3873a937467ae5544
Parents: bda61b6
Author: Matthias Boehm 
Authored: Sat Dec 8 13:40:33 2018 +0100
Committer: Matthias Boehm 
Committed: Sat Dec 8 13:40:33 2018 +0100

--
 .../instructions/spark/CumulativeOffsetSPInstruction.java |  9 ++---
 .../apache/sysml/runtime/matrix/data/LibMatrixAgg.java| 10 ++
 .../org/apache/sysml/runtime/matrix/data/MatrixBlock.java |  4 ++--
 .../java/org/apache/sysml/runtime/util/DataConverter.java |  9 -
 4 files changed, 22 insertions(+), 10 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/systemml/blob/1a58946a/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java
--
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java
index 1b26060..3dba53e 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java
@@ -32,6 +32,7 @@ import scala.Tuple2;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysml.runtime.functionobjects.Builtin;
+import org.apache.sysml.runtime.functionobjects.Builtin.BuiltinCode;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
 import org.apache.sysml.runtime.instructions.spark.data.PartitionedBroadcast;
@@ -94,8 +95,9 @@ public class CumulativeOffsetSPInstruction extends 
BinarySPInstruction {
//get and join inputs
JavaPairRDD inData = 
sec.getBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD> 
joined = null;
+   boolean broadcast = _broadcast && 
!SparkUtils.isHashPartitioned(inData);

-   if( _broadcast && !SparkUtils.isHashPartitioned(inData) ) {
+   if( broadcast ) {
//broadcast offsets and broadcast join with data
PartitionedBroadcast inAgg = 
sec.getBroadcastForVariable(input2.getName());
joined = inData.mapToPair(new 
RDDCumSplitLookupFunction(inAgg,_initValue, rlen, brlen));
@@ -119,7 +121,7 @@ public class CumulativeOffsetSPInstruction extends 
BinarySPInstruction {
updateUnaryOutputMatrixCharacteristics(sec);
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
-   sec.addLineage(output.getName(), input2.getName(), _broadcast);
+   sec.addLineage(output.getName(), input2.getName(), broadcast);
}
 
private static class RDDCumSplitFunction implements 
PairFlatMapFunction, MatrixIndexes, 
MatrixBlock> 
@@ -229,7 +231,8 @@ public class CumulativeOffsetSPInstruction extends 
BinarySPInstruction {

//blockwise cumagg computation, incl offset aggregation
return LibMatrixAgg.cumaggregateUnaryMatrix(dblkIn, 
blkOut, _uop,
-   DataConverter.convertToDoubleVector(oblkIn));
+   DataConverter.convertToDoubleVector(oblkIn, 
false,
+   ((Builtin)_uop.fn).bFunc == 
BuiltinCode.CUMSUM));
}
}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/1a58946a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixAgg.java
--
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixAgg.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixAgg.java
index 5e785d9..ed7d8f1 100644
--- 

[2/2] systemml git commit: [SYSTEMML-2504] In-place CP cumulative aggregates, incl compiler

2018-12-05 Thread mboehm7
[SYSTEMML-2504] In-place CP cumulative aggregates, incl compiler

This patch adds an option for in-place CP cumulative aggregates because
result allocation is the major bottleneck. As an initial compiler
integration, we now compiler inplace CP operations for the aggregation
of partial aggregates in Spark cumsum because it guarantees validity.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/25a10f41
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/25a10f41
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/25a10f41

Branch: refs/heads/master
Commit: 25a10f412614235d8974f371a2bb07bc08c88cee
Parents: 21b1a53
Author: Matthias Boehm 
Authored: Wed Dec 5 20:38:37 2018 +0100
Committer: Matthias Boehm 
Committed: Wed Dec 5 20:38:37 2018 +0100

--
 .../java/org/apache/sysml/hops/UnaryOp.java | 10 +-
 src/main/java/org/apache/sysml/lops/Unary.java  |  7 +--
 .../instructions/cp/UnaryCPInstruction.java |  5 +++--
 .../sysml/runtime/matrix/data/LibMatrixAgg.java | 20 +++-
 .../runtime/matrix/operators/UnaryOperator.java | 10 --
 5 files changed, 36 insertions(+), 16 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/systemml/blob/25a10f41/src/main/java/org/apache/sysml/hops/UnaryOp.java
--
diff --git a/src/main/java/org/apache/sysml/hops/UnaryOp.java 
b/src/main/java/org/apache/sysml/hops/UnaryOp.java
index d1110c3..4071d6f 100644
--- a/src/main/java/org/apache/sysml/hops/UnaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/UnaryOp.java
@@ -170,7 +170,7 @@ public class UnaryOp extends MultiThreadedHop
int k = isCumulativeUnaryOperation() || 
isExpensiveUnaryOperation() ?

OptimizerUtils.getConstrainedNumThreads( _maxNumThreads ) : 1;
Unary unary1 = new 
Unary(input.constructLops(),
-   HopsOpOp1LopsU.get(_op), 
getDataType(), getValueType(), et, k);
+   HopsOpOp1LopsU.get(_op), 
getDataType(), getValueType(), et, k, false);
setOutputDimensions(unary1);
setLineNumbers(unary1);
setLops(unary1);
@@ -404,15 +404,15 @@ public class UnaryOp extends MultiThreadedHop
agg.getOutputParameters().setDimensions(rlenAgg, clen, 
brlen, bclen, -1);

agg.setupCorrectionLocation(CorrectionLocationType.NONE); // aggregation uses 
kahanSum but the inputs do not have correction values
setLineNumbers(agg);
-   TEMP = agg; 
+   TEMP = agg;
level++;
force = false; //in case of unknowns, generate one level
}

//in-memory cum sum (of partial aggregates)
if( TEMP.getOutputParameters().getNumRows()!=1 ) {
-   int k = OptimizerUtils.getConstrainedNumThreads( 
_maxNumThreads );  
-   Unary unary1 = new Unary( TEMP, 
HopsOpOp1LopsU.get(_op), DataType.MATRIX, ValueType.DOUBLE, ExecType.CP, k);
+   int k = OptimizerUtils.getConstrainedNumThreads( 
_maxNumThreads );
+   Unary unary1 = new Unary( TEMP, 
HopsOpOp1LopsU.get(_op), DataType.MATRIX, ValueType.DOUBLE, ExecType.CP, k, 
true);

unary1.getOutputParameters().setDimensions(TEMP.getOutputParameters().getNumRows(),
 clen, brlen, bclen, -1);
setLineNumbers(unary1);
TEMP = unary1;
@@ -487,7 +487,7 @@ public class UnaryOp extends MultiThreadedHop
//in-memory cum sum (of partial aggregates)
if( TEMP.getOutputParameters().getNumRows()!=1 ){
int k = OptimizerUtils.getConstrainedNumThreads( 
_maxNumThreads );
-   Unary unary1 = new Unary( TEMP, 
HopsOpOp1LopsU.get(_op), DataType.MATRIX, ValueType.DOUBLE, ExecType.CP, k);
+   Unary unary1 = new Unary( TEMP, 
HopsOpOp1LopsU.get(_op), DataType.MATRIX, ValueType.DOUBLE, ExecType.CP, k, 
true);

unary1.getOutputParameters().setDimensions(TEMP.getOutputParameters().getNumRows(),
 clen, brlen, bclen, -1);
setLineNumbers(unary1);
TEMP = unary1;

http://git-wip-us.apache.org/repos/asf/systemml/blob/25a10f41/src/main/java/org/apache/sysml/lops/Unary.java

[1/2] systemml git commit: [SYSTEMML-2503] Exploit existing hash partitioning in spark cumoff ops

2018-12-05 Thread mboehm7
Repository: systemml
Updated Branches:
  refs/heads/master 7a3447a50 -> 25a10f412


[SYSTEMML-2503] Exploit existing hash partitioning in spark cumoff ops

Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/21b1a531
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/21b1a531
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/21b1a531

Branch: refs/heads/master
Commit: 21b1a53141c74b4aa3af6e0263af3f6b0d7c1336
Parents: 7a3447a
Author: Matthias Boehm 
Authored: Wed Dec 5 19:39:53 2018 +0100
Committer: Matthias Boehm 
Committed: Wed Dec 5 19:39:53 2018 +0100

--
 .../runtime/instructions/spark/CumulativeOffsetSPInstruction.java | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/systemml/blob/21b1a531/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java
--
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java
index 53e6e91..8befc5a 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java
@@ -35,6 +35,7 @@ import org.apache.sysml.runtime.functionobjects.Builtin;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
 import org.apache.sysml.runtime.instructions.spark.data.PartitionedBroadcast;
+import org.apache.sysml.runtime.instructions.spark.utils.SparkUtils;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 import org.apache.sysml.runtime.matrix.data.LibMatrixAgg;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
@@ -95,7 +96,7 @@ public class CumulativeOffsetSPInstruction extends 
BinarySPInstruction {
JavaPairRDD inData = 
sec.getBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD> 
joined = null;

-   if( _broadcast ) {
+   if( _broadcast && !SparkUtils.isHashPartitioned(inData) ) {
//broadcast offsets and broadcast join with data
PartitionedBroadcast inAgg = 
sec.getBroadcastForVariable(input2.getName());
joined = inData.mapToPair(new 
RDDCumSplitLookupFunction(inAgg,_initValue, rlen, brlen));



[1/3] systemml git commit: [SYSTEMML-2500] Async matrix allocation on Spark RDD collect

2018-12-01 Thread mboehm7
Repository: systemml
Updated Branches:
  refs/heads/master 95cbbd656 -> 7a3447a50


[SYSTEMML-2500] Async matrix allocation on Spark RDD collect

This patch introduces a general performance improvement of RDD collect
operations into the driver memory, by interleaving the matrix allocation
with the collect (and pending RDD evaluation). This is generally useful
because it reduces the serial fraction of parallel programs.

For example, for 100 distributed sum(cumsum(X)) operations, it reduced
the total runtime from 1,102s to 1,006s.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/77a7ef15
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/77a7ef15
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/77a7ef15

Branch: refs/heads/master
Commit: 77a7ef155d5f3546d053c7f3d11b1ff3b8021834
Parents: 95cbbd6
Author: Matthias Boehm 
Authored: Sat Dec 1 17:08:45 2018 +0100
Committer: Matthias Boehm 
Committed: Sat Dec 1 17:08:45 2018 +0100

--
 .../controlprogram/caching/LazyWriteBuffer.java   |  4 
 .../controlprogram/context/SparkExecutionContext.java | 14 ++
 .../org/apache/sysml/runtime/io/IOUtilFunctions.java  | 10 ++
 .../apache/sysml/runtime/matrix/data/MatrixBlock.java | 10 ++
 4 files changed, 34 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/systemml/blob/77a7ef15/src/main/java/org/apache/sysml/runtime/controlprogram/caching/LazyWriteBuffer.java
--
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/LazyWriteBuffer.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/LazyWriteBuffer.java
index 391f21a..d1dc801 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/LazyWriteBuffer.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/LazyWriteBuffer.java
@@ -272,6 +272,10 @@ public class LazyWriteBuffer
}
}

+   public static ExecutorService getUtilThreadPool() {
+   return _fClean != null ? _fClean._pool : null;
+   }
+   
/**
 * Extended LinkedHashMap with convenience methods for adding and 
removing
 * last/first entries.

http://git-wip-us.apache.org/repos/asf/systemml/blob/77a7ef15/src/main/java/org/apache/sysml/runtime/controlprogram/context/SparkExecutionContext.java
--
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/SparkExecutionContext.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/SparkExecutionContext.java
index 8981c87..b04aad0 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/SparkExecutionContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/SparkExecutionContext.java
@@ -24,6 +24,7 @@ import java.util.Arrays;
 import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.List;
+import java.util.concurrent.Future;
 import java.util.stream.Collectors;
 import java.util.stream.LongStream;
 
@@ -46,7 +47,6 @@ import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
 import org.apache.sysml.api.mlcontext.MLContext;
 import org.apache.sysml.api.mlcontext.MLContextUtil;
-import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.lops.Checkpoint;
 import org.apache.sysml.parser.Expression.ValueType;
@@ -72,6 +72,7 @@ import 
org.apache.sysml.runtime.instructions.spark.functions.CreateSparseBlockFu
 import 
org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction;
 import org.apache.sysml.runtime.instructions.spark.utils.RDDAggregateUtils;
 import org.apache.sysml.runtime.instructions.spark.utils.SparkUtils;
+import org.apache.sysml.runtime.io.IOUtilFunctions;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 import org.apache.sysml.runtime.matrix.data.FrameBlock;
 import org.apache.sysml.runtime.matrix.data.InputInfo;
@@ -824,7 +825,7 @@ public class SparkExecutionContext extends ExecutionContext
long t0 = ConfigurationManager.isStatistics() ? 
System.nanoTime() : 0;
 
MatrixBlock out = null;
-
+   
if( rlen <= brlen && clen <= bclen ) //SINGLE BLOCK
{
//special case without copy and nnz maintenance
@@ -846,9 +847,14 @@ public class SparkExecutionContext extends ExecutionContext
 
//create output matrix block (w/ lazy allocation)
out = new MatrixBlock(rlen, clen, sparse, lnnz);
-
+

[3/3] systemml git commit: [SYSTEMML-2502] Performance spark cumagg offset aggregation (zero-copy)

2018-12-01 Thread mboehm7
[SYSTEMML-2502] Performance spark cumagg offset aggregation (zero-copy)

This patch avoid unnecessary copy operations of input data blocks, which
were used to avoid data corruption on offset aggregation into the first
row. Instead we now directly pass the offset vector into the dedicated
cumulative aggregate operations. On our running example of 100
distributed sum(cumsum(X)) operations, this patch reduced the total
runtime from 887s to 732s.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/7a3447a5
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/7a3447a5
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/7a3447a5

Branch: refs/heads/master
Commit: 7a3447a50b6d2abdbaf6dce9d021a3ce7c2717d7
Parents: fee20fb
Author: Matthias Boehm 
Authored: Sat Dec 1 21:06:04 2018 +0100
Committer: Matthias Boehm 
Committed: Sat Dec 1 21:06:04 2018 +0100

--
 .../spark/CumulativeOffsetSPInstruction.java| 62 +++-
 .../sysml/runtime/matrix/data/LibMatrixAgg.java | 10 +++-
 2 files changed, 27 insertions(+), 45 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/systemml/blob/7a3447a5/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java
--
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java
index 952a6d0..53e6e91 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/CumulativeOffsetSPInstruction.java
@@ -32,50 +32,40 @@ import scala.Tuple2;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysml.runtime.functionobjects.Builtin;
-import org.apache.sysml.runtime.functionobjects.Multiply;
-import org.apache.sysml.runtime.functionobjects.Plus;
-import org.apache.sysml.runtime.functionobjects.PlusMultiply;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
 import org.apache.sysml.runtime.instructions.spark.data.PartitionedBroadcast;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
+import org.apache.sysml.runtime.matrix.data.LibMatrixAgg;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
-import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysml.runtime.matrix.operators.Operator;
 import org.apache.sysml.runtime.matrix.operators.UnaryOperator;
+import org.apache.sysml.runtime.util.DataConverter;
 import org.apache.sysml.runtime.util.UtilFunctions;
 import org.apache.sysml.utils.IntUtils;
 
 public class CumulativeOffsetSPInstruction extends BinarySPInstruction {
-   private BinaryOperator _bop = null;
private UnaryOperator _uop = null;
+   private boolean _cumsumprod = false;
private final double _initValue ;
private final boolean _broadcast;
 
private CumulativeOffsetSPInstruction(Operator op, CPOperand in1, 
CPOperand in2, CPOperand out, double init, boolean broadcast, String opcode, 
String istr) {
super(SPType.CumsumOffset, op, in1, in2, out, opcode, istr);
 
-   if ("bcumoffk+".equals(opcode)) {
-   _bop = new BinaryOperator(Plus.getPlusFnObject());
+   if ("bcumoffk+".equals(opcode))
_uop = new 
UnaryOperator(Builtin.getBuiltinFnObject("ucumk+"));
-   }
-   else if ("bcumoff*".equals(opcode)) {
-   _bop = new 
BinaryOperator(Multiply.getMultiplyFnObject());
+   else if ("bcumoff*".equals(opcode))
_uop = new 
UnaryOperator(Builtin.getBuiltinFnObject("ucum*"));
-   }
else if ("bcumoff+*".equals(opcode)) {
-   _bop = new BinaryOperator(PlusMultiply.getFnObject());
_uop = new 
UnaryOperator(Builtin.getBuiltinFnObject("ucumk+*"));
+   _cumsumprod = true;
}
-   else if ("bcumoffmin".equals(opcode)) {
-   _bop = new 
BinaryOperator(Builtin.getBuiltinFnObject("min"));
+   else if ("bcumoffmin".equals(opcode))
_uop = new 
UnaryOperator(Builtin.getBuiltinFnObject("ucummin"));
-   }
-   else if ("bcumoffmax".equals(opcode)) {
-   _bop = new 
BinaryOperator(Builtin.getBuiltinFnObject("max"));
+  

  1   2   3   4   5   6   7   8   9   10   >