This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new 5ec8d0c06a [SYSTEMDS-3696] Basic incremental slice-line builtin, and 
tests
5ec8d0c06a is described below

commit 5ec8d0c06a99cdf1250d2d85c6dbc8e43e84ea19
Author: Frederic Zoepffel <[email protected]>
AuthorDate: Mon May 20 16:36:09 2024 +0200

    [SYSTEMDS-3696] Basic incremental slice-line builtin, and tests
    
    Closes #2024.
---
 scripts/builtin/incSliceLine.dml                   | 383 ++++++++++++++++++++
 .../java/org/apache/sysds/common/Builtins.java     |   1 +
 .../builtin/part2/BuiltinIncSliceLineTest.java     | 403 +++++++++++++++++++++
 .../scripts/functions/builtin/incSliceLine.dml     |  29 ++
 4 files changed, 816 insertions(+)

diff --git a/scripts/builtin/incSliceLine.dml b/scripts/builtin/incSliceLine.dml
new file mode 100644
index 0000000000..f6c02fac9b
--- /dev/null
+++ b/scripts/builtin/incSliceLine.dml
@@ -0,0 +1,383 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# This builtin function implements SliceLine, a linear-algebra-based
+# ML model debugging technique for finding the top-k data slices where
+# a trained models performs significantly worse than on the overall 
+# dataset. For a detailed description and experimental results, see:
+# Svetlana Sagadeeva, Matthias Boehm: SliceLine: Fast, Linear-Algebra-based 
Slice Finding for ML Model Debugging.(SIGMOD 2021)
+#
+# INPUT:
+# 
---------------------------------------------------------------------------------------
+# newX     Feature matrix in recoded/binned representation
+# oldX     All-comprising feature matrix of previous runs in recoded/binned 
representation
+# e        Error vector of trained model
+# k        Number of subsets required
+# maxL     maximum level L (conjunctions of L predicates), 0 unlimited
+# minSup   minimum support (min number of rows per slice)
+# alpha    weight [0,1]: 0 only size, 1 only error
+# tpEval   flag for task-parallel slice evaluation,
+#          otherwise data-parallel
+# tpBlksz  block size for task-parallel execution (num slices)
+# selFeat  flag for removing one-hot-encoded features that don`t satisfy
+#          the initial minimum-support constraint and/or have zero error
+# verbose  flag for verbose debug output
+# prevL    previous lattice (for incremental updates)
+# prevRL   previous statistics whole lattice (for incremental updates)
+# 
---------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# 
-----------------------------------------------------------------------------------------
+# TK    top-k slices (k x ncol(newX) if successful)
+# TKC   score, size, error of slices (k x 3)
+# D     debug matrix, populated with enumeration stats if verbose
+# L     lattice matrix
+# RL    statistics matrix for all slices in L
+# Xout  feature matrix consisting of oldX and newX for next run
+# 
-----------------------------------------------------------------------------------------
+
+m_incSliceLine = function(Matrix[Double] newX, Matrix[Double] oldX = matrix(0, 
0, 0), Matrix[Double] e, Int k = 4,
+    Int maxL = 0, Int minSup = 32, Double alpha = 0.5, Boolean tpEval = TRUE,
+    Int tpBlksz = 16, Boolean selFeat = FALSE, Boolean verbose = FALSE,
+    Matrix[Double] prevLattice = matrix(0, 0, 0) , Matrix[Double] prevRL = 
matrix(0, 0, 0))
+  return(Matrix[Double] TK, Matrix[Double] TKC, Matrix[Double] D, 
Matrix[Double] L, Matrix[Double] RL, Matrix[Double] Xout)
+{
+  # TODO convert input/output of previous enumerated slices to lists
+  # for simple collection and processing
+  t1 = time();
+
+  # init debug matrix: levelID, enumerated S, valid S, TKmax, TKmin
+  D = matrix(0, 0, 5);
+
+  m = nrow(newX);
+  n = ncol(newX);
+
+  # prepare offset vectors and one-hot encoded newX
+  fdom = colMaxs(newX);
+  foffb = t(cumsum(t(fdom))) - fdom;
+  foffe = t(cumsum(t(fdom)))
+  rix = matrix(seq(1,m)%*%matrix(1,1,n), m*n, 1)
+  cix = matrix(newX + foffb, m*n, 1);
+  X2 = table(rix, cix, 1, m, as.scalar(foffe[,n]), FALSE); #one-hot encoded
+
+  # initialize statistics and basic slices
+  n2 = ncol(X2);     # one-hot encoded features
+  eAvg = sum(e) / m; # average error
+  [S, R, selCols] = createAndScoreBasicSlices(X2, e, eAvg, minSup, alpha, 
verbose); 
+
+  # initialize Lattice and Statistics
+  L = S 
+  RL = R
+
+  # initialize top-k
+  [TK, TKC] = maintainTopK(S, R, matrix(0,0,n2), matrix(0,0,4), k, minSup);
+
+  if( verbose ) {
+    [maxsc, minsc] = analyzeTopK(TKC);
+    print("SliceFinder: initial top-K: count="+nrow(TK)+", max="+maxsc+", 
min="+minsc+" (time="+(time()-t1)+")")
+    D = rbind(D, t(as.matrix(list(1, n2, nrow(S), maxsc, minsc))));
+  }
+
+  # reduced dataset to relevant attributes (minSup, err>0), S reduced 
on-the-fly
+  if( selFeat )
+    X2 = removeEmpty(target=X2, margin="cols", select=t(selCols));
+
+  # lattice enumeration w/ size/error pruning, one iteration per level
+  # termination condition (max #feature levels)
+  maxL = ifelse(maxL<=0, n, maxL)
+  level = 1;
+  while( nrow(S) > 0 & sum(S) > 0 & level < n & level < maxL ) {
+    level = level + 1;
+
+    # enumerate candidate join pairs, incl size/error pruning 
+    nrS = nrow(S);
+    S = getPairedCandidates(S, R, TK, TKC, k, level, eAvg, minSup, alpha, n2, 
foffb, foffe); 
+    S2 = S;
+
+    # update lattice and statistics
+    L = rbind(L, S);
+
+    if(selFeat)
+      S2 = removeEmpty(target=S, margin="cols", select=t(selCols));
+
+    if(verbose) {
+      print("\nSliceFinder: level "+level+":")
+      print(" -- generated paired slice candidates: "+nrS+" -> "+nrow(S));
+    }
+
+    if( nrow(S) > 0 ) {
+      # extract and evaluate candidate slices
+      if( tpEval ) { # task-parallel
+        # hybrid task-parallel w/ 1 matrix-matrix for blocks of 16 
matrix-vector 
+        R = matrix(0, nrow(S), 4)
+        parfor( i in 1:ceil(nrow(S)/tpBlksz), check=0 ) {
+          beg = (i-1)*tpBlksz + 1; 
+          end = min(i*tpBlksz, nrow(R));
+          R[beg:end,] = evalSlice(X2, e, eAvg, t(S2[beg:end,]), level, alpha);
+        }
+        RL = rbind(RL, R);
+      }
+      else { # data-parallel
+        R = evalSlice(X2, e, eAvg, t(S2), level, alpha);
+        RL = rbind(RL, R);
+      }
+
+      # maintain top-k after evaluation
+      [TK, TKC] = maintainTopK(S, R, TK, TKC, k, minSup);
+
+      if(verbose) {
+        [maxsc, minsc] = analyzeTopK(TKC);
+        valid = as.integer(sum(R[,2]>0 & R[,4]>=minSup));
+        print(" -- valid slices after eval: "+valid+"/"+nrow(S));
+        print(" -- top-K: count="+nrow(TK)+", max="+maxsc+", min="+minsc);
+        print(" -- (time="+(time()-t1)+")")
+        D = rbind(D, t(as.matrix(list(level, nrow(S), valid, maxsc, minsc))));
+      }
+    }
+  }
+
+  TK = decodeOneHot(TK, foffb, foffe);
+
+  # prepare output feature matrix for next run
+  if (nrow(oldX) > 0){
+    Xout = rbind(oldX, newX);
+  } else {
+    Xout = newX;
+  }
+
+  L = decodeOneHot(L, foffb, foffe)
+  
+
+  if( verbose ) {
+    print("SliceFinder: terminated at level "+level+":\n"
+      + toString(TK) + "\n" + toString(TKC));
+  }
+
+  print("Lattice: \n "+ toString(L) +":\n"
+    + "Statistics: \n "+ toString(RL));
+}
+
+createAndScoreBasicSlices = function(Matrix[Double] X2, Matrix[Double] e, 
+    Double eAvg, Double minSup, Double alpha, Boolean verbose)
+  return(Matrix[Double] S, Matrix[Double] R, Matrix[Double] selCols)
+{
+  n2 = ncol(X2);
+  cCnts = t(colSums(X2));    # column counts
+  err = t(t(e) %*% X2);      # total error vector
+  merr = t(colMaxs(X2 * e)); # maximum error vector
+
+  if( verbose ) {
+    drop = as.integer(sum(cCnts < minSup | err == 0));
+    print("SliceFinder: dropping "+drop+"/"+n2+" features below minSup = 
"+minSup+".");
+  }
+
+  # working set of active slices (#attr x #slices) and top k
+  selCols = (cCnts >= minSup & err > 0);
+  attr = removeEmpty(target=seq(1,n2), margin="rows", select=selCols);
+  ss = removeEmpty(target=cCnts, margin="rows", select=selCols);
+  se = removeEmpty(target=err, margin="rows", select=selCols);
+  sm = removeEmpty(target=merr, margin="rows", select=selCols);
+  S = table(seq(1,nrow(attr)), attr, nrow(attr), n2);
+
+  # score 1-slices and create initial top-k 
+  sc = score(ss, se, eAvg, alpha, nrow(X2));
+  R = cbind(sc, se, sm, ss);
+}
+
+score = function(Matrix[Double] ss, Matrix[Double] se, Double eAvg, Double 
alpha, Integer n)
+  return(Matrix[Double] sc)
+{
+  sc = alpha * ((se/ss) / eAvg - 1) - (1-alpha) * (n/ss - 1);
+  sc = replace(target=sc, pattern=NaN, replacement=-Inf);
+}
+
+scoreUB = function(Matrix[Double] ss, Matrix[Double] se, Matrix[Double] sm, 
+    Double eAvg, Integer minSup, Double alpha, Integer n)
+  return(Matrix[Double] sc)
+{
+  # Initial upper bound equation (with minSup and ss in pos/neg terms)
+  # sc = alpha * ((se/minSup) / eAvg - 1) - (1-alpha) * (n/ss - 1);
+
+  # Since sc is either monotonically increasing or decreasing, we
+  # probe interesting points of sc in the interval [minSup, ss],
+  # and compute the maximum to serve as the upper bound 
+  s = cbind(matrix(minSup,nrow(ss),1), max(se/sm,minSup), ss) 
+  sc = rowMaxs(alpha * ((min(s*sm,se)/s) / eAvg - 1) - (1-alpha) * (1/s*n - 
1));
+  sc = replace(target=sc, pattern=NaN, replacement=-Inf);
+}
+
+
+maintainTopK = function(Matrix[Double] S, Matrix[Double] R, 
+    Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer minSup) 
+  return(Matrix[Double] TK, Matrix[Double] TKC)
+{
+  # prune invalid minSup and scores
+  I = (R[,1] > 0) & (R[,4] >= minSup);
+
+  if( sum(I)!=0 ) {
+    S = removeEmpty(target=S, margin="rows", select=I);
+    R = removeEmpty(target=R, margin="rows", select=I);
+
+    # evaluated candidated and previous top-k
+    slices = rbind(TK, S);
+    scores = rbind(TKC, R);
+
+    # extract top-k
+    IX = order(target=scores, by=1, decreasing=TRUE, index.return=TRUE);
+    IX = IX[1:min(k,nrow(IX)),];
+    P = table(seq(1,nrow(IX)), IX, nrow(IX), nrow(slices));
+    TK = P %*% slices;
+    TKC = P %*% scores;
+  }
+}
+
+analyzeTopK = function(Matrix[Double] TKC) return(Double maxsc, Double minsc) {
+  maxsc = -Inf;
+  minsc = -Inf;
+  if( nrow(TKC)>0 ) {
+    maxsc = as.scalar(TKC[1,1]);
+    minsc = as.scalar(TKC[nrow(TKC),1]);
+  }
+}
+
+getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R, 
+    Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer level, 
+    Double eAvg, Integer minSup, Double alpha, Integer n2, 
+    Matrix[Double] foffb, Matrix[Double] foffe)
+  return(Matrix[Double] P)
+{
+  # prune invalid slices (possible without affecting overall
+  # pruning effectiveness due to handling of missing parents)
+  pI = (R[,4] >= minSup & R[,2] > 0);
+  S = removeEmpty(target=S, margin="rows", select=pI)
+  R = removeEmpty(target=R, margin="rows", select=pI)
+
+  # join compatible slices (without self)
+  join = S %*% t(S) == (level-2)
+  I = upper.tri(target=join, diag=FALSE, values=TRUE);
+
+  # pair construction
+  nr = nrow(I); nc = ncol(I);
+  rix = matrix(I * seq(1,nr), nr*nc, 1);
+  cix = matrix(I * t(seq(1,nc)), nr*nc, 1);
+  rix = removeEmpty(target=rix, margin="rows");
+  cix = removeEmpty(target=cix, margin="rows");
+
+  P = matrix(0,0,ncol(S))
+  if( sum(rix)!=0 ) {
+    P1 = table(seq(1,nrow(rix)), rix, nrow(rix), nrow(S));
+    P2 = table(seq(1,nrow(cix)), cix, nrow(rix), nrow(S));
+    P12 = P1 + P2; # combined slice
+    P = (P1 %*% S + P2 %*% S) != 0;
+
+    se = min(P1 %*% R[,2], P2 %*% R[,2])
+    sm = min(P1 %*% R[,3], P2 %*% R[,3])
+    ss = min(P1 %*% R[,4], P2 %*% R[,4])
+
+    # prune invalid self joins (>1 bit per feature)
+    I = matrix(1, nrow(P), 1);
+    for( j in 1:ncol(foffb) ) {
+      beg = as.scalar(foffb[1,j])+1;
+      end = as.scalar(foffe[1,j]);
+      I = I & (rowSums(P[,beg:end]) <= 1);
+    }
+    P12 = removeEmpty(target=P12, margin="rows", select=I)
+    P = removeEmpty(target=P, margin="rows", select=I);
+    ss = removeEmpty(target=ss, margin="rows", select=I);
+    se = removeEmpty(target=se, margin="rows", select=I);
+    sm = removeEmpty(target=sm, margin="rows", select=I);
+
+    # prepare IDs for deduplication and pruning
+    ID = matrix(0, nrow(P), 1);
+    dom = foffe-foffb+1;
+    for( j in 1:ncol(dom) ) {
+      beg = as.scalar(foffb[1,j])+1;
+      end = as.scalar(foffe[1,j]);
+      I = rowIndexMax(P[,beg:end]) * rowMaxs(P[,beg:end]);
+      prod = 1;
+      if(j<ncol(dom))
+        prod = prod(dom[1,(j+1):ncol(dom)])
+      ID = ID + I * prod;
+    }
+
+    # ID transformation to avoid exceeding INT_MAX and
+    # and to void creating huge sparse intermediates
+    [ID, M] = transformencode(target=as.frame(ID), 
spec="{ids:true,recode:[1]}")
+
+    # size pruning, with rowMin-rowMax transform 
+    # to avoid densification (ignored zeros)
+    map = table(ID, seq(1,nrow(P)), max(ID), nrow(P))
+    ubSizes = 1/rowMaxs(map * (1/t(ss)));
+    ubSizes = replace(target=ubSizes, pattern=Inf, replacement=0);
+    fSizes = (ubSizes >= minSup)
+
+    # error pruning
+    ubError = 1/rowMaxs(map * (1/t(se)));
+    ubError = replace(target=ubError, pattern=Inf, replacement=0);
+    ubMError = 1/rowMaxs(map * (1/t(sm)));
+    ubMError = replace(target=ubMError, pattern=Inf, replacement=0);
+    ubScores = scoreUB(ubSizes, ubError, ubMError, eAvg, minSup, alpha, n2);
+    [maxsc, minsc] = analyzeTopK(TKC);
+    fScores = (ubScores > minsc & ubScores > 0) 
+
+    # missing parents pruning
+    numParents = rowSums((map %*% P12) != 0) 
+    fParents = (numParents == level);
+
+    # apply all pruning 
+    fall = (fSizes & fScores & fParents);
+
+    # deduplication of join outputs
+    Dedup = removeEmpty(target=map, margin="rows", select=fall) != 0
+    #P = (Dedup %*% P) != 0, replaced by below (easier sparsity propagation)
+    DeI = table(rowIndexMax(Dedup), 1, nrow(P), 1);
+    P = removeEmpty(target=P, margin="rows", select=DeI);
+  }
+}
+
+evalSlice = function(Matrix[Double] X, Matrix[Double] e, Double eAvg, 
+    Matrix[Double] tS, Integer l, Double alpha) 
+  return(Matrix[Double] R)
+{
+  I = (X %*% tS) == l;    # slice indicator
+  ss = t(colSums(I));     # absolute slice size (nnz)
+  se = t(t(e) %*% I);     # absolute slice error
+  sm = t(colMaxs(I * e)); # maximum tuple error in slice
+
+  # score of relative error and relative size
+  sc = score(ss, se, eAvg, alpha, nrow(X));
+  R = cbind(sc, se, sm, ss);
+}
+
+decodeOneHot = function(Matrix[Double] M, Matrix[Double] foffb, Matrix[Double] 
foffe)
+  return(Matrix[Double] M) 
+{
+  R = matrix(1, nrow(M), ncol(foffb));
+  if( nrow(M) > 0 ) {
+    parfor( j in 1:ncol(foffb) ) {
+      beg = as.scalar(foffb[1,j])+1;
+      end = as.scalar(foffe[1,j]);
+      I = rowSums(M[,beg:end]) * rowIndexMax(M[,beg:end]);
+      R[, j] = I;
+    }
+  }
+  M = R;
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java 
b/src/main/java/org/apache/sysds/common/Builtins.java
index 5c334f72ae..de3f2b5c5a 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -192,6 +192,7 @@ public enum Builtins {
        IMPUTE_BY_MODE_APPLY("imputeByModeApply", true),
        IMPUTE_FD("imputeByFD", true),
        IMPUTE_FD_APPLY("imputeByFDApply", true),
+       INCSLICELINE("incSliceLine", true),
        INTERQUANTILE("interQuantile", false),
        INTERSECT("intersect", true),
        INVERSE("inv", "inverse", false),
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinIncSliceLineTest.java
 
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinIncSliceLineTest.java
new file mode 100644
index 0000000000..82b1a65265
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinIncSliceLineTest.java
@@ -0,0 +1,403 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin.part2;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+
+public class BuiltinIncSliceLineTest extends AutomatedTestBase {
+    private static final String PREP_NAME = "slicefinderPrep";
+    private static final String TEST_NAME = "incSliceLine";
+    private static final String TEST_DIR = "functions/builtin/";
+    private static final String TEST_CLASS_DIR = TEST_DIR + 
BuiltinIncSliceLineTest.class.getSimpleName() + "/";
+    private static final boolean VERBOSE = true;
+
+    private static final double[][] EXPECTED_TOPK = new double[][] {
+            { 1.042, 69210699988.477, 11078019685.642, 18.000 },
+            { 0.478, 92957580467.849, 11078019685.642, 39.000 },
+            { 0.316, 40425449547.480, 11078019685.642, 10.000 },
+            { 0.262, 67630559163.266, 7261504482.540, 29.000 },
+            { 0.224, 202448990843.317, 11119010986.000, 125.000 },
+            { 0.218, 68860581248.568, 7261504482.540, 31.000 },
+            { 0.164, 206527445340.279, 11119010986.000, 135.000 },
+            { 0.122, 68961886413.866, 7261504482.540, 34.000 },
+            { 0.098, 360278523220.479, 11119010986.000, 266.000 },
+            { 0.092, 73954209826.485, 11078019685.642, 39.000 }
+    };
+
+    @Override
+    public void setUp() {
+        addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, 
TEST_NAME, new String[] { "R" }));
+    }
+
+    @Test
+    public void testTop4HybridDP() {
+        runIncSliceLineTest(4, "e", true, false, ExecMode.HYBRID);
+    }
+
+    @Test
+    public void testTop4SinglenodeDP() {
+        runIncSliceLineTest(4, "e", true, false, ExecMode.SINGLE_NODE);
+    }
+
+    @Test
+    public void testTop4HybridTP() {
+        runIncSliceLineTest(4, "e", false, false, ExecMode.HYBRID);
+    }
+
+    @Test
+    public void testTop4SinglenodeTP() {
+        runIncSliceLineTest(4, "e", false, false, ExecMode.SINGLE_NODE);
+    }
+
+    @Test
+    public void testTop10HybridDP() {
+        runIncSliceLineTest(10, "e", true, false, ExecMode.HYBRID);
+    }
+
+    @Test
+    public void testTop10SinglenodeDP() {
+        runIncSliceLineTest(10, "e", true, false, ExecMode.SINGLE_NODE);
+    }
+
+    @Test
+    public void testTop10HybridTP() {
+        runIncSliceLineTest(10, "e", false, false, ExecMode.HYBRID);
+    }
+
+    @Test
+    public void testTop10SinglenodeTP() {
+        runIncSliceLineTest(10, "e", false, false, ExecMode.SINGLE_NODE);
+    }
+
+    @Test
+    public void testTop4HybridDPSel() {
+        runIncSliceLineTest(4, "e", true, true, ExecMode.HYBRID);
+    }
+
+    @Test
+    public void testTop4SinglenodeDPSel() {
+        runIncSliceLineTest(4, "e", true, true, ExecMode.SINGLE_NODE);
+    }
+
+    @Test
+    public void testTop4HybridTPSel() {
+        runIncSliceLineTest(4, "e", false, true, ExecMode.HYBRID);
+    }
+
+    @Test
+    public void testTop4SinglenodeTPSel() {
+        runIncSliceLineTest(4, "e", false, true, ExecMode.SINGLE_NODE);
+    }
+
+    @Test
+    public void testTop10HybridDPSel() {
+        runIncSliceLineTest(10, "e", true, true, ExecMode.HYBRID);
+    }
+
+    @Test
+    public void testTop10SinglenodeDPSel() {
+        runIncSliceLineTest(10, "e", true, true, ExecMode.SINGLE_NODE);
+    }
+
+    @Test
+    public void testTop10HybridTPSel() {
+        runIncSliceLineTest(10, "e", false, true, ExecMode.HYBRID);
+    }
+
+    @Test
+    public void testTop10SinglenodeTPSel() {
+        runIncSliceLineTest(10, "e", false, true, ExecMode.SINGLE_NODE);
+    }
+
+    @Test
+    public void testTop10HybridTPSelE2() {
+        runIncSliceLineTest(10, "oe", false, true, ExecMode.HYBRID);
+    }
+
+    @Test
+    public void testTop10SinglenodeTPSelE2() {
+        runIncSliceLineTest(10, "oe", false, true, ExecMode.SINGLE_NODE);
+    }
+
+    @Test
+    public void testIncSliceLineCustomInputs1() {
+        double[][] newX = {
+                { 2, 1, 1, 2, 3, 2, 3, 3, 1, 2 },
+                { 2, 2, 2, 3, 4, 1, 2, 1, 3, 2 },
+                { 2, 1, 3, 3, 2, 2, 3, 1, 1, 4 },
+                { 1, 2, 2, 1, 3, 2, 3, 2, 2, 3 },
+                { 3, 2, 3, 4, 3, 3, 4, 1, 1, 3 },
+                { 4, 3, 2, 3, 4, 4, 3, 4, 1, 1 },
+                { 2, 2, 2, 4, 3, 3, 2, 2, 1, 2 },
+                { 1, 1, 2, 2, 3, 3, 2, 1, 1, 2 },
+                { 4, 3, 2, 1, 3, 2, 4, 2, 4, 3 },
+                { 1, 3, 1, 4, 1, 3, 3, 2, 3, 2 },
+                { 2, 4, 3, 1, 2, 4, 1, 3, 2, 4 },
+                { 3, 2, 4, 3, 1, 4, 2, 3, 4, 1 },
+                { 4, 1, 2, 4, 3, 1, 4, 2, 1, 3 },
+                { 1, 3, 4, 2, 4, 3, 1, 4, 2, 3 },
+                { 2, 4, 1, 3, 2, 4, 3, 1, 4, 2 },
+                { 3, 2, 4, 1, 3, 4, 2, 3, 1, 4 },
+                { 4, 1, 3, 2, 4, 1, 4, 2, 3, 1 },
+                { 1, 3, 2, 4, 1, 3, 4, 2, 4, 3 },
+                { 2, 4, 1, 3, 2, 4, 3, 1, 2, 4 },
+                { 2, 3, 3, 2, 1, 4, 2, 3, 2, 3 }
+        };
+        double[][] e = {
+                { 0.159 }, { 0.588 }, { 0.414 }, { 0.305 }, { 0.193 }, { 0.195 
}, { 0.878 }, { 0.149 }, { 0.835 },
+                { 0.344 },
+                { 0.123 }, { 0.456 }, { 0.789 }, { 0.987 }, { 0.654 }, { 0.321 
}, { 0.246 }, { 0.135 }, { 0.579 },
+                { 0.802 }
+        };
+        int K = 10;
+        double[][] correctRes = {
+                { 0.307, 2.807, 0.878, 4.000 },
+                { 0.307, 2.807, 0.878, 4.000 },
+                { 0.282, 2.759, 0.987, 4.000 },
+                { 0.157, 4.046, 0.987, 7.000 },
+                { 0.127, 2.956, 0.878, 5.000 },
+                { 0.122, 2.942, 0.878, 5.000 },
+                { 0.074, 3.298, 0.987, 6.000 },
+                { 0.064, 4.197, 0.878, 8.000 },
+                { 0.061, 2.796, 0.987, 5.000 },
+                { 0.038, 3.194, 0.878, 6.000 }
+        };
+        testIncSliceLineCustomInputs(newX, e, K, correctRes);
+    }
+
+    @Test
+    public void testIncSliceLineCustomInputs2() {
+        double[][] newX = {
+                { 2, 1, 1, 1, 3, 4, 2, 2, 1, 2 },
+                { 3, 3, 3, 2, 1, 2, 3, 1, 4, 2 },
+                { 3, 2, 3, 1, 1, 1, 4, 3, 4, 2 },
+                { 1, 3, 2, 3, 2, 3, 2, 1, 2, 1 },
+                { 4, 3, 1, 1, 1, 1, 1, 1, 3, 2 },
+                { 2, 2, 3, 3, 2, 2, 2, 3, 4, 1 },
+                { 3, 2, 2, 2, 4, 4, 2, 4, 1, 1 },
+                { 1, 3, 3, 2, 1, 3, 1, 2, 4, 4 },
+                { 2, 1, 2, 2, 3, 1, 2, 3, 2, 1 },
+                { 4, 1, 3, 4, 1, 4, 2, 3, 4, 4 },
+                { 4, 2, 4, 4, 2, 1, 2, 1, 1, 4 },
+                { 4, 1, 1, 4, 1, 4, 3, 2, 4, 2 },
+                { 2, 1, 2, 2, 3, 1, 4, 3, 3, 4 },
+                { 4, 1, 3, 1, 3, 1, 2, 1, 3, 3 },
+                { 2, 1, 3, 1, 1, 3, 1, 2, 1, 2 },
+                { 1, 3, 4, 3, 1, 2, 2, 2, 1, 1 },
+                { 2, 4, 4, 3, 4, 1, 2, 1, 2, 4 },
+                { 3, 3, 3, 3, 3, 1, 2, 3, 4, 4 },
+                { 3, 2, 2, 2, 4, 1, 4, 2, 3, 1 },
+                { 1, 2, 3, 2, 4, 3, 2, 3, 2, 3 }
+        };
+
+        double[][] e = {
+                { 0.591 }, { 0.858 }, { 0.144 }, { 0.350 }, { 0.931 }, { 0.951 
}, { 0.788 }, { 0.491 }, { 0.358 },
+                { 0.443 },
+                { 0.231 }, { 0.564 }, { 0.897 }, { 0.879 }, { 0.546 }, { 0.132 
}, { 0.462 }, { 0.153 }, { 0.759 },
+                { 0.028 }
+        };
+        int K = 10;
+        double[][] correctRes = {
+                { 0.410, 3.466, 0.931, 4.000 },
+                { 0.410, 3.466, 0.931, 4.000 },
+                { 0.111, 2.802, 0.897, 4.000 },
+                { 0.075, 3.805, 0.951, 6.000 },
+                { 0.057, 4.278, 0.897, 7.000 },
+                { 0.047, 3.711, 0.931, 6.000 },
+                { 0.035, 3.152, 0.897, 5.000 },
+                { 0.032, 4.179, 0.897, 7.000 },
+                { 0.023, 3.634, 0.931, 6.000 },
+                { 0.013, 3.091, 0.931, 5.000 }
+        };
+
+        testIncSliceLineCustomInputs(newX, e, K, correctRes);
+    }
+
+    @Test
+    public void testIncSliceLineCustomInputs3() {
+        double[][] newX = {
+                { 2, 1, 1, 2, 3, 2, 3, 3, 1, 2 },
+                { 2, 2, 2, 3, 4, 1, 2, 1, 3, 2 },
+                { 2, 1, 3, 3, 2, 2, 3, 1, 1, 4 },
+                { 1, 2, 2, 1, 3, 2, 3, 2, 2, 3 },
+                { 3, 2, 3, 4, 3, 3, 4, 1, 1, 3 },
+                { 4, 3, 2, 3, 4, 4, 3, 4, 1, 1 },
+                { 2, 2, 2, 4, 3, 3, 2, 2, 1, 2 },
+                { 1, 1, 2, 2, 3, 3, 2, 1, 1, 2 },
+                { 4, 3, 2, 1, 3, 2, 4, 2, 4, 3 },
+                { 1, 3, 1, 4, 1, 3, 3, 2, 3, 2 },
+                { 2, 4, 3, 1, 2, 4, 1, 3, 2, 4 },
+                { 3, 2, 4, 3, 1, 4, 2, 3, 4, 1 },
+                { 4, 1, 2, 4, 3, 1, 4, 2, 1, 3 },
+                { 1, 3, 4, 2, 4, 3, 1, 4, 2, 3 },
+                { 2, 4, 1, 3, 2, 4, 3, 1, 4, 2 },
+                { 3, 2, 4, 1, 3, 4, 2, 3, 1, 4 },
+                { 4, 1, 3, 2, 4, 1, 4, 2, 3, 1 },
+                { 1, 3, 2, 4, 1, 3, 4, 2, 4, 3 },
+                { 2, 4, 1, 3, 2, 4, 3, 1, 2, 4 },
+                { 2, 3, 3, 2, 1, 4, 2, 3, 2, 3 },
+                { 2, 1, 1, 1, 3, 4, 2, 2, 1, 2 },
+                { 3, 3, 3, 2, 1, 2, 3, 1, 4, 2 },
+                { 3, 2, 3, 1, 1, 1, 4, 3, 4, 2 },
+                { 1, 3, 2, 3, 2, 3, 2, 1, 2, 1 },
+                { 4, 3, 1, 1, 1, 1, 1, 1, 3, 2 },
+                { 2, 2, 3, 3, 2, 2, 2, 3, 4, 1 },
+                { 3, 2, 2, 2, 4, 4, 2, 4, 1, 1 },
+                { 1, 3, 3, 2, 1, 3, 1, 2, 4, 4 },
+                { 2, 1, 2, 2, 3, 1, 2, 3, 2, 1 },
+                { 4, 1, 3, 4, 1, 4, 2, 3, 4, 4 },
+                { 4, 2, 4, 4, 2, 1, 2, 1, 1, 4 },
+                { 4, 1, 1, 4, 1, 4, 3, 2, 4, 2 },
+                { 2, 1, 2, 2, 3, 1, 4, 3, 3, 4 },
+                { 4, 1, 3, 1, 3, 1, 2, 1, 3, 3 },
+                { 2, 1, 3, 1, 1, 3, 1, 2, 1, 2 },
+                { 1, 3, 4, 3, 1, 2, 2, 2, 1, 1 },
+                { 2, 4, 4, 3, 4, 1, 2, 1, 2, 4 },
+                { 3, 3, 3, 3, 3, 1, 2, 3, 4, 4 },
+                { 3, 2, 2, 2, 4, 1, 4, 2, 3, 1 },
+                { 1, 2, 3, 2, 4, 3, 2, 3, 2, 3 }
+        };
+        double[][] e = {
+                { 0.159 }, { 0.588 }, { 0.414 }, { 0.305 }, { 0.193 }, { 0.195 
}, { 0.878 }, { 0.149 }, { 0.835 },
+                { 0.344 },
+                { 0.123 }, { 0.456 }, { 0.789 }, { 0.987 }, { 0.654 }, { 0.321 
}, { 0.246 }, { 0.135 }, { 0.579 },
+                { 0.802 },
+                { 0.591 }, { 0.858 }, { 0.144 }, { 0.350 }, { 0.931 }, { 0.951 
}, { 0.788 }, { 0.491 }, { 0.358 },
+                { 0.443 },
+                { 0.231 }, { 0.564 }, { 0.897 }, { 0.879 }, { 0.546 }, { 0.132 
}, { 0.462 }, { 0.153 }, { 0.759 },
+                { 0.028 }
+        };
+        int K = 10;
+        double[][] correctRes = {
+                { 0.149, 4.300, 0.931, 6.000 },
+                { 0.113, 3.138, 0.987, 4.000 },
+                { 0.093, 4.644, 0.931, 7.000 },
+                { 0.090, 4.630, 0.951, 7.000 },
+                { 0.059, 8.002, 0.951, 14.000 },
+                { 0.024, 2.954, 0.951, 4.000 },
+                { 0.017, 3.415, 0.897, 5.000 },
+                { 0.010, 3.398, 0.878, 5.000 },
+                { 0.009, 2.923, 0.897, 4.000 },
+                { 0.008, 3.391, 0.897, 5.000 }
+        };
+        testIncSliceLineCustomInputs(newX, e, K, correctRes);
+    }
+
+    // @Test
+    // public void testTop10SparkTP() {
+    // runIncSliceLineTest(10, false, ExecMode.SPARK);
+    // }
+
+    private void runIncSliceLineTest(int K, String err, boolean dp, boolean 
selCols, ExecMode mode) {
+        ExecMode platformOld = setExecMode(mode);
+        loadTestConfiguration(getTestConfiguration(TEST_NAME));
+        String HOME = SCRIPT_DIR + TEST_DIR;
+        String data = DATASET_DIR + "Salaries.csv";
+
+        try {
+            loadTestConfiguration(getTestConfiguration(TEST_NAME));
+
+            // run data preparation
+            fullDMLScriptName = HOME + PREP_NAME + ".dml";
+            programArgs = new String[] { "-args", data, err, output("newX"), 
output("e") };
+            runTest(true, false, null, -1);
+
+            // read output and store for dml and R
+            double[][] newX = 
TestUtils.convertHashMapToDoubleArray(readDMLMatrixFromOutputDir("newX"));
+            double[][] e = 
TestUtils.convertHashMapToDoubleArray(readDMLMatrixFromOutputDir("e"));
+            writeInputMatrixWithMTD("newX", newX, true);
+            writeInputMatrixWithMTD("e", e, true);
+
+            // execute main test
+            fullDMLScriptName = HOME + TEST_NAME + ".dml";
+            programArgs = new String[] { "-args", input("newX"), input("e"), 
String.valueOf(K),
+                    String.valueOf(!dp).toUpperCase(), 
String.valueOf(selCols).toUpperCase(),
+                    String.valueOf(VERBOSE).toUpperCase(), output("R") };
+
+            runTest(true, false, null, -1);
+
+            HashMap<CellIndex, Double> dmlfile = 
readDMLMatrixFromOutputDir("R");
+
+            // execute main test
+            fullDMLScriptName = HOME + "slicefinder" + ".dml";
+            programArgs = new String[] { "-args", input("newX"), input("e"), 
String.valueOf(K),
+                    String.valueOf(!dp).toUpperCase(), 
String.valueOf(selCols).toUpperCase(),
+                    String.valueOf(VERBOSE).toUpperCase(), output("R") };
+
+            runTest(true, false, null, -1);
+
+            HashMap<CellIndex, Double> dmlfile2 = 
readDMLMatrixFromOutputDir("R");
+
+            TestUtils.compareMatrices(dmlfile, dmlfile2, 1e-2, 
"Stat-IncSliceLine", "Stat-Slicefinder");
+
+            // compare expected results
+            if (err.equals("e")) {
+                double[][] ret = 
TestUtils.convertHashMapToDoubleArray(dmlfile);
+                if (mode != ExecMode.SPARK) // TODO why only CP correct, but R 
always matches? test framework?
+                    for (int i = 0; i < K; i++)
+                        TestUtils.compareMatrices(EXPECTED_TOPK[i], ret[i], 
1e-2);
+            }
+
+            // ensure proper inlining, despite initially multiple calls and 
large function
+            Assert.assertFalse(heavyHittersContainsSubString("evalSlice"));
+        } finally {
+            rtplatform = platformOld;
+        }
+    }
+
+    public void testIncSliceLineCustomInputs(double[][] newX, double[][] e, 
int K, double[][] correctRes) {
+        boolean dp = true, selCols = false;
+        ExecMode mode = ExecMode.SINGLE_NODE;
+        ExecMode platformOld = setExecMode(mode);
+        loadTestConfiguration(getTestConfiguration(TEST_NAME));
+        String HOME = SCRIPT_DIR + TEST_DIR;
+
+        try {
+            loadTestConfiguration(getTestConfiguration(TEST_NAME));
+
+            writeInputMatrixWithMTD("newX", newX, false);
+            writeInputMatrixWithMTD("e", e, false);
+
+            fullDMLScriptName = HOME + TEST_NAME + ".dml";
+            programArgs = new String[] { "-args", input("newX"), input("e"), 
String.valueOf(K),
+                    String.valueOf(!dp).toUpperCase(), 
String.valueOf(selCols).toUpperCase(),
+                    String.valueOf(VERBOSE).toUpperCase(), output("R") };
+
+            runTest(true, false, null, -1);
+
+            HashMap<CellIndex, Double> dmlfile = 
readDMLMatrixFromOutputDir("R");
+            double[][] ret = TestUtils.convertHashMapToDoubleArray(dmlfile);
+            TestUtils.compareMatrices(correctRes, ret, 1e-2);
+
+            Assert.assertFalse(heavyHittersContainsSubString("evalSlice"));
+        } finally {
+            rtplatform = platformOld;
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/test/scripts/functions/builtin/incSliceLine.dml 
b/src/test/scripts/functions/builtin/incSliceLine.dml
new file mode 100644
index 0000000000..72843cab32
--- /dev/null
+++ b/src/test/scripts/functions/builtin/incSliceLine.dml
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+newX = read($1);
+e = read($2);
+
+# call slice finding
+[TS,TR] = incSliceLine(newX=newX, e=e, k=$3,
+  alpha=0.95, minSup=4, tpEval=$4, selFeat=$5, verbose=$6);
+
+write(TR, $7)


Reply via email to