This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 5ec8d0c06a [SYSTEMDS-3696] Basic incremental slice-line builtin, and
tests
5ec8d0c06a is described below
commit 5ec8d0c06a99cdf1250d2d85c6dbc8e43e84ea19
Author: Frederic Zoepffel <[email protected]>
AuthorDate: Mon May 20 16:36:09 2024 +0200
[SYSTEMDS-3696] Basic incremental slice-line builtin, and tests
Closes #2024.
---
scripts/builtin/incSliceLine.dml | 383 ++++++++++++++++++++
.../java/org/apache/sysds/common/Builtins.java | 1 +
.../builtin/part2/BuiltinIncSliceLineTest.java | 403 +++++++++++++++++++++
.../scripts/functions/builtin/incSliceLine.dml | 29 ++
4 files changed, 816 insertions(+)
diff --git a/scripts/builtin/incSliceLine.dml b/scripts/builtin/incSliceLine.dml
new file mode 100644
index 0000000000..f6c02fac9b
--- /dev/null
+++ b/scripts/builtin/incSliceLine.dml
@@ -0,0 +1,383 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# This builtin function implements SliceLine, a linear-algebra-based
+# ML model debugging technique for finding the top-k data slices where
+# a trained models performs significantly worse than on the overall
+# dataset. For a detailed description and experimental results, see:
+# Svetlana Sagadeeva, Matthias Boehm: SliceLine: Fast, Linear-Algebra-based
Slice Finding for ML Model Debugging.(SIGMOD 2021)
+#
+# INPUT:
+#
---------------------------------------------------------------------------------------
+# newX Feature matrix in recoded/binned representation
+# oldX All-comprising feature matrix of previous runs in recoded/binned
representation
+# e Error vector of trained model
+# k Number of subsets required
+# maxL maximum level L (conjunctions of L predicates), 0 unlimited
+# minSup minimum support (min number of rows per slice)
+# alpha weight [0,1]: 0 only size, 1 only error
+# tpEval flag for task-parallel slice evaluation,
+# otherwise data-parallel
+# tpBlksz block size for task-parallel execution (num slices)
+# selFeat flag for removing one-hot-encoded features that don`t satisfy
+# the initial minimum-support constraint and/or have zero error
+# verbose flag for verbose debug output
+# prevL previous lattice (for incremental updates)
+# prevRL previous statistics whole lattice (for incremental updates)
+#
---------------------------------------------------------------------------------------
+#
+# OUTPUT:
+#
-----------------------------------------------------------------------------------------
+# TK top-k slices (k x ncol(newX) if successful)
+# TKC score, size, error of slices (k x 3)
+# D debug matrix, populated with enumeration stats if verbose
+# L lattice matrix
+# RL statistics matrix for all slices in L
+# Xout feature matrix consisting of oldX and newX for next run
+#
-----------------------------------------------------------------------------------------
+
+m_incSliceLine = function(Matrix[Double] newX, Matrix[Double] oldX = matrix(0,
0, 0), Matrix[Double] e, Int k = 4,
+ Int maxL = 0, Int minSup = 32, Double alpha = 0.5, Boolean tpEval = TRUE,
+ Int tpBlksz = 16, Boolean selFeat = FALSE, Boolean verbose = FALSE,
+ Matrix[Double] prevLattice = matrix(0, 0, 0) , Matrix[Double] prevRL =
matrix(0, 0, 0))
+ return(Matrix[Double] TK, Matrix[Double] TKC, Matrix[Double] D,
Matrix[Double] L, Matrix[Double] RL, Matrix[Double] Xout)
+{
+ # TODO convert input/output of previous enumerated slices to lists
+ # for simple collection and processing
+ t1 = time();
+
+ # init debug matrix: levelID, enumerated S, valid S, TKmax, TKmin
+ D = matrix(0, 0, 5);
+
+ m = nrow(newX);
+ n = ncol(newX);
+
+ # prepare offset vectors and one-hot encoded newX
+ fdom = colMaxs(newX);
+ foffb = t(cumsum(t(fdom))) - fdom;
+ foffe = t(cumsum(t(fdom)))
+ rix = matrix(seq(1,m)%*%matrix(1,1,n), m*n, 1)
+ cix = matrix(newX + foffb, m*n, 1);
+ X2 = table(rix, cix, 1, m, as.scalar(foffe[,n]), FALSE); #one-hot encoded
+
+ # initialize statistics and basic slices
+ n2 = ncol(X2); # one-hot encoded features
+ eAvg = sum(e) / m; # average error
+ [S, R, selCols] = createAndScoreBasicSlices(X2, e, eAvg, minSup, alpha,
verbose);
+
+ # initialize Lattice and Statistics
+ L = S
+ RL = R
+
+ # initialize top-k
+ [TK, TKC] = maintainTopK(S, R, matrix(0,0,n2), matrix(0,0,4), k, minSup);
+
+ if( verbose ) {
+ [maxsc, minsc] = analyzeTopK(TKC);
+ print("SliceFinder: initial top-K: count="+nrow(TK)+", max="+maxsc+",
min="+minsc+" (time="+(time()-t1)+")")
+ D = rbind(D, t(as.matrix(list(1, n2, nrow(S), maxsc, minsc))));
+ }
+
+ # reduced dataset to relevant attributes (minSup, err>0), S reduced
on-the-fly
+ if( selFeat )
+ X2 = removeEmpty(target=X2, margin="cols", select=t(selCols));
+
+ # lattice enumeration w/ size/error pruning, one iteration per level
+ # termination condition (max #feature levels)
+ maxL = ifelse(maxL<=0, n, maxL)
+ level = 1;
+ while( nrow(S) > 0 & sum(S) > 0 & level < n & level < maxL ) {
+ level = level + 1;
+
+ # enumerate candidate join pairs, incl size/error pruning
+ nrS = nrow(S);
+ S = getPairedCandidates(S, R, TK, TKC, k, level, eAvg, minSup, alpha, n2,
foffb, foffe);
+ S2 = S;
+
+ # update lattice and statistics
+ L = rbind(L, S);
+
+ if(selFeat)
+ S2 = removeEmpty(target=S, margin="cols", select=t(selCols));
+
+ if(verbose) {
+ print("\nSliceFinder: level "+level+":")
+ print(" -- generated paired slice candidates: "+nrS+" -> "+nrow(S));
+ }
+
+ if( nrow(S) > 0 ) {
+ # extract and evaluate candidate slices
+ if( tpEval ) { # task-parallel
+ # hybrid task-parallel w/ 1 matrix-matrix for blocks of 16
matrix-vector
+ R = matrix(0, nrow(S), 4)
+ parfor( i in 1:ceil(nrow(S)/tpBlksz), check=0 ) {
+ beg = (i-1)*tpBlksz + 1;
+ end = min(i*tpBlksz, nrow(R));
+ R[beg:end,] = evalSlice(X2, e, eAvg, t(S2[beg:end,]), level, alpha);
+ }
+ RL = rbind(RL, R);
+ }
+ else { # data-parallel
+ R = evalSlice(X2, e, eAvg, t(S2), level, alpha);
+ RL = rbind(RL, R);
+ }
+
+ # maintain top-k after evaluation
+ [TK, TKC] = maintainTopK(S, R, TK, TKC, k, minSup);
+
+ if(verbose) {
+ [maxsc, minsc] = analyzeTopK(TKC);
+ valid = as.integer(sum(R[,2]>0 & R[,4]>=minSup));
+ print(" -- valid slices after eval: "+valid+"/"+nrow(S));
+ print(" -- top-K: count="+nrow(TK)+", max="+maxsc+", min="+minsc);
+ print(" -- (time="+(time()-t1)+")")
+ D = rbind(D, t(as.matrix(list(level, nrow(S), valid, maxsc, minsc))));
+ }
+ }
+ }
+
+ TK = decodeOneHot(TK, foffb, foffe);
+
+ # prepare output feature matrix for next run
+ if (nrow(oldX) > 0){
+ Xout = rbind(oldX, newX);
+ } else {
+ Xout = newX;
+ }
+
+ L = decodeOneHot(L, foffb, foffe)
+
+
+ if( verbose ) {
+ print("SliceFinder: terminated at level "+level+":\n"
+ + toString(TK) + "\n" + toString(TKC));
+ }
+
+ print("Lattice: \n "+ toString(L) +":\n"
+ + "Statistics: \n "+ toString(RL));
+}
+
+createAndScoreBasicSlices = function(Matrix[Double] X2, Matrix[Double] e,
+ Double eAvg, Double minSup, Double alpha, Boolean verbose)
+ return(Matrix[Double] S, Matrix[Double] R, Matrix[Double] selCols)
+{
+ n2 = ncol(X2);
+ cCnts = t(colSums(X2)); # column counts
+ err = t(t(e) %*% X2); # total error vector
+ merr = t(colMaxs(X2 * e)); # maximum error vector
+
+ if( verbose ) {
+ drop = as.integer(sum(cCnts < minSup | err == 0));
+ print("SliceFinder: dropping "+drop+"/"+n2+" features below minSup =
"+minSup+".");
+ }
+
+ # working set of active slices (#attr x #slices) and top k
+ selCols = (cCnts >= minSup & err > 0);
+ attr = removeEmpty(target=seq(1,n2), margin="rows", select=selCols);
+ ss = removeEmpty(target=cCnts, margin="rows", select=selCols);
+ se = removeEmpty(target=err, margin="rows", select=selCols);
+ sm = removeEmpty(target=merr, margin="rows", select=selCols);
+ S = table(seq(1,nrow(attr)), attr, nrow(attr), n2);
+
+ # score 1-slices and create initial top-k
+ sc = score(ss, se, eAvg, alpha, nrow(X2));
+ R = cbind(sc, se, sm, ss);
+}
+
+score = function(Matrix[Double] ss, Matrix[Double] se, Double eAvg, Double
alpha, Integer n)
+ return(Matrix[Double] sc)
+{
+ sc = alpha * ((se/ss) / eAvg - 1) - (1-alpha) * (n/ss - 1);
+ sc = replace(target=sc, pattern=NaN, replacement=-Inf);
+}
+
+scoreUB = function(Matrix[Double] ss, Matrix[Double] se, Matrix[Double] sm,
+ Double eAvg, Integer minSup, Double alpha, Integer n)
+ return(Matrix[Double] sc)
+{
+ # Initial upper bound equation (with minSup and ss in pos/neg terms)
+ # sc = alpha * ((se/minSup) / eAvg - 1) - (1-alpha) * (n/ss - 1);
+
+ # Since sc is either monotonically increasing or decreasing, we
+ # probe interesting points of sc in the interval [minSup, ss],
+ # and compute the maximum to serve as the upper bound
+ s = cbind(matrix(minSup,nrow(ss),1), max(se/sm,minSup), ss)
+ sc = rowMaxs(alpha * ((min(s*sm,se)/s) / eAvg - 1) - (1-alpha) * (1/s*n -
1));
+ sc = replace(target=sc, pattern=NaN, replacement=-Inf);
+}
+
+
+maintainTopK = function(Matrix[Double] S, Matrix[Double] R,
+ Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer minSup)
+ return(Matrix[Double] TK, Matrix[Double] TKC)
+{
+ # prune invalid minSup and scores
+ I = (R[,1] > 0) & (R[,4] >= minSup);
+
+ if( sum(I)!=0 ) {
+ S = removeEmpty(target=S, margin="rows", select=I);
+ R = removeEmpty(target=R, margin="rows", select=I);
+
+ # evaluated candidated and previous top-k
+ slices = rbind(TK, S);
+ scores = rbind(TKC, R);
+
+ # extract top-k
+ IX = order(target=scores, by=1, decreasing=TRUE, index.return=TRUE);
+ IX = IX[1:min(k,nrow(IX)),];
+ P = table(seq(1,nrow(IX)), IX, nrow(IX), nrow(slices));
+ TK = P %*% slices;
+ TKC = P %*% scores;
+ }
+}
+
+analyzeTopK = function(Matrix[Double] TKC) return(Double maxsc, Double minsc) {
+ maxsc = -Inf;
+ minsc = -Inf;
+ if( nrow(TKC)>0 ) {
+ maxsc = as.scalar(TKC[1,1]);
+ minsc = as.scalar(TKC[nrow(TKC),1]);
+ }
+}
+
+getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R,
+ Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer level,
+ Double eAvg, Integer minSup, Double alpha, Integer n2,
+ Matrix[Double] foffb, Matrix[Double] foffe)
+ return(Matrix[Double] P)
+{
+ # prune invalid slices (possible without affecting overall
+ # pruning effectiveness due to handling of missing parents)
+ pI = (R[,4] >= minSup & R[,2] > 0);
+ S = removeEmpty(target=S, margin="rows", select=pI)
+ R = removeEmpty(target=R, margin="rows", select=pI)
+
+ # join compatible slices (without self)
+ join = S %*% t(S) == (level-2)
+ I = upper.tri(target=join, diag=FALSE, values=TRUE);
+
+ # pair construction
+ nr = nrow(I); nc = ncol(I);
+ rix = matrix(I * seq(1,nr), nr*nc, 1);
+ cix = matrix(I * t(seq(1,nc)), nr*nc, 1);
+ rix = removeEmpty(target=rix, margin="rows");
+ cix = removeEmpty(target=cix, margin="rows");
+
+ P = matrix(0,0,ncol(S))
+ if( sum(rix)!=0 ) {
+ P1 = table(seq(1,nrow(rix)), rix, nrow(rix), nrow(S));
+ P2 = table(seq(1,nrow(cix)), cix, nrow(rix), nrow(S));
+ P12 = P1 + P2; # combined slice
+ P = (P1 %*% S + P2 %*% S) != 0;
+
+ se = min(P1 %*% R[,2], P2 %*% R[,2])
+ sm = min(P1 %*% R[,3], P2 %*% R[,3])
+ ss = min(P1 %*% R[,4], P2 %*% R[,4])
+
+ # prune invalid self joins (>1 bit per feature)
+ I = matrix(1, nrow(P), 1);
+ for( j in 1:ncol(foffb) ) {
+ beg = as.scalar(foffb[1,j])+1;
+ end = as.scalar(foffe[1,j]);
+ I = I & (rowSums(P[,beg:end]) <= 1);
+ }
+ P12 = removeEmpty(target=P12, margin="rows", select=I)
+ P = removeEmpty(target=P, margin="rows", select=I);
+ ss = removeEmpty(target=ss, margin="rows", select=I);
+ se = removeEmpty(target=se, margin="rows", select=I);
+ sm = removeEmpty(target=sm, margin="rows", select=I);
+
+ # prepare IDs for deduplication and pruning
+ ID = matrix(0, nrow(P), 1);
+ dom = foffe-foffb+1;
+ for( j in 1:ncol(dom) ) {
+ beg = as.scalar(foffb[1,j])+1;
+ end = as.scalar(foffe[1,j]);
+ I = rowIndexMax(P[,beg:end]) * rowMaxs(P[,beg:end]);
+ prod = 1;
+ if(j<ncol(dom))
+ prod = prod(dom[1,(j+1):ncol(dom)])
+ ID = ID + I * prod;
+ }
+
+ # ID transformation to avoid exceeding INT_MAX and
+ # and to void creating huge sparse intermediates
+ [ID, M] = transformencode(target=as.frame(ID),
spec="{ids:true,recode:[1]}")
+
+ # size pruning, with rowMin-rowMax transform
+ # to avoid densification (ignored zeros)
+ map = table(ID, seq(1,nrow(P)), max(ID), nrow(P))
+ ubSizes = 1/rowMaxs(map * (1/t(ss)));
+ ubSizes = replace(target=ubSizes, pattern=Inf, replacement=0);
+ fSizes = (ubSizes >= minSup)
+
+ # error pruning
+ ubError = 1/rowMaxs(map * (1/t(se)));
+ ubError = replace(target=ubError, pattern=Inf, replacement=0);
+ ubMError = 1/rowMaxs(map * (1/t(sm)));
+ ubMError = replace(target=ubMError, pattern=Inf, replacement=0);
+ ubScores = scoreUB(ubSizes, ubError, ubMError, eAvg, minSup, alpha, n2);
+ [maxsc, minsc] = analyzeTopK(TKC);
+ fScores = (ubScores > minsc & ubScores > 0)
+
+ # missing parents pruning
+ numParents = rowSums((map %*% P12) != 0)
+ fParents = (numParents == level);
+
+ # apply all pruning
+ fall = (fSizes & fScores & fParents);
+
+ # deduplication of join outputs
+ Dedup = removeEmpty(target=map, margin="rows", select=fall) != 0
+ #P = (Dedup %*% P) != 0, replaced by below (easier sparsity propagation)
+ DeI = table(rowIndexMax(Dedup), 1, nrow(P), 1);
+ P = removeEmpty(target=P, margin="rows", select=DeI);
+ }
+}
+
+evalSlice = function(Matrix[Double] X, Matrix[Double] e, Double eAvg,
+ Matrix[Double] tS, Integer l, Double alpha)
+ return(Matrix[Double] R)
+{
+ I = (X %*% tS) == l; # slice indicator
+ ss = t(colSums(I)); # absolute slice size (nnz)
+ se = t(t(e) %*% I); # absolute slice error
+ sm = t(colMaxs(I * e)); # maximum tuple error in slice
+
+ # score of relative error and relative size
+ sc = score(ss, se, eAvg, alpha, nrow(X));
+ R = cbind(sc, se, sm, ss);
+}
+
+decodeOneHot = function(Matrix[Double] M, Matrix[Double] foffb, Matrix[Double]
foffe)
+ return(Matrix[Double] M)
+{
+ R = matrix(1, nrow(M), ncol(foffb));
+ if( nrow(M) > 0 ) {
+ parfor( j in 1:ncol(foffb) ) {
+ beg = as.scalar(foffb[1,j])+1;
+ end = as.scalar(foffe[1,j]);
+ I = rowSums(M[,beg:end]) * rowIndexMax(M[,beg:end]);
+ R[, j] = I;
+ }
+ }
+ M = R;
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java
b/src/main/java/org/apache/sysds/common/Builtins.java
index 5c334f72ae..de3f2b5c5a 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -192,6 +192,7 @@ public enum Builtins {
IMPUTE_BY_MODE_APPLY("imputeByModeApply", true),
IMPUTE_FD("imputeByFD", true),
IMPUTE_FD_APPLY("imputeByFDApply", true),
+ INCSLICELINE("incSliceLine", true),
INTERQUANTILE("interQuantile", false),
INTERSECT("intersect", true),
INVERSE("inv", "inverse", false),
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinIncSliceLineTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinIncSliceLineTest.java
new file mode 100644
index 0000000000..82b1a65265
--- /dev/null
+++
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinIncSliceLineTest.java
@@ -0,0 +1,403 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin.part2;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+
+public class BuiltinIncSliceLineTest extends AutomatedTestBase {
+ private static final String PREP_NAME = "slicefinderPrep";
+ private static final String TEST_NAME = "incSliceLine";
+ private static final String TEST_DIR = "functions/builtin/";
+ private static final String TEST_CLASS_DIR = TEST_DIR +
BuiltinIncSliceLineTest.class.getSimpleName() + "/";
+ private static final boolean VERBOSE = true;
+
+ private static final double[][] EXPECTED_TOPK = new double[][] {
+ { 1.042, 69210699988.477, 11078019685.642, 18.000 },
+ { 0.478, 92957580467.849, 11078019685.642, 39.000 },
+ { 0.316, 40425449547.480, 11078019685.642, 10.000 },
+ { 0.262, 67630559163.266, 7261504482.540, 29.000 },
+ { 0.224, 202448990843.317, 11119010986.000, 125.000 },
+ { 0.218, 68860581248.568, 7261504482.540, 31.000 },
+ { 0.164, 206527445340.279, 11119010986.000, 135.000 },
+ { 0.122, 68961886413.866, 7261504482.540, 34.000 },
+ { 0.098, 360278523220.479, 11119010986.000, 266.000 },
+ { 0.092, 73954209826.485, 11078019685.642, 39.000 }
+ };
+
+ @Override
+ public void setUp() {
+ addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR,
TEST_NAME, new String[] { "R" }));
+ }
+
+ @Test
+ public void testTop4HybridDP() {
+ runIncSliceLineTest(4, "e", true, false, ExecMode.HYBRID);
+ }
+
+ @Test
+ public void testTop4SinglenodeDP() {
+ runIncSliceLineTest(4, "e", true, false, ExecMode.SINGLE_NODE);
+ }
+
+ @Test
+ public void testTop4HybridTP() {
+ runIncSliceLineTest(4, "e", false, false, ExecMode.HYBRID);
+ }
+
+ @Test
+ public void testTop4SinglenodeTP() {
+ runIncSliceLineTest(4, "e", false, false, ExecMode.SINGLE_NODE);
+ }
+
+ @Test
+ public void testTop10HybridDP() {
+ runIncSliceLineTest(10, "e", true, false, ExecMode.HYBRID);
+ }
+
+ @Test
+ public void testTop10SinglenodeDP() {
+ runIncSliceLineTest(10, "e", true, false, ExecMode.SINGLE_NODE);
+ }
+
+ @Test
+ public void testTop10HybridTP() {
+ runIncSliceLineTest(10, "e", false, false, ExecMode.HYBRID);
+ }
+
+ @Test
+ public void testTop10SinglenodeTP() {
+ runIncSliceLineTest(10, "e", false, false, ExecMode.SINGLE_NODE);
+ }
+
+ @Test
+ public void testTop4HybridDPSel() {
+ runIncSliceLineTest(4, "e", true, true, ExecMode.HYBRID);
+ }
+
+ @Test
+ public void testTop4SinglenodeDPSel() {
+ runIncSliceLineTest(4, "e", true, true, ExecMode.SINGLE_NODE);
+ }
+
+ @Test
+ public void testTop4HybridTPSel() {
+ runIncSliceLineTest(4, "e", false, true, ExecMode.HYBRID);
+ }
+
+ @Test
+ public void testTop4SinglenodeTPSel() {
+ runIncSliceLineTest(4, "e", false, true, ExecMode.SINGLE_NODE);
+ }
+
+ @Test
+ public void testTop10HybridDPSel() {
+ runIncSliceLineTest(10, "e", true, true, ExecMode.HYBRID);
+ }
+
+ @Test
+ public void testTop10SinglenodeDPSel() {
+ runIncSliceLineTest(10, "e", true, true, ExecMode.SINGLE_NODE);
+ }
+
+ @Test
+ public void testTop10HybridTPSel() {
+ runIncSliceLineTest(10, "e", false, true, ExecMode.HYBRID);
+ }
+
+ @Test
+ public void testTop10SinglenodeTPSel() {
+ runIncSliceLineTest(10, "e", false, true, ExecMode.SINGLE_NODE);
+ }
+
+ @Test
+ public void testTop10HybridTPSelE2() {
+ runIncSliceLineTest(10, "oe", false, true, ExecMode.HYBRID);
+ }
+
+ @Test
+ public void testTop10SinglenodeTPSelE2() {
+ runIncSliceLineTest(10, "oe", false, true, ExecMode.SINGLE_NODE);
+ }
+
+ @Test
+ public void testIncSliceLineCustomInputs1() {
+ double[][] newX = {
+ { 2, 1, 1, 2, 3, 2, 3, 3, 1, 2 },
+ { 2, 2, 2, 3, 4, 1, 2, 1, 3, 2 },
+ { 2, 1, 3, 3, 2, 2, 3, 1, 1, 4 },
+ { 1, 2, 2, 1, 3, 2, 3, 2, 2, 3 },
+ { 3, 2, 3, 4, 3, 3, 4, 1, 1, 3 },
+ { 4, 3, 2, 3, 4, 4, 3, 4, 1, 1 },
+ { 2, 2, 2, 4, 3, 3, 2, 2, 1, 2 },
+ { 1, 1, 2, 2, 3, 3, 2, 1, 1, 2 },
+ { 4, 3, 2, 1, 3, 2, 4, 2, 4, 3 },
+ { 1, 3, 1, 4, 1, 3, 3, 2, 3, 2 },
+ { 2, 4, 3, 1, 2, 4, 1, 3, 2, 4 },
+ { 3, 2, 4, 3, 1, 4, 2, 3, 4, 1 },
+ { 4, 1, 2, 4, 3, 1, 4, 2, 1, 3 },
+ { 1, 3, 4, 2, 4, 3, 1, 4, 2, 3 },
+ { 2, 4, 1, 3, 2, 4, 3, 1, 4, 2 },
+ { 3, 2, 4, 1, 3, 4, 2, 3, 1, 4 },
+ { 4, 1, 3, 2, 4, 1, 4, 2, 3, 1 },
+ { 1, 3, 2, 4, 1, 3, 4, 2, 4, 3 },
+ { 2, 4, 1, 3, 2, 4, 3, 1, 2, 4 },
+ { 2, 3, 3, 2, 1, 4, 2, 3, 2, 3 }
+ };
+ double[][] e = {
+ { 0.159 }, { 0.588 }, { 0.414 }, { 0.305 }, { 0.193 }, { 0.195
}, { 0.878 }, { 0.149 }, { 0.835 },
+ { 0.344 },
+ { 0.123 }, { 0.456 }, { 0.789 }, { 0.987 }, { 0.654 }, { 0.321
}, { 0.246 }, { 0.135 }, { 0.579 },
+ { 0.802 }
+ };
+ int K = 10;
+ double[][] correctRes = {
+ { 0.307, 2.807, 0.878, 4.000 },
+ { 0.307, 2.807, 0.878, 4.000 },
+ { 0.282, 2.759, 0.987, 4.000 },
+ { 0.157, 4.046, 0.987, 7.000 },
+ { 0.127, 2.956, 0.878, 5.000 },
+ { 0.122, 2.942, 0.878, 5.000 },
+ { 0.074, 3.298, 0.987, 6.000 },
+ { 0.064, 4.197, 0.878, 8.000 },
+ { 0.061, 2.796, 0.987, 5.000 },
+ { 0.038, 3.194, 0.878, 6.000 }
+ };
+ testIncSliceLineCustomInputs(newX, e, K, correctRes);
+ }
+
+ @Test
+ public void testIncSliceLineCustomInputs2() {
+ double[][] newX = {
+ { 2, 1, 1, 1, 3, 4, 2, 2, 1, 2 },
+ { 3, 3, 3, 2, 1, 2, 3, 1, 4, 2 },
+ { 3, 2, 3, 1, 1, 1, 4, 3, 4, 2 },
+ { 1, 3, 2, 3, 2, 3, 2, 1, 2, 1 },
+ { 4, 3, 1, 1, 1, 1, 1, 1, 3, 2 },
+ { 2, 2, 3, 3, 2, 2, 2, 3, 4, 1 },
+ { 3, 2, 2, 2, 4, 4, 2, 4, 1, 1 },
+ { 1, 3, 3, 2, 1, 3, 1, 2, 4, 4 },
+ { 2, 1, 2, 2, 3, 1, 2, 3, 2, 1 },
+ { 4, 1, 3, 4, 1, 4, 2, 3, 4, 4 },
+ { 4, 2, 4, 4, 2, 1, 2, 1, 1, 4 },
+ { 4, 1, 1, 4, 1, 4, 3, 2, 4, 2 },
+ { 2, 1, 2, 2, 3, 1, 4, 3, 3, 4 },
+ { 4, 1, 3, 1, 3, 1, 2, 1, 3, 3 },
+ { 2, 1, 3, 1, 1, 3, 1, 2, 1, 2 },
+ { 1, 3, 4, 3, 1, 2, 2, 2, 1, 1 },
+ { 2, 4, 4, 3, 4, 1, 2, 1, 2, 4 },
+ { 3, 3, 3, 3, 3, 1, 2, 3, 4, 4 },
+ { 3, 2, 2, 2, 4, 1, 4, 2, 3, 1 },
+ { 1, 2, 3, 2, 4, 3, 2, 3, 2, 3 }
+ };
+
+ double[][] e = {
+ { 0.591 }, { 0.858 }, { 0.144 }, { 0.350 }, { 0.931 }, { 0.951
}, { 0.788 }, { 0.491 }, { 0.358 },
+ { 0.443 },
+ { 0.231 }, { 0.564 }, { 0.897 }, { 0.879 }, { 0.546 }, { 0.132
}, { 0.462 }, { 0.153 }, { 0.759 },
+ { 0.028 }
+ };
+ int K = 10;
+ double[][] correctRes = {
+ { 0.410, 3.466, 0.931, 4.000 },
+ { 0.410, 3.466, 0.931, 4.000 },
+ { 0.111, 2.802, 0.897, 4.000 },
+ { 0.075, 3.805, 0.951, 6.000 },
+ { 0.057, 4.278, 0.897, 7.000 },
+ { 0.047, 3.711, 0.931, 6.000 },
+ { 0.035, 3.152, 0.897, 5.000 },
+ { 0.032, 4.179, 0.897, 7.000 },
+ { 0.023, 3.634, 0.931, 6.000 },
+ { 0.013, 3.091, 0.931, 5.000 }
+ };
+
+ testIncSliceLineCustomInputs(newX, e, K, correctRes);
+ }
+
+ @Test
+ public void testIncSliceLineCustomInputs3() {
+ double[][] newX = {
+ { 2, 1, 1, 2, 3, 2, 3, 3, 1, 2 },
+ { 2, 2, 2, 3, 4, 1, 2, 1, 3, 2 },
+ { 2, 1, 3, 3, 2, 2, 3, 1, 1, 4 },
+ { 1, 2, 2, 1, 3, 2, 3, 2, 2, 3 },
+ { 3, 2, 3, 4, 3, 3, 4, 1, 1, 3 },
+ { 4, 3, 2, 3, 4, 4, 3, 4, 1, 1 },
+ { 2, 2, 2, 4, 3, 3, 2, 2, 1, 2 },
+ { 1, 1, 2, 2, 3, 3, 2, 1, 1, 2 },
+ { 4, 3, 2, 1, 3, 2, 4, 2, 4, 3 },
+ { 1, 3, 1, 4, 1, 3, 3, 2, 3, 2 },
+ { 2, 4, 3, 1, 2, 4, 1, 3, 2, 4 },
+ { 3, 2, 4, 3, 1, 4, 2, 3, 4, 1 },
+ { 4, 1, 2, 4, 3, 1, 4, 2, 1, 3 },
+ { 1, 3, 4, 2, 4, 3, 1, 4, 2, 3 },
+ { 2, 4, 1, 3, 2, 4, 3, 1, 4, 2 },
+ { 3, 2, 4, 1, 3, 4, 2, 3, 1, 4 },
+ { 4, 1, 3, 2, 4, 1, 4, 2, 3, 1 },
+ { 1, 3, 2, 4, 1, 3, 4, 2, 4, 3 },
+ { 2, 4, 1, 3, 2, 4, 3, 1, 2, 4 },
+ { 2, 3, 3, 2, 1, 4, 2, 3, 2, 3 },
+ { 2, 1, 1, 1, 3, 4, 2, 2, 1, 2 },
+ { 3, 3, 3, 2, 1, 2, 3, 1, 4, 2 },
+ { 3, 2, 3, 1, 1, 1, 4, 3, 4, 2 },
+ { 1, 3, 2, 3, 2, 3, 2, 1, 2, 1 },
+ { 4, 3, 1, 1, 1, 1, 1, 1, 3, 2 },
+ { 2, 2, 3, 3, 2, 2, 2, 3, 4, 1 },
+ { 3, 2, 2, 2, 4, 4, 2, 4, 1, 1 },
+ { 1, 3, 3, 2, 1, 3, 1, 2, 4, 4 },
+ { 2, 1, 2, 2, 3, 1, 2, 3, 2, 1 },
+ { 4, 1, 3, 4, 1, 4, 2, 3, 4, 4 },
+ { 4, 2, 4, 4, 2, 1, 2, 1, 1, 4 },
+ { 4, 1, 1, 4, 1, 4, 3, 2, 4, 2 },
+ { 2, 1, 2, 2, 3, 1, 4, 3, 3, 4 },
+ { 4, 1, 3, 1, 3, 1, 2, 1, 3, 3 },
+ { 2, 1, 3, 1, 1, 3, 1, 2, 1, 2 },
+ { 1, 3, 4, 3, 1, 2, 2, 2, 1, 1 },
+ { 2, 4, 4, 3, 4, 1, 2, 1, 2, 4 },
+ { 3, 3, 3, 3, 3, 1, 2, 3, 4, 4 },
+ { 3, 2, 2, 2, 4, 1, 4, 2, 3, 1 },
+ { 1, 2, 3, 2, 4, 3, 2, 3, 2, 3 }
+ };
+ double[][] e = {
+ { 0.159 }, { 0.588 }, { 0.414 }, { 0.305 }, { 0.193 }, { 0.195
}, { 0.878 }, { 0.149 }, { 0.835 },
+ { 0.344 },
+ { 0.123 }, { 0.456 }, { 0.789 }, { 0.987 }, { 0.654 }, { 0.321
}, { 0.246 }, { 0.135 }, { 0.579 },
+ { 0.802 },
+ { 0.591 }, { 0.858 }, { 0.144 }, { 0.350 }, { 0.931 }, { 0.951
}, { 0.788 }, { 0.491 }, { 0.358 },
+ { 0.443 },
+ { 0.231 }, { 0.564 }, { 0.897 }, { 0.879 }, { 0.546 }, { 0.132
}, { 0.462 }, { 0.153 }, { 0.759 },
+ { 0.028 }
+ };
+ int K = 10;
+ double[][] correctRes = {
+ { 0.149, 4.300, 0.931, 6.000 },
+ { 0.113, 3.138, 0.987, 4.000 },
+ { 0.093, 4.644, 0.931, 7.000 },
+ { 0.090, 4.630, 0.951, 7.000 },
+ { 0.059, 8.002, 0.951, 14.000 },
+ { 0.024, 2.954, 0.951, 4.000 },
+ { 0.017, 3.415, 0.897, 5.000 },
+ { 0.010, 3.398, 0.878, 5.000 },
+ { 0.009, 2.923, 0.897, 4.000 },
+ { 0.008, 3.391, 0.897, 5.000 }
+ };
+ testIncSliceLineCustomInputs(newX, e, K, correctRes);
+ }
+
+ // @Test
+ // public void testTop10SparkTP() {
+ // runIncSliceLineTest(10, false, ExecMode.SPARK);
+ // }
+
+ private void runIncSliceLineTest(int K, String err, boolean dp, boolean
selCols, ExecMode mode) {
+ ExecMode platformOld = setExecMode(mode);
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+ String HOME = SCRIPT_DIR + TEST_DIR;
+ String data = DATASET_DIR + "Salaries.csv";
+
+ try {
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+
+ // run data preparation
+ fullDMLScriptName = HOME + PREP_NAME + ".dml";
+ programArgs = new String[] { "-args", data, err, output("newX"),
output("e") };
+ runTest(true, false, null, -1);
+
+ // read output and store for dml and R
+ double[][] newX =
TestUtils.convertHashMapToDoubleArray(readDMLMatrixFromOutputDir("newX"));
+ double[][] e =
TestUtils.convertHashMapToDoubleArray(readDMLMatrixFromOutputDir("e"));
+ writeInputMatrixWithMTD("newX", newX, true);
+ writeInputMatrixWithMTD("e", e, true);
+
+ // execute main test
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+ programArgs = new String[] { "-args", input("newX"), input("e"),
String.valueOf(K),
+ String.valueOf(!dp).toUpperCase(),
String.valueOf(selCols).toUpperCase(),
+ String.valueOf(VERBOSE).toUpperCase(), output("R") };
+
+ runTest(true, false, null, -1);
+
+ HashMap<CellIndex, Double> dmlfile =
readDMLMatrixFromOutputDir("R");
+
+ // execute main test
+ fullDMLScriptName = HOME + "slicefinder" + ".dml";
+ programArgs = new String[] { "-args", input("newX"), input("e"),
String.valueOf(K),
+ String.valueOf(!dp).toUpperCase(),
String.valueOf(selCols).toUpperCase(),
+ String.valueOf(VERBOSE).toUpperCase(), output("R") };
+
+ runTest(true, false, null, -1);
+
+ HashMap<CellIndex, Double> dmlfile2 =
readDMLMatrixFromOutputDir("R");
+
+ TestUtils.compareMatrices(dmlfile, dmlfile2, 1e-2,
"Stat-IncSliceLine", "Stat-Slicefinder");
+
+ // compare expected results
+ if (err.equals("e")) {
+ double[][] ret =
TestUtils.convertHashMapToDoubleArray(dmlfile);
+ if (mode != ExecMode.SPARK) // TODO why only CP correct, but R
always matches? test framework?
+ for (int i = 0; i < K; i++)
+ TestUtils.compareMatrices(EXPECTED_TOPK[i], ret[i],
1e-2);
+ }
+
+ // ensure proper inlining, despite initially multiple calls and
large function
+ Assert.assertFalse(heavyHittersContainsSubString("evalSlice"));
+ } finally {
+ rtplatform = platformOld;
+ }
+ }
+
+ public void testIncSliceLineCustomInputs(double[][] newX, double[][] e,
int K, double[][] correctRes) {
+ boolean dp = true, selCols = false;
+ ExecMode mode = ExecMode.SINGLE_NODE;
+ ExecMode platformOld = setExecMode(mode);
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+ String HOME = SCRIPT_DIR + TEST_DIR;
+
+ try {
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+
+ writeInputMatrixWithMTD("newX", newX, false);
+ writeInputMatrixWithMTD("e", e, false);
+
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+ programArgs = new String[] { "-args", input("newX"), input("e"),
String.valueOf(K),
+ String.valueOf(!dp).toUpperCase(),
String.valueOf(selCols).toUpperCase(),
+ String.valueOf(VERBOSE).toUpperCase(), output("R") };
+
+ runTest(true, false, null, -1);
+
+ HashMap<CellIndex, Double> dmlfile =
readDMLMatrixFromOutputDir("R");
+ double[][] ret = TestUtils.convertHashMapToDoubleArray(dmlfile);
+ TestUtils.compareMatrices(correctRes, ret, 1e-2);
+
+ Assert.assertFalse(heavyHittersContainsSubString("evalSlice"));
+ } finally {
+ rtplatform = platformOld;
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/test/scripts/functions/builtin/incSliceLine.dml
b/src/test/scripts/functions/builtin/incSliceLine.dml
new file mode 100644
index 0000000000..72843cab32
--- /dev/null
+++ b/src/test/scripts/functions/builtin/incSliceLine.dml
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+newX = read($1);
+e = read($2);
+
+# call slice finding
+[TS,TR] = incSliceLine(newX=newX, e=e, k=$3,
+ alpha=0.95, minSup=4, tpEval=$4, selFeat=$5, verbose=$6);
+
+write(TR, $7)