This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new e9dac368e3 [MINOR] Renaming of sliceLine builtin function
(backwards-compatible)
e9dac368e3 is described below
commit e9dac368e371b9536e67a19899bcdb60ddda1873
Author: Matthias Boehm <[email protected]>
AuthorDate: Sun Sep 1 15:07:07 2024 +0200
[MINOR] Renaming of sliceLine builtin function (backwards-compatible)
---
scripts/builtin/{slicefinder.dml => sliceLine.dml} | 13 +-
scripts/builtin/slicefinder.dml | 310 +--------------------
.../java/org/apache/sysds/common/Builtins.java | 3 +-
3 files changed, 19 insertions(+), 307 deletions(-)
diff --git a/scripts/builtin/slicefinder.dml b/scripts/builtin/sliceLine.dml
similarity index 98%
copy from scripts/builtin/slicefinder.dml
copy to scripts/builtin/sliceLine.dml
index cddbe7f808..b63257be04 100644
--- a/scripts/builtin/slicefinder.dml
+++ b/scripts/builtin/sliceLine.dml
@@ -23,10 +23,11 @@
# ML model debugging technique for finding the top-k data slices where
# a trained models performs significantly worse than on the overall
# dataset. For a detailed description and experimental results, see:
-# Svetlana Sagadeeva, Matthias Boehm: SliceLine: Fast, Linear-Algebra-based
Slice Finding for ML Model Debugging.(SIGMOD 2021)
+# Svetlana Sagadeeva, Matthias Boehm: SliceLine: Fast, Linear-Algebra-based
+# Slice Finding for ML Model Debugging.(SIGMOD 2021)
#
# INPUT:
-#
---------------------------------------------------------------------------------------
+#
------------------------------------------------------------------------------
# X Feature matrix in recoded/binned representation
# e Error vector of trained model
# k Number of subsets required
@@ -39,16 +40,16 @@
# selFeat flag for removing one-hot-encoded features that don't satisfy
# the initial minimum-support constraint and/or have zero error
# verbose flag for verbose debug output
-#
---------------------------------------------------------------------------------------
+#
------------------------------------------------------------------------------
#
# OUTPUT:
-#
-----------------------------------------------------------------------------------------
+#
------------------------------------------------------------------------------
# TK top-k slices (k x ncol(X) if successful)
# TKC score, total/max error, size of slices (k x 4)
# D debug matrix, populated with enumeration stats if verbose
-#
-----------------------------------------------------------------------------------------
+#
------------------------------------------------------------------------------
-m_slicefinder = function(Matrix[Double] X, Matrix[Double] e, Int k = 4,
+m_sliceLine = function(Matrix[Double] X, Matrix[Double] e, Int k = 4,
Int maxL = 0, Int minSup = 32, Double alpha = 0.5, Boolean tpEval = TRUE,
Int tpBlksz = 16, Boolean selFeat = FALSE, Boolean verbose = FALSE)
return(Matrix[Double] TK, Matrix[Double] TKC, Matrix[Double] D)
diff --git a/scripts/builtin/slicefinder.dml b/scripts/builtin/slicefinder.dml
index cddbe7f808..5b84a5d7bb 100644
--- a/scripts/builtin/slicefinder.dml
+++ b/scripts/builtin/slicefinder.dml
@@ -21,12 +21,13 @@
# This builtin function implements SliceLine, a linear-algebra-based
# ML model debugging technique for finding the top-k data slices where
-# a trained models performs significantly worse than on the overall
+# a trained models performs significantly worse than on the overall
# dataset. For a detailed description and experimental results, see:
-# Svetlana Sagadeeva, Matthias Boehm: SliceLine: Fast, Linear-Algebra-based
Slice Finding for ML Model Debugging.(SIGMOD 2021)
+# Svetlana Sagadeeva, Matthias Boehm: SliceLine: Fast, Linear-Algebra-based
+# Slice Finding for ML Model Debugging.(SIGMOD 2021)
#
# INPUT:
-#
---------------------------------------------------------------------------------------
+#
------------------------------------------------------------------------------
# X Feature matrix in recoded/binned representation
# e Error vector of trained model
# k Number of subsets required
@@ -39,313 +40,22 @@
# selFeat flag for removing one-hot-encoded features that don't satisfy
# the initial minimum-support constraint and/or have zero error
# verbose flag for verbose debug output
-#
---------------------------------------------------------------------------------------
+#
------------------------------------------------------------------------------
#
# OUTPUT:
-#
-----------------------------------------------------------------------------------------
+#
------------------------------------------------------------------------------
# TK top-k slices (k x ncol(X) if successful)
# TKC score, total/max error, size of slices (k x 4)
# D debug matrix, populated with enumeration stats if verbose
-#
-----------------------------------------------------------------------------------------
+#
------------------------------------------------------------------------------
m_slicefinder = function(Matrix[Double] X, Matrix[Double] e, Int k = 4,
Int maxL = 0, Int minSup = 32, Double alpha = 0.5, Boolean tpEval = TRUE,
Int tpBlksz = 16, Boolean selFeat = FALSE, Boolean verbose = FALSE)
return(Matrix[Double] TK, Matrix[Double] TKC, Matrix[Double] D)
{
- t1 = time();
-
- # init debug matrix: levelID, enumerated S, valid S, TKmax, TKmin
- D = matrix(0, 0, 5);
-
- m = nrow(X);
- n = ncol(X);
-
- # prepare offset vectors and one-hot encoded X
- fdom = colMaxs(X);
- foffb = t(cumsum(t(fdom))) - fdom;
- foffe = t(cumsum(t(fdom)))
- rix = matrix(seq(1,m)%*%matrix(1,1,n), m*n, 1)
- cix = matrix(X + foffb, m*n, 1);
- X2 = table(rix, cix, 1, m, as.scalar(foffe[,n]), FALSE); #one-hot encoded
-
- # initialize statistics and basic slices
- n2 = ncol(X2); # one-hot encoded features
- eAvg = sum(e) / m; # average error
- [S, R, selCols] = createAndScoreBasicSlices(X2, e, eAvg, minSup, alpha,
verbose);
-
- # initialize top-k
- [TK, TKC] = maintainTopK(S, R, matrix(0,0,n2), matrix(0,0,4), k, minSup);
-
- if( verbose ) {
- [maxsc, minsc] = analyzeTopK(TKC);
- print("SliceFinder: initial top-K: count="+nrow(TK)+", max="+maxsc+",
min="+minsc+" (time="+(time()-t1)+")")
- D = rbind(D, t(as.matrix(list(1, n2, nrow(S), maxsc, minsc))));
- }
-
- # reduced dataset to relevant attributes (minSup, err>0), S reduced
on-the-fly
- if( selFeat )
- X2 = removeEmpty(target=X2, margin="cols", select=t(selCols));
-
- # lattice enumeration w/ size/error pruning, one iteration per level
- # termination condition (max #feature levels)
- maxL = ifelse(maxL<=0, n, maxL)
- level = 1;
- while( nrow(S) > 0 & sum(S) > 0 & level < n & level < maxL ) {
- level = level + 1;
-
- # enumerate candidate join pairs, incl size/error pruning
- nrS = nrow(S);
- S = getPairedCandidates(S, R, TK, TKC, k, level, eAvg, minSup, alpha, n2,
foffb, foffe);
- S2 = S;
- if(selFeat)
- S2 = removeEmpty(target=S, margin="cols", select=t(selCols));
-
- if(verbose) {
- print("\nSliceFinder: level "+level+":")
- print(" -- generated paired slice candidates: "+nrS+" -> "+nrow(S));
- }
-
- if( nrow(S) > 0 ) {
- # extract and evaluate candidate slices
- if( tpEval ) { # task-parallel
- # hybrid task-parallel w/ 1 matrix-matrix for blocks of 16
matrix-vector
- R = matrix(0, nrow(S), 4)
- parfor( i in 1:ceil(nrow(S)/tpBlksz), check=0 ) {
- beg = (i-1)*tpBlksz + 1;
- end = min(i*tpBlksz, nrow(R));
- R[beg:end,] = evalSlice(X2, e, eAvg, t(S2[beg:end,]), level, alpha);
- }
- }
- else { # data-parallel
- R = evalSlice(X2, e, eAvg, t(S2), level, alpha);
- }
-
- # maintain top-k after evaluation
- [TK, TKC] = maintainTopK(S, R, TK, TKC, k, minSup);
-
- if(verbose) {
- [maxsc, minsc] = analyzeTopK(TKC);
- valid = as.integer(sum(R[,2]>0 & R[,4]>=minSup));
- print(" -- valid slices after eval: "+valid+"/"+nrow(S));
- print(" -- top-K: count="+nrow(TK)+", max="+maxsc+", min="+minsc);
- print(" -- (time="+(time()-t1)+")")
- D = rbind(D, t(as.matrix(list(level, nrow(S), valid, maxsc, minsc))));
- }
- }
- }
-
- TK = decodeTopK(TK, foffb, foffe);
-
- if( verbose ) {
- print("SliceFinder: terminated at level "+level+":\n"
- + toString(TK) + "\n" + toString(TKC));
- }
-}
-
-createAndScoreBasicSlices = function(Matrix[Double] X2, Matrix[Double] e,
- Double eAvg, Double minSup, Double alpha, Boolean verbose)
- return(Matrix[Double] S, Matrix[Double] R, Matrix[Double] selCols)
-{
- n2 = ncol(X2);
- cCnts = t(colSums(X2)); # column counts
- err = t(t(e) %*% X2); # total error vector
- merr = t(colMaxs(X2 * e)); # maximum error vector
-
- if( verbose ) {
- drop = as.integer(sum(cCnts < minSup | err == 0));
- print("SliceFinder: dropping "+drop+"/"+n2+" features below minSup =
"+minSup+".");
- }
-
- # working set of active slices (#attr x #slices) and top k
- selCols = (cCnts >= minSup & err > 0);
- attr = removeEmpty(target=seq(1,n2), margin="rows", select=selCols);
- ss = removeEmpty(target=cCnts, margin="rows", select=selCols);
- se = removeEmpty(target=err, margin="rows", select=selCols);
- sm = removeEmpty(target=merr, margin="rows", select=selCols);
- S = table(seq(1,nrow(attr)), attr, nrow(attr), n2);
-
- # score 1-slices and create initial top-k
- sc = score(ss, se, eAvg, alpha, nrow(X2));
- R = cbind(sc, se, sm, ss);
-}
-
-score = function(Matrix[Double] ss, Matrix[Double] se, Double eAvg, Double
alpha, Integer n)
- return(Matrix[Double] sc)
-{
- sc = alpha * ((se/ss) / eAvg - 1) - (1-alpha) * (n/ss - 1);
- sc = replace(target=sc, pattern=NaN, replacement=-Inf);
-}
-
-scoreUB = function(Matrix[Double] ss, Matrix[Double] se, Matrix[Double] sm,
- Double eAvg, Integer minSup, Double alpha, Integer n)
- return(Matrix[Double] sc)
-{
- # Initial upper bound equation (with minSup and ss in pos/neg terms)
- # sc = alpha * ((se/minSup) / eAvg - 1) - (1-alpha) * (n/ss - 1);
-
- # Since sc is either monotonically increasing or decreasing, we
- # probe interesting points of sc in the interval [minSup, ss],
- # and compute the maximum to serve as the upper bound
- s = cbind(matrix(minSup,nrow(ss),1), max(se/sm,minSup), ss)
- sc = rowMaxs(alpha * ((min(s*sm,se)/s) / eAvg - 1) - (1-alpha) * (1/s*n -
1));
- sc = replace(target=sc, pattern=NaN, replacement=-Inf);
-}
-
-
-maintainTopK = function(Matrix[Double] S, Matrix[Double] R,
- Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer minSup)
- return(Matrix[Double] TK, Matrix[Double] TKC)
-{
- # prune invalid minSup and scores
- I = (R[,1] > 0) & (R[,4] >= minSup);
-
- if( sum(I)!=0 ) {
- S = removeEmpty(target=S, margin="rows", select=I);
- R = removeEmpty(target=R, margin="rows", select=I);
-
- # evaluated candidated and previous top-k
- slices = rbind(TK, S);
- scores = rbind(TKC, R);
-
- # extract top-k
- IX = order(target=scores, by=1, decreasing=TRUE, index.return=TRUE);
- IX = IX[1:min(k,nrow(IX)),];
- P = table(seq(1,nrow(IX)), IX, nrow(IX), nrow(slices));
- TK = P %*% slices;
- TKC = P %*% scores;
- }
-}
-
-analyzeTopK = function(Matrix[Double] TKC) return(Double maxsc, Double minsc) {
- maxsc = -Inf;
- minsc = -Inf;
- if( nrow(TKC)>0 ) {
- maxsc = as.scalar(TKC[1,1]);
- minsc = as.scalar(TKC[nrow(TKC),1]);
- }
+ # rediction to sliceLine for backwards compatibility
+ [TK,TKC,D] = sliceLine(X=X, e=e, k=k, maxL=maxL, minSup=minSup, alpha=alpha,
+ tpEval=tpEval, tpBlksz=tpBlksz, selFeat=selFeat,
verbose=verbose)
}
-getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R,
- Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer level,
- Double eAvg, Integer minSup, Double alpha, Integer n2,
- Matrix[Double] foffb, Matrix[Double] foffe)
- return(Matrix[Double] P)
-{
- # prune invalid slices (possible without affecting overall
- # pruning effectiveness due to handling of missing parents)
- pI = (R[,4] >= minSup & R[,2] > 0);
- S = removeEmpty(target=S, margin="rows", select=pI)
- R = removeEmpty(target=R, margin="rows", select=pI)
-
- # join compatible slices (without self)
- join = S %*% t(S) == (level-2)
- I = upper.tri(target=join, diag=FALSE, values=TRUE);
-
- # pair construction
- nr = nrow(I); nc = ncol(I);
- rix = matrix(I * seq(1,nr), nr*nc, 1);
- cix = matrix(I * t(seq(1,nc)), nr*nc, 1);
- rix = removeEmpty(target=rix, margin="rows");
- cix = removeEmpty(target=cix, margin="rows");
-
- P = matrix(0,0,ncol(S))
- if( sum(rix)!=0 ) {
- P1 = table(seq(1,nrow(rix)), rix, nrow(rix), nrow(S));
- P2 = table(seq(1,nrow(cix)), cix, nrow(rix), nrow(S));
- P12 = P1 + P2; # combined slice
- P = (P1 %*% S + P2 %*% S) != 0;
-
- se = min(P1 %*% R[,2], P2 %*% R[,2])
- sm = min(P1 %*% R[,3], P2 %*% R[,3])
- ss = min(P1 %*% R[,4], P2 %*% R[,4])
-
- # prune invalid self joins (>1 bit per feature)
- I = matrix(1, nrow(P), 1);
- for( j in 1:ncol(foffb) ) {
- beg = as.scalar(foffb[1,j])+1;
- end = as.scalar(foffe[1,j]);
- I = I & (rowSums(P[,beg:end]) <= 1);
- }
- P12 = removeEmpty(target=P12, margin="rows", select=I)
- P = removeEmpty(target=P, margin="rows", select=I);
- ss = removeEmpty(target=ss, margin="rows", select=I);
- se = removeEmpty(target=se, margin="rows", select=I);
- sm = removeEmpty(target=sm, margin="rows", select=I);
-
- # prepare IDs for deduplication and pruning
- ID = matrix(0, nrow(P), 1);
- dom = foffe-foffb+1;
- for( j in 1:ncol(dom) ) {
- beg = as.scalar(foffb[1,j])+1;
- end = as.scalar(foffe[1,j]);
- I = rowIndexMax(P[,beg:end]) * rowMaxs(P[,beg:end]);
- prod = 1;
- if(j<ncol(dom))
- prod = prod(dom[1,(j+1):ncol(dom)])
- ID = ID + I * prod;
- }
-
- # ID transformation to avoid exceeding INT_MAX and
- # and to void creating huge sparse intermediates
- [ID, M] = transformencode(target=as.frame(ID),
spec="{ids:true,recode:[1]}")
-
- # size pruning, with rowMin-rowMax transform
- # to avoid densification (ignored zeros)
- map = table(ID, seq(1,nrow(P)), max(ID), nrow(P))
- ubSizes = 1/rowMaxs(map * (1/t(ss)));
- ubSizes = replace(target=ubSizes, pattern=Inf, replacement=0);
- fSizes = (ubSizes >= minSup)
-
- # error pruning
- ubError = 1/rowMaxs(map * (1/t(se)));
- ubError = replace(target=ubError, pattern=Inf, replacement=0);
- ubMError = 1/rowMaxs(map * (1/t(sm)));
- ubMError = replace(target=ubMError, pattern=Inf, replacement=0);
- ubScores = scoreUB(ubSizes, ubError, ubMError, eAvg, minSup, alpha, n2);
- [maxsc, minsc] = analyzeTopK(TKC);
- fScores = (ubScores > minsc & ubScores > 0)
-
- # missing parents pruning
- numParents = rowSums((map %*% P12) != 0)
- fParents = (numParents == level);
-
- # apply all pruning
- fall = (fSizes & fScores & fParents);
-
- # deduplication of join outputs
- Dedup = removeEmpty(target=map, margin="rows", select=fall) != 0
- #P = (Dedup %*% P) != 0, replaced by below (easier sparsity propagation)
- DeI = table(rowIndexMax(Dedup), 1, nrow(P), 1);
- P = removeEmpty(target=P, margin="rows", select=DeI);
- }
-}
-
-evalSlice = function(Matrix[Double] X, Matrix[Double] e, Double eAvg,
- Matrix[Double] tS, Integer l, Double alpha)
- return(Matrix[Double] R)
-{
- I = (X %*% tS) == l; # slice indicator
- ss = t(colSums(I)); # absolute slice size (nnz)
- se = t(t(e) %*% I); # absolute slice error
- sm = t(colMaxs(I * e)); # maximum tuple error in slice
-
- # score of relative error and relative size
- sc = score(ss, se, eAvg, alpha, nrow(X));
- R = cbind(sc, se, sm, ss);
-}
-
-decodeTopK = function(Matrix[Double] TK, Matrix[Double] foffb, Matrix[Double]
foffe)
- return(Matrix[Double] TK)
-{
- R = matrix(1, nrow(TK), ncol(foffb));
- if( nrow(TK) > 0 ) {
- parfor( j in 1:ncol(foffb) ) {
- beg = as.scalar(foffb[1,j])+1;
- end = as.scalar(foffe[1,j]);
- I = rowSums(TK[,beg:end]) * rowIndexMax(TK[,beg:end]);
- R[, j] = I;
- }
- }
- TK = R;
-}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java
b/src/main/java/org/apache/sysds/common/Builtins.java
index 8baee4ec2f..d021597d7f 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -306,7 +306,8 @@ public enum Builtins {
SIGN("sign", false),
SIN("sin", false),
SINH("sinh", false),
- SLICEFINDER("slicefinder", true), //TODO rename
+ SLICEFINDER("slicefinder", true), //TODO remove
+ SLICELINE("sliceLine", true),
SLICELINE_DEBUG("sliceLineDebug", true),
SKEWNESS("skewness", true),
SMAPE("smape", true),