This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push: new e9dac368e3 [MINOR] Renaming of sliceLine builtin function (backwards-compatible) e9dac368e3 is described below commit e9dac368e371b9536e67a19899bcdb60ddda1873 Author: Matthias Boehm <mboe...@gmail.com> AuthorDate: Sun Sep 1 15:07:07 2024 +0200 [MINOR] Renaming of sliceLine builtin function (backwards-compatible) --- scripts/builtin/{slicefinder.dml => sliceLine.dml} | 13 +- scripts/builtin/slicefinder.dml | 310 +-------------------- .../java/org/apache/sysds/common/Builtins.java | 3 +- 3 files changed, 19 insertions(+), 307 deletions(-) diff --git a/scripts/builtin/slicefinder.dml b/scripts/builtin/sliceLine.dml similarity index 98% copy from scripts/builtin/slicefinder.dml copy to scripts/builtin/sliceLine.dml index cddbe7f808..b63257be04 100644 --- a/scripts/builtin/slicefinder.dml +++ b/scripts/builtin/sliceLine.dml @@ -23,10 +23,11 @@ # ML model debugging technique for finding the top-k data slices where # a trained models performs significantly worse than on the overall # dataset. For a detailed description and experimental results, see: -# Svetlana Sagadeeva, Matthias Boehm: SliceLine: Fast, Linear-Algebra-based Slice Finding for ML Model Debugging.(SIGMOD 2021) +# Svetlana Sagadeeva, Matthias Boehm: SliceLine: Fast, Linear-Algebra-based +# Slice Finding for ML Model Debugging.(SIGMOD 2021) # # INPUT: -# --------------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------ # X Feature matrix in recoded/binned representation # e Error vector of trained model # k Number of subsets required @@ -39,16 +40,16 @@ # selFeat flag for removing one-hot-encoded features that don't satisfy # the initial minimum-support constraint and/or have zero error # verbose flag for verbose debug output -# --------------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------ # # OUTPUT: -# ----------------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------ # TK top-k slices (k x ncol(X) if successful) # TKC score, total/max error, size of slices (k x 4) # D debug matrix, populated with enumeration stats if verbose -# ----------------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------ -m_slicefinder = function(Matrix[Double] X, Matrix[Double] e, Int k = 4, +m_sliceLine = function(Matrix[Double] X, Matrix[Double] e, Int k = 4, Int maxL = 0, Int minSup = 32, Double alpha = 0.5, Boolean tpEval = TRUE, Int tpBlksz = 16, Boolean selFeat = FALSE, Boolean verbose = FALSE) return(Matrix[Double] TK, Matrix[Double] TKC, Matrix[Double] D) diff --git a/scripts/builtin/slicefinder.dml b/scripts/builtin/slicefinder.dml index cddbe7f808..5b84a5d7bb 100644 --- a/scripts/builtin/slicefinder.dml +++ b/scripts/builtin/slicefinder.dml @@ -21,12 +21,13 @@ # This builtin function implements SliceLine, a linear-algebra-based # ML model debugging technique for finding the top-k data slices where -# a trained models performs significantly worse than on the overall +# a trained models performs significantly worse than on the overall # dataset. For a detailed description and experimental results, see: -# Svetlana Sagadeeva, Matthias Boehm: SliceLine: Fast, Linear-Algebra-based Slice Finding for ML Model Debugging.(SIGMOD 2021) +# Svetlana Sagadeeva, Matthias Boehm: SliceLine: Fast, Linear-Algebra-based +# Slice Finding for ML Model Debugging.(SIGMOD 2021) # # INPUT: -# --------------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------ # X Feature matrix in recoded/binned representation # e Error vector of trained model # k Number of subsets required @@ -39,313 +40,22 @@ # selFeat flag for removing one-hot-encoded features that don't satisfy # the initial minimum-support constraint and/or have zero error # verbose flag for verbose debug output -# --------------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------ # # OUTPUT: -# ----------------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------ # TK top-k slices (k x ncol(X) if successful) # TKC score, total/max error, size of slices (k x 4) # D debug matrix, populated with enumeration stats if verbose -# ----------------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------ m_slicefinder = function(Matrix[Double] X, Matrix[Double] e, Int k = 4, Int maxL = 0, Int minSup = 32, Double alpha = 0.5, Boolean tpEval = TRUE, Int tpBlksz = 16, Boolean selFeat = FALSE, Boolean verbose = FALSE) return(Matrix[Double] TK, Matrix[Double] TKC, Matrix[Double] D) { - t1 = time(); - - # init debug matrix: levelID, enumerated S, valid S, TKmax, TKmin - D = matrix(0, 0, 5); - - m = nrow(X); - n = ncol(X); - - # prepare offset vectors and one-hot encoded X - fdom = colMaxs(X); - foffb = t(cumsum(t(fdom))) - fdom; - foffe = t(cumsum(t(fdom))) - rix = matrix(seq(1,m)%*%matrix(1,1,n), m*n, 1) - cix = matrix(X + foffb, m*n, 1); - X2 = table(rix, cix, 1, m, as.scalar(foffe[,n]), FALSE); #one-hot encoded - - # initialize statistics and basic slices - n2 = ncol(X2); # one-hot encoded features - eAvg = sum(e) / m; # average error - [S, R, selCols] = createAndScoreBasicSlices(X2, e, eAvg, minSup, alpha, verbose); - - # initialize top-k - [TK, TKC] = maintainTopK(S, R, matrix(0,0,n2), matrix(0,0,4), k, minSup); - - if( verbose ) { - [maxsc, minsc] = analyzeTopK(TKC); - print("SliceFinder: initial top-K: count="+nrow(TK)+", max="+maxsc+", min="+minsc+" (time="+(time()-t1)+")") - D = rbind(D, t(as.matrix(list(1, n2, nrow(S), maxsc, minsc)))); - } - - # reduced dataset to relevant attributes (minSup, err>0), S reduced on-the-fly - if( selFeat ) - X2 = removeEmpty(target=X2, margin="cols", select=t(selCols)); - - # lattice enumeration w/ size/error pruning, one iteration per level - # termination condition (max #feature levels) - maxL = ifelse(maxL<=0, n, maxL) - level = 1; - while( nrow(S) > 0 & sum(S) > 0 & level < n & level < maxL ) { - level = level + 1; - - # enumerate candidate join pairs, incl size/error pruning - nrS = nrow(S); - S = getPairedCandidates(S, R, TK, TKC, k, level, eAvg, minSup, alpha, n2, foffb, foffe); - S2 = S; - if(selFeat) - S2 = removeEmpty(target=S, margin="cols", select=t(selCols)); - - if(verbose) { - print("\nSliceFinder: level "+level+":") - print(" -- generated paired slice candidates: "+nrS+" -> "+nrow(S)); - } - - if( nrow(S) > 0 ) { - # extract and evaluate candidate slices - if( tpEval ) { # task-parallel - # hybrid task-parallel w/ 1 matrix-matrix for blocks of 16 matrix-vector - R = matrix(0, nrow(S), 4) - parfor( i in 1:ceil(nrow(S)/tpBlksz), check=0 ) { - beg = (i-1)*tpBlksz + 1; - end = min(i*tpBlksz, nrow(R)); - R[beg:end,] = evalSlice(X2, e, eAvg, t(S2[beg:end,]), level, alpha); - } - } - else { # data-parallel - R = evalSlice(X2, e, eAvg, t(S2), level, alpha); - } - - # maintain top-k after evaluation - [TK, TKC] = maintainTopK(S, R, TK, TKC, k, minSup); - - if(verbose) { - [maxsc, minsc] = analyzeTopK(TKC); - valid = as.integer(sum(R[,2]>0 & R[,4]>=minSup)); - print(" -- valid slices after eval: "+valid+"/"+nrow(S)); - print(" -- top-K: count="+nrow(TK)+", max="+maxsc+", min="+minsc); - print(" -- (time="+(time()-t1)+")") - D = rbind(D, t(as.matrix(list(level, nrow(S), valid, maxsc, minsc)))); - } - } - } - - TK = decodeTopK(TK, foffb, foffe); - - if( verbose ) { - print("SliceFinder: terminated at level "+level+":\n" - + toString(TK) + "\n" + toString(TKC)); - } -} - -createAndScoreBasicSlices = function(Matrix[Double] X2, Matrix[Double] e, - Double eAvg, Double minSup, Double alpha, Boolean verbose) - return(Matrix[Double] S, Matrix[Double] R, Matrix[Double] selCols) -{ - n2 = ncol(X2); - cCnts = t(colSums(X2)); # column counts - err = t(t(e) %*% X2); # total error vector - merr = t(colMaxs(X2 * e)); # maximum error vector - - if( verbose ) { - drop = as.integer(sum(cCnts < minSup | err == 0)); - print("SliceFinder: dropping "+drop+"/"+n2+" features below minSup = "+minSup+"."); - } - - # working set of active slices (#attr x #slices) and top k - selCols = (cCnts >= minSup & err > 0); - attr = removeEmpty(target=seq(1,n2), margin="rows", select=selCols); - ss = removeEmpty(target=cCnts, margin="rows", select=selCols); - se = removeEmpty(target=err, margin="rows", select=selCols); - sm = removeEmpty(target=merr, margin="rows", select=selCols); - S = table(seq(1,nrow(attr)), attr, nrow(attr), n2); - - # score 1-slices and create initial top-k - sc = score(ss, se, eAvg, alpha, nrow(X2)); - R = cbind(sc, se, sm, ss); -} - -score = function(Matrix[Double] ss, Matrix[Double] se, Double eAvg, Double alpha, Integer n) - return(Matrix[Double] sc) -{ - sc = alpha * ((se/ss) / eAvg - 1) - (1-alpha) * (n/ss - 1); - sc = replace(target=sc, pattern=NaN, replacement=-Inf); -} - -scoreUB = function(Matrix[Double] ss, Matrix[Double] se, Matrix[Double] sm, - Double eAvg, Integer minSup, Double alpha, Integer n) - return(Matrix[Double] sc) -{ - # Initial upper bound equation (with minSup and ss in pos/neg terms) - # sc = alpha * ((se/minSup) / eAvg - 1) - (1-alpha) * (n/ss - 1); - - # Since sc is either monotonically increasing or decreasing, we - # probe interesting points of sc in the interval [minSup, ss], - # and compute the maximum to serve as the upper bound - s = cbind(matrix(minSup,nrow(ss),1), max(se/sm,minSup), ss) - sc = rowMaxs(alpha * ((min(s*sm,se)/s) / eAvg - 1) - (1-alpha) * (1/s*n - 1)); - sc = replace(target=sc, pattern=NaN, replacement=-Inf); -} - - -maintainTopK = function(Matrix[Double] S, Matrix[Double] R, - Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer minSup) - return(Matrix[Double] TK, Matrix[Double] TKC) -{ - # prune invalid minSup and scores - I = (R[,1] > 0) & (R[,4] >= minSup); - - if( sum(I)!=0 ) { - S = removeEmpty(target=S, margin="rows", select=I); - R = removeEmpty(target=R, margin="rows", select=I); - - # evaluated candidated and previous top-k - slices = rbind(TK, S); - scores = rbind(TKC, R); - - # extract top-k - IX = order(target=scores, by=1, decreasing=TRUE, index.return=TRUE); - IX = IX[1:min(k,nrow(IX)),]; - P = table(seq(1,nrow(IX)), IX, nrow(IX), nrow(slices)); - TK = P %*% slices; - TKC = P %*% scores; - } -} - -analyzeTopK = function(Matrix[Double] TKC) return(Double maxsc, Double minsc) { - maxsc = -Inf; - minsc = -Inf; - if( nrow(TKC)>0 ) { - maxsc = as.scalar(TKC[1,1]); - minsc = as.scalar(TKC[nrow(TKC),1]); - } + # rediction to sliceLine for backwards compatibility + [TK,TKC,D] = sliceLine(X=X, e=e, k=k, maxL=maxL, minSup=minSup, alpha=alpha, + tpEval=tpEval, tpBlksz=tpBlksz, selFeat=selFeat, verbose=verbose) } -getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R, - Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer level, - Double eAvg, Integer minSup, Double alpha, Integer n2, - Matrix[Double] foffb, Matrix[Double] foffe) - return(Matrix[Double] P) -{ - # prune invalid slices (possible without affecting overall - # pruning effectiveness due to handling of missing parents) - pI = (R[,4] >= minSup & R[,2] > 0); - S = removeEmpty(target=S, margin="rows", select=pI) - R = removeEmpty(target=R, margin="rows", select=pI) - - # join compatible slices (without self) - join = S %*% t(S) == (level-2) - I = upper.tri(target=join, diag=FALSE, values=TRUE); - - # pair construction - nr = nrow(I); nc = ncol(I); - rix = matrix(I * seq(1,nr), nr*nc, 1); - cix = matrix(I * t(seq(1,nc)), nr*nc, 1); - rix = removeEmpty(target=rix, margin="rows"); - cix = removeEmpty(target=cix, margin="rows"); - - P = matrix(0,0,ncol(S)) - if( sum(rix)!=0 ) { - P1 = table(seq(1,nrow(rix)), rix, nrow(rix), nrow(S)); - P2 = table(seq(1,nrow(cix)), cix, nrow(rix), nrow(S)); - P12 = P1 + P2; # combined slice - P = (P1 %*% S + P2 %*% S) != 0; - - se = min(P1 %*% R[,2], P2 %*% R[,2]) - sm = min(P1 %*% R[,3], P2 %*% R[,3]) - ss = min(P1 %*% R[,4], P2 %*% R[,4]) - - # prune invalid self joins (>1 bit per feature) - I = matrix(1, nrow(P), 1); - for( j in 1:ncol(foffb) ) { - beg = as.scalar(foffb[1,j])+1; - end = as.scalar(foffe[1,j]); - I = I & (rowSums(P[,beg:end]) <= 1); - } - P12 = removeEmpty(target=P12, margin="rows", select=I) - P = removeEmpty(target=P, margin="rows", select=I); - ss = removeEmpty(target=ss, margin="rows", select=I); - se = removeEmpty(target=se, margin="rows", select=I); - sm = removeEmpty(target=sm, margin="rows", select=I); - - # prepare IDs for deduplication and pruning - ID = matrix(0, nrow(P), 1); - dom = foffe-foffb+1; - for( j in 1:ncol(dom) ) { - beg = as.scalar(foffb[1,j])+1; - end = as.scalar(foffe[1,j]); - I = rowIndexMax(P[,beg:end]) * rowMaxs(P[,beg:end]); - prod = 1; - if(j<ncol(dom)) - prod = prod(dom[1,(j+1):ncol(dom)]) - ID = ID + I * prod; - } - - # ID transformation to avoid exceeding INT_MAX and - # and to void creating huge sparse intermediates - [ID, M] = transformencode(target=as.frame(ID), spec="{ids:true,recode:[1]}") - - # size pruning, with rowMin-rowMax transform - # to avoid densification (ignored zeros) - map = table(ID, seq(1,nrow(P)), max(ID), nrow(P)) - ubSizes = 1/rowMaxs(map * (1/t(ss))); - ubSizes = replace(target=ubSizes, pattern=Inf, replacement=0); - fSizes = (ubSizes >= minSup) - - # error pruning - ubError = 1/rowMaxs(map * (1/t(se))); - ubError = replace(target=ubError, pattern=Inf, replacement=0); - ubMError = 1/rowMaxs(map * (1/t(sm))); - ubMError = replace(target=ubMError, pattern=Inf, replacement=0); - ubScores = scoreUB(ubSizes, ubError, ubMError, eAvg, minSup, alpha, n2); - [maxsc, minsc] = analyzeTopK(TKC); - fScores = (ubScores > minsc & ubScores > 0) - - # missing parents pruning - numParents = rowSums((map %*% P12) != 0) - fParents = (numParents == level); - - # apply all pruning - fall = (fSizes & fScores & fParents); - - # deduplication of join outputs - Dedup = removeEmpty(target=map, margin="rows", select=fall) != 0 - #P = (Dedup %*% P) != 0, replaced by below (easier sparsity propagation) - DeI = table(rowIndexMax(Dedup), 1, nrow(P), 1); - P = removeEmpty(target=P, margin="rows", select=DeI); - } -} - -evalSlice = function(Matrix[Double] X, Matrix[Double] e, Double eAvg, - Matrix[Double] tS, Integer l, Double alpha) - return(Matrix[Double] R) -{ - I = (X %*% tS) == l; # slice indicator - ss = t(colSums(I)); # absolute slice size (nnz) - se = t(t(e) %*% I); # absolute slice error - sm = t(colMaxs(I * e)); # maximum tuple error in slice - - # score of relative error and relative size - sc = score(ss, se, eAvg, alpha, nrow(X)); - R = cbind(sc, se, sm, ss); -} - -decodeTopK = function(Matrix[Double] TK, Matrix[Double] foffb, Matrix[Double] foffe) - return(Matrix[Double] TK) -{ - R = matrix(1, nrow(TK), ncol(foffb)); - if( nrow(TK) > 0 ) { - parfor( j in 1:ncol(foffb) ) { - beg = as.scalar(foffb[1,j])+1; - end = as.scalar(foffe[1,j]); - I = rowSums(TK[,beg:end]) * rowIndexMax(TK[,beg:end]); - R[, j] = I; - } - } - TK = R; -} diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java index 8baee4ec2f..d021597d7f 100644 --- a/src/main/java/org/apache/sysds/common/Builtins.java +++ b/src/main/java/org/apache/sysds/common/Builtins.java @@ -306,7 +306,8 @@ public enum Builtins { SIGN("sign", false), SIN("sin", false), SINH("sinh", false), - SLICEFINDER("slicefinder", true), //TODO rename + SLICEFINDER("slicefinder", true), //TODO remove + SLICELINE("sliceLine", true), SLICELINE_DEBUG("sliceLineDebug", true), SKEWNESS("skewness", true), SMAPE("smape", true),