(systemds) branch main updated: [MINOR] Renaming of sliceLine builtin function (backwards-compatible)

mboehm7 Sun, 01 Sep 2024 06:09:35 -0700

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new e9dac368e3 [MINOR] Renaming of sliceLine builtin function 
(backwards-compatible)
e9dac368e3 is described below

commit e9dac368e371b9536e67a19899bcdb60ddda1873
Author: Matthias Boehm <mboe...@gmail.com>
AuthorDate: Sun Sep 1 15:07:07 2024 +0200

    [MINOR] Renaming of sliceLine builtin function (backwards-compatible)
---
 scripts/builtin/{slicefinder.dml => sliceLine.dml} |  13 +-
 scripts/builtin/slicefinder.dml                    | 310 +--------------------
 .../java/org/apache/sysds/common/Builtins.java     |   3 +-
 3 files changed, 19 insertions(+), 307 deletions(-)

diff --git a/scripts/builtin/slicefinder.dml b/scripts/builtin/sliceLine.dml
similarity index 98%
copy from scripts/builtin/slicefinder.dml
copy to scripts/builtin/sliceLine.dml
index cddbe7f808..b63257be04 100644
--- a/scripts/builtin/slicefinder.dml
+++ b/scripts/builtin/sliceLine.dml
@@ -23,10 +23,11 @@
 # ML model debugging technique for finding the top-k data slices where
 # a trained models performs significantly worse than on the overall 
 # dataset. For a detailed description and experimental results, see:
-# Svetlana Sagadeeva, Matthias Boehm: SliceLine: Fast, Linear-Algebra-based 
Slice Finding for ML Model Debugging.(SIGMOD 2021)
+# Svetlana Sagadeeva, Matthias Boehm: SliceLine: Fast, Linear-Algebra-based
+# Slice Finding for ML Model Debugging.(SIGMOD 2021)
 #
 # INPUT:
-# 
---------------------------------------------------------------------------------------
+# 
------------------------------------------------------------------------------
 # X        Feature matrix in recoded/binned representation
 # e        Error vector of trained model
 # k        Number of subsets required
@@ -39,16 +40,16 @@
 # selFeat  flag for removing one-hot-encoded features that don't satisfy
 #          the initial minimum-support constraint and/or have zero error
 # verbose  flag for verbose debug output
-# 
---------------------------------------------------------------------------------------
+# 
------------------------------------------------------------------------------
 #
 # OUTPUT:
-# 
-----------------------------------------------------------------------------------------
+# 
------------------------------------------------------------------------------
 # TK    top-k slices (k x ncol(X) if successful)
 # TKC   score, total/max error, size of slices (k x 4)
 # D     debug matrix, populated with enumeration stats if verbose
-# 
-----------------------------------------------------------------------------------------
+# 
------------------------------------------------------------------------------
 
-m_slicefinder = function(Matrix[Double] X, Matrix[Double] e, Int k = 4,
+m_sliceLine = function(Matrix[Double] X, Matrix[Double] e, Int k = 4,
     Int maxL = 0, Int minSup = 32, Double alpha = 0.5, Boolean tpEval = TRUE,
     Int tpBlksz = 16, Boolean selFeat = FALSE, Boolean verbose = FALSE)
   return(Matrix[Double] TK, Matrix[Double] TKC, Matrix[Double] D)
diff --git a/scripts/builtin/slicefinder.dml b/scripts/builtin/slicefinder.dml
index cddbe7f808..5b84a5d7bb 100644
--- a/scripts/builtin/slicefinder.dml
+++ b/scripts/builtin/slicefinder.dml
@@ -21,12 +21,13 @@
 
 # This builtin function implements SliceLine, a linear-algebra-based
 # ML model debugging technique for finding the top-k data slices where
-# a trained models performs significantly worse than on the overall 
+# a trained models performs significantly worse than on the overall
 # dataset. For a detailed description and experimental results, see:
-# Svetlana Sagadeeva, Matthias Boehm: SliceLine: Fast, Linear-Algebra-based 
Slice Finding for ML Model Debugging.(SIGMOD 2021)
+# Svetlana Sagadeeva, Matthias Boehm: SliceLine: Fast, Linear-Algebra-based
+# Slice Finding for ML Model Debugging.(SIGMOD 2021)
 #
 # INPUT:
-# 
---------------------------------------------------------------------------------------
+# 
------------------------------------------------------------------------------
 # X        Feature matrix in recoded/binned representation
 # e        Error vector of trained model
 # k        Number of subsets required
@@ -39,313 +40,22 @@
 # selFeat  flag for removing one-hot-encoded features that don't satisfy
 #          the initial minimum-support constraint and/or have zero error
 # verbose  flag for verbose debug output
-# 
---------------------------------------------------------------------------------------
+# 
------------------------------------------------------------------------------
 #
 # OUTPUT:
-# 
-----------------------------------------------------------------------------------------
+# 
------------------------------------------------------------------------------
 # TK    top-k slices (k x ncol(X) if successful)
 # TKC   score, total/max error, size of slices (k x 4)
 # D     debug matrix, populated with enumeration stats if verbose
-# 
-----------------------------------------------------------------------------------------
+# 
------------------------------------------------------------------------------
 
 m_slicefinder = function(Matrix[Double] X, Matrix[Double] e, Int k = 4,
     Int maxL = 0, Int minSup = 32, Double alpha = 0.5, Boolean tpEval = TRUE,
     Int tpBlksz = 16, Boolean selFeat = FALSE, Boolean verbose = FALSE)
   return(Matrix[Double] TK, Matrix[Double] TKC, Matrix[Double] D)
 {
-  t1 = time();
-
-  # init debug matrix: levelID, enumerated S, valid S, TKmax, TKmin
-  D = matrix(0, 0, 5);
-
-  m = nrow(X);
-  n = ncol(X);
-
-  # prepare offset vectors and one-hot encoded X
-  fdom = colMaxs(X);
-  foffb = t(cumsum(t(fdom))) - fdom;
-  foffe = t(cumsum(t(fdom)))
-  rix = matrix(seq(1,m)%*%matrix(1,1,n), m*n, 1)
-  cix = matrix(X + foffb, m*n, 1);
-  X2 = table(rix, cix, 1, m, as.scalar(foffe[,n]), FALSE); #one-hot encoded
-
-  # initialize statistics and basic slices
-  n2 = ncol(X2);     # one-hot encoded features
-  eAvg = sum(e) / m; # average error
-  [S, R, selCols] = createAndScoreBasicSlices(X2, e, eAvg, minSup, alpha, 
verbose); 
-
-  # initialize top-k
-  [TK, TKC] = maintainTopK(S, R, matrix(0,0,n2), matrix(0,0,4), k, minSup);
-
-  if( verbose ) {
-    [maxsc, minsc] = analyzeTopK(TKC);
-    print("SliceFinder: initial top-K: count="+nrow(TK)+", max="+maxsc+", 
min="+minsc+" (time="+(time()-t1)+")")
-    D = rbind(D, t(as.matrix(list(1, n2, nrow(S), maxsc, minsc))));
-  }
-
-  # reduced dataset to relevant attributes (minSup, err>0), S reduced 
on-the-fly
-  if( selFeat )
-    X2 = removeEmpty(target=X2, margin="cols", select=t(selCols));
-
-  # lattice enumeration w/ size/error pruning, one iteration per level
-  # termination condition (max #feature levels)
-  maxL = ifelse(maxL<=0, n, maxL)
-  level = 1;
-  while( nrow(S) > 0 & sum(S) > 0 & level < n & level < maxL ) {
-    level = level + 1;
-
-    # enumerate candidate join pairs, incl size/error pruning 
-    nrS = nrow(S);
-    S = getPairedCandidates(S, R, TK, TKC, k, level, eAvg, minSup, alpha, n2, 
foffb, foffe);
-    S2 = S;
-    if(selFeat)
-      S2 = removeEmpty(target=S, margin="cols", select=t(selCols));
-
-    if(verbose) {
-      print("\nSliceFinder: level "+level+":")
-      print(" -- generated paired slice candidates: "+nrS+" -> "+nrow(S));
-    }
-
-    if( nrow(S) > 0 ) {
-      # extract and evaluate candidate slices
-      if( tpEval ) { # task-parallel
-        # hybrid task-parallel w/ 1 matrix-matrix for blocks of 16 
matrix-vector
-        R = matrix(0, nrow(S), 4)
-        parfor( i in 1:ceil(nrow(S)/tpBlksz), check=0 ) {
-          beg = (i-1)*tpBlksz + 1;
-          end = min(i*tpBlksz, nrow(R));
-          R[beg:end,] = evalSlice(X2, e, eAvg, t(S2[beg:end,]), level, alpha);
-        }
-      }
-      else { # data-parallel
-        R = evalSlice(X2, e, eAvg, t(S2), level, alpha);
-      }
-
-      # maintain top-k after evaluation
-      [TK, TKC] = maintainTopK(S, R, TK, TKC, k, minSup);
-
-      if(verbose) {
-        [maxsc, minsc] = analyzeTopK(TKC);
-        valid = as.integer(sum(R[,2]>0 & R[,4]>=minSup));
-        print(" -- valid slices after eval: "+valid+"/"+nrow(S));
-        print(" -- top-K: count="+nrow(TK)+", max="+maxsc+", min="+minsc);
-        print(" -- (time="+(time()-t1)+")")
-        D = rbind(D, t(as.matrix(list(level, nrow(S), valid, maxsc, minsc))));
-      }
-    }
-  }
-
-  TK = decodeTopK(TK, foffb, foffe);
-
-  if( verbose ) {
-    print("SliceFinder: terminated at level "+level+":\n"
-      + toString(TK) + "\n" + toString(TKC));
-  }
-}
-
-createAndScoreBasicSlices = function(Matrix[Double] X2, Matrix[Double] e,
-    Double eAvg, Double minSup, Double alpha, Boolean verbose)
-  return(Matrix[Double] S, Matrix[Double] R, Matrix[Double] selCols)
-{
-  n2 = ncol(X2);
-  cCnts = t(colSums(X2));    # column counts
-  err = t(t(e) %*% X2);      # total error vector
-  merr = t(colMaxs(X2 * e)); # maximum error vector
-
-  if( verbose ) {
-    drop = as.integer(sum(cCnts < minSup | err == 0));
-    print("SliceFinder: dropping "+drop+"/"+n2+" features below minSup = 
"+minSup+".");
-  }
-
-  # working set of active slices (#attr x #slices) and top k
-  selCols = (cCnts >= minSup & err > 0);
-  attr = removeEmpty(target=seq(1,n2), margin="rows", select=selCols);
-  ss = removeEmpty(target=cCnts, margin="rows", select=selCols);
-  se = removeEmpty(target=err, margin="rows", select=selCols);
-  sm = removeEmpty(target=merr, margin="rows", select=selCols);
-  S = table(seq(1,nrow(attr)), attr, nrow(attr), n2);
-
-  # score 1-slices and create initial top-k
-  sc = score(ss, se, eAvg, alpha, nrow(X2));
-  R = cbind(sc, se, sm, ss);
-}
-
-score = function(Matrix[Double] ss, Matrix[Double] se, Double eAvg, Double 
alpha, Integer n)
-  return(Matrix[Double] sc)
-{
-  sc = alpha * ((se/ss) / eAvg - 1) - (1-alpha) * (n/ss - 1);
-  sc = replace(target=sc, pattern=NaN, replacement=-Inf);
-}
-
-scoreUB = function(Matrix[Double] ss, Matrix[Double] se, Matrix[Double] sm, 
-    Double eAvg, Integer minSup, Double alpha, Integer n)
-  return(Matrix[Double] sc)
-{
-  # Initial upper bound equation (with minSup and ss in pos/neg terms)
-  # sc = alpha * ((se/minSup) / eAvg - 1) - (1-alpha) * (n/ss - 1);
-
-  # Since sc is either monotonically increasing or decreasing, we
-  # probe interesting points of sc in the interval [minSup, ss],
-  # and compute the maximum to serve as the upper bound 
-  s = cbind(matrix(minSup,nrow(ss),1), max(se/sm,minSup), ss) 
-  sc = rowMaxs(alpha * ((min(s*sm,se)/s) / eAvg - 1) - (1-alpha) * (1/s*n - 
1));
-  sc = replace(target=sc, pattern=NaN, replacement=-Inf);
-}
-
-
-maintainTopK = function(Matrix[Double] S, Matrix[Double] R, 
-    Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer minSup) 
-  return(Matrix[Double] TK, Matrix[Double] TKC)
-{
-  # prune invalid minSup and scores
-  I = (R[,1] > 0) & (R[,4] >= minSup);
-
-  if( sum(I)!=0 ) {
-    S = removeEmpty(target=S, margin="rows", select=I);
-    R = removeEmpty(target=R, margin="rows", select=I);
-
-    # evaluated candidated and previous top-k
-    slices = rbind(TK, S);
-    scores = rbind(TKC, R);
-
-    # extract top-k
-    IX = order(target=scores, by=1, decreasing=TRUE, index.return=TRUE);
-    IX = IX[1:min(k,nrow(IX)),];
-    P = table(seq(1,nrow(IX)), IX, nrow(IX), nrow(slices));
-    TK = P %*% slices;
-    TKC = P %*% scores;
-  }
-}
-
-analyzeTopK = function(Matrix[Double] TKC) return(Double maxsc, Double minsc) {
-  maxsc = -Inf;
-  minsc = -Inf;
-  if( nrow(TKC)>0 ) {
-    maxsc = as.scalar(TKC[1,1]);
-    minsc = as.scalar(TKC[nrow(TKC),1]);
-  }
+  # rediction to sliceLine for backwards compatibility
+  [TK,TKC,D] = sliceLine(X=X, e=e, k=k, maxL=maxL, minSup=minSup, alpha=alpha,
+                 tpEval=tpEval, tpBlksz=tpBlksz, selFeat=selFeat, 
verbose=verbose)
 }
 
-getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R, 
-    Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer level, 
-    Double eAvg, Integer minSup, Double alpha, Integer n2, 
-    Matrix[Double] foffb, Matrix[Double] foffe)
-  return(Matrix[Double] P)
-{
-  # prune invalid slices (possible without affecting overall
-  # pruning effectiveness due to handling of missing parents)
-  pI = (R[,4] >= minSup & R[,2] > 0);
-  S = removeEmpty(target=S, margin="rows", select=pI)
-  R = removeEmpty(target=R, margin="rows", select=pI)
-
-  # join compatible slices (without self)
-  join = S %*% t(S) == (level-2)
-  I = upper.tri(target=join, diag=FALSE, values=TRUE);
-
-  # pair construction
-  nr = nrow(I); nc = ncol(I);
-  rix = matrix(I * seq(1,nr), nr*nc, 1);
-  cix = matrix(I * t(seq(1,nc)), nr*nc, 1);
-  rix = removeEmpty(target=rix, margin="rows");
-  cix = removeEmpty(target=cix, margin="rows");
-
-  P = matrix(0,0,ncol(S))
-  if( sum(rix)!=0 ) {
-    P1 = table(seq(1,nrow(rix)), rix, nrow(rix), nrow(S));
-    P2 = table(seq(1,nrow(cix)), cix, nrow(rix), nrow(S));
-    P12 = P1 + P2; # combined slice
-    P = (P1 %*% S + P2 %*% S) != 0;
-
-    se = min(P1 %*% R[,2], P2 %*% R[,2])
-    sm = min(P1 %*% R[,3], P2 %*% R[,3])
-    ss = min(P1 %*% R[,4], P2 %*% R[,4])
-
-    # prune invalid self joins (>1 bit per feature)
-    I = matrix(1, nrow(P), 1);
-    for( j in 1:ncol(foffb) ) {
-      beg = as.scalar(foffb[1,j])+1;
-      end = as.scalar(foffe[1,j]);
-      I = I & (rowSums(P[,beg:end]) <= 1);
-    }
-    P12 = removeEmpty(target=P12, margin="rows", select=I)
-    P = removeEmpty(target=P, margin="rows", select=I);
-    ss = removeEmpty(target=ss, margin="rows", select=I);
-    se = removeEmpty(target=se, margin="rows", select=I);
-    sm = removeEmpty(target=sm, margin="rows", select=I);
-
-    # prepare IDs for deduplication and pruning
-    ID = matrix(0, nrow(P), 1);
-    dom = foffe-foffb+1;
-    for( j in 1:ncol(dom) ) {
-      beg = as.scalar(foffb[1,j])+1;
-      end = as.scalar(foffe[1,j]);
-      I = rowIndexMax(P[,beg:end]) * rowMaxs(P[,beg:end]);
-      prod = 1;
-      if(j<ncol(dom))
-        prod = prod(dom[1,(j+1):ncol(dom)])
-      ID = ID + I * prod;
-    }
-
-    # ID transformation to avoid exceeding INT_MAX and
-    # and to void creating huge sparse intermediates
-    [ID, M] = transformencode(target=as.frame(ID), 
spec="{ids:true,recode:[1]}")
-
-    # size pruning, with rowMin-rowMax transform
-    # to avoid densification (ignored zeros)
-    map = table(ID, seq(1,nrow(P)), max(ID), nrow(P))
-    ubSizes = 1/rowMaxs(map * (1/t(ss)));
-    ubSizes = replace(target=ubSizes, pattern=Inf, replacement=0);
-    fSizes = (ubSizes >= minSup)
-
-    # error pruning
-    ubError = 1/rowMaxs(map * (1/t(se)));
-    ubError = replace(target=ubError, pattern=Inf, replacement=0);
-    ubMError = 1/rowMaxs(map * (1/t(sm)));
-    ubMError = replace(target=ubMError, pattern=Inf, replacement=0);
-    ubScores = scoreUB(ubSizes, ubError, ubMError, eAvg, minSup, alpha, n2);
-    [maxsc, minsc] = analyzeTopK(TKC);
-    fScores = (ubScores > minsc & ubScores > 0)
-
-    # missing parents pruning
-    numParents = rowSums((map %*% P12) != 0)
-    fParents = (numParents == level);
-
-    # apply all pruning
-    fall = (fSizes & fScores & fParents);
-
-    # deduplication of join outputs
-    Dedup = removeEmpty(target=map, margin="rows", select=fall) != 0
-    #P = (Dedup %*% P) != 0, replaced by below (easier sparsity propagation)
-    DeI = table(rowIndexMax(Dedup), 1, nrow(P), 1);
-    P = removeEmpty(target=P, margin="rows", select=DeI);
-  }
-}
-
-evalSlice = function(Matrix[Double] X, Matrix[Double] e, Double eAvg, 
-    Matrix[Double] tS, Integer l, Double alpha) 
-  return(Matrix[Double] R)
-{
-  I = (X %*% tS) == l;    # slice indicator
-  ss = t(colSums(I));     # absolute slice size (nnz)
-  se = t(t(e) %*% I);     # absolute slice error
-  sm = t(colMaxs(I * e)); # maximum tuple error in slice
-
-  # score of relative error and relative size
-  sc = score(ss, se, eAvg, alpha, nrow(X));
-  R = cbind(sc, se, sm, ss);
-}
-
-decodeTopK = function(Matrix[Double] TK, Matrix[Double] foffb, Matrix[Double] 
foffe)
-  return(Matrix[Double] TK) 
-{
-  R = matrix(1, nrow(TK), ncol(foffb));
-  if( nrow(TK) > 0 ) {
-    parfor( j in 1:ncol(foffb) ) {
-      beg = as.scalar(foffb[1,j])+1;
-      end = as.scalar(foffe[1,j]);
-      I = rowSums(TK[,beg:end]) * rowIndexMax(TK[,beg:end]);
-      R[, j] = I;
-    }
-  }
-  TK = R;
-}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java 
b/src/main/java/org/apache/sysds/common/Builtins.java
index 8baee4ec2f..d021597d7f 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -306,7 +306,8 @@ public enum Builtins {
        SIGN("sign", false),
        SIN("sin", false),
        SINH("sinh", false),
-       SLICEFINDER("slicefinder", true), //TODO rename
+       SLICEFINDER("slicefinder", true), //TODO remove
+       SLICELINE("sliceLine", true),
        SLICELINE_DEBUG("sliceLineDebug", true),
        SKEWNESS("skewness", true),
        SMAPE("smape", true),

(systemds) branch main updated: [MINOR] Renaming of sliceLine builtin function (backwards-compatible)

Reply via email to