(systemds) branch main updated: [SYSTEMDS-3696] Additional pruning strategy for incremental slice line

mboehm7 Sat, 14 Sep 2024 06:22:18 -0700

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new 726d21d08a [SYSTEMDS-3696] Additional pruning strategy for incremental 
slice line
726d21d08a is described below

commit 726d21d08aa417764123221e2f5ae95ff92bb4f9
Author: Matthias Boehm <mboe...@gmail.com>
AuthorDate: Sat Sep 14 15:21:44 2024 +0200

    [SYSTEMDS-3696] Additional pruning strategy for incremental slice line
    
    This patch adds a very effective pruning strategy which yields up to
    two orders of magnitude runtime improvements on Adult, Covtype, KDD98,
    and USCenus. However, this strategy only gives high-probability
    guarantees. In detail, we evaluate previously evaluated slices by
    adding the contribution of added and removed tuples in order to
    determine feature-wise high-probability upper bound scores which are
    in turn used to eliminate basic (single-feature) slices early on.
    Due to edge cases that might be missed, this strategy should not be
    applied by default (even though the tests pass), which I will do
    when handling #2107 because it also touches the pruning selector.
---
 scripts/builtin/incSliceLine.dml | 92 ++++++++++++++++++++++++++++++++++------
 1 file changed, 80 insertions(+), 12 deletions(-)

diff --git a/scripts/builtin/incSliceLine.dml b/scripts/builtin/incSliceLine.dml
index 0ef77bd5f2..212d12e553 100644
--- a/scripts/builtin/incSliceLine.dml
+++ b/scripts/builtin/incSliceLine.dml
@@ -116,11 +116,11 @@ m_incSliceLine = function(
     oldE = matrix(0,0,ncol(addedE));
   }
   ## remove all rows from oldX and oldE that are in indicesRemoved
-  removedTuples = matrix(0,0,ncol(oldX));
+  removedX = matrix(0,0,ncol(oldX));
   removedE = matrix(0,0,1);
   if(length(indicesRemoved) > 0 & nrow(oldX)  > 0){
-    [oldX, removedTuples] = removeRowsByIndices( oldX, indicesRemoved);
-    [oldE, removedE] = removeRowsByIndices( oldE, indicesRemoved);
+    [oldX, removedX] = removeRowsByIndices(oldX, indicesRemoved);
+    [oldE, removedE] = removeRowsByIndices(oldE, indicesRemoved);
     if(verbose){
       print("incSliceLine: removed "+nrow(indicesRemoved)+" tuples.");
     }
@@ -156,9 +156,12 @@ m_incSliceLine = function(
   # One-hot encoding of tuples that were removed from oldX
   # combining of addedX and oldX in changedX2 facilitates simple determination 
of unchanged slices for pruning
   changedX2 = addedX2;
-  if(nrow(removedTuples) > 0){
-    removedTuples2 = oneHotEncodeUsingOffsets(removedTuples, foffb, foffe);
-    changedX2 = rbind(addedX2, removedTuples2);
+  removedX2 = removedX;
+  if(nrow(removedX) > 0){
+    removedX2 = oneHotEncodeUsingOffsets(removedX, foffb, foffe);
+    changedX2 = rbind(addedX2, removedX2);
+  } else {
+    removedX2 = matrix(0, 0, ncol(addedX2));
   }
   changedE = rbind(addedE, removedE);
 
@@ -180,8 +183,11 @@ m_incSliceLine = function(
   # create and score basic slices (conjunctions of 1 feature)
   maxsc = getMaxScoreAllFeatures(nrow(X2), ncol(X2), prevLattice, 
metaPrevLattice,
      prevStats, encodeLat, differentOffsets, alpha, eAvg, prevFoffb, 
prevFoffe, foffb, foffe);
+  maxscub = getMaxChangedScoreAllFeatures(nrow(X2), ncol(X2),
+     addedX2, removedX2, addedE, removedE, prevLattice, metaPrevLattice, 
prevStats,
+     encodeLat, differentOffsets, alpha, eAvg, minSup, prevFoffb, prevFoffe, 
foffb, foffe);
   [S, R, selCols] = createAndScoreBasicSlicesInc(X2, changedX2, prevTK2, 
totalE, changedE,
-     eAvg, eAvgOld, eAvgNew, minSup, alpha, minsc, maxsc, verbose, 
disableIncScorePruning);
+     eAvg, eAvgOld, eAvgNew, minSup, alpha, minsc, maxsc, maxscub, verbose, 
disableIncScorePruning);
 
   # initialize lattice and statistics for incremental updates
   L1 = S;
@@ -302,7 +308,8 @@ m_incSliceLine = function(
 createAndScoreBasicSlicesInc = function(Matrix[Double] X2, Matrix[Double] X2p,
     Matrix[Double] prevTK2, Matrix[Double] e, Matrix[Double] ep,
     Double eAvg, Double eAvgOld, Double eAvgNew, Double minSup, Double alpha,
-    Double minsc, Matrix[Double] maxsc, Boolean verbose, Boolean 
disableIncScorePruning)
+    Double minsc, Matrix[Double] maxsc, Matrix[Double] maxscub, Boolean 
verbose,
+    Boolean disableIncScorePruning)
   return(Matrix[Double] S, Matrix[Double] R, Matrix[Double] selCols)
 {
   n2 = ncol(X2);
@@ -329,6 +336,15 @@ createAndScoreBasicSlicesInc = function(Matrix[Double] X2, 
Matrix[Double] X2p,
     print("incSliceLine: dropping "+drop+"/"+n+" unaffected features.");
   }
 
+  # c) max score changed pruning
+  n = as.integer(sum(selCols2));
+  selCols2 = selCols2 & (maxscub >= max(0, minsc) | maxscub==-Inf);
+
+  if( verbose ) {
+    drop = as.integer(n - sum(selCols2));
+    print("incSliceLine: dropping "+drop+"/"+n+" insufficiently affected 
features.");
+  }
+
   # working set of active slices (#attr x #slices) and top k
   attr = removeEmpty(target=seq(1,n2), margin="rows", select=selCols2);
   ss = removeEmpty(target=cCnts, margin="rows", select=selCols2);
@@ -341,7 +357,7 @@ createAndScoreBasicSlicesInc = function(Matrix[Double] X2, 
Matrix[Double] X2p,
   sc = scoreInc(ss, se, eAvg, alpha, nrow(X2));
   R = cbind(sc, se, sm, ss);
 
-  # c) score pruning
+  # d) score pruning
   # compute upper bound scores for all remaining slices
   if(minsc > -Inf & !disableIncScorePruning) {
     ubSc = scoreUBInc(ss, se, sm, eAvg, minSup, alpha, nrow(X2));
@@ -521,7 +537,6 @@ evalSliceInc = function(Matrix[Double] X, Matrix[Double] e, 
Double eAvg,
   I = (X %*% tS) == l;    # slice indicator
   ss = t(colSums(I));     # absolute slice size (nnz)
   se = t(t(e) %*% I);     # absolute slice error
-
   sm = t(colMaxs(I * e)); # maximum tuple error in slice
 
   # score of relative error and relative size
@@ -621,7 +636,7 @@ pruneUnchangedSlices = function(Matrix[Double] S, 
Matrix[Double] S2, Matrix[Doub
   # a) determine unchagned slices
   # only computing unchanged slices for levels 2 and above,
   # Imat has a 1 where a slice in changedX2 belongs to a slice in 
prevLatAtLevel
-  if(nrow(S2) > 0 & nrow(unchangedS) > 0) {
+  if(nrow(S2) > 0 & sum(S2) > 0 & nrow(unchangedS) > 0) {
     # b) select unchanged slices that can be pruned
     I = t(colSums((changedX2 %*% t(unchangedS)) == level) == 0) # change 
pushdown
         & unchangedR[,4] < minSup; # minSup pushdown
@@ -724,6 +739,60 @@ getMaxScoreAllFeatures = function(Int numRows, Int 
numFeatures, List[Unknown] pr
   }
 }
 
+getMaxChangedScoreAllFeatures = function(Int numRows, Int numFeatures, 
Matrix[Double] addedX2,
+  Matrix[Double] removedX2, Matrix[Double] addedE, Matrix[Double] removedE,
+  List[Unknown] prevLattice, List[Unknown] metaPrevLattice, List[Unknown] 
prevStats,
+  Boolean encodeLat, Boolean differentOffsets, Double alpha, Double eAvg, 
Double minSup,
+  Matrix[Double] prevFoffb, Matrix[Double] prevFoffe, Matrix[Double] foffb, 
Matrix[Double] foffe)
+  return(Matrix[Double] maxscub)
+{
+  maxscub = matrix(-Inf, numFeatures, 1);
+  if( length(prevLattice) > 0 & nrow(addedX2) < 0.05*numRows ) {
+    # compute upper bounds per feature for added subset
+    ss = t(colSums(addedX2));
+    se = t(t(addedE) %*% addedX2);
+    sm = t(colMaxs(addedX2 * addedE))
+    maxscub = scoreUBInc(ss, se, sm, eAvg, minSup, alpha, numRows);
+
+    # compute high-probability upper-bound scores per feature
+    for(level in 1:length(prevLattice)) {
+      if( length(prevStats) >= level ) {
+        S = as.matrix(prevLattice[level]);
+        if( encodeLat ) {
+          metaAtLevel = as.frame(metaPrevLattice[level]);
+          prevLatticeDec = transformIDsToSlices(S, prevFoffb, prevFoffe, 
metaAtLevel);
+          if( nrow(S) > 0 )
+            S = oneHotEncodeUsingOffsets(prevLatticeDec, foffb, foffe);
+          else
+            S = matrix(0, 1, numFeatures);
+        }
+        else if(differentOffsets) {
+          prevLatticeDec = decodeOneHot(S, prevFoffb, prevFoffe);
+          S = oneHotEncodeUsingOffsets(prevLatticeDec, foffb, foffe);
+        }
+        R = as.matrix(prevStats[level]); #(sc,se,sm,ss)
+        # compute exact deltas/slice scores, and update max scores per feature 
despite the
+        # exact deltas we need to consider upper bounds because new slices, 
previously not
+        # considered, might appear (only high probability, for hard guarantees 
we would
+        # need to consider previously pruned slices too)
+        dssp = rowSums((S%*%t(addedX2))==level);               # size delta 
plus
+        dssm = rowSums((S%*%t(removedX2))==level);             # size delta 
minus
+        dsep = rowSums((S%*%t(addedX2)==level)*t(addedE));     # error delta 
plus
+        dsem = rowSums((S%*%t(removedX2)==level)*t(removedE)); # error delta 
minus
+        # compute interesting extreme points
+        sc1 = scoreInc(R[,4] - dssm,        R[,2] + dsep,        eAvg, alpha, 
numRows);
+        sc2 = scoreInc(R[,4],               R[,2] + dsep,        eAvg, alpha, 
numRows);
+        sc3 = scoreInc(R[,4] + dssp,        R[,2] + dsep,        eAvg, alpha, 
numRows);
+        sc4 = scoreInc(R[,4] + dssp - dssm, R[,2] + dsep - dsem, eAvg, alpha, 
numRows);
+        sc = max(sc1, sc2, sc3, sc4);
+        # robustness for S * sc creating NaNs because 0 * -Inf = NaN
+        sc = replace(target=sc, pattern=-Inf, replacement=0);
+        maxscub = max(maxscub, t(colMaxs(S*sc)));
+      }
+    }
+  }
+}
+
 preparePrevLattice = function(list[unknown] prevLattice, list[unknown] 
metaPrevLattice,
   Matrix[Double] prevFoffb, Matrix[Double] prevFoffe, Matrix[Double] foffb,
   Matrix[Double] foffe, Integer level, Boolean encodeLat, Boolean 
differentOffsets)
@@ -756,4 +825,3 @@ removeRowsByIndices = function(Matrix[Double] M, 
Matrix[Double] indices)
   P2 = table(seq(1, nrow(CIX)), CIX, nrow(CIX), nrow(M))
   remain = P2 %*% M;
 }
-

(systemds) branch main updated: [SYSTEMDS-3696] Additional pruning strategy for incremental slice line

Reply via email to