This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new a9cabbc87f [SYSTEMDS-3149] Robustness decisionTree/randomForest (many 
feat values)
a9cabbc87f is described below

commit a9cabbc87f5fa73383137834007e22f6fba98ae8
Author: Matthias Boehm <[email protected]>
AuthorDate: Wed Apr 12 22:34:58 2023 +0200

    [SYSTEMDS-3149] Robustness decisionTree/randomForest (many feat values)
    
    In scenarios where the selected features of decisionTree/randomForest
    have many distinct items (>50K), the vectorized evaluation causes
    severe memory pressure (and non-vectorized evaluation would be very
    slow). This patch adds a new parameter: max_values which is a log
    measure similar to max_features which allows sampling the evaluated
    values per feature (e.g., max_values=0.5 -> 50K^0.5 = 223) without
    much accuracy degradation. The real-data test cases have been adapted
    accordingly.
---
 scripts/builtin/decisionTree.dml                   | 22 +++++++++++----
 scripts/builtin/randomForest.dml                   |  8 ++++--
 .../part1/BuiltinDecisionTreeRealDataTest.java     | 33 ++++++++++++++++------
 .../functions/builtin/decisionTreeRealData.dml     |  7 +++--
 4 files changed, 52 insertions(+), 18 deletions(-)

diff --git a/scripts/builtin/decisionTree.dml b/scripts/builtin/decisionTree.dml
index 4d4e273c65..6a85367526 100644
--- a/scripts/builtin/decisionTree.dml
+++ b/scripts/builtin/decisionTree.dml
@@ -36,6 +36,8 @@
 # min_split       Minimum number of samples in leaf for attempting a split
 # max_features    Parameter controlling the number of features used as split
 #                 candidates at tree nodes: m = ceil(num_features^max_features)
+# max_values      Parameter controlling the number of values per feature used
+#                 as split candidates: nb = ceil(num_values^max_values)
 # impurity        Impurity measure: entropy, gini (default)
 # seed            Fixed seed for randomization of samples and split candidates
 # verbose         Flag indicating verbose debug output
@@ -59,7 +61,8 @@
 # 
------------------------------------------------------------------------------
 
 m_decisionTree = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] 
ctypes,
-    Int max_depth = 10, Int min_leaf = 20, Int min_split = 50, Double 
max_features = 0.5,
+    Int max_depth = 10, Int min_leaf = 20, Int min_split = 50,
+    Double max_features = 0.5, Double max_values = 1.0,
     String impurity = "gini", Int seed = -1, Boolean verbose = FALSE)
   return(Matrix[Double] M)
 {
@@ -113,7 +116,7 @@ m_decisionTree = function(Matrix[Double] X, Matrix[Double] 
y, Matrix[Double] cty
     # find best split attribute
     nSeed = ifelse(seed==-1, seed, seed*nID);
     [f, v, IDleft, Ileft, IDright, Iright] = findBestSplit(
-      X2, y2, foffb, foffe, nID, nI, min_leaf, max_features, impurity, nSeed);
+      X2, y2, foffb, foffe, nID, nI, min_leaf, max_features, max_values, 
impurity, nSeed);
     validSplit = sum(Ileft) >= min_leaf & sum(Iright) >= min_leaf;
     if(verbose)
       print("-- best split: f"+f+" <= "+v+" --> valid="+validSplit);
@@ -147,7 +150,7 @@ m_decisionTree = function(Matrix[Double] X, Matrix[Double] 
y, Matrix[Double] cty
 }
 
 findBestSplit = function(Matrix[Double] X2, Matrix[Double] y2, Matrix[Double] 
foffb, Matrix[Double] foffe,
-    Int ID, Matrix[Double] I, Int min_leaf, Double max_features, String 
impurity, Int seed)
+    Int ID, Matrix[Double] I, Int min_leaf, Double max_features, Double 
max_values, String impurity, Int seed)
   return(Int f, Int v, Int IDleft, Matrix[Double] Ileft, Int IDright, 
Matrix[Double] Iright)
 {
   # sample features iff max_features < 1
@@ -173,8 +176,15 @@ findBestSplit = function(Matrix[Double] X2, Matrix[Double] 
y2, Matrix[Double] fo
     # construct 0/1 predicate vectors with <= semantics
     # find rows that match at least one value and appear in I
     # (vectorized evaluation, each column in P is a split candidate)
-    P = matrix(0, ncol(X2), belen+1);
-    P[beg:end-1,1:belen+1] = upper.tri(target=matrix(1,belen,belen+1), 
diag=TRUE);
+    fP = upper.tri(target=matrix(1,belen,belen+1), diag=TRUE);
+    vI = seq(1,belen+1);
+    if( max_values < 1.0 & ncol(fP)>10 ) {
+      rI2 = rand(rows=ncol(fP), cols=1, seed=seed) <= 
(ncol(fP)^max_values/ncol(fP));
+      fP = removeEmpty(target=fP, margin="cols", select=t(rI2));
+      vI = removeEmpty(target=vI, margin="rows", select=rI2);
+    }
+    P = matrix(0, ncol(X2), ncol(fP));
+    P[beg:end-1,1:ncol(fP)] = fP;
     Ileft = (t(X2 %*% P) * I) != 0;
     Iright = (Ileft==0) * I;
 
@@ -188,6 +198,8 @@ findBestSplit = function(Matrix[Double] X2, Matrix[Double] 
y2, Matrix[Double] fo
     valid = (rowSums(Ileft)>=min_leaf) & (rowSums(Iright)>=min_leaf);
     bestig = max(valid*ig);
     bestv = ifelse(bestig>0, 
nrow(valid)-as.scalar(rowIndexMax(t(rev(valid*ig))))+beg, -1);
+    if( bestv >= 0 )
+      bestv = as.scalar(vI[bestv-beg+1,1])+beg-1;
     R[,i] = as.matrix(list(f, bestig, bestv));
   }
   ix = as.scalar(rowIndexMax(R[2,]));
diff --git a/scripts/builtin/randomForest.dml b/scripts/builtin/randomForest.dml
index 37f7f64fb4..fda266a3f4 100644
--- a/scripts/builtin/randomForest.dml
+++ b/scripts/builtin/randomForest.dml
@@ -40,6 +40,8 @@
 # min_split       Minimum number of samples in leaf for attempting a split
 # max_features    Parameter controlling the number of features used as split
 #                 candidates at tree nodes: m = ceil(num_features^max_features)
+# max_values      Parameter controlling the number of values per feature used
+#                 as split candidates: nb = ceil(num_values^max_values)
 # impurity        Impurity measure: entropy, gini (default)
 # seed            Fixed seed for randomization of samples and split candidates
 # verbose         Flag indicating verbose debug output
@@ -69,7 +71,8 @@
 
 m_randomForest = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] 
ctypes,
     Int num_trees = 16, Double sample_frac = 0.1, Double feature_frac = 1.0,
-    Int max_depth = 10, Int min_leaf = 20, Int min_split = 50, Double 
max_features = 0.5,
+    Int max_depth = 10, Int min_leaf = 20, Int min_split = 50,
+    Double max_features = 0.5, Double max_values = 1.0,
     String impurity = "gini", Int seed = -1, Boolean verbose = FALSE)
   return(Matrix[Double] M)
 {
@@ -123,7 +126,8 @@ m_randomForest = function(Matrix[Double] X, Matrix[Double] 
y, Matrix[Double] cty
     t2 = time();
     si3 = as.integer(as.scalar(randSeeds[3*(i-1)+3,1]));
     Mtemp = decisionTree(X=Xi, y=yi, ctypes=ctypes, max_depth=max_depth, 
min_split=min_split,
-      min_leaf=min_leaf, max_features=max_features, impurity=impurity, 
seed=si3, verbose=verbose);
+      min_leaf=min_leaf, max_features=max_features, max_values=max_values,
+      impurity=impurity, seed=si3, verbose=verbose);
     M[i,1:length(Mtemp)] = matrix(Mtemp, rows=1, cols=length(Mtemp));
     if( verbose )
       print("-- ["+i+"] trained decision tree in "+(time()-t2)/1e9+" 
seconds.");
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinDecisionTreeRealDataTest.java
 
b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinDecisionTreeRealDataTest.java
index f797cfff09..3776132eb5 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinDecisionTreeRealDataTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinDecisionTreeRealDataTest.java
@@ -42,23 +42,40 @@ public class BuiltinDecisionTreeRealDataTest extends 
AutomatedTestBase {
        }
 
        @Test
-       public void testDecisionTreeTitanic() {
-               runDecisionTree(TITANIC_DATA, TITANIC_TFSPEC, 0.875, 1, 
ExecType.CP);
+       public void testDecisionTreeTitanic_MaxV1() {
+               runDecisionTree(TITANIC_DATA, TITANIC_TFSPEC, 0.875, 1, 1.0, 
ExecType.CP);
        }
        
        @Test
-       public void testRandomForestTitanic1() {
+       public void testRandomForestTitanic1_MaxV1() {
                //one tree with sample_frac=1 should be equivalent to decision 
tree
-               runDecisionTree(TITANIC_DATA, TITANIC_TFSPEC, 0.875, 2, 
ExecType.CP);
+               runDecisionTree(TITANIC_DATA, TITANIC_TFSPEC, 0.875, 2, 1.0, 
ExecType.CP);
        }
        
        @Test
-       public void testRandomForestTitanic8() {
+       public void testRandomForestTitanic8_MaxV1() {
                //8 trees with sample fraction 0.125 each, accuracy 0.785 due 
to randomness
-               runDecisionTree(TITANIC_DATA, TITANIC_TFSPEC, 0.793, 9, 
ExecType.CP);
+               runDecisionTree(TITANIC_DATA, TITANIC_TFSPEC, 0.793, 9, 1.0, 
ExecType.CP);
+       }
+       
+       @Test
+       public void testDecisionTreeTitanic_MaxV06() {
+               runDecisionTree(TITANIC_DATA, TITANIC_TFSPEC, 0.871, 1, 0.6, 
ExecType.CP);
+       }
+       
+       @Test
+       public void testRandomForestTitanic1_MaxV06() {
+               //one tree with sample_frac=1 should be equivalent to decision 
tree
+               runDecisionTree(TITANIC_DATA, TITANIC_TFSPEC, 0.871, 2, 0.6, 
ExecType.CP);
+       }
+       
+       @Test
+       public void testRandomForestTitanic8_MaxV06() {
+               //8 trees with sample fraction 0.125 each, accuracy 0.785 due 
to randomness
+               runDecisionTree(TITANIC_DATA, TITANIC_TFSPEC, 0.793, 9, 0.6, 
ExecType.CP);
        }
 
-       private void runDecisionTree(String data, String tfspec, double minAcc, 
int dt, ExecType instType) {
+       private void runDecisionTree(String data, String tfspec, double minAcc, 
int dt, double maxV, ExecType instType) {
                Types.ExecMode platformOld = setExecMode(instType);
                try {
                        loadTestConfiguration(getTestConfiguration(TEST_NAME));
@@ -66,7 +83,7 @@ public class BuiltinDecisionTreeRealDataTest extends 
AutomatedTestBase {
                        String HOME = SCRIPT_DIR + TEST_DIR;
                        fullDMLScriptName = HOME + TEST_NAME + ".dml";
                        programArgs = new String[] {"-stats",
-                               "-args", data, tfspec, String.valueOf(dt), 
output("R")};
+                               "-args", data, tfspec, String.valueOf(dt), 
String.valueOf(maxV), output("R")};
 
                        runTest(true, false, null, -1);
 
diff --git a/src/test/scripts/functions/builtin/decisionTreeRealData.dml 
b/src/test/scripts/functions/builtin/decisionTreeRealData.dml
index f61b2de77e..446e50abf6 100644
--- a/src/test/scripts/functions/builtin/decisionTreeRealData.dml
+++ b/src/test/scripts/functions/builtin/decisionTreeRealData.dml
@@ -31,13 +31,14 @@ X = X[, 1:ncol(X)-1]
 X = imputeByMode(X);
 
 if( $3==1 ) {
-  M = decisionTree(X=X, y=Y, ctypes=R, max_features=1,
+  M = decisionTree(X=X, y=Y, ctypes=R, max_features=1, max_values=$4,
                    min_split=10, min_leaf=4, seed=7, verbose=TRUE);
   yhat = decisionTreePredict(X=X, y=Y, ctypes=R, M=M)
 }
 else {
   sf = 1.0/($3-1);
-  M = randomForest(X=X, y=Y, ctypes=R, sample_frac=sf, num_trees=$3-1, 
max_features=1,
+  M = randomForest(X=X, y=Y, ctypes=R, sample_frac=sf, num_trees=$3-1,
+                   max_features=1, max_values=$4,
                    min_split=10, min_leaf=4, seed=7, verbose=TRUE);
   yhat = randomForestPredict(X=X, y=Y, ctypes=R,  M=M)
 }
@@ -46,4 +47,4 @@ acc = as.matrix(mean(yhat == Y))
 err = 1-(acc);
 print("accuracy: "+as.scalar(acc))
 
-write(acc, $4);
+write(acc, $5);

Reply via email to