This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new a9cabbc87f [SYSTEMDS-3149] Robustness decisionTree/randomForest (many
feat values)
a9cabbc87f is described below
commit a9cabbc87f5fa73383137834007e22f6fba98ae8
Author: Matthias Boehm <[email protected]>
AuthorDate: Wed Apr 12 22:34:58 2023 +0200
[SYSTEMDS-3149] Robustness decisionTree/randomForest (many feat values)
In scenarios where the selected features of decisionTree/randomForest
have many distinct items (>50K), the vectorized evaluation causes
severe memory pressure (and non-vectorized evaluation would be very
slow). This patch adds a new parameter: max_values which is a log
measure similar to max_features which allows sampling the evaluated
values per feature (e.g., max_values=0.5 -> 50K^0.5 = 223) without
much accuracy degradation. The real-data test cases have been adapted
accordingly.
---
scripts/builtin/decisionTree.dml | 22 +++++++++++----
scripts/builtin/randomForest.dml | 8 ++++--
.../part1/BuiltinDecisionTreeRealDataTest.java | 33 ++++++++++++++++------
.../functions/builtin/decisionTreeRealData.dml | 7 +++--
4 files changed, 52 insertions(+), 18 deletions(-)
diff --git a/scripts/builtin/decisionTree.dml b/scripts/builtin/decisionTree.dml
index 4d4e273c65..6a85367526 100644
--- a/scripts/builtin/decisionTree.dml
+++ b/scripts/builtin/decisionTree.dml
@@ -36,6 +36,8 @@
# min_split Minimum number of samples in leaf for attempting a split
# max_features Parameter controlling the number of features used as split
# candidates at tree nodes: m = ceil(num_features^max_features)
+# max_values Parameter controlling the number of values per feature used
+# as split candidates: nb = ceil(num_values^max_values)
# impurity Impurity measure: entropy, gini (default)
# seed Fixed seed for randomization of samples and split candidates
# verbose Flag indicating verbose debug output
@@ -59,7 +61,8 @@
#
------------------------------------------------------------------------------
m_decisionTree = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double]
ctypes,
- Int max_depth = 10, Int min_leaf = 20, Int min_split = 50, Double
max_features = 0.5,
+ Int max_depth = 10, Int min_leaf = 20, Int min_split = 50,
+ Double max_features = 0.5, Double max_values = 1.0,
String impurity = "gini", Int seed = -1, Boolean verbose = FALSE)
return(Matrix[Double] M)
{
@@ -113,7 +116,7 @@ m_decisionTree = function(Matrix[Double] X, Matrix[Double]
y, Matrix[Double] cty
# find best split attribute
nSeed = ifelse(seed==-1, seed, seed*nID);
[f, v, IDleft, Ileft, IDright, Iright] = findBestSplit(
- X2, y2, foffb, foffe, nID, nI, min_leaf, max_features, impurity, nSeed);
+ X2, y2, foffb, foffe, nID, nI, min_leaf, max_features, max_values,
impurity, nSeed);
validSplit = sum(Ileft) >= min_leaf & sum(Iright) >= min_leaf;
if(verbose)
print("-- best split: f"+f+" <= "+v+" --> valid="+validSplit);
@@ -147,7 +150,7 @@ m_decisionTree = function(Matrix[Double] X, Matrix[Double]
y, Matrix[Double] cty
}
findBestSplit = function(Matrix[Double] X2, Matrix[Double] y2, Matrix[Double]
foffb, Matrix[Double] foffe,
- Int ID, Matrix[Double] I, Int min_leaf, Double max_features, String
impurity, Int seed)
+ Int ID, Matrix[Double] I, Int min_leaf, Double max_features, Double
max_values, String impurity, Int seed)
return(Int f, Int v, Int IDleft, Matrix[Double] Ileft, Int IDright,
Matrix[Double] Iright)
{
# sample features iff max_features < 1
@@ -173,8 +176,15 @@ findBestSplit = function(Matrix[Double] X2, Matrix[Double]
y2, Matrix[Double] fo
# construct 0/1 predicate vectors with <= semantics
# find rows that match at least one value and appear in I
# (vectorized evaluation, each column in P is a split candidate)
- P = matrix(0, ncol(X2), belen+1);
- P[beg:end-1,1:belen+1] = upper.tri(target=matrix(1,belen,belen+1),
diag=TRUE);
+ fP = upper.tri(target=matrix(1,belen,belen+1), diag=TRUE);
+ vI = seq(1,belen+1);
+ if( max_values < 1.0 & ncol(fP)>10 ) {
+ rI2 = rand(rows=ncol(fP), cols=1, seed=seed) <=
(ncol(fP)^max_values/ncol(fP));
+ fP = removeEmpty(target=fP, margin="cols", select=t(rI2));
+ vI = removeEmpty(target=vI, margin="rows", select=rI2);
+ }
+ P = matrix(0, ncol(X2), ncol(fP));
+ P[beg:end-1,1:ncol(fP)] = fP;
Ileft = (t(X2 %*% P) * I) != 0;
Iright = (Ileft==0) * I;
@@ -188,6 +198,8 @@ findBestSplit = function(Matrix[Double] X2, Matrix[Double]
y2, Matrix[Double] fo
valid = (rowSums(Ileft)>=min_leaf) & (rowSums(Iright)>=min_leaf);
bestig = max(valid*ig);
bestv = ifelse(bestig>0,
nrow(valid)-as.scalar(rowIndexMax(t(rev(valid*ig))))+beg, -1);
+ if( bestv >= 0 )
+ bestv = as.scalar(vI[bestv-beg+1,1])+beg-1;
R[,i] = as.matrix(list(f, bestig, bestv));
}
ix = as.scalar(rowIndexMax(R[2,]));
diff --git a/scripts/builtin/randomForest.dml b/scripts/builtin/randomForest.dml
index 37f7f64fb4..fda266a3f4 100644
--- a/scripts/builtin/randomForest.dml
+++ b/scripts/builtin/randomForest.dml
@@ -40,6 +40,8 @@
# min_split Minimum number of samples in leaf for attempting a split
# max_features Parameter controlling the number of features used as split
# candidates at tree nodes: m = ceil(num_features^max_features)
+# max_values Parameter controlling the number of values per feature used
+# as split candidates: nb = ceil(num_values^max_values)
# impurity Impurity measure: entropy, gini (default)
# seed Fixed seed for randomization of samples and split candidates
# verbose Flag indicating verbose debug output
@@ -69,7 +71,8 @@
m_randomForest = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double]
ctypes,
Int num_trees = 16, Double sample_frac = 0.1, Double feature_frac = 1.0,
- Int max_depth = 10, Int min_leaf = 20, Int min_split = 50, Double
max_features = 0.5,
+ Int max_depth = 10, Int min_leaf = 20, Int min_split = 50,
+ Double max_features = 0.5, Double max_values = 1.0,
String impurity = "gini", Int seed = -1, Boolean verbose = FALSE)
return(Matrix[Double] M)
{
@@ -123,7 +126,8 @@ m_randomForest = function(Matrix[Double] X, Matrix[Double]
y, Matrix[Double] cty
t2 = time();
si3 = as.integer(as.scalar(randSeeds[3*(i-1)+3,1]));
Mtemp = decisionTree(X=Xi, y=yi, ctypes=ctypes, max_depth=max_depth,
min_split=min_split,
- min_leaf=min_leaf, max_features=max_features, impurity=impurity,
seed=si3, verbose=verbose);
+ min_leaf=min_leaf, max_features=max_features, max_values=max_values,
+ impurity=impurity, seed=si3, verbose=verbose);
M[i,1:length(Mtemp)] = matrix(Mtemp, rows=1, cols=length(Mtemp));
if( verbose )
print("-- ["+i+"] trained decision tree in "+(time()-t2)/1e9+"
seconds.");
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinDecisionTreeRealDataTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinDecisionTreeRealDataTest.java
index f797cfff09..3776132eb5 100644
---
a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinDecisionTreeRealDataTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinDecisionTreeRealDataTest.java
@@ -42,23 +42,40 @@ public class BuiltinDecisionTreeRealDataTest extends
AutomatedTestBase {
}
@Test
- public void testDecisionTreeTitanic() {
- runDecisionTree(TITANIC_DATA, TITANIC_TFSPEC, 0.875, 1,
ExecType.CP);
+ public void testDecisionTreeTitanic_MaxV1() {
+ runDecisionTree(TITANIC_DATA, TITANIC_TFSPEC, 0.875, 1, 1.0,
ExecType.CP);
}
@Test
- public void testRandomForestTitanic1() {
+ public void testRandomForestTitanic1_MaxV1() {
//one tree with sample_frac=1 should be equivalent to decision
tree
- runDecisionTree(TITANIC_DATA, TITANIC_TFSPEC, 0.875, 2,
ExecType.CP);
+ runDecisionTree(TITANIC_DATA, TITANIC_TFSPEC, 0.875, 2, 1.0,
ExecType.CP);
}
@Test
- public void testRandomForestTitanic8() {
+ public void testRandomForestTitanic8_MaxV1() {
//8 trees with sample fraction 0.125 each, accuracy 0.785 due
to randomness
- runDecisionTree(TITANIC_DATA, TITANIC_TFSPEC, 0.793, 9,
ExecType.CP);
+ runDecisionTree(TITANIC_DATA, TITANIC_TFSPEC, 0.793, 9, 1.0,
ExecType.CP);
+ }
+
+ @Test
+ public void testDecisionTreeTitanic_MaxV06() {
+ runDecisionTree(TITANIC_DATA, TITANIC_TFSPEC, 0.871, 1, 0.6,
ExecType.CP);
+ }
+
+ @Test
+ public void testRandomForestTitanic1_MaxV06() {
+ //one tree with sample_frac=1 should be equivalent to decision
tree
+ runDecisionTree(TITANIC_DATA, TITANIC_TFSPEC, 0.871, 2, 0.6,
ExecType.CP);
+ }
+
+ @Test
+ public void testRandomForestTitanic8_MaxV06() {
+ //8 trees with sample fraction 0.125 each, accuracy 0.785 due
to randomness
+ runDecisionTree(TITANIC_DATA, TITANIC_TFSPEC, 0.793, 9, 0.6,
ExecType.CP);
}
- private void runDecisionTree(String data, String tfspec, double minAcc,
int dt, ExecType instType) {
+ private void runDecisionTree(String data, String tfspec, double minAcc,
int dt, double maxV, ExecType instType) {
Types.ExecMode platformOld = setExecMode(instType);
try {
loadTestConfiguration(getTestConfiguration(TEST_NAME));
@@ -66,7 +83,7 @@ public class BuiltinDecisionTreeRealDataTest extends
AutomatedTestBase {
String HOME = SCRIPT_DIR + TEST_DIR;
fullDMLScriptName = HOME + TEST_NAME + ".dml";
programArgs = new String[] {"-stats",
- "-args", data, tfspec, String.valueOf(dt),
output("R")};
+ "-args", data, tfspec, String.valueOf(dt),
String.valueOf(maxV), output("R")};
runTest(true, false, null, -1);
diff --git a/src/test/scripts/functions/builtin/decisionTreeRealData.dml
b/src/test/scripts/functions/builtin/decisionTreeRealData.dml
index f61b2de77e..446e50abf6 100644
--- a/src/test/scripts/functions/builtin/decisionTreeRealData.dml
+++ b/src/test/scripts/functions/builtin/decisionTreeRealData.dml
@@ -31,13 +31,14 @@ X = X[, 1:ncol(X)-1]
X = imputeByMode(X);
if( $3==1 ) {
- M = decisionTree(X=X, y=Y, ctypes=R, max_features=1,
+ M = decisionTree(X=X, y=Y, ctypes=R, max_features=1, max_values=$4,
min_split=10, min_leaf=4, seed=7, verbose=TRUE);
yhat = decisionTreePredict(X=X, y=Y, ctypes=R, M=M)
}
else {
sf = 1.0/($3-1);
- M = randomForest(X=X, y=Y, ctypes=R, sample_frac=sf, num_trees=$3-1,
max_features=1,
+ M = randomForest(X=X, y=Y, ctypes=R, sample_frac=sf, num_trees=$3-1,
+ max_features=1, max_values=$4,
min_split=10, min_leaf=4, seed=7, verbose=TRUE);
yhat = randomForestPredict(X=X, y=Y, ctypes=R, M=M)
}
@@ -46,4 +47,4 @@ acc = as.matrix(mean(yhat == Y))
err = 1-(acc);
print("accuracy: "+as.scalar(acc))
-write(acc, $4);
+write(acc, $5);