This is an automated email from the ASF dual-hosted git repository. baunsgaard pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/systemds.git
commit ddcc1867788d21dc2da607e5b858109b217c1dc7 Author: baunsgaard <[email protected]> AuthorDate: Wed May 10 10:53:10 2023 +0200 [MINOR] Update DecisionTree and RandomForest docs This commit update the documentation of Decision Tree and Random Forest to work with the python docs. --- scripts/builtin/decisionTree.dml | 31 +++++++++-------- scripts/builtin/randomForest.dml | 39 ++++++++++++---------- .../operator/algorithm/builtin/decisionTree.py | 29 ++++++++++------ .../operator/algorithm/builtin/randomForest.py | 36 +++++++++++--------- 4 files changed, 78 insertions(+), 57 deletions(-) diff --git a/scripts/builtin/decisionTree.dml b/scripts/builtin/decisionTree.dml index 85e414c61d..41ba72e024 100644 --- a/scripts/builtin/decisionTree.dml +++ b/scripts/builtin/decisionTree.dml @@ -24,6 +24,23 @@ # regression tree) decision trees depending on the provided labels y, either # classification (majority vote per leaf) or regression (average per leaf). # +# .. code-block:: +# +# For example, give a feature matrix with features [a,b,c,d] +# and the following trees, M would look as follows: +# +# (L1) |d<5| +# / \ +# (L2) P1:2 |a<7| +# / \ +# (L3) P2:2 P3:1 +# +# --> M := +# [[4, 5, 0, 2, 1, 7, 0, 0, 0, 0, 0, 2, 0, 1]] +# |(L1)| | (L2) | | (L3) | +# +# +# # INPUT: # ------------------------------------------------------------------------------ # X Feature matrix in recoded/binned representation @@ -45,19 +62,7 @@ # # OUTPUT: # ------------------------------------------------------------------------------ -# M Matrix M containing the learne trees, in linearized form -# For example, give a feature matrix with features [a,b,c,d] -# and the following trees, M would look as follows: -# -# (L1) |d<5| -# / \ -# (L2) P1:2 |a<7| -# / \ -# (L3) P2:2 P3:1 -# -# --> M := -# [[4, 5, 0, 2, 1, 7, 0, 0, 0, 0, 0, 2, 0, 1]] -# |(L1)| | (L2) | | (L3) | +# M Matrix M containing the learned trees, in linearized form # ------------------------------------------------------------------------------ m_decisionTree = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] ctypes, diff --git a/scripts/builtin/randomForest.dml b/scripts/builtin/randomForest.dml index 7e39c9064e..ccebd59d86 100644 --- a/scripts/builtin/randomForest.dml +++ b/scripts/builtin/randomForest.dml @@ -26,6 +26,26 @@ # and optionally subset of features (columns). During tree construction, split # candidates are additionally chosen on a sample of remaining features. # +# .. code-block:: +# +# For example, given a feature matrix with features [a,b,c,d] +# and the following two trees, M (the output) would look as follows: +# +# (L1) |a<7| |d<5| +# / \ / \ +# (L2) |c<3| |b<4| |a<7| P3:2 +# / \ / \ / \ +# (L3) P1:2 P2:1 P3:1 P4:2 P1:2 P2:1 +# --> M := +# [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2], (1st tree) +# [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]] (2nd tree) +# |(L1)| | (L2) | | (L3) | +# +# With feature sampling (feature_frac < 1), each tree is +# prefixed by a one-hot vector of sampled features +# (e.g., [1,1,1,0] if we sampled a,b,c of the four features) +# +# # INPUT: # ------------------------------------------------------------------------------ # X Feature matrix in recoded/binned representation @@ -49,24 +69,7 @@ # # OUTPUT: # ------------------------------------------------------------------------------ -# M Matrix M containing the learned trees, in linearized form -# For example, give a feature matrix with features [a,b,c,d] -# and the following two trees, M would look as follows: -# -# (L1) |a<7| |d<5| -# / \ / \ -# (L2) |c<3| |b<4| |a<7| P3:2 -# / \ / \ / \ -# (L3) P1:2 P2:1 P3:1 P4:2 P1:2 P2:1 -# -# --> M := -# [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2], (1st tree) -# [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]] (2nd tree) -# |(L1)| | (L2) | | (L3) | -# -# With feature sampling (feature_frac < 1), each tree is -# prefixed by a one-hot vector of sampled features -# (e.g., [1,1,1,0] if we sampled a,b,c of the four features) +# M Matrix M containing the learned trees, in linearized form. # ------------------------------------------------------------------------------ m_randomForest = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] ctypes, diff --git a/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py b/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py index 399a21fd50..38ab517a8a 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py +++ b/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py @@ -39,6 +39,23 @@ def decisionTree(X: Matrix, regression tree) decision trees depending on the provided labels y, either classification (majority vote per leaf) or regression (average per leaf). + .. code-block:: + + For example, give a feature matrix with features [a,b,c,d] + and the following trees, M would look as follows: + + (L1) |d<5| + / \ + (L2) P1:2 |a<7| + / \ + (L3) P2:2 P3:1 + + --> M := + [[4, 5, 0, 2, 1, 7, 0, 0, 0, 0, 0, 2, 0, 1]] + |(L1)| | (L2) | | (L3) | + + + :param X: Feature matrix in recoded/binned representation @@ -56,17 +73,7 @@ def decisionTree(X: Matrix, :param impurity: Impurity measure: entropy, gini (default), rss (regression) :param seed: Fixed seed for randomization of samples and split candidates :param verbose: Flag indicating verbose debug output - :return: Matrix M containing the learne trees, in linearized form - For example, give a feature matrix with features [a,b,c,d] - and the following trees, M would look as follows: - (L1) |d<5| - / \ - (L2) P1:2 |a<7| - / \ - (L3) P2:2 P3:1 - --> M := - [[4, 5, 0, 2, 1, 7, 0, 0, 0, 0, 0, 2, 0, 1]] - |(L1)| | (L2) | | (L3) | + :return: Matrix M containing the learned trees, in linearized form """ params_dict = {'X': X, 'y': y, 'ctypes': ctypes} diff --git a/src/main/python/systemds/operator/algorithm/builtin/randomForest.py b/src/main/python/systemds/operator/algorithm/builtin/randomForest.py index 5c4bb0438a..c0659d47bd 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/randomForest.py +++ b/src/main/python/systemds/operator/algorithm/builtin/randomForest.py @@ -41,6 +41,26 @@ def randomForest(X: Matrix, and optionally subset of features (columns). During tree construction, split candidates are additionally chosen on a sample of remaining features. + .. code-block:: + + For example, given a feature matrix with features [a,b,c,d] + and the following two trees, M (the output) would look as follows: + + (L1) |a<7| |d<5| + / \ / \ + (L2) |c<3| |b<4| |a<7| P3:2 + / \ / \ / \ + (L3) P1:2 P2:1 P3:1 P4:2 P1:2 P2:1 + --> M := + [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2], (1st tree) + [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]] (2nd tree) + |(L1)| | (L2) | | (L3) | + + With feature sampling (feature_frac < 1), each tree is + prefixed by a one-hot vector of sampled features + (e.g., [1,1,1,0] if we sampled a,b,c of the four features) + + :param X: Feature matrix in recoded/binned representation @@ -60,21 +80,7 @@ def randomForest(X: Matrix, :param impurity: Impurity measure: entropy, gini (default), rss (regression) :param seed: Fixed seed for randomization of samples and split candidates :param verbose: Flag indicating verbose debug output - :return: Matrix M containing the learned trees, in linearized form - For example, give a feature matrix with features [a,b,c,d] - and the following two trees, M would look as follows: - (L1) |a<7| |d<5| - / \ / \ - (L2) |c<3| |b<4| |a<7| P3:2 - / \ / \ / \ - (L3) P1:2 P2:1 P3:1 P4:2 P1:2 P2:1 - --> M := - [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2], (1st tree) - [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]] (2nd tree) - |(L1)| | (L2) | | (L3) | - With feature sampling (feature_frac < 1), each tree is - prefixed by a one-hot vector of sampled features - (e.g., [1,1,1,0] if we sampled a,b,c of the four features) + :return: Matrix M containing the learned trees, in linearized form. """ params_dict = {'X': X, 'y': y, 'ctypes': ctypes}
