incubator-madlib git commit: Decision Tree: Multiple fixes - pruning, tree_depth, viz

riyer Fri, 14 Apr 2017 17:24:18 -0700

Repository: incubator-madlib
Updated Branches:
  refs/heads/master 975d34e43 -> c82b9d0ad



Decision Tree: Multiple fixes - pruning, tree_depth, viz

Commit includes following changes:
- Pruning is not performed when cp = 0 (default behavior)
- A particular bug is fixed: User input of max_depth starts from 0 and
the internal tree_depth starts from 1. This discrepancy was not taken into
account when tree train termination was checked leading to trees
containing only two leaf nodes on last level.
- Integer categorical variable is treated as ordered and hence is not
re-ordered. If the original ordering method is desired, then integer
column needs to be cast to TEXT.
- Visualization is improved: nodes with categorical feature splits only
provide the last value in the split, instead of the complete list.
This is consistent with the visualization in scikit-learn.

Closes #111


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/c82b9d0a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/c82b9d0a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/c82b9d0a

Branch: refs/heads/master
Commit: c82b9d0ad844a03ef5f16111fa73e441a03850d5
Parents: 975d34e
Author: Rahul Iyer <ri...@apache.org>
Authored: Fri Apr 14 17:21:19 2017 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Fri Apr 14 17:23:30 2017 -0700

----------------------------------------------------------------------
 src/modules/recursive_partitioning/DT_impl.hpp  |  88 ++++++--
 .../recursive_partitioning/decision_tree.cpp    |   4 +-
 .../recursive_partitioning/feature_encoding.cpp |   6 +-
 .../recursive_partitioning/decision_tree.py_in  | 203 +++++++++++--------
 .../recursive_partitioning/decision_tree.sql_in |  19 +-
 .../recursive_partitioning/random_forest.py_in  |  25 ++-
 .../test/decision_tree.sql_in                   |   1 +
 .../modules/validation/cross_validation.py_in   |   1 -
 8 files changed, 217 insertions(+), 130 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c82b9d0a/src/modules/recursive_partitioning/DT_impl.hpp
----------------------------------------------------------------------
diff --git a/src/modules/recursive_partitioning/DT_impl.hpp 
b/src/modules/recursive_partitioning/DT_impl.hpp
index f622f94..64d2b88 100644
--- a/src/modules/recursive_partitioning/DT_impl.hpp
+++ b/src/modules/recursive_partitioning/DT_impl.hpp
@@ -574,7 +574,6 @@ DecisionTree<Container>::expand(const Accumulator &state,
                 feature_indices(i) = FINISHED_LEAF;
         }
     }
-
     return training_finished;
 }
 // -------------------------------------------------------------------------
@@ -1025,11 +1024,10 @@ DecisionTree<Container>::shouldSplit(const ColumnVector 
&combined_stats,
     uint64_t thresh_min_bucket = (min_bucket == 0) ? 1u : min_bucket;
     uint64_t true_count = statCount(combined_stats.segment(0, 
stats_per_split));
     uint64_t false_count = statCount(combined_stats.segment(stats_per_split, 
stats_per_split));
-
     return ((true_count + false_count) >= min_split &&
             true_count >= thresh_min_bucket &&
             false_count >= thresh_min_bucket &&
-            tree_depth <= max_depth);
+            tree_depth <= max_depth + 1);
 }
 // ------------------------------------------------------------------------
 
@@ -1107,11 +1105,32 @@ DecisionTree<Container>::displayLeafNode(
     display_str << "\"" << id_prefix << id << "\" [label=\"" << 
predict_str.str();
 
     if(verbose){
-        display_str << "\\n samples = " << statCount(predictions.row(id)) << 
"\\n value = ";
+        display_str << "\\n impurity = "<< impurity(predictions.row(id))
+                    << "\\n samples = " << statCount(predictions.row(id))
+                    << "\\n value = ";
         if (is_regression)
             display_str << statPredict(predictions.row(id));
         else{
-            display_str << "[" << predictions.row(id).head(n_y_labels)<< "]";
+            display_str << "[";
+            // NUM_PER_LINE: inserting new lines at fixed intervals
+            // avoids a really long 'value' line
+            const uint16_t NUM_PER_LINE = 10;
+            // note: last element of predictions is 'statCount' and
+            // can be ignored
+            const Index pred_size = predictions.row(id).size() - 1;
+            for (Index i = 0; i < pred_size; i += NUM_PER_LINE){
+                uint16_t n_elem;
+                if (i + NUM_PER_LINE <= pred_size) {
+                    // not overflowing the vector
+                    n_elem = NUM_PER_LINE;
+                } else {
+                    // less than NUM_PER_LINE left, avoid reading past the end
+                    n_elem = pred_size - i;
+                }
+                display_str << predictions.row(id).segment(i, n_elem) << "\n";
+            }
+            display_str << "]";
+
         }
     }
     display_str << "\",shape=box]" << ";";
@@ -1143,23 +1162,46 @@ DecisionTree<Container>::displayInternalNode(
         label_str << escape_quotes(feature_name) << " <= " << 
feature_thresholds(id);
     } else {
         feature_name = get_text(cat_features_str, feature_indices(id));
-        label_str << escape_quotes(feature_name) << " in "
-                   << getCatLabels(feature_indices(id),
-                                   static_cast<Index>(0),
-                                   static_cast<Index>(feature_thresholds(id)),
-                                   cat_levels_text, cat_n_levels);
+        label_str << escape_quotes(feature_name) << " <= ";
+
+        // Text for all categoricals are stored in a flat array 
(cat_levels_text);
+        // find the appropriate index for this node
+        size_t to_skip = 0;
+        for (Index i=0; i < feature_indices(id); i++)
+            to_skip += cat_n_levels[i];
+        const size_t index = to_skip + feature_thresholds(id);
+        label_str << get_text(cat_levels_text, index);
     }
 
     std::stringstream display_str;
     display_str << "\"" << id_prefix << id << "\" [label=\"" << 
label_str.str();
     if(verbose){
+        display_str << "\\n impurity = "<< impurity(predictions.row(id)) << 
"\\n samples = " << statCount(predictions.row(id));
 
-        display_str << "\\n impurity = "<< impurity(predictions.row(id)) << 
"\\n samples = " << statCount(predictions.row(id)) << "\\n value = ";
+        display_str << "\\n value = ";
         if (is_regression)
             display_str << statPredict(predictions.row(id));
         else{
-            display_str << "[" << predictions.row(id).head(n_y_labels)<< "]";
+            display_str << "[";
+            // NUM_PER_LINE: inserting new lines at fixed interval
+            // avoids really long 'value' line
+            const uint16_t NUM_PER_LINE = 10;
+            // note: last element of predictions is just 'statCount' and needs 
to
+            // be ignored
+            const Index pred_size = predictions.row(id).size() - 1;
+            for (Index i = 0; i < pred_size; i += NUM_PER_LINE){
+                uint16_t n_elem;
+                if (i + NUM_PER_LINE <= pred_size) {
+                    // not overflowing the vector
+                    n_elem = NUM_PER_LINE;
+                } else {
+                    n_elem = pred_size - i;
+                }
+                display_str << predictions.row(id).segment(i, n_elem) << "\n";
+            }
+            display_str << "]";
         }
+
         std::stringstream predict_str;
         if (static_cast<bool>(is_regression)){
             predict_str << predict_response(id);
@@ -1360,20 +1402,24 @@ DecisionTree<Container>::getCatLabels(Index cat_index,
                                       Index end_value,
                                       ArrayHandle<text*> &cat_levels_text,
                                       ArrayHandle<int> &cat_n_levels) {
+    Index MAX_LABELS = 5;
     size_t to_skip = 0;
     for (Index i=0; i < cat_index; i++) {
         to_skip += cat_n_levels[i];
     }
     std::stringstream cat_levels;
-    size_t start_index;
+    size_t index;
     cat_levels << "{";
-    for (start_index = to_skip + start_value;
-            start_index < to_skip + end_value &&
-            start_index < cat_levels_text.size();
-            start_index++) {
-        cat_levels << get_text(cat_levels_text, start_index) << ",";
+    for (index = to_skip + start_value;
+            index < to_skip + end_value && index < cat_levels_text.size();
+            index++) {
+        cat_levels << get_text(cat_levels_text, index) << ",";
+        if (index > to_skip + start_value + MAX_LABELS){
+            cat_levels << " ... ";
+            break;
+        }
     }
-    cat_levels << get_text(cat_levels_text, start_index) << "}";
+    cat_levels << get_text(cat_levels_text, index) << "}";
     return cat_levels.str();
 }
 // -------------------------------------------------------------------------
@@ -1575,7 +1621,7 @@ TreeAccumulator<Container, DTree>::operator<<(const 
tuple_type& inTuple) {
             uint16_t n_non_leaf_nodes = static_cast<uint16_t>(n_leaf_nodes - 
1);
             Index dt_search_index = dt.search(cat_features, con_features);
             if (dt.feature_indices(dt_search_index) != dt.FINISHED_LEAF &&
-                 dt.feature_indices(dt_search_index) != dt.NODE_NON_EXISTING) {
+                   dt.feature_indices(dt_search_index) != 
dt.NODE_NON_EXISTING) {
                 Index row_index = dt_search_index - n_non_leaf_nodes;
                 assert(row_index >= 0);
                 // add this row into the stats for the node
@@ -1651,7 +1697,7 @@ TreeAccumulator<Container, DTree>::operator<<(const 
surr_tuple_type& inTuple) {
         double primary_val = is_primary_cat ? cat_features(primary_index) :
                                               con_features(primary_index);
 
-        // We only capture statistics for rows that:
+        // Only capture statistics for rows that:
         //  1. lead to leaf nodes in the last layer. Surrogates for other nodes
         //      have already been trained.
         //  2. have non-null values for the primary split.

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c82b9d0a/src/modules/recursive_partitioning/decision_tree.cpp
----------------------------------------------------------------------
diff --git a/src/modules/recursive_partitioning/decision_tree.cpp 
b/src/modules/recursive_partitioning/decision_tree.cpp
index 7a8ec95..b298df8 100644
--- a/src/modules/recursive_partitioning/decision_tree.cpp
+++ b/src/modules/recursive_partitioning/decision_tree.cpp
@@ -237,7 +237,9 @@ dt_apply::run(AnyType & args){
     }
 
     AnyType output_tuple;
-    output_tuple << dt.storage() << return_code << 
static_cast<uint16_t>(dt.tree_depth - 1);
+    output_tuple << dt.storage()
+                 << return_code
+                 << static_cast<uint16_t>(dt.tree_depth - 1);
     return output_tuple;
 } // apply function
 // -------------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c82b9d0a/src/modules/recursive_partitioning/feature_encoding.cpp
----------------------------------------------------------------------
diff --git a/src/modules/recursive_partitioning/feature_encoding.cpp 
b/src/modules/recursive_partitioning/feature_encoding.cpp
index 0b868df..20856e2 100644
--- a/src/modules/recursive_partitioning/feature_encoding.cpp
+++ b/src/modules/recursive_partitioning/feature_encoding.cpp
@@ -156,11 +156,11 @@ p_log2_p(const double &p) {
 AnyType
 dst_compute_entropy_final::run(AnyType &args){
     MappedIntegerVector state = args[0].getAs<MappedIntegerVector>();
-    double sum = static_cast<double>(state.sum());
-    ColumnVector probilities = state.cast<double>() / sum;
+    double sum_of_dep_counts = static_cast<double>(state.sum());
+    ColumnVector probs = state.cast<double>() / sum_of_dep_counts;
     // usage of unaryExpr with functor:
     // 
http://eigen.tuxfamily.org/dox/classEigen_1_1MatrixBase.html#a23fc4bf97168dee2516f85edcfd4cfe7
-    return -(probilities.unaryExpr(std::ptr_fun(p_log2_p))).sum();
+    return -(probs.unaryExpr(std::ptr_fun(p_log2_p))).sum();
 }
 // ------------------------------------------------------------
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c82b9d0a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in 
b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index 40f4b7e..fb18278 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -27,7 +27,6 @@ from utilities.validate_args import table_is_empty
 from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import is_var_valid
 from utilities.validate_args import unquote_ident
-from utilities.validate_args import quote_ident
 from utilities.utilities import _assert
 from utilities.utilities import extract_keyvalue_params
 from utilities.utilities import unique_string
@@ -137,14 +136,14 @@ def _get_features_to_use(schema_madlib, 
training_table_name,
 # ------------------------------------------------------------
 
 
-def _get_col_value(input_dict, col_name):
+def _dict_get_quoted(input_dict, col_name):
     """Return value from dict where key could be quoted or unquoted name"""
     return input_dict.get(
         col_name, input_dict.get(unquote_ident(col_name)))
 # -------------------------------------------------------------------------
 
 
-def _classify_features(all_feature_names_types, features):
+def _classify_features(feature_to_type, features):
     """ Returns
     1) an array of categorical features (all casted to string)
     2) an array of continuous features
@@ -155,24 +154,28 @@ def _classify_features(all_feature_names_types, features):
     text_types = ['text', 'varchar', 'character varying', 'char', 'character']
     boolean_types = ['boolean']
     cat_types = int_types + text_types + boolean_types
+    ordered_cat_types = int_types
+
+    cat_features = [c for c in features
+                    if _dict_get_quoted(feature_to_type, c) in cat_types]
+    ordered_cat_features = [c for c in features if _dict_get_quoted(
+                            feature_to_type, c) in ordered_cat_types]
 
-    cat_features = [col for col in features
-                    if _get_col_value(all_feature_names_types, col) in 
cat_types]
     cat_features_set = set(cat_features)
     # continuous types - 'real' is cast to 'double precision' for uniformity
     con_types = ['real', 'float8', 'double precision']
-    con_features = [col for col in features
-                    if (not col in cat_features_set and
-                        _get_col_value(all_feature_names_types, col) in 
con_types)]
+    con_features = [c for c in features
+                    if (c not in cat_features_set and
+                        _dict_get_quoted(feature_to_type, c) in con_types)]
 
     # In order to be able to form an array, all categorical variables
-    # will be casted into TEXT type, but GPDB cannot cast a boolean
-    # directly into a text. Thus, boolean type categorical variables
-    # needs special treatment: cast them into integers before casting
+    # will be cast into TEXT type, but GPDB cannot cast a boolean
+    # directly into a text. Thus, boolean categorical variables
+    # need special treatment: cast them into integers before casting
     # into text.
-    boolean_cats = [col for col in features
-                    if _get_col_value(all_feature_names_types, col) in 
boolean_types]
-    return cat_features, con_features, boolean_cats
+    boolean_cats = [c for c in features
+                    if _dict_get_quoted(feature_to_type, c) in boolean_types]
+    return cat_features, ordered_cat_features, con_features, boolean_cats
 # ------------------------------------------------------------
 
 
@@ -357,8 +360,9 @@ def _extract_pruning_params(pruning_params_str):
 def _get_tree_states(schema_madlib, is_classification, split_criterion,
                      training_table_name, output_table_name, id_col_name,
                      dependent_variable, dep_is_bool,
-                     grouping_cols, cat_features, con_features,
-                     n_bins, boolean_cats, min_split, min_bucket, weights,
+                     grouping_cols, cat_features, ordered_cat_features,
+                     con_features, n_bins, boolean_cats,
+                     min_split, min_bucket, weights,
                      max_depth, grp_key_to_cp, compute_cp_list=False,
                      max_n_surr=0, **kwargs):
     """
@@ -407,8 +411,9 @@ def _get_tree_states(schema_madlib, is_classification, 
split_criterion,
         # 3)  Find the splitting bins, one dict containing two arrays:
         #       categorical bins and continuous bins
         bins = _get_bins(schema_madlib, training_table_name, cat_features,
-                         con_features, n_bins, dep_var_str, boolean_cats, 
n_rows,
-                         is_classification, dep_n_levels, filter_null)
+                         ordered_cat_features, con_features, n_bins,
+                         dep_var_str, boolean_cats,
+                         n_rows, is_classification, dep_n_levels, filter_null)
         # some features may be dropped if they have only one value
         cat_features = bins['cat_features']
 
@@ -429,7 +434,8 @@ def _get_tree_states(schema_madlib, is_classification, 
split_criterion,
                 # result in excessive memory usage.
                 plpy.notice("Analyzing data to compute split boundaries for 
variables")
                 bins = _get_bins_grps(schema_madlib, training_table_name,
-                                      cat_features, con_features, n_bins,
+                                      cat_features, ordered_cat_features,
+                                      con_features, n_bins,
                                       dep_var_str,
                                       boolean_cats, grouping_cols,
                                       grouping_array_str, n_rows,
@@ -451,10 +457,14 @@ def _get_tree_states(schema_madlib, is_classification, 
split_criterion,
     #   cp values if cross-validation is required (cp_list = [] if not)
     for tree in tree_states:
         if 'cp' in tree:
-            pruned_tree = _prune_and_cplist(schema_madlib, tree['tree_state'],
-                                            tree['cp'], 
compute_cp_list=compute_cp_list)
+            pruned_tree = _prune_and_cplist(schema_madlib, tree,
+                                            tree['cp'],
+                                            compute_cp_list=compute_cp_list)
             tree['tree_state'] = pruned_tree['tree_state']
-            tree['pruned_depth'] = pruned_tree['pruned_depth']
+            if 'pruned_depth' in pruned_tree:
+                tree['pruned_depth'] = pruned_tree['pruned_depth']
+            else:
+                tree['pruned_depth'] = pruned_tree['tree_depth']
             if 'cp_list' in pruned_tree:
                 tree['cp_list'] = pruned_tree['cp_list']
 
@@ -474,7 +484,7 @@ def get_grouping_array_str(table_name, grouping_cols, 
qualifier=None):
 
     all_cols_types = dict(get_cols_and_types(table_name))
     grouping_cols_list = [col.strip() for col in grouping_cols.split(',')]
-    grouping_cols_and_types = [(col, _get_col_value(all_cols_types, col))
+    grouping_cols_and_types = [(col, _dict_get_quoted(all_cols_types, col))
                                for col in grouping_cols_list]
     grouping_array_str = 'array_to_string(array[' + \
         ','.join("(case when " + col + " then 'True' else 'False' end)::text"
@@ -489,7 +499,8 @@ def get_grouping_array_str(table_name, grouping_cols, 
qualifier=None):
 def _build_tree(schema_madlib, is_classification, split_criterion,
                 training_table_name, output_table_name, id_col_name,
                 dependent_variable, dep_is_bool,
-                cat_features,  boolean_cats, con_features, grouping_cols,
+                cat_features, ordered_cat_features,
+                boolean_cats, con_features, grouping_cols,
                 weights, max_depth, min_split, min_bucket, n_bins,
                 cp_table, max_n_surr=0, msg_level="warning", k=0, **kwargs):
 
@@ -556,13 +567,13 @@ def tree_train(schema_madlib, training_table_name, 
output_table_name,
     """
     msg_level = "notice" if verbose_mode else "warning"
 
-    #### Set default values for optional arguments
+    # Set default values for optional arguments
     min_split = 20 if (min_split is None and min_bucket is None) else min_split
     min_bucket = min_split // 3 if min_bucket is None else min_bucket
     min_split = min_bucket * 3 if min_split is None else min_split
     n_bins = 100 if n_bins is None else n_bins
     split_criterion = 'gini' if not split_criterion else split_criterion
-    plpy.notice("split_criterion:"+split_criterion)
+    plpy.notice("split_criterion:" + split_criterion)
     pruning_param_dict = _extract_pruning_params(pruning_params)
     cp = pruning_param_dict['cp']
     n_folds = pruning_param_dict['n_folds']
@@ -591,8 +602,8 @@ def tree_train(schema_madlib, training_table_name, 
output_table_name,
 
         # 2)
         all_cols_types = dict(get_cols_and_types(training_table_name))
-        cat_features, con_features, boolean_cats = _classify_features(
-            all_cols_types, features)
+        cat_features, ordered_cat_features, con_features, boolean_cats = \
+            _classify_features(all_cols_types, features)
         # get all rows
         n_all_rows = plpy.execute("SELECT count(*) FROM {source_table}".
                                   format(source_table=training_table_name)
@@ -630,7 +641,8 @@ def _create_output_tables(schema_madlib, 
training_table_name, output_table_name,
     if not grouping_cols:
         _create_result_table(schema_madlib, tree_states[0],
                              bins['cat_origin'], bins['cat_n'], cat_features,
-                             con_features, output_table_name, 
use_existing_tables, running_cv, k)
+                             con_features, output_table_name,
+                             use_existing_tables, running_cv, k)
     else:
         _create_grp_result_table(
             schema_madlib, tree_states, bins, cat_features,
@@ -687,7 +699,8 @@ def _is_dep_categorical(training_table_name, 
dependent_variable):
 # ------------------------------------------------------------
 
 
-def _get_bins(schema_madlib, training_table_name, cat_features,
+def _get_bins(schema_madlib, training_table_name,
+              cat_features, ordered_cat_features,
               con_features, n_bins, dependent_variable, boolean_cats,
               n_rows, is_classification, dep_n_levels, filter_null):
     """ Compute the bins of all features
@@ -757,39 +770,37 @@ def _get_bins(schema_madlib, training_table_name, 
cat_features,
     else:
         con_splits = {'con_splits': ''}   # no continuous features present
 
-    # For categorical variables, different from the continuous
-    # variable case, we scan the whole table to extract all the
+    # For categorical variables, scan the whole table to extract all the
     # levels of the categorical variables, and at the same time
     # sort the levels according to the entropy of the dependent
     # variable.
     # So this aggregate returns a composite type with two columns:
     # col 1 is the array of ordered levels; col 2 is the number of
-    # levels in col1.
+    # levels in col 1.
 
     # TODO When n_bins is larger than 2^k - 1, where k is the number
     # of levels of a given categrical feature, we can actually compute
     # all combinations of levels and obtain a complete set of splits
-    # instead of using sorting to get an approximate set of splits. This
-    # can also be done in the following aggregate, but we may not need it
-    # in the initial draft. Implement this optimization only if it is
-    # necessary.
-
+    # instead of using sorting to get an approximate set of splits.
+    #
     # We will use integer to represent levels of categorical variables.
     # So before everything, we need to create a mapping from categorical
     # variable levels to integers, and keep this mapping in the memory.
     if len(cat_features) > 0:
         if is_classification:
             # For classifications
-            order_fun = ("{schema_madlib}._dst_compute_entropy("
-                         "{dependent_variable}, {n})".
-                         format(schema_madlib=schema_madlib,
-                                dependent_variable=dependent_variable,
+            order_fun = ("{madlib}._dst_compute_entropy({dep}, {n})".
+                         format(madlib=schema_madlib,
+                                dep=dependent_variable,
                                 n=dep_n_levels))
         else:
             # For regressions
-            order_fun = \
-                
"AVG({dependent_variable})".format(dependent_variable=dependent_variable)
+            order_fun = "AVG({0})".format(dependent_variable)
 
+        # Note that 'sql_cat_levels' goes through two levels of formatting
+        # Try to obtain all the levels in one scan of the table.
+        # () are needed when casting the categorical variables because
+        # they can be expressions.
         sql_cat_levels = """
             SELECT
                 '{{col_name}}'::text AS colname,
@@ -801,7 +812,7 @@ def _get_bins(schema_madlib, training_table_name, 
cat_features,
                 FROM (
                     SELECT
                         ({{col}})::text AS levels,
-                        {order_fun} AS dep_avg
+                        {{order_fun}} AS dep_avg
                     FROM {training_table_name}
                     WHERE {filter_null}
                         AND {{col}} is not NULL
@@ -810,22 +821,23 @@ def _get_bins(schema_madlib, training_table_name, 
cat_features,
             ) s1
             WHERE array_upper(levels, 1) > 1
             """.format(training_table_name=training_table_name,
-                       order_fun=order_fun, filter_null=filter_null)
+                       filter_null=filter_null)
 
-        # Try to obtain all the levels in one scan of the table.
-        # () are needed when casting the categorical variables because
-        # they can be expressions.
-        sql_all_cats = ' UNION ALL '.join(
-            sql_cat_levels.format(col="(CASE WHEN " + col + " THEN 'True' ELSE 
'False' END)"
-                                  if col in boolean_cats else col,
-                                  col_name=col) for col in cat_features)
+        sql_all_cats = ' UNION '.join(
+            sql_cat_levels.format(
+                col="(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)"
+                    if col in boolean_cats else col,
+                col_name=col,
+                order_fun=col if col in ordered_cat_features else order_fun)
+            for col in cat_features)
         all_levels = plpy.execute(sql_all_cats)
 
         if len(all_levels) != len(cat_features):
             plpy.warning("Decision tree warning: Categorical columns with only 
"
                          "one value are dropped from the tree model.")
             use_cat_features = [row['colname'] for row in all_levels]
-            cat_features = [feature for feature in cat_features if feature in 
use_cat_features]
+            cat_features = [feature for feature in cat_features
+                            if feature in use_cat_features]
 
         col_to_row = dict((row['colname'], i) for i, row in 
enumerate(all_levels))
 
@@ -863,6 +875,8 @@ def _create_result_table(schema_madlib, tree_state,
         header = "insert into " + output_table_name + " "
     else:
         header = "create table " + output_table_name + " as "
+    depth = (tree_state['pruned_depth'] if 'pruned_depth' in tree_state
+             else tree_state['tree_depth'])
     if len(cat_features) > 0:
         sql = header + """
                 SELECT
@@ -870,10 +884,11 @@ def _create_result_table(schema_madlib, tree_state,
                     $1 as tree,
                     $2 as cat_levels_in_text,
                     $3 as cat_n_levels,
-                    {tree_depth} as tree_depth
+                    {depth} as tree_depth
                     {fold}
-            """.format(tree_depth=tree_state['pruned_depth'],
-                       cp=tree_state['cp'], fold=fold)
+            """.format(depth=depth,
+                       cp=tree_state['cp'],
+                       fold=fold)
         sql_plan = plpy.prepare(sql, ['{0}.bytea8'.format(schema_madlib),
                                       'text[]', 'integer[]'])
         plpy.execute(sql_plan, [tree_state['tree_state'], cat_origin, cat_n])
@@ -884,10 +899,11 @@ def _create_result_table(schema_madlib, tree_state,
                     $1 as tree,
                     NULL::text[] as cat_levels_in_text,
                     NULL::integer[] as cat_n_levels,
-                    {tree_depth} as tree_depth
+                    {depth} as tree_depth
                     {fold}
-            """.format(tree_depth=tree_state['pruned_depth'],
-                       cp=tree_state['cp'], fold=fold)
+            """.format(depth=depth,
+                       cp=tree_state['cp'],
+                       fold=fold)
         sql_plan = plpy.prepare(sql, ['{0}.bytea8'.format(schema_madlib)])
         plpy.execute(sql_plan, [tree_state['tree_state']])
 
@@ -895,7 +911,7 @@ def _create_result_table(schema_madlib, tree_state,
 
 
 def _get_bins_grps(
-        schema_madlib, training_table_name, cat_features,
+        schema_madlib, training_table_name, cat_features, ordered_cat_features,
         con_features, n_bins, dependent_variable, boolean_cats,
         grouping_cols, grouping_array_str, n_rows, is_classification,
         dep_n_levels, filter_null):
@@ -1012,7 +1028,7 @@ def _get_bins_grps(
                         SELECT
                             {grouping_array_str} as grp_key,
                             ({{col}})::text as levels,
-                            {order_fun} as dep_avg
+                            {{order_fun}} as dep_avg
                         FROM {training_table_name}
                         WHERE {filter_null}
                             AND {{col}} is not NULL
@@ -1027,7 +1043,8 @@ def _get_bins_grps(
             sql_cat_levels.format(
                 col=("(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)"
                      if col in boolean_cats else col),
-                col_name=col)
+                col_name=col,
+                order_fun=col if col in ordered_cat_features else order_fun)
             for col in cat_features)
 
         all_levels = list(plpy.execute(sql_all_cats))
@@ -1454,7 +1471,8 @@ def _create_grp_result_table(
         plpy.execute(sql_plan, [
             [t['grp_key'] for t in tree_states],
             [t['tree_state'] for t in tree_states],
-            [t['pruned_depth'] for t in tree_states],
+            [t['pruned_depth'] if 'pruned_depth' in t else t['tree_depth']
+             for t in tree_states],
             [t['cp'] for t in tree_states],
             bins['grp_key_cat'],
             bins['cat_n'],
@@ -1469,9 +1487,10 @@ def _create_grp_result_table(
         plpy.execute(sql_plan, [
             [t['grp_key'] for t in tree_states],
             [t['tree_state'] for t in tree_states],
-            [t['pruned_depth'] for t in tree_states],
+            [t['pruned_depth'] if 'pruned_depth' in t else t['tree_depth']
+             for t in tree_states],
             [t['cp'] for t in tree_states]
-            ])
+        ])
 # ------------------------------------------------------------
 
 
@@ -1511,7 +1530,7 @@ def _create_summary_table(
                         "$dep_list$")
     else:
         dep_list_str = "NULL"
-    indep_type = ', '.join(_get_col_value(all_cols_types, col)
+    indep_type = ', '.join(_dict_get_quoted(all_cols_types, col)
                            for col in cat_features + con_features)
     dep_type = _get_dep_type(training_table_name, dependent_variable)
     cat_features_str = ','.join(cat_features)
@@ -1560,8 +1579,12 @@ def _create_summary_table(
 # ------------------------------------------------------------
 
 
-def _get_filter_str(schema_madlib, cat_features, con_features, boolean_cats,
-                    dependent_variable, grouping_cols, max_n_surr=0):
+def _get_filter_str(schema_madlib, cat_features, con_features,
+                    boolean_cats, dependent_variable,
+                    grouping_cols, max_n_surr=0):
+    """ Return a 'WHERE' clause string that filters out all rows that contain a
+    NULL.
+    """
     if grouping_cols:
         g_filter = ' and '.join('(' + s.strip() + ') is not NULL' for s in 
grouping_cols.split(','))
     else:
@@ -1592,7 +1615,7 @@ def _get_filter_str(schema_madlib, cat_features, 
con_features, boolean_cats,
 
 
 def _validate_predict(schema_madlib, model, source, output, 
use_existing_tables):
-       # validations for inputs
+    # validations for inputs
     _assert(source and source.strip().lower() not in ('null', ''),
             "Decision tree error: Invalid data table name: {0}".format(source))
     _assert(table_exists(source),
@@ -1961,13 +1984,13 @@ SELECT * FROM tree_predict_out;
 # ------------------------------------------------------------
 
 
-def _prune_and_cplist(schema_madlib, tree_state, cp, compute_cp_list=False):
+def _prune_and_cplist(schema_madlib, tree, cp, compute_cp_list=False):
     """ Prune tree with given cost-complexity parameters
         and return a list of cp values at which tree can be pruned
 
         Args:
             @param schema_madlib: str, MADlib schema name
-            @param tree_state: schema_madlib.bytea8, tree to prune
+            @param tree: Tree data to prune
             @param cp: float, cost-complexity parameter, all splits that have a
                                 complexity lower than 'cp' will be pruned
             @param compute_cp_list: bool, optionally return a list of cp 
values that
@@ -1982,20 +2005,22 @@ def _prune_and_cplist(schema_madlib, tree_state, cp, 
compute_cp_list=False):
                 cp_list: list of cp values at which tree can be pruned
                          (returned only if compute_cp_list=True)
     """
+    if cp <= 0 and not compute_cp_list:
+        return tree
     sql = """
         SELECT (pruned_tree).*
         FROM (
-            SELECT {schema_madlib}._prune_and_cplist(
+            SELECT {madlib}._prune_and_cplist(
                         $1,
                         ({cp})::double precision,
                         ({compute_cp_list})::boolean
                     ) as pruned_tree
         ) q
-    """.format(schema_madlib=schema_madlib, cp=cp,
+    """.format(madlib=schema_madlib, cp=cp,
                compute_cp_list=bool(compute_cp_list))
 
-    sql_plan = plpy.prepare(sql, 
['{schema_madlib}.bytea8'.format(schema_madlib=schema_madlib)])
-    pruned_tree = plpy.execute(sql_plan, [tree_state])[0]
+    sql_plan = plpy.prepare(sql, [schema_madlib + '.bytea8'])
+    pruned_tree = plpy.execute(sql_plan, [tree['tree_state']])[0]
     return pruned_tree
 # -------------------------------------------------------------------------
 
@@ -2003,7 +2028,7 @@ def _prune_and_cplist(schema_madlib, tree_state, cp, 
compute_cp_list=False):
 def _xvalidate(schema_madlib, tree_states, training_table_name, 
output_table_name,
                id_col_name, dependent_variable,
                list_of_features, list_of_features_to_exclude,
-               cat_features, con_features, boolean_cats,
+               cat_features, ordered_cat_features, boolean_cats, con_features,
                split_criterion, grouping_cols, weights, max_depth,
                min_split, min_bucket, n_bins, is_classification,
                dep_is_bool, dep_n_levels, n_folds, n_rows,
@@ -2068,24 +2093,26 @@ def _xvalidate(schema_madlib, tree_states, 
training_table_name, output_table_nam
         plpy.execute(plan, [grp_list, cp_list])
 
     # 2) call CV function to actually cross-validate _build_tree
-    # expect output table model_cv({grouping_cols), cp, avg, stddev)
+    # expects output table model_cv({grouping_cols), cp, avg, stddev)
     model_cv = output_table_name + "_cv"
     metric_function = "_tree_misclassified" if is_classification else 
"_tree_rmse"
     pred_name = '"estimated_{0}"'.format(dependent_variable.strip(' "'))
     grouping_str = 'NULL' if not grouping_cols else '"' + grouping_cols + '"'
     cat_feature_str = _array_to_string(cat_features)
-    con_feature_str = _array_to_string(con_features)
+    ordered_cat_feature_str = _array_to_string(ordered_cat_features)
     boolean_cat_str = _array_to_string(boolean_cats)
+    con_feature_str = _array_to_string(con_features)
     modeling_params = [str(i) for i in
                        (is_classification,
                         split_criterion, "%data%", "%model%", id_col_name,
                         dependent_variable, dep_is_bool,
-                        cat_feature_str, boolean_cat_str, con_feature_str,
+                        cat_feature_str, ordered_cat_feature_str,
+                        boolean_cat_str, con_feature_str,
                         grouping_str, weights, max_depth,
                         min_split, min_bucket, n_bins,
                         "%explore%", max_n_surr, msg_level)]
     modeling_param_types = (["BOOLEAN"] + ["TEXT"] * 5 + ["BOOLEAN"] +
-                            ["VARCHAR[]"] * 3 + ["TEXT"] * 2 + ["INTEGER"] * 4 
+
+                            ["VARCHAR[]"] * 4 + ["TEXT"] * 2 + ["INTEGER"] * 4 
+
                             ["TEXT", "SMALLINT", "TEXT"])
 
     cross_validation_grouping_w_params(
@@ -2141,7 +2168,6 @@ def _xvalidate(schema_madlib, tree_states, 
training_table_name, output_table_nam
 
     grp_key_to_best_cp = dict((row['grp_key'], row['cp']) for row in 
validation_result)
 
-    plpy.notice("Finished cross validation, final pruning ...")
     # 4) update tree_states to have the best cp cross-validated
     for tree in tree_states:
         best_cp = grp_key_to_best_cp[tree['grp_key']]
@@ -2151,11 +2177,16 @@ def _xvalidate(schema_madlib, tree_states, 
training_table_name, output_table_nam
             # giving the optimal pruned tree.
             # This time we don't need the cp_list.
             pruned_tree = _prune_and_cplist(schema_madlib,
-                                            tree['tree_state'],
+                                            tree,
                                             tree['cp'],
                                             compute_cp_list=False)
             tree['tree_state'] = pruned_tree['tree_state']
-            tree['pruned_depth'] = pruned_tree['pruned_depth']
+            if 'pruned_depth' in pruned_tree:
+                tree['pruned_depth'] = pruned_tree['pruned_depth']
+            elif 'tree_depth' in pruned_tree:
+                tree['pruned_depth'] = pruned_tree['tree_depth']
+            else:
+                tree['pruned_depth'] = 0
 
     plpy.execute("DROP TABLE {group_to_param_list_table}".format(**locals()))
 # ------------------------------------------------------------
@@ -2184,9 +2215,9 @@ def _tree_train_using_bins(
                    max_n_surr=max_n_surr))[0]
     plpy.notice("Starting tree building")
     tree_depth = -1
-    while not tree_state['finished']:
-        tree_depth += 1
+    while tree_state['finished'] == 0:
         #  finished: 0 = running, 1 = finished training, 2 = terminated 
prematurely
+        tree_depth += 1
         tree_state = _one_step(
             schema_madlib, training_table_name,
             cat_features, con_features, boolean_cats, bins,

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c82b9d0a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
index b5ed4a2..97e8471 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -225,11 +225,11 @@ tree_train(
   double precision columns are considered continuous. The categorical variables
   are not encoded and used as is for the training.
 
-  There are no limitations on the number of levels in a categorical variable.
-  It is, however, important to note that we don't test for every combination of
+  It is important to note that we don't test for every combination of
   levels of a categorical variable when evaluating a split. We order the levels
-  of the variable by the entropy of the variable in predicting the response. 
The
-  split at each node is evaluated between these ordered levels.
+  of the non-integer categorical variable by the entropy of the variable in
+  predicting the response. The split at each node is evaluated between these
+  ordered levels. Integer categorical variables are ordered by their value.
   </DD>
 
   <DT>list_of_features_to_exclude</DT>
@@ -337,7 +337,10 @@ tree_train(
 - Many of the parameters are designed to be similar to the popular R package 
'rpart'.
 An important distinction between rpart and the MADlib function is that
 for both response and feature variables, MADlib considers integer values as
-categorical values, while rpart considers them as continuous.
+categorical values, while rpart considers them as continuous. To use integers 
as
+continuous, please cast them to double precision.
+- Integer values are ordered by value for computing the split boundaries. 
Please
+cast to TEXT if the entropy-based ordering method is desired.
 - When using no surrogates (<em>max_surrogates</em>=0), all rows containing 
NULL values
 for any of the features used for training will be ignored from training and 
prediction.
 - When cross-validation is not used (<em>n_folds</em>=0), each tree output
@@ -349,8 +352,9 @@ to compute the optimal sub-tree. The optimal sub-tree and 
the 'cp' corresponding
 to this optimal sub-tree is placed in the <em>output_table</em>, with the
 columns named as <em>tree</em> and <em>pruning_cp</em> respectively.
 - The main parameters that affect memory usage are:  depth of tree, number
-of features, and number of values per feature.  If you are hitting VMEM limits,
-consider reducing one or more of these parameters.
+of features, number of values per categorical feature, and number of bins for
+continuous features.  If you are hitting VMEM limits, consider reducing one or
+more of these parameters.
 
 @anchor predict
 @par Prediction Function
@@ -986,6 +990,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__build_tree(
     dependent_variable    TEXT,
     dep_is_bool           BOOLEAN,
     cat_features          VARCHAR[],
+    ordered_cat_features  VARCHAR[],
     boolean_cats          VARCHAR[],
     con_features          VARCHAR[],
     grouping_cols         TEXT,

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c82b9d0a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in 
b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index 0eb5985..930d916 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -34,7 +34,7 @@ from decision_tree import _is_dep_categorical
 from decision_tree import _get_n_and_deplist
 from decision_tree import _classify_features
 from decision_tree import _get_filter_str
-from decision_tree import _get_col_value
+from decision_tree import _dict_get_quoted
 from decision_tree import _get_display_header
 from decision_tree import get_feature_str
 # ------------------------------------------------------------
@@ -329,8 +329,8 @@ def forest_train(
                         "is more than the actual number of features.")
 
                 all_cols_types = dict(get_cols_and_types(training_table_name))
-                cat_features, con_features, boolean_cats = _classify_features(
-                    all_cols_types, features)
+                cat_features, ordered_cat_features, con_features, boolean_cats 
= \
+                    _classify_features(all_cols_types, features)
 
                 filter_null = _get_filter_str(schema_madlib, cat_features,
                                               con_features, boolean_cats,
@@ -382,7 +382,8 @@ def forest_train(
                     # bins, and continuous bins
                     num_groups = 1
                     bins = _get_bins(schema_madlib, training_table_name,
-                                     cat_features, con_features, num_bins, dep,
+                                     cat_features, ordered_cat_features,
+                                     con_features, num_bins, dep,
                                      boolean_cats, n_rows, is_classification,
                                      dep_n_levels, filter_null)
                     # some features may be dropped because they have only one 
value
@@ -390,13 +391,14 @@ def forest_train(
                     bins['grp_key_cat'] = ['']
                 else:
                     grouping_cols_list = [col.strip() for col in 
grouping_cols.split(',')]
-                    grouping_cols_and_types = [(col, 
_get_col_value(all_cols_types, col))
+                    grouping_cols_and_types = [(col, 
_dict_get_quoted(all_cols_types, col))
                                                for col in grouping_cols_list]
-                    grouping_array_str = "array_to_string(array[" + \
-                            ','.join("(case when " + col + " then 'True' else 
'False' end)::text"
+                    grouping_array_str = (
+                        "array_to_string(array[" +
+                        ','.join("(case when " + col + " then 'True' else 
'False' end)::text"
                                  if col_type == 'boolean' else '(' + col + 
')::text'
-                                 for col, col_type in grouping_cols_and_types) 
+ \
-                            "], ',')"
+                                 for col, col_type in grouping_cols_and_types) 
+
+                        "], ',')")
                     grouping_cols_str = ('' if grouping_cols is None
                                          else grouping_cols + ",")
                     sql_grp_key_to_grp_cols = """
@@ -417,7 +419,8 @@ def forest_train(
                             """.format(**locals()))[0]['count']
                     plpy.notice("Analyzing data to compute split boundaries 
for variables")
                     bins = _get_bins_grps(schema_madlib, training_table_name,
-                                          cat_features, con_features, 
num_bins, dep,
+                                          cat_features, ordered_cat_features,
+                                          con_features, num_bins, dep,
                                           boolean_cats, grouping_cols,
                                           grouping_array_str, n_rows,
                                           is_classification, dep_n_levels, 
filter_null)
@@ -1198,7 +1201,7 @@ def _create_summary_table(**kwargs):
     else:
         kwargs['dep_list_str'] = "NULL"
 
-    kwargs['indep_type'] = ', '.join(_get_col_value(kwargs['all_cols_types'], 
col)
+    kwargs['indep_type'] = ', 
'.join(_dict_get_quoted(kwargs['all_cols_types'], col)
                            for col in kwargs['cat_features'] + 
kwargs['con_features'])
     kwargs['dep_type'] = _get_dep_type(kwargs['training_table_name'],
                                        kwargs['dependent_variable'])

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c82b9d0a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
index 8f9168c..1863b64 100644
--- 
a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
+++ 
b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
@@ -370,6 +370,7 @@ select __build_tree(
     FALSE,
     ARRAY['"OUTLOOK"']::text[],
     '{}',
+    '{}',
     '{humidity}',
     'class',
     '1',

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c82b9d0a/src/ports/postgres/modules/validation/cross_validation.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/validation/cross_validation.py_in 
b/src/ports/postgres/modules/validation/cross_validation.py_in
index 7b39c90..f157ffa 100644
--- a/src/ports/postgres/modules/validation/cross_validation.py_in
+++ b/src/ports/postgres/modules/validation/cross_validation.py_in
@@ -402,7 +402,6 @@ def cross_validation_grouping_w_params(
     with MinWarning("warning"):
         if not data_cols:
             data_cols = get_cols(data_tbl, schema_madlib)
-
         n_rows = _validate_cv_args(**locals())
 
         explore_type_str = "::INTEGER"

incubator-madlib git commit: Decision Tree: Multiple fixes - pruning, tree_depth, viz

Reply via email to