Repository: incubator-madlib Updated Branches: refs/heads/master 975d34e43 -> c82b9d0ad
Decision Tree: Multiple fixes - pruning, tree_depth, viz Commit includes following changes: - Pruning is not performed when cp = 0 (default behavior) - A particular bug is fixed: User input of max_depth starts from 0 and the internal tree_depth starts from 1. This discrepancy was not taken into account when tree train termination was checked leading to trees containing only two leaf nodes on last level. - Integer categorical variable is treated as ordered and hence is not re-ordered. If the original ordering method is desired, then integer column needs to be cast to TEXT. - Visualization is improved: nodes with categorical feature splits only provide the last value in the split, instead of the complete list. This is consistent with the visualization in scikit-learn. Closes #111 Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/c82b9d0a Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/c82b9d0a Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/c82b9d0a Branch: refs/heads/master Commit: c82b9d0ad844a03ef5f16111fa73e441a03850d5 Parents: 975d34e Author: Rahul Iyer <ri...@apache.org> Authored: Fri Apr 14 17:21:19 2017 -0700 Committer: Rahul Iyer <ri...@apache.org> Committed: Fri Apr 14 17:23:30 2017 -0700 ---------------------------------------------------------------------- src/modules/recursive_partitioning/DT_impl.hpp | 88 ++++++-- .../recursive_partitioning/decision_tree.cpp | 4 +- .../recursive_partitioning/feature_encoding.cpp | 6 +- .../recursive_partitioning/decision_tree.py_in | 203 +++++++++++-------- .../recursive_partitioning/decision_tree.sql_in | 19 +- .../recursive_partitioning/random_forest.py_in | 25 ++- .../test/decision_tree.sql_in | 1 + .../modules/validation/cross_validation.py_in | 1 - 8 files changed, 217 insertions(+), 130 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c82b9d0a/src/modules/recursive_partitioning/DT_impl.hpp ---------------------------------------------------------------------- diff --git a/src/modules/recursive_partitioning/DT_impl.hpp b/src/modules/recursive_partitioning/DT_impl.hpp index f622f94..64d2b88 100644 --- a/src/modules/recursive_partitioning/DT_impl.hpp +++ b/src/modules/recursive_partitioning/DT_impl.hpp @@ -574,7 +574,6 @@ DecisionTree<Container>::expand(const Accumulator &state, feature_indices(i) = FINISHED_LEAF; } } - return training_finished; } // ------------------------------------------------------------------------- @@ -1025,11 +1024,10 @@ DecisionTree<Container>::shouldSplit(const ColumnVector &combined_stats, uint64_t thresh_min_bucket = (min_bucket == 0) ? 1u : min_bucket; uint64_t true_count = statCount(combined_stats.segment(0, stats_per_split)); uint64_t false_count = statCount(combined_stats.segment(stats_per_split, stats_per_split)); - return ((true_count + false_count) >= min_split && true_count >= thresh_min_bucket && false_count >= thresh_min_bucket && - tree_depth <= max_depth); + tree_depth <= max_depth + 1); } // ------------------------------------------------------------------------ @@ -1107,11 +1105,32 @@ DecisionTree<Container>::displayLeafNode( display_str << "\"" << id_prefix << id << "\" [label=\"" << predict_str.str(); if(verbose){ - display_str << "\\n samples = " << statCount(predictions.row(id)) << "\\n value = "; + display_str << "\\n impurity = "<< impurity(predictions.row(id)) + << "\\n samples = " << statCount(predictions.row(id)) + << "\\n value = "; if (is_regression) display_str << statPredict(predictions.row(id)); else{ - display_str << "[" << predictions.row(id).head(n_y_labels)<< "]"; + display_str << "["; + // NUM_PER_LINE: inserting new lines at fixed intervals + // avoids a really long 'value' line + const uint16_t NUM_PER_LINE = 10; + // note: last element of predictions is 'statCount' and + // can be ignored + const Index pred_size = predictions.row(id).size() - 1; + for (Index i = 0; i < pred_size; i += NUM_PER_LINE){ + uint16_t n_elem; + if (i + NUM_PER_LINE <= pred_size) { + // not overflowing the vector + n_elem = NUM_PER_LINE; + } else { + // less than NUM_PER_LINE left, avoid reading past the end + n_elem = pred_size - i; + } + display_str << predictions.row(id).segment(i, n_elem) << "\n"; + } + display_str << "]"; + } } display_str << "\",shape=box]" << ";"; @@ -1143,23 +1162,46 @@ DecisionTree<Container>::displayInternalNode( label_str << escape_quotes(feature_name) << " <= " << feature_thresholds(id); } else { feature_name = get_text(cat_features_str, feature_indices(id)); - label_str << escape_quotes(feature_name) << " in " - << getCatLabels(feature_indices(id), - static_cast<Index>(0), - static_cast<Index>(feature_thresholds(id)), - cat_levels_text, cat_n_levels); + label_str << escape_quotes(feature_name) << " <= "; + + // Text for all categoricals are stored in a flat array (cat_levels_text); + // find the appropriate index for this node + size_t to_skip = 0; + for (Index i=0; i < feature_indices(id); i++) + to_skip += cat_n_levels[i]; + const size_t index = to_skip + feature_thresholds(id); + label_str << get_text(cat_levels_text, index); } std::stringstream display_str; display_str << "\"" << id_prefix << id << "\" [label=\"" << label_str.str(); if(verbose){ + display_str << "\\n impurity = "<< impurity(predictions.row(id)) << "\\n samples = " << statCount(predictions.row(id)); - display_str << "\\n impurity = "<< impurity(predictions.row(id)) << "\\n samples = " << statCount(predictions.row(id)) << "\\n value = "; + display_str << "\\n value = "; if (is_regression) display_str << statPredict(predictions.row(id)); else{ - display_str << "[" << predictions.row(id).head(n_y_labels)<< "]"; + display_str << "["; + // NUM_PER_LINE: inserting new lines at fixed interval + // avoids really long 'value' line + const uint16_t NUM_PER_LINE = 10; + // note: last element of predictions is just 'statCount' and needs to + // be ignored + const Index pred_size = predictions.row(id).size() - 1; + for (Index i = 0; i < pred_size; i += NUM_PER_LINE){ + uint16_t n_elem; + if (i + NUM_PER_LINE <= pred_size) { + // not overflowing the vector + n_elem = NUM_PER_LINE; + } else { + n_elem = pred_size - i; + } + display_str << predictions.row(id).segment(i, n_elem) << "\n"; + } + display_str << "]"; } + std::stringstream predict_str; if (static_cast<bool>(is_regression)){ predict_str << predict_response(id); @@ -1360,20 +1402,24 @@ DecisionTree<Container>::getCatLabels(Index cat_index, Index end_value, ArrayHandle<text*> &cat_levels_text, ArrayHandle<int> &cat_n_levels) { + Index MAX_LABELS = 5; size_t to_skip = 0; for (Index i=0; i < cat_index; i++) { to_skip += cat_n_levels[i]; } std::stringstream cat_levels; - size_t start_index; + size_t index; cat_levels << "{"; - for (start_index = to_skip + start_value; - start_index < to_skip + end_value && - start_index < cat_levels_text.size(); - start_index++) { - cat_levels << get_text(cat_levels_text, start_index) << ","; + for (index = to_skip + start_value; + index < to_skip + end_value && index < cat_levels_text.size(); + index++) { + cat_levels << get_text(cat_levels_text, index) << ","; + if (index > to_skip + start_value + MAX_LABELS){ + cat_levels << " ... "; + break; + } } - cat_levels << get_text(cat_levels_text, start_index) << "}"; + cat_levels << get_text(cat_levels_text, index) << "}"; return cat_levels.str(); } // ------------------------------------------------------------------------- @@ -1575,7 +1621,7 @@ TreeAccumulator<Container, DTree>::operator<<(const tuple_type& inTuple) { uint16_t n_non_leaf_nodes = static_cast<uint16_t>(n_leaf_nodes - 1); Index dt_search_index = dt.search(cat_features, con_features); if (dt.feature_indices(dt_search_index) != dt.FINISHED_LEAF && - dt.feature_indices(dt_search_index) != dt.NODE_NON_EXISTING) { + dt.feature_indices(dt_search_index) != dt.NODE_NON_EXISTING) { Index row_index = dt_search_index - n_non_leaf_nodes; assert(row_index >= 0); // add this row into the stats for the node @@ -1651,7 +1697,7 @@ TreeAccumulator<Container, DTree>::operator<<(const surr_tuple_type& inTuple) { double primary_val = is_primary_cat ? cat_features(primary_index) : con_features(primary_index); - // We only capture statistics for rows that: + // Only capture statistics for rows that: // 1. lead to leaf nodes in the last layer. Surrogates for other nodes // have already been trained. // 2. have non-null values for the primary split. http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c82b9d0a/src/modules/recursive_partitioning/decision_tree.cpp ---------------------------------------------------------------------- diff --git a/src/modules/recursive_partitioning/decision_tree.cpp b/src/modules/recursive_partitioning/decision_tree.cpp index 7a8ec95..b298df8 100644 --- a/src/modules/recursive_partitioning/decision_tree.cpp +++ b/src/modules/recursive_partitioning/decision_tree.cpp @@ -237,7 +237,9 @@ dt_apply::run(AnyType & args){ } AnyType output_tuple; - output_tuple << dt.storage() << return_code << static_cast<uint16_t>(dt.tree_depth - 1); + output_tuple << dt.storage() + << return_code + << static_cast<uint16_t>(dt.tree_depth - 1); return output_tuple; } // apply function // ------------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c82b9d0a/src/modules/recursive_partitioning/feature_encoding.cpp ---------------------------------------------------------------------- diff --git a/src/modules/recursive_partitioning/feature_encoding.cpp b/src/modules/recursive_partitioning/feature_encoding.cpp index 0b868df..20856e2 100644 --- a/src/modules/recursive_partitioning/feature_encoding.cpp +++ b/src/modules/recursive_partitioning/feature_encoding.cpp @@ -156,11 +156,11 @@ p_log2_p(const double &p) { AnyType dst_compute_entropy_final::run(AnyType &args){ MappedIntegerVector state = args[0].getAs<MappedIntegerVector>(); - double sum = static_cast<double>(state.sum()); - ColumnVector probilities = state.cast<double>() / sum; + double sum_of_dep_counts = static_cast<double>(state.sum()); + ColumnVector probs = state.cast<double>() / sum_of_dep_counts; // usage of unaryExpr with functor: // http://eigen.tuxfamily.org/dox/classEigen_1_1MatrixBase.html#a23fc4bf97168dee2516f85edcfd4cfe7 - return -(probilities.unaryExpr(std::ptr_fun(p_log2_p))).sum(); + return -(probs.unaryExpr(std::ptr_fun(p_log2_p))).sum(); } // ------------------------------------------------------------ http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c82b9d0a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in index 40f4b7e..fb18278 100644 --- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in @@ -27,7 +27,6 @@ from utilities.validate_args import table_is_empty from utilities.validate_args import columns_exist_in_table from utilities.validate_args import is_var_valid from utilities.validate_args import unquote_ident -from utilities.validate_args import quote_ident from utilities.utilities import _assert from utilities.utilities import extract_keyvalue_params from utilities.utilities import unique_string @@ -137,14 +136,14 @@ def _get_features_to_use(schema_madlib, training_table_name, # ------------------------------------------------------------ -def _get_col_value(input_dict, col_name): +def _dict_get_quoted(input_dict, col_name): """Return value from dict where key could be quoted or unquoted name""" return input_dict.get( col_name, input_dict.get(unquote_ident(col_name))) # ------------------------------------------------------------------------- -def _classify_features(all_feature_names_types, features): +def _classify_features(feature_to_type, features): """ Returns 1) an array of categorical features (all casted to string) 2) an array of continuous features @@ -155,24 +154,28 @@ def _classify_features(all_feature_names_types, features): text_types = ['text', 'varchar', 'character varying', 'char', 'character'] boolean_types = ['boolean'] cat_types = int_types + text_types + boolean_types + ordered_cat_types = int_types + + cat_features = [c for c in features + if _dict_get_quoted(feature_to_type, c) in cat_types] + ordered_cat_features = [c for c in features if _dict_get_quoted( + feature_to_type, c) in ordered_cat_types] - cat_features = [col for col in features - if _get_col_value(all_feature_names_types, col) in cat_types] cat_features_set = set(cat_features) # continuous types - 'real' is cast to 'double precision' for uniformity con_types = ['real', 'float8', 'double precision'] - con_features = [col for col in features - if (not col in cat_features_set and - _get_col_value(all_feature_names_types, col) in con_types)] + con_features = [c for c in features + if (c not in cat_features_set and + _dict_get_quoted(feature_to_type, c) in con_types)] # In order to be able to form an array, all categorical variables - # will be casted into TEXT type, but GPDB cannot cast a boolean - # directly into a text. Thus, boolean type categorical variables - # needs special treatment: cast them into integers before casting + # will be cast into TEXT type, but GPDB cannot cast a boolean + # directly into a text. Thus, boolean categorical variables + # need special treatment: cast them into integers before casting # into text. - boolean_cats = [col for col in features - if _get_col_value(all_feature_names_types, col) in boolean_types] - return cat_features, con_features, boolean_cats + boolean_cats = [c for c in features + if _dict_get_quoted(feature_to_type, c) in boolean_types] + return cat_features, ordered_cat_features, con_features, boolean_cats # ------------------------------------------------------------ @@ -357,8 +360,9 @@ def _extract_pruning_params(pruning_params_str): def _get_tree_states(schema_madlib, is_classification, split_criterion, training_table_name, output_table_name, id_col_name, dependent_variable, dep_is_bool, - grouping_cols, cat_features, con_features, - n_bins, boolean_cats, min_split, min_bucket, weights, + grouping_cols, cat_features, ordered_cat_features, + con_features, n_bins, boolean_cats, + min_split, min_bucket, weights, max_depth, grp_key_to_cp, compute_cp_list=False, max_n_surr=0, **kwargs): """ @@ -407,8 +411,9 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion, # 3) Find the splitting bins, one dict containing two arrays: # categorical bins and continuous bins bins = _get_bins(schema_madlib, training_table_name, cat_features, - con_features, n_bins, dep_var_str, boolean_cats, n_rows, - is_classification, dep_n_levels, filter_null) + ordered_cat_features, con_features, n_bins, + dep_var_str, boolean_cats, + n_rows, is_classification, dep_n_levels, filter_null) # some features may be dropped if they have only one value cat_features = bins['cat_features'] @@ -429,7 +434,8 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion, # result in excessive memory usage. plpy.notice("Analyzing data to compute split boundaries for variables") bins = _get_bins_grps(schema_madlib, training_table_name, - cat_features, con_features, n_bins, + cat_features, ordered_cat_features, + con_features, n_bins, dep_var_str, boolean_cats, grouping_cols, grouping_array_str, n_rows, @@ -451,10 +457,14 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion, # cp values if cross-validation is required (cp_list = [] if not) for tree in tree_states: if 'cp' in tree: - pruned_tree = _prune_and_cplist(schema_madlib, tree['tree_state'], - tree['cp'], compute_cp_list=compute_cp_list) + pruned_tree = _prune_and_cplist(schema_madlib, tree, + tree['cp'], + compute_cp_list=compute_cp_list) tree['tree_state'] = pruned_tree['tree_state'] - tree['pruned_depth'] = pruned_tree['pruned_depth'] + if 'pruned_depth' in pruned_tree: + tree['pruned_depth'] = pruned_tree['pruned_depth'] + else: + tree['pruned_depth'] = pruned_tree['tree_depth'] if 'cp_list' in pruned_tree: tree['cp_list'] = pruned_tree['cp_list'] @@ -474,7 +484,7 @@ def get_grouping_array_str(table_name, grouping_cols, qualifier=None): all_cols_types = dict(get_cols_and_types(table_name)) grouping_cols_list = [col.strip() for col in grouping_cols.split(',')] - grouping_cols_and_types = [(col, _get_col_value(all_cols_types, col)) + grouping_cols_and_types = [(col, _dict_get_quoted(all_cols_types, col)) for col in grouping_cols_list] grouping_array_str = 'array_to_string(array[' + \ ','.join("(case when " + col + " then 'True' else 'False' end)::text" @@ -489,7 +499,8 @@ def get_grouping_array_str(table_name, grouping_cols, qualifier=None): def _build_tree(schema_madlib, is_classification, split_criterion, training_table_name, output_table_name, id_col_name, dependent_variable, dep_is_bool, - cat_features, boolean_cats, con_features, grouping_cols, + cat_features, ordered_cat_features, + boolean_cats, con_features, grouping_cols, weights, max_depth, min_split, min_bucket, n_bins, cp_table, max_n_surr=0, msg_level="warning", k=0, **kwargs): @@ -556,13 +567,13 @@ def tree_train(schema_madlib, training_table_name, output_table_name, """ msg_level = "notice" if verbose_mode else "warning" - #### Set default values for optional arguments + # Set default values for optional arguments min_split = 20 if (min_split is None and min_bucket is None) else min_split min_bucket = min_split // 3 if min_bucket is None else min_bucket min_split = min_bucket * 3 if min_split is None else min_split n_bins = 100 if n_bins is None else n_bins split_criterion = 'gini' if not split_criterion else split_criterion - plpy.notice("split_criterion:"+split_criterion) + plpy.notice("split_criterion:" + split_criterion) pruning_param_dict = _extract_pruning_params(pruning_params) cp = pruning_param_dict['cp'] n_folds = pruning_param_dict['n_folds'] @@ -591,8 +602,8 @@ def tree_train(schema_madlib, training_table_name, output_table_name, # 2) all_cols_types = dict(get_cols_and_types(training_table_name)) - cat_features, con_features, boolean_cats = _classify_features( - all_cols_types, features) + cat_features, ordered_cat_features, con_features, boolean_cats = \ + _classify_features(all_cols_types, features) # get all rows n_all_rows = plpy.execute("SELECT count(*) FROM {source_table}". format(source_table=training_table_name) @@ -630,7 +641,8 @@ def _create_output_tables(schema_madlib, training_table_name, output_table_name, if not grouping_cols: _create_result_table(schema_madlib, tree_states[0], bins['cat_origin'], bins['cat_n'], cat_features, - con_features, output_table_name, use_existing_tables, running_cv, k) + con_features, output_table_name, + use_existing_tables, running_cv, k) else: _create_grp_result_table( schema_madlib, tree_states, bins, cat_features, @@ -687,7 +699,8 @@ def _is_dep_categorical(training_table_name, dependent_variable): # ------------------------------------------------------------ -def _get_bins(schema_madlib, training_table_name, cat_features, +def _get_bins(schema_madlib, training_table_name, + cat_features, ordered_cat_features, con_features, n_bins, dependent_variable, boolean_cats, n_rows, is_classification, dep_n_levels, filter_null): """ Compute the bins of all features @@ -757,39 +770,37 @@ def _get_bins(schema_madlib, training_table_name, cat_features, else: con_splits = {'con_splits': ''} # no continuous features present - # For categorical variables, different from the continuous - # variable case, we scan the whole table to extract all the + # For categorical variables, scan the whole table to extract all the # levels of the categorical variables, and at the same time # sort the levels according to the entropy of the dependent # variable. # So this aggregate returns a composite type with two columns: # col 1 is the array of ordered levels; col 2 is the number of - # levels in col1. + # levels in col 1. # TODO When n_bins is larger than 2^k - 1, where k is the number # of levels of a given categrical feature, we can actually compute # all combinations of levels and obtain a complete set of splits - # instead of using sorting to get an approximate set of splits. This - # can also be done in the following aggregate, but we may not need it - # in the initial draft. Implement this optimization only if it is - # necessary. - + # instead of using sorting to get an approximate set of splits. + # # We will use integer to represent levels of categorical variables. # So before everything, we need to create a mapping from categorical # variable levels to integers, and keep this mapping in the memory. if len(cat_features) > 0: if is_classification: # For classifications - order_fun = ("{schema_madlib}._dst_compute_entropy(" - "{dependent_variable}, {n})". - format(schema_madlib=schema_madlib, - dependent_variable=dependent_variable, + order_fun = ("{madlib}._dst_compute_entropy({dep}, {n})". + format(madlib=schema_madlib, + dep=dependent_variable, n=dep_n_levels)) else: # For regressions - order_fun = \ - "AVG({dependent_variable})".format(dependent_variable=dependent_variable) + order_fun = "AVG({0})".format(dependent_variable) + # Note that 'sql_cat_levels' goes through two levels of formatting + # Try to obtain all the levels in one scan of the table. + # () are needed when casting the categorical variables because + # they can be expressions. sql_cat_levels = """ SELECT '{{col_name}}'::text AS colname, @@ -801,7 +812,7 @@ def _get_bins(schema_madlib, training_table_name, cat_features, FROM ( SELECT ({{col}})::text AS levels, - {order_fun} AS dep_avg + {{order_fun}} AS dep_avg FROM {training_table_name} WHERE {filter_null} AND {{col}} is not NULL @@ -810,22 +821,23 @@ def _get_bins(schema_madlib, training_table_name, cat_features, ) s1 WHERE array_upper(levels, 1) > 1 """.format(training_table_name=training_table_name, - order_fun=order_fun, filter_null=filter_null) + filter_null=filter_null) - # Try to obtain all the levels in one scan of the table. - # () are needed when casting the categorical variables because - # they can be expressions. - sql_all_cats = ' UNION ALL '.join( - sql_cat_levels.format(col="(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)" - if col in boolean_cats else col, - col_name=col) for col in cat_features) + sql_all_cats = ' UNION '.join( + sql_cat_levels.format( + col="(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)" + if col in boolean_cats else col, + col_name=col, + order_fun=col if col in ordered_cat_features else order_fun) + for col in cat_features) all_levels = plpy.execute(sql_all_cats) if len(all_levels) != len(cat_features): plpy.warning("Decision tree warning: Categorical columns with only " "one value are dropped from the tree model.") use_cat_features = [row['colname'] for row in all_levels] - cat_features = [feature for feature in cat_features if feature in use_cat_features] + cat_features = [feature for feature in cat_features + if feature in use_cat_features] col_to_row = dict((row['colname'], i) for i, row in enumerate(all_levels)) @@ -863,6 +875,8 @@ def _create_result_table(schema_madlib, tree_state, header = "insert into " + output_table_name + " " else: header = "create table " + output_table_name + " as " + depth = (tree_state['pruned_depth'] if 'pruned_depth' in tree_state + else tree_state['tree_depth']) if len(cat_features) > 0: sql = header + """ SELECT @@ -870,10 +884,11 @@ def _create_result_table(schema_madlib, tree_state, $1 as tree, $2 as cat_levels_in_text, $3 as cat_n_levels, - {tree_depth} as tree_depth + {depth} as tree_depth {fold} - """.format(tree_depth=tree_state['pruned_depth'], - cp=tree_state['cp'], fold=fold) + """.format(depth=depth, + cp=tree_state['cp'], + fold=fold) sql_plan = plpy.prepare(sql, ['{0}.bytea8'.format(schema_madlib), 'text[]', 'integer[]']) plpy.execute(sql_plan, [tree_state['tree_state'], cat_origin, cat_n]) @@ -884,10 +899,11 @@ def _create_result_table(schema_madlib, tree_state, $1 as tree, NULL::text[] as cat_levels_in_text, NULL::integer[] as cat_n_levels, - {tree_depth} as tree_depth + {depth} as tree_depth {fold} - """.format(tree_depth=tree_state['pruned_depth'], - cp=tree_state['cp'], fold=fold) + """.format(depth=depth, + cp=tree_state['cp'], + fold=fold) sql_plan = plpy.prepare(sql, ['{0}.bytea8'.format(schema_madlib)]) plpy.execute(sql_plan, [tree_state['tree_state']]) @@ -895,7 +911,7 @@ def _create_result_table(schema_madlib, tree_state, def _get_bins_grps( - schema_madlib, training_table_name, cat_features, + schema_madlib, training_table_name, cat_features, ordered_cat_features, con_features, n_bins, dependent_variable, boolean_cats, grouping_cols, grouping_array_str, n_rows, is_classification, dep_n_levels, filter_null): @@ -1012,7 +1028,7 @@ def _get_bins_grps( SELECT {grouping_array_str} as grp_key, ({{col}})::text as levels, - {order_fun} as dep_avg + {{order_fun}} as dep_avg FROM {training_table_name} WHERE {filter_null} AND {{col}} is not NULL @@ -1027,7 +1043,8 @@ def _get_bins_grps( sql_cat_levels.format( col=("(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)" if col in boolean_cats else col), - col_name=col) + col_name=col, + order_fun=col if col in ordered_cat_features else order_fun) for col in cat_features) all_levels = list(plpy.execute(sql_all_cats)) @@ -1454,7 +1471,8 @@ def _create_grp_result_table( plpy.execute(sql_plan, [ [t['grp_key'] for t in tree_states], [t['tree_state'] for t in tree_states], - [t['pruned_depth'] for t in tree_states], + [t['pruned_depth'] if 'pruned_depth' in t else t['tree_depth'] + for t in tree_states], [t['cp'] for t in tree_states], bins['grp_key_cat'], bins['cat_n'], @@ -1469,9 +1487,10 @@ def _create_grp_result_table( plpy.execute(sql_plan, [ [t['grp_key'] for t in tree_states], [t['tree_state'] for t in tree_states], - [t['pruned_depth'] for t in tree_states], + [t['pruned_depth'] if 'pruned_depth' in t else t['tree_depth'] + for t in tree_states], [t['cp'] for t in tree_states] - ]) + ]) # ------------------------------------------------------------ @@ -1511,7 +1530,7 @@ def _create_summary_table( "$dep_list$") else: dep_list_str = "NULL" - indep_type = ', '.join(_get_col_value(all_cols_types, col) + indep_type = ', '.join(_dict_get_quoted(all_cols_types, col) for col in cat_features + con_features) dep_type = _get_dep_type(training_table_name, dependent_variable) cat_features_str = ','.join(cat_features) @@ -1560,8 +1579,12 @@ def _create_summary_table( # ------------------------------------------------------------ -def _get_filter_str(schema_madlib, cat_features, con_features, boolean_cats, - dependent_variable, grouping_cols, max_n_surr=0): +def _get_filter_str(schema_madlib, cat_features, con_features, + boolean_cats, dependent_variable, + grouping_cols, max_n_surr=0): + """ Return a 'WHERE' clause string that filters out all rows that contain a + NULL. + """ if grouping_cols: g_filter = ' and '.join('(' + s.strip() + ') is not NULL' for s in grouping_cols.split(',')) else: @@ -1592,7 +1615,7 @@ def _get_filter_str(schema_madlib, cat_features, con_features, boolean_cats, def _validate_predict(schema_madlib, model, source, output, use_existing_tables): - # validations for inputs + # validations for inputs _assert(source and source.strip().lower() not in ('null', ''), "Decision tree error: Invalid data table name: {0}".format(source)) _assert(table_exists(source), @@ -1961,13 +1984,13 @@ SELECT * FROM tree_predict_out; # ------------------------------------------------------------ -def _prune_and_cplist(schema_madlib, tree_state, cp, compute_cp_list=False): +def _prune_and_cplist(schema_madlib, tree, cp, compute_cp_list=False): """ Prune tree with given cost-complexity parameters and return a list of cp values at which tree can be pruned Args: @param schema_madlib: str, MADlib schema name - @param tree_state: schema_madlib.bytea8, tree to prune + @param tree: Tree data to prune @param cp: float, cost-complexity parameter, all splits that have a complexity lower than 'cp' will be pruned @param compute_cp_list: bool, optionally return a list of cp values that @@ -1982,20 +2005,22 @@ def _prune_and_cplist(schema_madlib, tree_state, cp, compute_cp_list=False): cp_list: list of cp values at which tree can be pruned (returned only if compute_cp_list=True) """ + if cp <= 0 and not compute_cp_list: + return tree sql = """ SELECT (pruned_tree).* FROM ( - SELECT {schema_madlib}._prune_and_cplist( + SELECT {madlib}._prune_and_cplist( $1, ({cp})::double precision, ({compute_cp_list})::boolean ) as pruned_tree ) q - """.format(schema_madlib=schema_madlib, cp=cp, + """.format(madlib=schema_madlib, cp=cp, compute_cp_list=bool(compute_cp_list)) - sql_plan = plpy.prepare(sql, ['{schema_madlib}.bytea8'.format(schema_madlib=schema_madlib)]) - pruned_tree = plpy.execute(sql_plan, [tree_state])[0] + sql_plan = plpy.prepare(sql, [schema_madlib + '.bytea8']) + pruned_tree = plpy.execute(sql_plan, [tree['tree_state']])[0] return pruned_tree # ------------------------------------------------------------------------- @@ -2003,7 +2028,7 @@ def _prune_and_cplist(schema_madlib, tree_state, cp, compute_cp_list=False): def _xvalidate(schema_madlib, tree_states, training_table_name, output_table_name, id_col_name, dependent_variable, list_of_features, list_of_features_to_exclude, - cat_features, con_features, boolean_cats, + cat_features, ordered_cat_features, boolean_cats, con_features, split_criterion, grouping_cols, weights, max_depth, min_split, min_bucket, n_bins, is_classification, dep_is_bool, dep_n_levels, n_folds, n_rows, @@ -2068,24 +2093,26 @@ def _xvalidate(schema_madlib, tree_states, training_table_name, output_table_nam plpy.execute(plan, [grp_list, cp_list]) # 2) call CV function to actually cross-validate _build_tree - # expect output table model_cv({grouping_cols), cp, avg, stddev) + # expects output table model_cv({grouping_cols), cp, avg, stddev) model_cv = output_table_name + "_cv" metric_function = "_tree_misclassified" if is_classification else "_tree_rmse" pred_name = '"estimated_{0}"'.format(dependent_variable.strip(' "')) grouping_str = 'NULL' if not grouping_cols else '"' + grouping_cols + '"' cat_feature_str = _array_to_string(cat_features) - con_feature_str = _array_to_string(con_features) + ordered_cat_feature_str = _array_to_string(ordered_cat_features) boolean_cat_str = _array_to_string(boolean_cats) + con_feature_str = _array_to_string(con_features) modeling_params = [str(i) for i in (is_classification, split_criterion, "%data%", "%model%", id_col_name, dependent_variable, dep_is_bool, - cat_feature_str, boolean_cat_str, con_feature_str, + cat_feature_str, ordered_cat_feature_str, + boolean_cat_str, con_feature_str, grouping_str, weights, max_depth, min_split, min_bucket, n_bins, "%explore%", max_n_surr, msg_level)] modeling_param_types = (["BOOLEAN"] + ["TEXT"] * 5 + ["BOOLEAN"] + - ["VARCHAR[]"] * 3 + ["TEXT"] * 2 + ["INTEGER"] * 4 + + ["VARCHAR[]"] * 4 + ["TEXT"] * 2 + ["INTEGER"] * 4 + ["TEXT", "SMALLINT", "TEXT"]) cross_validation_grouping_w_params( @@ -2141,7 +2168,6 @@ def _xvalidate(schema_madlib, tree_states, training_table_name, output_table_nam grp_key_to_best_cp = dict((row['grp_key'], row['cp']) for row in validation_result) - plpy.notice("Finished cross validation, final pruning ...") # 4) update tree_states to have the best cp cross-validated for tree in tree_states: best_cp = grp_key_to_best_cp[tree['grp_key']] @@ -2151,11 +2177,16 @@ def _xvalidate(schema_madlib, tree_states, training_table_name, output_table_nam # giving the optimal pruned tree. # This time we don't need the cp_list. pruned_tree = _prune_and_cplist(schema_madlib, - tree['tree_state'], + tree, tree['cp'], compute_cp_list=False) tree['tree_state'] = pruned_tree['tree_state'] - tree['pruned_depth'] = pruned_tree['pruned_depth'] + if 'pruned_depth' in pruned_tree: + tree['pruned_depth'] = pruned_tree['pruned_depth'] + elif 'tree_depth' in pruned_tree: + tree['pruned_depth'] = pruned_tree['tree_depth'] + else: + tree['pruned_depth'] = 0 plpy.execute("DROP TABLE {group_to_param_list_table}".format(**locals())) # ------------------------------------------------------------ @@ -2184,9 +2215,9 @@ def _tree_train_using_bins( max_n_surr=max_n_surr))[0] plpy.notice("Starting tree building") tree_depth = -1 - while not tree_state['finished']: - tree_depth += 1 + while tree_state['finished'] == 0: # finished: 0 = running, 1 = finished training, 2 = terminated prematurely + tree_depth += 1 tree_state = _one_step( schema_madlib, training_table_name, cat_features, con_features, boolean_cats, bins, http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c82b9d0a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in index b5ed4a2..97e8471 100644 --- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in +++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in @@ -225,11 +225,11 @@ tree_train( double precision columns are considered continuous. The categorical variables are not encoded and used as is for the training. - There are no limitations on the number of levels in a categorical variable. - It is, however, important to note that we don't test for every combination of + It is important to note that we don't test for every combination of levels of a categorical variable when evaluating a split. We order the levels - of the variable by the entropy of the variable in predicting the response. The - split at each node is evaluated between these ordered levels. + of the non-integer categorical variable by the entropy of the variable in + predicting the response. The split at each node is evaluated between these + ordered levels. Integer categorical variables are ordered by their value. </DD> <DT>list_of_features_to_exclude</DT> @@ -337,7 +337,10 @@ tree_train( - Many of the parameters are designed to be similar to the popular R package 'rpart'. An important distinction between rpart and the MADlib function is that for both response and feature variables, MADlib considers integer values as -categorical values, while rpart considers them as continuous. +categorical values, while rpart considers them as continuous. To use integers as +continuous, please cast them to double precision. +- Integer values are ordered by value for computing the split boundaries. Please +cast to TEXT if the entropy-based ordering method is desired. - When using no surrogates (<em>max_surrogates</em>=0), all rows containing NULL values for any of the features used for training will be ignored from training and prediction. - When cross-validation is not used (<em>n_folds</em>=0), each tree output @@ -349,8 +352,9 @@ to compute the optimal sub-tree. The optimal sub-tree and the 'cp' corresponding to this optimal sub-tree is placed in the <em>output_table</em>, with the columns named as <em>tree</em> and <em>pruning_cp</em> respectively. - The main parameters that affect memory usage are: depth of tree, number -of features, and number of values per feature. If you are hitting VMEM limits, -consider reducing one or more of these parameters. +of features, number of values per categorical feature, and number of bins for +continuous features. If you are hitting VMEM limits, consider reducing one or +more of these parameters. @anchor predict @par Prediction Function @@ -986,6 +990,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__build_tree( dependent_variable TEXT, dep_is_bool BOOLEAN, cat_features VARCHAR[], + ordered_cat_features VARCHAR[], boolean_cats VARCHAR[], con_features VARCHAR[], grouping_cols TEXT, http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c82b9d0a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in index 0eb5985..930d916 100644 --- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in @@ -34,7 +34,7 @@ from decision_tree import _is_dep_categorical from decision_tree import _get_n_and_deplist from decision_tree import _classify_features from decision_tree import _get_filter_str -from decision_tree import _get_col_value +from decision_tree import _dict_get_quoted from decision_tree import _get_display_header from decision_tree import get_feature_str # ------------------------------------------------------------ @@ -329,8 +329,8 @@ def forest_train( "is more than the actual number of features.") all_cols_types = dict(get_cols_and_types(training_table_name)) - cat_features, con_features, boolean_cats = _classify_features( - all_cols_types, features) + cat_features, ordered_cat_features, con_features, boolean_cats = \ + _classify_features(all_cols_types, features) filter_null = _get_filter_str(schema_madlib, cat_features, con_features, boolean_cats, @@ -382,7 +382,8 @@ def forest_train( # bins, and continuous bins num_groups = 1 bins = _get_bins(schema_madlib, training_table_name, - cat_features, con_features, num_bins, dep, + cat_features, ordered_cat_features, + con_features, num_bins, dep, boolean_cats, n_rows, is_classification, dep_n_levels, filter_null) # some features may be dropped because they have only one value @@ -390,13 +391,14 @@ def forest_train( bins['grp_key_cat'] = [''] else: grouping_cols_list = [col.strip() for col in grouping_cols.split(',')] - grouping_cols_and_types = [(col, _get_col_value(all_cols_types, col)) + grouping_cols_and_types = [(col, _dict_get_quoted(all_cols_types, col)) for col in grouping_cols_list] - grouping_array_str = "array_to_string(array[" + \ - ','.join("(case when " + col + " then 'True' else 'False' end)::text" + grouping_array_str = ( + "array_to_string(array[" + + ','.join("(case when " + col + " then 'True' else 'False' end)::text" if col_type == 'boolean' else '(' + col + ')::text' - for col, col_type in grouping_cols_and_types) + \ - "], ',')" + for col, col_type in grouping_cols_and_types) + + "], ',')") grouping_cols_str = ('' if grouping_cols is None else grouping_cols + ",") sql_grp_key_to_grp_cols = """ @@ -417,7 +419,8 @@ def forest_train( """.format(**locals()))[0]['count'] plpy.notice("Analyzing data to compute split boundaries for variables") bins = _get_bins_grps(schema_madlib, training_table_name, - cat_features, con_features, num_bins, dep, + cat_features, ordered_cat_features, + con_features, num_bins, dep, boolean_cats, grouping_cols, grouping_array_str, n_rows, is_classification, dep_n_levels, filter_null) @@ -1198,7 +1201,7 @@ def _create_summary_table(**kwargs): else: kwargs['dep_list_str'] = "NULL" - kwargs['indep_type'] = ', '.join(_get_col_value(kwargs['all_cols_types'], col) + kwargs['indep_type'] = ', '.join(_dict_get_quoted(kwargs['all_cols_types'], col) for col in kwargs['cat_features'] + kwargs['con_features']) kwargs['dep_type'] = _get_dep_type(kwargs['training_table_name'], kwargs['dependent_variable']) http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c82b9d0a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in index 8f9168c..1863b64 100644 --- a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in +++ b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in @@ -370,6 +370,7 @@ select __build_tree( FALSE, ARRAY['"OUTLOOK"']::text[], '{}', + '{}', '{humidity}', 'class', '1', http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c82b9d0a/src/ports/postgres/modules/validation/cross_validation.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/validation/cross_validation.py_in b/src/ports/postgres/modules/validation/cross_validation.py_in index 7b39c90..f157ffa 100644 --- a/src/ports/postgres/modules/validation/cross_validation.py_in +++ b/src/ports/postgres/modules/validation/cross_validation.py_in @@ -402,7 +402,6 @@ def cross_validation_grouping_w_params( with MinWarning("warning"): if not data_cols: data_cols = get_cols(data_tbl, schema_madlib) - n_rows = _validate_cv_args(**locals()) explore_type_str = "::INTEGER"