Repository: incubator-madlib Updated Branches: refs/heads/master 41a439f47 -> 80410635a
Multiple: Fix array to string conversion for DT with CV This change is made to allow double quotes in names of feature arrays input to DT/RF. The double quotes were leading to failures when cast into a text array for cross validation. Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/80410635 Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/80410635 Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/80410635 Branch: refs/heads/master Commit: 80410635a0d4e0ce8eb95a85d9dbb7fc551eb2e7 Parents: 41a439f Author: Rahul Iyer <ri...@apache.org> Authored: Wed May 17 16:31:45 2017 -0700 Committer: Rahul Iyer <ri...@apache.org> Committed: Wed May 17 16:31:45 2017 -0700 ---------------------------------------------------------------------- .../recursive_partitioning/decision_tree.py_in | 120 +++++++++++++++---- .../modules/validation/cross_validation.py_in | 34 ++++-- 2 files changed, 118 insertions(+), 36 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/80410635/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in index 32c484d..7b724c1 100644 --- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in @@ -2088,37 +2088,30 @@ def _xvalidate(schema_madlib, tree_states, training_table_name, output_table_nam pred_name = '"estimated_{0}"'.format(dependent_variable.strip(' "')) grouping_str = 'NULL' if not grouping_cols else '"' + grouping_cols + '"' - cat_feature_str = _array_to_string(cat_features) - ordered_cat_feature_str = _array_to_string(ordered_cat_features) - boolean_cat_str = _array_to_string(boolean_cats) - con_feature_str = _array_to_string(con_features) - - modeling_params = [str(i) for i in - (is_classification, - split_criterion, "%data%", "%model%", id_col_name, - dependent_variable, dep_is_bool, - cat_feature_str, ordered_cat_feature_str, - boolean_cat_str, con_feature_str, - grouping_str, weights, max_depth, - min_split, min_bucket, n_bins, - "%explore%", max_n_surr, msg_level)] - modeling_param_types = (["BOOLEAN"] + ["TEXT"] * 5 + ["BOOLEAN"] + - ["VARCHAR[]"] * 4 + ["TEXT"] * 2 + ["INTEGER"] * 4 + - ["TEXT", "SMALLINT", "TEXT"]) - + all_features = [cat_features, ordered_cat_features, boolean_cats, con_features] + + # _get_xvalidate_params builds the parameters used in + # DT train, predict, distance functions. Single quotes are added in these + # parameters (except for the feature arrays) since we run + # cross_validation_grouping_w_params with `add_param_quotes=False'. + # This special handling is put in place to ensure the feature arrays are + # treated as arrays instead of strings. + xvalidate_params = _get_xvalidate_params(**locals()) cross_validation_grouping_w_params( schema_madlib, schema_madlib + '.__build_tree', - modeling_params, - modeling_param_types, + xvalidate_params[0], + xvalidate_params[1], schema_madlib + '.__tree_predict', - ["%model%", "%data%", "%prediction%", "response", "True"], - ["VARCHAR", "VARCHAR", "VARCHAR", "VARCHAR", "BOOLEAN"], + xvalidate_params[2], + xvalidate_params[3], schema_madlib + "." + metric_function, - ["%data%", dependent_variable, "%prediction%", pred_name, id_col_name, grouping_cols, "%error%", "True"], - ["VARCHAR", "VARCHAR", "VARCHAR", "VARCHAR", "VARCHAR", "TEXT", "VARCHAR", "BOOLEAN"], + xvalidate_params[4], + xvalidate_params[5], group_to_param_list_table, param_list_name, grouping_cols, - training_table_name, id_col_name, False, model_cv, 'cp', None, n_folds) + training_table_name, id_col_name, False, + model_cv, 'cp', None, n_folds, + add_param_quotes=False) # 3) find the best cp for each group from table {model_cv} if not grouping_cols: @@ -2183,6 +2176,83 @@ def _xvalidate(schema_madlib, tree_states, training_table_name, output_table_nam # ------------------------------------------------------------ +def _get_xvalidate_params(**kwargs): + """ Build train, predict, and metric parameters for cross_validation + + Args: + @param all_features + + Returns: + + """ + def _list_to_string_to_array(array_input): + """ Return a string that can interpreted by postgresql as text[] containing + the names in array_input + + Example: + Input: ['"Cont_features"[1]', '"Cont_features"[2]'] + Output: string_to_array('"Cont_features"[1]~^~"Cont_features"[2]'::text, '~^~'); + + When this output is executed by Postgresql it creates a text array: + madlib=# select string_to_array('"Cont_features"[1]~^~"Cont_features"[2]'::text, '~^~')::VARCHAR[] as t; + t + ------------------------------------------------- + {"\"Cont_features\"[1]","\"Cont_features\"[2]"} + (1 row) + """ + if not array_input: + return "'{}'" + return "string_to_array('{0}', '~^~')".format('~^~'.join(array_input)) + + all_feature_str = [_list_to_string_to_array(i) for i in kwargs['all_features']] + + def _add_quote(s): + if s is None: + return None + s = str(s) + return "NULL" if s.upper() == 'NULL' else "'{0}'".format(s) + + quoted_args = {} + for k, v in kwargs.items(): + quoted_args[k] = _add_quote(v) + + modeling_params = [quoted_args['is_classification'], + quoted_args['split_criterion'], + "%data%", + "%model%", + quoted_args['id_col_name'], + quoted_args['dependent_variable'], + quoted_args['dep_is_bool'], + all_feature_str[0], + all_feature_str[1], + all_feature_str[2], + all_feature_str[3], + quoted_args['grouping_str'], + quoted_args['weights'], + quoted_args['max_depth'], + quoted_args['min_split'], quoted_args['min_bucket'], quoted_args['n_bins'], + "%explore%", quoted_args['max_n_surr'], quoted_args['msg_level'] + ] + modeling_param_types = (["BOOLEAN"] + ["TEXT"] * 5 + ["BOOLEAN"] + + ["VARCHAR[]"] * 4 + ["TEXT"] * 2 + ["INTEGER"] * 4 + + ["TEXT", "SMALLINT", "TEXT"]) + predict_params = ["%model%", "%data%", "%prediction%", "'response'", "True"] + predict_param_types = ["VARCHAR", "VARCHAR", "VARCHAR", "VARCHAR", "BOOLEAN"] + metric_params = ["%data%", + quoted_args['dependent_variable'], + "%prediction%", + quoted_args['pred_name'], + quoted_args['id_col_name'], + quoted_args['grouping_cols'], + "%error%", + "True"] + metric_param_types = ["VARCHAR", "VARCHAR", "VARCHAR", "VARCHAR", "VARCHAR", "TEXT", "VARCHAR", "BOOLEAN"] + return [modeling_params, modeling_param_types, + predict_params, predict_param_types, + metric_params, metric_param_types] +# ---------------------------------------------------------------------- + + def _tree_train_using_bins( schema_madlib, bins, training_table_name, cat_features, con_features, boolean_cats, n_bins, weights, http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/80410635/src/ports/postgres/modules/validation/cross_validation.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/validation/cross_validation.py_in b/src/ports/postgres/modules/validation/cross_validation.py_in index f157ffa..3b3dfe1 100644 --- a/src/ports/postgres/modules/validation/cross_validation.py_in +++ b/src/ports/postgres/modules/validation/cross_validation.py_in @@ -20,7 +20,8 @@ mad_vec = version_wrapper.select_vecfunc() def __cv_combine_params_type_general(params, params_type, tbl_data, col_random_id, param_explored, explore_value, tbl_input, tbl_output, - grp_to_param_tbl=None): + grp_to_param_tbl=None, + add_param_quotes=True): """ Create argument list for SQL functions for training, validation and metric measuring @@ -71,7 +72,9 @@ def __cv_combine_params_type_general(params, params_type, tbl_data, elif p == "%group_param_tbl%": rst.append("\'" + tbl_output + "\'::" + p_type) else: - rst.append("\'" + p + "\'::" + p_type) + if add_param_quotes: + p = "'{0}'".format(p) + rst.append(p + "::" + p_type) return ','.join(rst) # ------------------------------------------------------------------------ @@ -106,7 +109,7 @@ def _replace_explore(params, param_explored, explore_value): def __cv_funcall_general(func, params, params_type, tbl_data, col_random_id, param_explored, explore_value, tbl_input, - tbl_output): + tbl_output, add_param_quotes=True): """ Call training, validation or metric measuring function @@ -116,9 +119,10 @@ def __cv_funcall_general(func, params, params_type, tbl_data, col_random_id, """ arg_string = __cv_combine_params_type_general( params, params_type, tbl_data, col_random_id, - param_explored, explore_value, tbl_input, tbl_output - ) + param_explored, explore_value, tbl_input, tbl_output, + add_param_quotes=add_param_quotes) sql = "SELECT {func}({arg_string})".format(func=func, arg_string=arg_string) + plpy.info(sql) plpy.execute(sql) # ------------------------------------------------------------------------ @@ -207,7 +211,7 @@ def _one_step_cv(tbl_output_model, tbl_output_pred, tbl_output_error, tbl_accum_ modelling_func, modelling_params, modelling_params_type, tbl_train, predict_func, predict_params, predict_params_type, tbl_valid, metric_func, metric_params, metric_params_type, k=0, append_k=False, - use_existing_tables=False, **kwargs): + use_existing_tables=False, add_param_quotes=True, **kwargs): """ Args: @param schema_madlib @@ -251,19 +255,22 @@ def _one_step_cv(tbl_output_model, tbl_output_pred, tbl_output_error, tbl_accum_ __cv_funcall_general( modelling_func, modelling_params, modelling_params_type, tbl_train, col_random_id, temp_param_explored, - explore_value, None, tbl_output_model) + explore_value, None, tbl_output_model, + add_param_quotes=add_param_quotes) # validation __cv_funcall_general( predict_func, predict_params, predict_params_type, tbl_valid, col_random_id, temp_param_explored, - explore_value, tbl_output_model, tbl_output_pred) + explore_value, tbl_output_model, tbl_output_pred, + add_param_quotes=add_param_quotes) # measure the performance metric __cv_funcall_general( metric_func, metric_params, metric_params_type, tbl_valid, col_random_id, temp_param_explored, - explore_value, tbl_output_pred, tbl_output_error) + explore_value, tbl_output_pred, tbl_output_error, + add_param_quotes=add_param_quotes) # accumulate the measured metric result if not output_created: @@ -380,9 +387,10 @@ def cross_validation_general( return None # ------------------------------------------------------------------------ -# XXX Currently, this function is used only by the decision tree, +# XXX Currently, this function is built specifically for the decision tree, # which appends the outputs from different cv folds to a single # table to reduce catalog changes and thus improve the performance. +# FIXME: this should either be made general enough or moved to decision_tree.py_in def cross_validation_grouping_w_params( @@ -436,7 +444,8 @@ def cross_validation_grouping_w_params( param_list_name=param_list_name, group_to_param_list_table=group_to_param_list_table)) - _replace_explore(modelling_params, unique_string(), grp_to_param_tbl) + _replace_explore(modelling_params, unique_string(), + "'" + grp_to_param_tbl + "'") # get the maximum size of the param list among all groups max_len = plpy.execute(""" SELECT max(array_upper({0}, 1)) as max_len @@ -477,6 +486,9 @@ def cross_validation_grouping_w_params( # to speed up the search in the prediction table. append_k = True # k is fold ID to distinguish different sets of results. + # XXX Important to not add single quotes around param + # as a special-handling for decision trees + add_param_quotes = False output_created = _one_step_cv(**locals()) # compute the averages and standard deviations of cv_error for each group and explore value