Repository: madlib Updated Branches: refs/heads/master 3c762e68f -> ea73f546a
CV: Simplify and fix internal CV requirements This commit ensures internal cross validation API is consistent and simplifies the arguments for CV parameters. Closes #307 Project: http://git-wip-us.apache.org/repos/asf/madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/64fbcaee Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/64fbcaee Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/64fbcaee Branch: refs/heads/master Commit: 64fbcaee3ec95b96c1cf8c7b113db7605f4d4cf4 Parents: 40c2071 Author: Rahul Iyer <ri...@apache.org> Authored: Sun Aug 5 19:18:32 2018 -0700 Committer: Rahul Iyer <ri...@apache.org> Committed: Mon Aug 6 15:34:17 2018 -0700 ---------------------------------------------------------------------- .../modules/elastic_net/test/elastic_net.sql_in | 51 +++++++++--------- src/ports/postgres/modules/svm/svm.py_in | 55 ++++++++++---------- src/ports/postgres/modules/svm/test/svm.sql_in | 4 -- .../validation/internal/cross_validation.py_in | 50 ++++++++++-------- 4 files changed, 79 insertions(+), 81 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/madlib/blob/64fbcaee/src/ports/postgres/modules/elastic_net/test/elastic_net.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/elastic_net/test/elastic_net.sql_in b/src/ports/postgres/modules/elastic_net/test/elastic_net.sql_in index 28a6fe6..e3765ee 100644 --- a/src/ports/postgres/modules/elastic_net/test/elastic_net.sql_in +++ b/src/ports/postgres/modules/elastic_net/test/elastic_net.sql_in @@ -814,29 +814,28 @@ end; $$ language plpgsql volatile; select check_elastic_net(); --- This test has been temporarily removed for GPDB5 alpha support - --- DROP TABLE if exists house_en, house_en_summary, house_en_cv; --- SELECT elastic_net_train( --- 'lin_housing_wi', --- 'house_en', --- 'y', --- 'x', --- 'gaussian', --- 0.1, --- 0.2, --- True, --- NULL, --- 'fista', --- $$ eta = 2, max_stepsize = 0.5, use_active_set = f, --- n_folds = 3, validation_result=house_en_cv, --- n_lambdas = 3, alpha = {0, 0.1, 1}, --- warmup = True, warmup_lambdas = {10, 1, 0.1} --- $$, --- NULL, --- 100, --- 1e-2 --- ); --- SELECT * FROM house_en; --- SELECT * FROM house_en_summary; --- SELECT * FROM house_en_cv; + +DROP TABLE if exists house_en, house_en_summary, house_en_cv; +SELECT elastic_net_train( + 'lin_housing_wi', + 'house_en', + 'y', + 'x', + 'gaussian', + 0.1, + 0.2, + True, + NULL, + 'fista', + $$ eta = 2, max_stepsize = 0.5, use_active_set = f, + n_folds = 3, validation_result=house_en_cv, + n_lambdas = 3, alpha = {0, 0.1, 1}, + warmup = True, warmup_lambdas = {10, 1, 0.1} + $$, + NULL, + 100, + 1e-2 +); +SELECT * FROM house_en; +SELECT * FROM house_en_summary; +SELECT * FROM house_en_cv; http://git-wip-us.apache.org/repos/asf/madlib/blob/64fbcaee/src/ports/postgres/modules/svm/svm.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/svm/svm.py_in b/src/ports/postgres/modules/svm/svm.py_in index 4760f36..b8780ab 100644 --- a/src/ports/postgres/modules/svm/svm.py_in +++ b/src/ports/postgres/modules/svm/svm.py_in @@ -384,9 +384,9 @@ def svm_one_class(schema_madlib, source_table, model_table, independent_varname, _verify_table(source_table, model_table, dependent_varname, independent_varname, verify_dep=False) reserved_cols =['coef', 'random_feature_data', - 'random_feature_data', 'loss' - 'num_rows_processed', 'num_rows_skipped', - 'norm_of_gradient', 'num_iterations'] + 'random_feature_data', 'loss' + 'num_rows_processed', 'num_rows_skipped', + 'norm_of_gradient', 'num_iterations'] grouping_str, grouping_col = get_grouping_col_str(schema_madlib, 'SVM', reserved_cols, source_table, @@ -403,15 +403,16 @@ def svm_one_class(schema_madlib, source_table, model_table, independent_varname, dependent_varname, independent_varname, kernel_func, kernel_params, grouping_col, override_fit_intercept=True) - params_dict = _extract_params(schema_madlib, params) - if not params_dict['class_weight']: - params_dict['class_weight'] = 'balanced' source_table = transformer.transformed_table['source_table'] independent_varname = transformer.transformed_table['independent_varname'] dependent_varname = transformer.transformed_table['dependent_varname'] update_source_for_one_class = True args = locals() + + args.update(_extract_params(schema_madlib, params)) + if not args['class_weight']: + args['class_weight'] = 'balanced' _cross_validate_svm(args) _svm_parsed_params(use_transformer_for_output=True, **args) transformer.clear() @@ -952,9 +953,9 @@ def svm(schema_madlib, source_table, model_table, _verify_table(source_table, model_table, dependent_varname, independent_varname) reserved_cols =['coef', 'random_feature_data', - 'random_feature_data', 'loss' - 'num_rows_processed', 'num_rows_skipped', - 'norm_of_gradient', 'num_iterations'] + 'random_feature_data', 'loss' + 'num_rows_processed', 'num_rows_skipped', + 'norm_of_gradient', 'num_iterations'] grouping_str, grouping_col = \ get_grouping_col_str(schema_madlib, 'SVM', reserved_cols, source_table, grouping_col) @@ -963,11 +964,10 @@ def svm(schema_madlib, source_table, model_table, dependent_varname, independent_varname, kernel_func, kernel_params, grouping_col) - params_dict = _extract_params(schema_madlib, params) args = locals() + args.update(_extract_params(schema_madlib, params)) if transformer.transformed_table: args.update(transformer.transformed_table) - _cross_validate_svm(args) _svm_parsed_params(use_transformer_for_output=True, **args) transformer.clear() @@ -976,7 +976,7 @@ def svm(schema_madlib, source_table, model_table, def _cross_validate_svm(args): # updating params_dict will also update args['params_dict'] - params_dict = args['params_dict'] + params_dict = args if params_dict['n_folds'] > 1 and args['grouping_col']: plpy.error('SVM Error: cross validation ' @@ -1018,10 +1018,9 @@ def _cross_validate_svm(args): return scorer = 'classification' if args['is_svc'] else 'regression' - sub_args = {'params_dict': cv_params} - # we want svm in cross validation to not transform the data again, + # svm in cross validation should not transform the data, # since test data in cross validation comes from the transformed source table. - # A linear transformer without adding intercept is a no-op transformer. + # A linear transformer without intercept is a no-op transformer. no_op_kernel = create_kernel(args['schema_madlib'], 0, 'linear', {'fit_intercept': False}) no_op_transformer = no_op_kernel.transform(args['source_table'], @@ -1030,9 +1029,9 @@ def _cross_validate_svm(args): transformer = args.get('transformer', no_op_transformer) args.update(dict(transformer=no_op_transformer)) cv = CrossValidator(_svm_parsed_params, svm_predict, scorer, args) - val_res = cv.validate(sub_args, params_dict['n_folds']).sorted() + val_res = cv.validate(cv_params, params_dict['n_folds']) val_res.output_tbl(params_dict['validation_result']) - params_dict.update(val_res.first('sub_args')['params_dict']) + params_dict.update(val_res.top('sub_args')) args.update(dict(transformer=transformer)) # ------------------------------------------------------------------------------ @@ -1129,8 +1128,9 @@ def _compute_class_weight_sql(source_table, dependent_varname, def _svm_parsed_params(schema_madlib, source_table, model_table, - dependent_varname, independent_varname, transformer, - grouping_str, grouping_col, params_dict, is_svc, + dependent_varname, independent_varname, + transformer, grouping_str, + grouping_col, is_svc, use_transformer_for_output=False, update_source_for_one_class=False, verbose=False, **kwargs): @@ -1186,7 +1186,7 @@ def _svm_parsed_params(schema_madlib, source_table, model_table, class_weight_sql = _compute_class_weight_sql(source_table, dependent_varname, is_svc, - params_dict['class_weight']) + kwargs['class_weight']) args = locals() args.update({ @@ -1202,7 +1202,7 @@ def _svm_parsed_params(schema_madlib, source_table, model_table, 'col_dep_var': args['dependent_varname'], }) - args.update(_verify_get_params_dict(params_dict)) + args.update(_verify_get_params_dict(kwargs)) args.update(_process_epsilon(is_svc, args)) args.update(_svc_or_svr(is_svc, source_table, dependent_varname)) @@ -1254,13 +1254,12 @@ def svm_predict(schema_madlib, model_table, new_data_table, id_col_name, input_tbl_valid(new_data_table, 'SVM') reserved_cols =['coef', 'random_feature_data', - 'random_feature_data', 'loss' - 'num_rows_processed', 'num_rows_skipped', - 'norm_of_gradient', 'num_iterations'] - grouping_str, grouping_col = get_grouping_col_str(schema_madlib, - 'SVM', reserved_cols, - new_data_table, - grouping_col) + 'random_feature_data', 'loss' + 'num_rows_processed', 'num_rows_skipped', + 'norm_of_gradient', 'num_iterations'] + grouping_str, grouping_col = get_grouping_col_str( + schema_madlib, 'SVM', reserved_cols, + new_data_table, grouping_col) _assert(is_var_valid(new_data_table, independent_varname), "SVM Error: independent_varname ('" + independent_varname + "') is invalid for new_data_table (" + new_data_table + ")!") http://git-wip-us.apache.org/repos/asf/madlib/blob/64fbcaee/src/ports/postgres/modules/svm/test/svm.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/svm/test/svm.sql_in b/src/ports/postgres/modules/svm/test/svm.sql_in index cba370c..217b9a0 100644 --- a/src/ports/postgres/modules/svm/test/svm.sql_in +++ b/src/ports/postgres/modules/svm/test/svm.sql_in @@ -822,8 +822,6 @@ FROM svm_unbalanced JOIN svm_predict_out using (index) WHERE y = prediction and y = 1; -/* Disabling cross validation tests due to a DROP CASCADE issue - -- Cross validation tests SELECT svm_one_class( 'svm_normalized', @@ -841,7 +839,6 @@ SELECT * FROM svm_model_expression2_summary; \x off SELECT svm_predict('svm_model_expression2', 'svm_test_normalized', 'id', 'svm_test_model_expression2'); -SELECT * FROM svm_test_model_expression2; SELECT svm_regression( 'svr_train_data', 'm1', @@ -908,4 +905,3 @@ SELECT array_upper(coef, 1) = 3, 'The dimension of the coefficients must be equal to n_components (3)!') FROM m9; -*/ http://git-wip-us.apache.org/repos/asf/madlib/blob/64fbcaee/src/ports/postgres/modules/validation/internal/cross_validation.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/validation/internal/cross_validation.py_in b/src/ports/postgres/modules/validation/internal/cross_validation.py_in index c173533..1cf3463 100644 --- a/src/ports/postgres/modules/validation/internal/cross_validation.py_in +++ b/src/ports/postgres/modules/validation/internal/cross_validation.py_in @@ -16,13 +16,12 @@ # under the License. import plpy -from validation.cv_utils import __cv_produce_col_name_string as _cv_col_string -from validation.cv_utils import __cv_validation_rows as _cv_validation_rows from utilities.utilities import __mad_version from utilities.utilities import unique_string from utilities.utilities import num_samples - -from utilities.validate_args import get_cols_and_types +from utilities.utilities import add_postfix +from validation.cv_utils import __cv_produce_col_name_string as _cv_col_string +from validation.cv_utils import __cv_validation_rows as _cv_validation_rows from math import sqrt from collections import namedtuple @@ -103,7 +102,9 @@ class ValidationResult(object): def sorted(self): """Sort the history w.r.t. mean value and return a new ValidationResult object""" - ch = sorted(self._cv_history, reverse=True, key=itemgetter('mean_score')) + ch = sorted(self._cv_history, + reverse=True, + key=itemgetter('mean_score')) return ValidationResult(ch) def first(self, attr=None): @@ -138,8 +139,7 @@ class ValidationResult(object): """ if not tbl_name or not str(tbl_name).strip(): return - - header = (self._cv_history[0]['sub_args']['params_dict'].keys() + + header = (self._cv_history[0]['sub_args'].keys() + ['mean_score', 'std_dev_score']) header_str = ','.join(map(str, header)) @@ -163,7 +163,7 @@ class _ValidationArgs(object): @classmethod def grid(cls, sub_args): - def comb_dict(dicts): + def combine_dict(dicts): # same as dict((k, v) for d in dicts for k, v in d.iteritems()) return dict(chain.from_iterable(d.iteritems() for d in dicts)) @@ -177,7 +177,7 @@ class _ValidationArgs(object): elif isinstance(v, dict): a.append(make_dicts(k, cls.grid(v))) tuples = product(*a) - return map(comb_dict, tuples) + return map(combine_dict, tuples) def make_from(self, **kwargs): def _update_dict(d1, d2): @@ -186,7 +186,9 @@ class _ValidationArgs(object): if not isinstance(d2, dict): raise TypeError("{0} is not dict".format(type(d2))) for k, v in d2.iteritems(): - if isinstance(v, dict) and isinstance(d1[k], dict): + if (isinstance(v, dict) and + k in d1 and + isinstance(d1[k], dict)): _update_dict(d1[k], v) else: d1[k] = v @@ -233,14 +235,14 @@ def _cv_split_data(rel_source, col_data, col_id, row_num, # which corresponds to rows outside of [start_row, end_row). # Extract the validation part of data, # which corresponds to rows inside of [start_row, end_row). - plpy.execute("drop view if exists {rel_train}".format(**kwargs)) + plpy.execute("drop view if exists {0}".format(rel_train)) plpy.execute(""" create temp view {rel_train} as select {col_id}, {col_string} from {rel_source} where {col_id} < {start_row} or {col_id} >= {end_row} """.format(**kwargs)) - plpy.execute("drop view if exists {rel_valid}".format(**kwargs)) + plpy.execute("drop view if exists {0}".format(rel_valid)) plpy.execute(""" create temp view {rel_valid} as select {col_id}, {col_string} from {rel_source} @@ -251,6 +253,12 @@ def _cv_split_data(rel_source, col_data, col_id, row_num, # ------------------------------------------------------------------------------ +def _cv_del_split_data(rel_train, rel_valid): + plpy.execute("drop view if exists {0} cascade".format(rel_train)) + plpy.execute("drop view if exists {0} cascade".format(rel_valid)) +# ------------------------------------------------------------------------------ + + class CrossValidator(object): """ Cross validation for estimator @@ -325,7 +333,6 @@ class CrossValidator(object): scorer = self._scorer rel_train = split_data.rel_train rel_valid = split_data.rel_valid - args = self._cv_args.make_from(source_table=rel_train, dependent_varname=self._target_col, independent_varname=self._features_col, @@ -339,10 +346,9 @@ class CrossValidator(object): predictor(schema_madlib, model_table, rel_valid, col_id, output_table) score = self._score(output_table, rel_valid, scorer) - plpy.execute("DROP TABLE IF EXISTS {model_table}, {model_table}_summary;". - format(model_table=model_table)) - plpy.execute("DROP TABLE IF EXISTS {output_table};". - format(output_table=output_table)) + plpy.execute("DROP TABLE IF EXISTS {0}, {1}". + format(model_table, add_postfix(model_table, "_summary"))) + plpy.execute("DROP TABLE IF EXISTS {0}".format(output_table)) return score def _score(self, pred, orig, method): @@ -390,11 +396,8 @@ class CrossValidator(object): Arguments to be validated. Multiple values are provided in a list, e.g., sub_args = { - 'params_dict': - { - 'lambda': [0.1, 1, 10], - 'epsilon': [0.1, 1, 10] - } + 'lambda': [0.1, 1, 10], + 'epsilon': [0.1, 1, 10] } Before running estimator, sub_args_single is generated from sub_args replacing the lists with single value for each argument and args is updated recursively using sub_args_single. @@ -417,7 +420,6 @@ class CrossValidator(object): if not sub_args: return [] - cv_history = ValidationResult() all_split_data = list(self._gen_split_data(n_folds)) tof = self._test_one_fold @@ -425,4 +427,6 @@ class CrossValidator(object): scores = [tof(i, sub_args=sa) for i in all_split_data] a, s = _stats(scores) cv_history.add_one(mean=a, std=s, sub_args=sa) + for each_split in all_split_data: + _cv_del_split_data(each_split.rel_train, each_split.rel_valid) return cv_history