Repository: incubator-madlib Updated Branches: refs/heads/master 6025c4b0d -> ceefae4f4
Elastic Net: Fix normalization issue MADLIB-1094 and MADLIB-1146 avg in psql is numerically unstable Data scaling was not occuring when grouping is true. Closes #164 Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/ceefae4f Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/ceefae4f Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/ceefae4f Branch: refs/heads/master Commit: ceefae4f4180b88a1aa5712d0e43f0b00573c378 Parents: 6025c4b Author: Cooper Sloan <cooper.sl...@gmail.com> Authored: Thu Aug 10 12:04:04 2017 -0700 Committer: Orhan Kislal <okis...@pivotal.io> Committed: Fri Aug 11 11:31:12 2017 -0700 ---------------------------------------------------------------------- .../elastic_net_generate_result.py_in | 6 +-- .../elastic_net/elastic_net_optimizer_igd.py_in | 4 +- .../modules/elastic_net/elastic_net_utils.py_in | 42 ++++++++++++++------ 3 files changed, 35 insertions(+), 17 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ceefae4f/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in index df5489f..7a87ef6 100644 --- a/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in +++ b/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in @@ -38,13 +38,13 @@ def _elastic_net_generate_result(optimizer, iteration_run, **args): select_mean_and_std = '' inner_join_x = '' inner_join_y = '' + grouping_cols_list = split_quoted_delimited_str(grouping_column) + select_grp = ','.join(['n_tuples_including_nulls_subq.'+str(grp) + for grp in grouping_cols_list]) + ',' if data_scaled: - grouping_cols_list = split_quoted_delimited_str(grouping_column) select_grouping_info = ','.join([ grp_col.strip()+"\t"+cols_types[grp_col.strip()] for grp_col in grouping_column.split(',')]) + "," - select_grp = ','.join(['n_tuples_including_nulls_subq.'+str(grp) - for grp in grouping_cols_list]) + ',' x_grp_cols = ' AND '.join([ 'n_tuples_including_nulls_subq.{0}={1}.{2}'.format(grp, args["x_mean_table"], grp) for grp in grouping_cols_list]) http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ceefae4f/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in index d73a754..c5d21c2 100644 --- a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in +++ b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in @@ -4,7 +4,7 @@ from utilities.utilities import unique_string from utilities.in_mem_group_control import GroupIterationController from elastic_net_utils import _compute_means from elastic_net_utils import _normalize_data -from elastic_net_utils import _compute_data_scales +from elastic_net_utils import _compute_scales from elastic_net_utils import _tbl_dimension_rownum from elastic_net_utils import _elastic_net_validate_args from utilities.utilities import _array_to_string @@ -216,7 +216,7 @@ def _elastic_net_igd_train_compute(schema_madlib, func_step_aggregate, args["col_ind_var_new"] = args["col_ind_var_norm_new"] args["col_dep_var_new"] = args["col_dep_var_norm_new"] else: - _compute_data_scales(args) + _compute_scales(args) tbl_used = tbl_source args["col_ind_var_new"] = col_ind_var args["col_dep_var_new"] = col_dep_var http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ceefae4f/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in index b2f2505..154ac31 100644 --- a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in +++ b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in @@ -129,18 +129,27 @@ def _compute_log_likelihood(coef, intercept, **args): Compute the log-likelihood at the end of calculation """ if args["family"] == "gaussian": # linear models + loss_query = """ + select + {method}(({col_dep_var_new} - {schema_madlib}.elastic_net_gaussian_predict( + '{coefficients}'::double precision[], + {intercept}::double precision, + {col_ind_var_new}))^2)/({denominator}) + as loss + from + {tbl_used} + """ + # See jira 1094, avg experiences numerical instability + denominator = "2." + method = "avg" + if not args["normalization"]: + method = "sum" + denominator = "count(*) * 2." loss = plpy.execute( - """ - select - avg(({col_dep_var_new} - {schema_madlib}.elastic_net_gaussian_predict( - '{coefficients}'::double precision[], - {intercept}::double precision, - {col_ind_var_new}))^2) / 2. - as loss - from - {tbl_used} - """.format(coefficients=_array_to_string(coef), + loss_query.format(coefficients=_array_to_string(coef), intercept=intercept, + method=method, + denominator=denominator, **args))[0]["loss"] elif args["family"] == "binomial": # logistic models loss = plpy.execute( @@ -192,8 +201,18 @@ def _elastic_net_validate_args(tbl_source, col_ind_var, col_dep_var, return None # ------------------------------------------------------------------------ +def _compute_scales(args): + if args["grouping_col"]: + _compute_data_scales_grouping(args) + else: + _compute_data_scales(args) def _compute_data_scales_grouping(args): + # When grouping_col is defined, we must find an array containing + # the mean of every dimension in the independent variable (x), the + # mean of dependent variable (y) and the standard deviation for them + # specific to groups. Store these results in temp tables x_mean_table + # and y_mean_table. __utils_ind_var_scales_grouping(args["tbl_source"], args["col_ind_var"], args["dimension"], args["schema_madlib"], args["grouping_col"], args["x_mean_table"]) @@ -227,13 +246,13 @@ def _normalize_data(args): The output is stored in tbl_data_scaled """ y_decenter = True if args["family"] == "gaussian" else False + _compute_scales(args) if args["grouping_col"]: # When grouping_col is defined, we must find an array containing # the mean of every dimension in the independent variable (x), the # mean of dependent variable (y) and the standard deviation for them # specific to groups. Store these results in temp tables x_mean_table # and y_mean_table. - _compute_data_scales_grouping(args) # __utils_normalize_data_grouping reads the various means and stds # from the tables. __utils_normalize_data_grouping(y_decenter=y_decenter, @@ -251,7 +270,6 @@ def _normalize_data(args): # When no grouping_col is defined, the mean and std for both 'x' and # 'y' can be defined using strings, stored in x_mean_str, x_std_str # etc. We don't need a table like how we needed for grouping. - _compute_data_scales(args) __utils_normalize_data(y_decenter=y_decenter, tbl_data=args["tbl_source"], col_ind_var=args["col_ind_var"],