This is an automated email from the ASF dual-hosted git repository. okislal pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/madlib.git
The following commit(s) were added to refs/heads/master by this push: new afd408d Linear Regression: Support for JSON and special characters afd408d is described below commit afd408d0f73cb849bbbdc651642d8aa39e51ffcf Author: Orhan Kislal <okis...@apache.org> AuthorDate: Tue Jan 22 12:50:54 2019 +0300 Linear Regression: Support for JSON and special characters JIRA: MADLIB-1284 - Modified code in linear.py_in to support special characters and JSON data type - Written test case for JSON and Speacial characters - Test case checks for JSON and special characters in GPDB 5.x and Special characters in GPDB 4.x Co-authored-by: Orhan Kislal <okis...@apache.org> Closes #343 --- src/ports/postgres/modules/regress/linear.py_in | 78 ++++++++--- .../postgres/modules/regress/test/linear.sql_in | 150 +++++++++++++++++++++ 2 files changed, 211 insertions(+), 17 deletions(-) diff --git a/src/ports/postgres/modules/regress/linear.py_in b/src/ports/postgres/modules/regress/linear.py_in index 70fe78a..2be78e9 100644 --- a/src/ports/postgres/modules/regress/linear.py_in +++ b/src/ports/postgres/modules/regress/linear.py_in @@ -11,19 +11,53 @@ from utilities.validate_args import table_is_empty from utilities.utilities import add_postfix from utilities.utilities import _assert from utilities.control import MinWarning +from utilities.utilities import get_table_qualified_col_str +from utilities.utilities import strip_end_quotes +from utilities.utilities import _string_to_array +from utilities.validate_args import get_expr_type +from utilities.utilities import _string_to_sql_array +from utilities.validate_args import quote_ident + + # ---------------------------------------------------------------------- def linregr_train(schema_madlib, source_table, out_table, dependent_varname, independent_varname, grouping_cols, heteroskedasticity_option, **kwargs): + """ + Args: + @param source_table, -- name of input table + @param out_table, -- name of output table + @param dependent_varname, -- name of dependent variable + @param independent_varname, -- name of independent variables + @param grouping_cols, -- names of columns to group-by + @param heteroskedasticity_option -- perform heteroskedasticity test? + + """ + with MinWarning('warning'): _validate_args(schema_madlib, source_table, out_table, dependent_varname, independent_varname, grouping_cols, heteroskedasticity_option) - group_str = '' if grouping_cols is None else 'GROUP BY %s' % grouping_cols - group_str_sel = '' if grouping_cols is None else grouping_cols + ',' + group_col_list = ''if grouping_cols is None else _string_to_array_with_quotes( + grouping_cols) + + group_str = '' if grouping_cols is None else 'GROUP BY %s' % get_table_qualified_col_str(source_table, + group_col_list) + # For json expressions like data->>'ID', this will quote them to + # "data->>'ID'" + cols_wo_quotes = ''if grouping_cols is None else ' '.join( + strip_end_quotes(col).format(**locals()) for col in group_col_list) + group_cols_w_quotes = ''if grouping_cols is None else ' ,'.join( + " \"" + strip_end_quotes(col) + "\" ".format(**locals()) for col in group_col_list) + + # For json expressions like data->>'ID' it will alias the columns to + # "data->>'ID'" + group_str_sel = ''if grouping_cols is None else ' , '.join("{source_table}.{col} as \"".format(source_table=source_table, col=col) + strip_end_quotes(col) + "\" " + for col in group_col_list) + "," + join_str = ',' if grouping_cols is None else 'JOIN' using_str = '' if grouping_cols is None else 'USING (%s)' % grouping_cols @@ -50,6 +84,13 @@ def linregr_train(schema_madlib, source_table, out_table, independent_varname=independent_varname, source_table=source_table)) + # USING Clause does not support expressions. So modifying to the regular + # join clause instead. + join_clause = '' if grouping_cols is None else " ON " + ' AND '.join("{source_table}.{col} = {temp_lin_rst}.\"" + .format(source_table=source_table, col=col, + temp_lin_rst=temp_lin_rst) + strip_end_quotes(col) + "\"" + for col in group_col_list) + # Run heteroskedasticity test if heteroskedasticity_option: temp_hsk_rst = unique_string() @@ -64,15 +105,15 @@ def linregr_train(schema_madlib, source_table, out_table, {independent_varname}, {temp_lin_rst}.coef) AS hsk_rst FROM - {source_table} {join_str} {temp_lin_rst} {using_str} - {group_str} + {source_table} {join_str} {temp_lin_rst} {join_clause} + {group_str} """.format(schema_madlib=schema_madlib, temp_hsk_rst=temp_hsk_rst, dependent_varname=dependent_varname, independent_varname=independent_varname, group_str_sel=group_str_sel, group_str=group_str, - join_str=join_str, using_str=using_str, - source_table=source_table, temp_lin_rst=temp_lin_rst)) + join_str=join_str, source_table=source_table, + temp_lin_rst=temp_lin_rst, join_clause=join_clause)) # Output the results join_str = '' @@ -80,17 +121,19 @@ def linregr_train(schema_madlib, source_table, out_table, if heteroskedasticity_option: if grouping_cols is not None: join_str = 'JOIN %s AS hsk' % temp_hsk_rst - using_str = 'USING (%s)' % (grouping_cols) + using_str = 'USING (%s)' % group_cols_w_quotes else: join_str = ', %s AS hsk' % temp_hsk_rst bp_stats = '(hsk.hsk_rst).bp_stats,' if heteroskedasticity_option else '' bp_p_value = '(hsk.hsk_rst).bp_p_value,' if heteroskedasticity_option else '' + group_cols_w_quotes = '' if grouping_cols is None else group_cols_w_quotes + "," + plpy.execute( """ CREATE TABLE {out_table} AS SELECT - {group_str_sel} + {group_cols_w_quotes} coef, r2, std_err, @@ -109,10 +152,11 @@ def linregr_train(schema_madlib, source_table, out_table, END AS num_missing_rows_skipped, vcov as variance_covariance FROM - {temp_lin_rst} AS lin {join_str} {using_str} - """.format(out_table=out_table, group_str_sel=group_str_sel, + {temp_lin_rst} AS lin {join_str} {using_str} + """.format(out_table=out_table, bp_stats=bp_stats, bp_p_value=bp_p_value, - temp_lin_rst=temp_lin_rst, join_str=join_str, using_str=using_str)) + temp_lin_rst=temp_lin_rst, join_str=join_str, using_str=using_str, + group_cols_w_quotes=group_cols_w_quotes)) num_rows = plpy.execute( """ @@ -134,8 +178,8 @@ def linregr_train(schema_madlib, source_table, out_table, 'linregr'::varchar as method , '{source_table}'::varchar as source_table , '{out_table}'::varchar as out_table - , '{dependent_varname}'::varchar as dependent_varname - , '{independent_varname}'::varchar as independent_varname + , $__madlib__${dependent_varname}$__madlib__$::varchar as dependent_varname + , $__madlib__${independent_varname}$__madlib__$::varchar as independent_varname , {num_rows_processed}::integer as num_rows_processed , {num_rows_skipped}::integer as num_missing_rows_skipped , {grouping_col}::text as grouping_col @@ -143,7 +187,7 @@ def linregr_train(schema_madlib, source_table, out_table, out_table_summary=out_table_summary, dependent_varname=dependent_varname, independent_varname=independent_varname, - grouping_col="'" + grouping_cols + "'" if grouping_cols else "NULL", + grouping_col="$__madlib__$"+grouping_cols+"$__madlib__$" if grouping_cols else "NULL", **num_rows)) return None # ---------------------------------------------------------------------- @@ -185,10 +229,9 @@ def _validate_args(schema_madlib, source_table, out_table, dependent_varname, if grouping_cols is not None: _assert(grouping_cols != '', "Linregr error: Invalid grouping columns name!") + # grouping columns can be a valid expression as well, for eg. + # a json expression (data->>'id'), so commenting this part. grouping_list = _string_to_array_with_quotes(grouping_cols) - _assert(columns_exist_in_table( - source_table, grouping_list, schema_madlib), - "Linregr error: Grouping column does not exist!") predefined = set(('coef', 'r2', 'std_err', 't_stats', 'p_values', 'condition_no', @@ -278,6 +321,7 @@ def linregr_help_message(schema_madlib, message, **kwargs): return help_string.format(schema_madlib=schema_madlib) + def linregr_predict_help_message(schema_madlib, message, **kwargs): """ Help message for Prediction in Linear Regression diff --git a/src/ports/postgres/modules/regress/test/linear.sql_in b/src/ports/postgres/modules/regress/test/linear.sql_in index 9013108..612914a 100644 --- a/src/ports/postgres/modules/regress/test/linear.sql_in +++ b/src/ports/postgres/modules/regress/test/linear.sql_in @@ -213,3 +213,153 @@ select linregr_train('example'); select linregr_predict(); select linregr_predict('usage'); select linregr_predict('example'); + + + + + +------------------------------------------------------------------------ + + +-- Test case for JSON Data Type and Special characters. +-- This function checks special characters for GPDB version = 4.x and JSON for 5.x and above + +create or replace function linereg_expr_test() RETURNS VOID AS +$$ +DECLARE col_type TEXT; + +begin +select typname into col_type from pg_type where typname = 'json' ; +if col_type = 'json' THEN + +DROP TABLE IF EXISTS houses_json; +CREATE TABLE houses_json ( + id SERIAL NOT NULL, + data json +); + + +INSERT INTO houses_json VALUES +( 2 ,'{ "ta,x": 590, "bedroom":2, "ba$th":1, "pr''ice": 50000, "size": 770, "lot":22100 }'), +( 4 ,'{ "ta,x": 1050, "bedroom":3, "ba$th":2, "pr''ice": 85000, "size":1410, "lot":12000 }'), +( 1 ,'{ "ta,x": 20, "bedroom":3, "ba$th":1, "pr''ice": 22500, "size":1060, "lot":3500 }'), +( 6 ,'{ "ta,x": 870, "bedroom":2, "ba$th":2, "pr''ice": 90000, "size":1300, "lot":17500 }'), +( 3 ,'{ "ta,x": 1320, "bedroom":3, "ba$th":2, "pr''ice": 133000, "size":1500, "lot":30000 }'), +( 5 ,'{ "ta,x": 1350, "bedroom":2, "ba$th":1, "pr''ice": 90500, "size": 820, "lot":25700 }'), +( 7 ,'{ "ta,x": 2790, "bedroom":3, "ba$th":2.5, "pr''ice": 260000, "size":2130, "lot":25000 }'), +( 10 ,'{ "ta,x": 680, "bedroom":2, "ba$th":1, "pr''ice": 142500, "size":1170, "lot":22000 }'), +( 9 ,'{ "ta,x": 1840, "bedroom":3, "ba$th":2, "pr''ice": 160000, "size":1500, "lot":19000 }'), +( 8 ,'{ "ta,x": 3680, "bedroom":4, "ba$th":2, "pr''ice": 240000, "size":2790, "lot":20000 }'), +( 11 ,'{ "ta,x": 1660, "bedroom":3, "ba$th":1, "pr''ice": 87000, "size":1030, "lot":17500 }'), +( 12 ,'{ "ta,x": 1620, "bedroom":3, "ba$th":2, "pr''ice": 118600, "size":1250, "lot":20000 }'), +( 13 ,'{ "ta,x": 3100, "bedroom":3, "ba$th":2, "pr''ice": 140000, "size":1760, "lot":38000 }'), +( 14 ,'{ "ta,x": 2070, "bedroom":2, "ba$th":3, "pr''ice": 148000, "size":1550, "lot":14000 }'), +( 15 ,'{ "ta,x": 650, "bedroom":3, "ba$th":1.5, "pr''ice": 65000, "size":1450, "lot":12000 }'); + +drop table if exists result_lin_houses_json; +drop table if exists result_lin_houses_json_summary; +PERFORM linregr_train('houses_json', 'result_lin_houses_json', '(data->>''pr''''ice'')::integer', + 'array[1, (data->>''ta,x'')::integer, (data->>''ba$th'')::double precision, (data->>''size'')::integer]', + 'data->>''bedroom''', True); +PERFORM assert( + relative_error(coef, array[-84242.0345, 55.4430, -78966.9754, 225.6119]) < 1e-2 and + relative_error(r2, 0.9688) < 1e-2 and + relative_error(std_err, array[35019.00, 19.57, 23036.81, 49.04]) < 1e-2 and + relative_error(t_stats, array[-2.406, 2.833, -3.428, 4.600]) < 1e-2 and + relative_error(p_values, array[0.251, 0.216, 0.181, 0.136]) < 1e-2 and + relative_error(condition_no, 10086.1) < 1e-2 and + relative_error(bp_stats, 2.5451) < 1e-2 and + relative_error(bp_p_value, 0.467192) < 1e-2, + 'Linear regression with heteroskedasticity & grouping (houses): Wrong results' +) +from result_lin_houses_json +where "data->>'bedroom'"::integer = 2; + +else + + + +DROP TABLE IF EXISTS houses_spcl; +CREATE TABLE houses_spcl ( + id SERIAL NOT NULL, + "ta,x" INTEGER, + "bed,room" REAL, + "ba$th" REAL, + "pr'ice" INTEGER, + size INTEGER, + lot INTEGER +); + +INSERT INTO houses_spcl("ta,x", "bed,room", "ba$th", "pr'ice", size, lot) VALUES +( 590, 2, 1, 50000, 770, 22100), +(1050, 3, 2, 85000, 1410, 12000), +( 20, 3, 1, 22500, 1060, 3500 ), +( 870, 2, 2, 90000, 1300, 17500), +(1320, 3, 2, 133000, 1500, 30000), +(1350, 2, 1, 90500, 820, 25700), +(2790, 3, 2.5, 260000, 2130, 25000), +( 680, 2, 1, 142500, 1170, 22000), +(1840, 3, 2, 160000, 1500, 19000), +(3680, 4, 2, 240000, 2790, 20000), +(1660, 3, 1, 87000, 1030, 17500), +(1620, 3, 2, 118600, 1250, 20000), +(3100, 3, 2, 140000, 1760, 38000), +(2070, 2, 3, 148000, 1550, 14000), +( 650, 3, 1.5, 65000, 1450, 12000); + +drop table if exists result_lin_houses_spcl; +drop table if exists result_lin_houses_spcl_summary; +PERFORM linregr_train('houses_spcl', 'result_lin_houses_spcl', '"pr''ice"', + 'array[1, "ta,x", "ba$th", size]', + '"bed,room"', True); +PERFORM assert( + relative_error(coef, array[-84242.0345, 55.4430, -78966.9754, 225.6119]) < 1e-2 and + relative_error(r2, 0.9688) < 1e-2 and + relative_error(std_err, array[35019.00, 19.57, 23036.81, 49.04]) < 1e-2 and + relative_error(t_stats, array[-2.406, 2.833, -3.428, 4.600]) < 1e-2 and + relative_error(p_values, array[0.251, 0.216, 0.181, 0.136]) < 1e-2 and + relative_error(condition_no, 10086.1) < 1e-2 and + relative_error(bp_stats, 2.5451) < 1e-2 and + relative_error(bp_p_value, 0.467192) < 1e-2, + 'Linear regression with heteroskedasticity & grouping (houses): Wrong results' +) +from result_lin_houses_spcl +where result_lin_houses_spcl."bed,room" = 2; + + +end IF; + +end; +$$ LANGUAGE plpgsql; + + +select linereg_expr_test(); + + + +------------------------------------------------------------------------ + +-- Test Case for : when grouping_cols is NULL and the heteroskedasticity_option is True + + + + +drop table if exists result_lin_houses; +drop table if exists result_lin_houses_summary; +select linregr_train('houses', 'result_lin_houses', 'price', + 'array[1, tax, bath, size]', + NULL, True); + +select assert( + relative_error(coef, array[-12849.4168959872,28.9613922651765,10181.6290712648,50.516894915354]) < 1e-2 and + relative_error(r2, 0.768577580597443) < 1e-2 and + relative_error(std_err, array[33453.0344331391,15.8992104963997,19437.7710925923,32.928023174087]) < 1e-2 and + relative_error(t_stats, array[-0.38410317968819,1.82156166004184,0.523806408809133,1.53416118083605]) < 1e-2 and + relative_error(p_values, array[0.708223134615422,0.0958005827189772,0.610804093526536,0.153235085548186]) < 1e-2 and + relative_error(condition_no, 9002.50) < 1e-2 and + relative_error(bp_stats, 1.22605243985138) < 1e-2 and + relative_error(bp_p_value, 0.746762880478034) < 1e-2, + 'Linear regression with heteroskedasticity with no grouping: Wrong results' +) +from result_lin_houses; +