[madlib] branch master updated: Linear Regression: Support for JSON and special characters

okislal Tue, 22 Jan 2019 01:55:32 -0800

This is an automated email from the ASF dual-hosted git repository.

okislal pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git



The following commit(s) were added to refs/heads/master by this push:
     new afd408d  Linear Regression: Support for JSON and special characters
afd408d is described below

commit afd408d0f73cb849bbbdc651642d8aa39e51ffcf
Author: Orhan Kislal <okis...@apache.org>
AuthorDate: Tue Jan 22 12:50:54 2019 +0300

    Linear Regression: Support for JSON and special characters
    
    JIRA: MADLIB-1284
    
    - Modified code in linear.py_in to support special characters and JSON data 
type
    - Written test case for JSON and Speacial characters
    - Test case checks for JSON and special characters in GPDB 5.x and Special 
characters in GPDB 4.x
    
    Co-authored-by: Orhan Kislal <okis...@apache.org>
    
    Closes #343
---
 src/ports/postgres/modules/regress/linear.py_in    |  78 ++++++++---
 .../postgres/modules/regress/test/linear.sql_in    | 150 +++++++++++++++++++++
 2 files changed, 211 insertions(+), 17 deletions(-)

diff --git a/src/ports/postgres/modules/regress/linear.py_in 
b/src/ports/postgres/modules/regress/linear.py_in
index 70fe78a..2be78e9 100644
--- a/src/ports/postgres/modules/regress/linear.py_in
+++ b/src/ports/postgres/modules/regress/linear.py_in
@@ -11,19 +11,53 @@ from utilities.validate_args import table_is_empty
 from utilities.utilities import add_postfix
 from utilities.utilities import _assert
 from utilities.control import MinWarning
+from utilities.utilities import get_table_qualified_col_str
+from utilities.utilities import strip_end_quotes
+from utilities.utilities import _string_to_array
+from utilities.validate_args import get_expr_type
+from utilities.utilities import _string_to_sql_array
+from utilities.validate_args import quote_ident
+
+
 # ----------------------------------------------------------------------
 
 
 def linregr_train(schema_madlib, source_table, out_table,
                   dependent_varname, independent_varname, grouping_cols,
                   heteroskedasticity_option, **kwargs):
+    """
+        Args:
+        @param source_table,                -- name of input table
+        @param out_table,                   -- name of output table
+        @param dependent_varname,           -- name of dependent variable
+        @param independent_varname,         -- name of independent variables
+        @param grouping_cols,               -- names of columns to group-by
+        @param heteroskedasticity_option    -- perform heteroskedasticity test?
+
+    """
+
     with MinWarning('warning'):
         _validate_args(schema_madlib, source_table, out_table,
                        dependent_varname, independent_varname,
                        grouping_cols, heteroskedasticity_option)
 
-        group_str = '' if grouping_cols is None else 'GROUP BY %s' % 
grouping_cols
-        group_str_sel = '' if grouping_cols is None else grouping_cols + ','
+        group_col_list = ''if grouping_cols is None else 
_string_to_array_with_quotes(
+            grouping_cols)
+
+        group_str = '' if grouping_cols is None else 'GROUP BY %s' % 
get_table_qualified_col_str(source_table,
+                                                                               
                  group_col_list)
+        # For json expressions like data->>'ID', this will quote them to
+        # "data->>'ID'"
+        cols_wo_quotes = ''if grouping_cols is None else ' '.join(
+            strip_end_quotes(col).format(**locals()) for col in group_col_list)
+        group_cols_w_quotes = ''if grouping_cols is None else ' ,'.join(
+            " \"" + strip_end_quotes(col) + "\" ".format(**locals()) for col 
in group_col_list)
+
+        # For json expressions like data->>'ID' it will alias the columns to
+        # "data->>'ID'"
+        group_str_sel = ''if grouping_cols is None else ' , 
'.join("{source_table}.{col} as \"".format(source_table=source_table, col=col) 
+ strip_end_quotes(col) + "\" "
+                                                                   for col in 
group_col_list) + ","
+
         join_str = ',' if grouping_cols is None else 'JOIN'
         using_str = '' if grouping_cols is None else 'USING (%s)' % 
grouping_cols
 
@@ -50,6 +84,13 @@ def linregr_train(schema_madlib, source_table, out_table,
                        independent_varname=independent_varname,
                        source_table=source_table))
 
+        # USING Clause does not support expressions. So modifying to the 
regular
+        # join clause instead.
+        join_clause = '' if grouping_cols is None else " ON " + ' AND 
'.join("{source_table}.{col} = {temp_lin_rst}.\""
+                                                                             
.format(source_table=source_table, col=col,
+                                                                               
      temp_lin_rst=temp_lin_rst) + strip_end_quotes(col) + "\""
+                                                                             
for col in group_col_list)
+
         # Run heteroskedasticity test
         if heteroskedasticity_option:
             temp_hsk_rst = unique_string()
@@ -64,15 +105,15 @@ def linregr_train(schema_madlib, source_table, out_table,
                         {independent_varname},
                         {temp_lin_rst}.coef) AS hsk_rst
                 FROM
-                    {source_table} {join_str} {temp_lin_rst} {using_str}
-                {group_str}
+                    {source_table} {join_str} {temp_lin_rst} {join_clause}
+               {group_str}
                 """.format(schema_madlib=schema_madlib,
                            temp_hsk_rst=temp_hsk_rst,
                            dependent_varname=dependent_varname,
                            independent_varname=independent_varname,
                            group_str_sel=group_str_sel, group_str=group_str,
-                           join_str=join_str, using_str=using_str,
-                           source_table=source_table, 
temp_lin_rst=temp_lin_rst))
+                           join_str=join_str, source_table=source_table,
+                           temp_lin_rst=temp_lin_rst, join_clause=join_clause))
 
         # Output the results
         join_str = ''
@@ -80,17 +121,19 @@ def linregr_train(schema_madlib, source_table, out_table,
         if heteroskedasticity_option:
             if grouping_cols is not None:
                 join_str = 'JOIN %s AS hsk' % temp_hsk_rst
-                using_str = 'USING (%s)' % (grouping_cols)
+                using_str = 'USING (%s)' % group_cols_w_quotes
             else:
                 join_str = ', %s AS hsk' % temp_hsk_rst
         bp_stats = '(hsk.hsk_rst).bp_stats,' if heteroskedasticity_option else 
''
         bp_p_value = '(hsk.hsk_rst).bp_p_value,' if heteroskedasticity_option 
else ''
 
+        group_cols_w_quotes = '' if grouping_cols is None else 
group_cols_w_quotes + ","
+
         plpy.execute(
             """
             CREATE TABLE {out_table} AS
             SELECT
-                {group_str_sel}
+               {group_cols_w_quotes}
                 coef,
                 r2,
                 std_err,
@@ -109,10 +152,11 @@ def linregr_train(schema_madlib, source_table, out_table,
                 END AS num_missing_rows_skipped,
                 vcov as variance_covariance
             FROM
-                {temp_lin_rst} AS lin {join_str} {using_str}
-           """.format(out_table=out_table, group_str_sel=group_str_sel,
+                {temp_lin_rst} AS lin {join_str}  {using_str}
+           """.format(out_table=out_table,
                       bp_stats=bp_stats, bp_p_value=bp_p_value,
-                      temp_lin_rst=temp_lin_rst, join_str=join_str, 
using_str=using_str))
+                      temp_lin_rst=temp_lin_rst, join_str=join_str, 
using_str=using_str,
+                      group_cols_w_quotes=group_cols_w_quotes))
 
         num_rows = plpy.execute(
             """
@@ -134,8 +178,8 @@ def linregr_train(schema_madlib, source_table, out_table,
                       'linregr'::varchar                  as method
                     , '{source_table}'::varchar           as source_table
                     , '{out_table}'::varchar              as out_table
-                    , '{dependent_varname}'::varchar      as dependent_varname
-                    , '{independent_varname}'::varchar    as 
independent_varname
+                    , $__madlib__${dependent_varname}$__madlib__$::varchar     
 as dependent_varname
+                    , $__madlib__${independent_varname}$__madlib__$::varchar   
 as independent_varname
                     , {num_rows_processed}::integer       as num_rows_processed
                     , {num_rows_skipped}::integer         as 
num_missing_rows_skipped
                     , {grouping_col}::text                as grouping_col
@@ -143,7 +187,7 @@ def linregr_train(schema_madlib, source_table, out_table,
                        out_table_summary=out_table_summary,
                        dependent_varname=dependent_varname,
                        independent_varname=independent_varname,
-                       grouping_col="'" + grouping_cols + "'" if grouping_cols 
else "NULL",
+                       
grouping_col="$__madlib__$"+grouping_cols+"$__madlib__$" if grouping_cols else 
"NULL",
                        **num_rows))
     return None
 # ----------------------------------------------------------------------
@@ -185,10 +229,9 @@ def _validate_args(schema_madlib, source_table, out_table, 
dependent_varname,
     if grouping_cols is not None:
         _assert(grouping_cols != '',
                 "Linregr error: Invalid grouping columns name!")
+        # grouping columns can be a valid expression as well, for eg.
+        # a json expression (data->>'id'), so commenting this part.
         grouping_list = _string_to_array_with_quotes(grouping_cols)
-        _assert(columns_exist_in_table(
-            source_table, grouping_list, schema_madlib),
-            "Linregr error: Grouping column does not exist!")
 
         predefined = set(('coef', 'r2', 'std_err', 't_stats',
                           'p_values', 'condition_no',
@@ -278,6 +321,7 @@ def linregr_help_message(schema_madlib, message, **kwargs):
 
     return help_string.format(schema_madlib=schema_madlib)
 
+
 def linregr_predict_help_message(schema_madlib, message, **kwargs):
     """ Help message for Prediction in Linear Regression
 
diff --git a/src/ports/postgres/modules/regress/test/linear.sql_in 
b/src/ports/postgres/modules/regress/test/linear.sql_in
index 9013108..612914a 100644
--- a/src/ports/postgres/modules/regress/test/linear.sql_in
+++ b/src/ports/postgres/modules/regress/test/linear.sql_in
@@ -213,3 +213,153 @@ select linregr_train('example');
 select linregr_predict();
 select linregr_predict('usage');
 select linregr_predict('example');
+
+
+
+
+
+------------------------------------------------------------------------
+
+
+-- Test case for JSON Data Type and Special characters. 
+-- This function checks special characters for GPDB version = 4.x and JSON for 
5.x and above
+
+create or replace function linereg_expr_test() RETURNS VOID AS
+$$ 
+DECLARE col_type TEXT;
+
+begin
+select typname into col_type from pg_type where typname = 'json' ;
+if col_type = 'json' THEN
+
+DROP TABLE IF EXISTS houses_json;
+CREATE TABLE houses_json (
+    id SERIAL NOT NULL,
+    data json
+);
+
+
+INSERT INTO houses_json VALUES
+( 2  ,'{ "ta,x":  590, "bedroom":2, "ba$th":1,   "pr''ice":  50000, "size": 
770, "lot":22100 }'),
+( 4  ,'{ "ta,x": 1050, "bedroom":3, "ba$th":2,   "pr''ice":  85000, 
"size":1410, "lot":12000 }'),
+( 1  ,'{ "ta,x":   20, "bedroom":3, "ba$th":1,   "pr''ice":  22500, 
"size":1060, "lot":3500  }'),
+( 6  ,'{ "ta,x":  870, "bedroom":2, "ba$th":2,   "pr''ice":  90000, 
"size":1300, "lot":17500 }'),
+( 3  ,'{ "ta,x": 1320, "bedroom":3, "ba$th":2,   "pr''ice": 133000, 
"size":1500, "lot":30000 }'),
+( 5  ,'{ "ta,x": 1350, "bedroom":2, "ba$th":1,   "pr''ice":  90500, "size": 
820, "lot":25700 }'),
+( 7  ,'{ "ta,x": 2790, "bedroom":3, "ba$th":2.5, "pr''ice": 260000, 
"size":2130, "lot":25000 }'),
+( 10 ,'{ "ta,x":  680, "bedroom":2, "ba$th":1,   "pr''ice": 142500, 
"size":1170, "lot":22000 }'),
+( 9  ,'{ "ta,x": 1840, "bedroom":3, "ba$th":2,   "pr''ice": 160000, 
"size":1500, "lot":19000 }'),
+( 8  ,'{ "ta,x": 3680, "bedroom":4, "ba$th":2,   "pr''ice": 240000, 
"size":2790, "lot":20000 }'),
+( 11 ,'{ "ta,x": 1660, "bedroom":3, "ba$th":1,   "pr''ice":  87000, 
"size":1030, "lot":17500 }'),
+( 12 ,'{ "ta,x": 1620, "bedroom":3, "ba$th":2,   "pr''ice": 118600, 
"size":1250, "lot":20000 }'),
+( 13 ,'{ "ta,x": 3100, "bedroom":3, "ba$th":2,   "pr''ice": 140000, 
"size":1760, "lot":38000 }'),
+( 14 ,'{ "ta,x": 2070, "bedroom":2, "ba$th":3,   "pr''ice": 148000, 
"size":1550, "lot":14000 }'),
+( 15 ,'{ "ta,x":  650, "bedroom":3, "ba$th":1.5, "pr''ice":  65000, 
"size":1450, "lot":12000 }');
+
+drop table if exists result_lin_houses_json;
+drop table if exists result_lin_houses_json_summary;
+PERFORM linregr_train('houses_json', 'result_lin_houses_json', 
'(data->>''pr''''ice'')::integer',
+                    'array[1, (data->>''ta,x'')::integer, 
(data->>''ba$th'')::double precision, (data->>''size'')::integer]',
+                    'data->>''bedroom''', True);
+PERFORM assert(
+    relative_error(coef, array[-84242.0345, 55.4430, -78966.9754, 225.6119]) < 
1e-2 and
+    relative_error(r2, 0.9688) < 1e-2 and
+    relative_error(std_err, array[35019.00, 19.57, 23036.81, 49.04]) < 1e-2 and
+    relative_error(t_stats, array[-2.406, 2.833, -3.428, 4.600]) < 1e-2 and
+    relative_error(p_values, array[0.251, 0.216, 0.181, 0.136]) < 1e-2 and
+    relative_error(condition_no, 10086.1) < 1e-2 and
+    relative_error(bp_stats, 2.5451) < 1e-2 and
+    relative_error(bp_p_value, 0.467192) < 1e-2,
+    'Linear regression with heteroskedasticity & grouping (houses): Wrong 
results'
+)
+from result_lin_houses_json
+where "data->>'bedroom'"::integer = 2;
+
+else 
+
+
+
+DROP TABLE IF EXISTS houses_spcl;
+CREATE TABLE houses_spcl (
+    id SERIAL NOT NULL,
+    "ta,x" INTEGER,
+    "bed,room" REAL,
+    "ba$th" REAL,
+    "pr'ice" INTEGER,
+    size INTEGER,
+    lot INTEGER
+);
+
+INSERT INTO houses_spcl("ta,x", "bed,room", "ba$th", "pr'ice", size, lot) 
VALUES
+( 590, 2, 1,    50000,  770, 22100),
+(1050, 3, 2,    85000, 1410, 12000),
+(  20, 3, 1,    22500, 1060, 3500 ),
+( 870, 2, 2,    90000, 1300, 17500),
+(1320, 3, 2,   133000, 1500, 30000),
+(1350, 2, 1,    90500,  820, 25700),
+(2790, 3, 2.5, 260000, 2130, 25000),
+( 680, 2, 1,   142500, 1170, 22000),
+(1840, 3, 2,   160000, 1500, 19000),
+(3680, 4, 2,   240000, 2790, 20000),
+(1660, 3, 1,    87000, 1030, 17500),
+(1620, 3, 2,   118600, 1250, 20000),
+(3100, 3, 2,   140000, 1760, 38000),
+(2070, 2, 3,   148000, 1550, 14000),
+( 650, 3, 1.5,  65000, 1450, 12000);
+
+drop table if exists result_lin_houses_spcl;
+drop table if exists result_lin_houses_spcl_summary;
+PERFORM linregr_train('houses_spcl', 'result_lin_houses_spcl', '"pr''ice"',
+                    'array[1, "ta,x", "ba$th", size]',
+                    '"bed,room"', True);
+PERFORM assert(
+    relative_error(coef, array[-84242.0345, 55.4430, -78966.9754, 225.6119]) < 
1e-2 and
+    relative_error(r2, 0.9688) < 1e-2 and
+    relative_error(std_err, array[35019.00, 19.57, 23036.81, 49.04]) < 1e-2 and
+    relative_error(t_stats, array[-2.406, 2.833, -3.428, 4.600]) < 1e-2 and
+    relative_error(p_values, array[0.251, 0.216, 0.181, 0.136]) < 1e-2 and
+    relative_error(condition_no, 10086.1) < 1e-2 and
+    relative_error(bp_stats, 2.5451) < 1e-2 and
+    relative_error(bp_p_value, 0.467192) < 1e-2,
+    'Linear regression with heteroskedasticity & grouping (houses): Wrong 
results'
+)
+from result_lin_houses_spcl
+where result_lin_houses_spcl."bed,room" = 2;
+
+
+end IF;
+
+end;
+$$ LANGUAGE plpgsql;
+
+
+select linereg_expr_test();
+
+
+
+------------------------------------------------------------------------
+
+-- Test Case for : when grouping_cols is NULL and the 
heteroskedasticity_option is True
+
+
+
+
+drop table if exists result_lin_houses;
+drop table if exists result_lin_houses_summary;
+select linregr_train('houses', 'result_lin_houses', 'price',
+                    'array[1, tax, bath, size]',
+                    NULL, True);
+
+select assert(
+    relative_error(coef, 
array[-12849.4168959872,28.9613922651765,10181.6290712648,50.516894915354]) < 
1e-2 and
+    relative_error(r2, 0.768577580597443) < 1e-2 and
+    relative_error(std_err, 
array[33453.0344331391,15.8992104963997,19437.7710925923,32.928023174087]) < 
1e-2 and
+    relative_error(t_stats, 
array[-0.38410317968819,1.82156166004184,0.523806408809133,1.53416118083605]) < 
1e-2 and
+    relative_error(p_values, 
array[0.708223134615422,0.0958005827189772,0.610804093526536,0.153235085548186])
 < 1e-2 and
+    relative_error(condition_no, 9002.50) < 1e-2 and
+    relative_error(bp_stats, 1.22605243985138) < 1e-2 and
+    relative_error(bp_p_value, 0.746762880478034) < 1e-2,
+    'Linear regression with heteroskedasticity with no grouping: Wrong results'
+)
+from result_lin_houses;
+

[madlib] branch master updated: Linear Regression: Support for JSON and special characters

Reply via email to