Utilities: Add module transform_vec_cols for column-vector conversion JIRA: MADLIB-1240
This commit adds a new SQL function called vec2cols and refactors the current function cols2vec, providing greater integration between the two modules. We now have a single Python file with separate classes for each feature. We also have unified unit-tests and dev-check/install-check tests. The vec2cols function enables users to split up a single column into multiple columns, given that the input column contains array entries. For example, if the input column contained ARRAY[1, 2, 3] in one of its rows, the output table will contain 3 different columns, one for each element of the array. Co-authored-by: Nandish Jayaram <njaya...@apache.org> Co-authored-by: Rahul Iyer <ri...@apache.org> Co-authored-by: Nikhil Kak <n...@pivotal.io> Co-authored-by: Orhan Kislal <okis...@pivotal.io> Co-authored-by: Frank McQuillan <fmcquil...@pivotal.io> Closes #291 Project: http://git-wip-us.apache.org/repos/asf/madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/20f95b33 Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/20f95b33 Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/20f95b33 Branch: refs/heads/master Commit: 20f95b33bcbd05b154a566c81958091c66258858 Parents: a0cfcf8 Author: Arvind Sridhar <asrid...@pivotal.io> Authored: Wed Aug 1 11:22:27 2018 -0700 Committer: Orhan Kislal <okis...@pivotal.io> Committed: Wed Aug 1 11:22:27 2018 -0700 ---------------------------------------------------------------------- doc/mainpage.dox.in | 1 + .../postgres/modules/internal/db_utils.py_in | 9 + .../postgres/modules/utilities/cols2vec.py_in | 128 ----- .../postgres/modules/utilities/cols2vec.sql_in | 345 ++++++++++--- .../modules/utilities/test/cols2vec.sql_in | 91 ---- .../utilities/test/transform_vec_cols.ic.sql_in | 68 +++ .../utilities/test/transform_vec_cols.sql_in | 470 ++++++++++++++++++ .../unit_tests/test_transform_vec_cols.py_in | 226 +++++++++ .../modules/utilities/transform_vec_cols.py_in | 496 +++++++++++++++++++ .../postgres/modules/utilities/utilities.py_in | 14 +- .../postgres/modules/utilities/vec2cols.sql_in | 348 +++++++++++++ 11 files changed, 1908 insertions(+), 288 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/doc/mainpage.dox.in ---------------------------------------------------------------------- diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in index 8f97491..d174ab7 100644 --- a/doc/mainpage.dox.in +++ b/doc/mainpage.dox.in @@ -276,6 +276,7 @@ complete matrix stored as a distributed table. @defgroup grp_minibatch_preprocessing Mini-Batch Preprocessor @defgroup grp_pmml PMML Export @defgroup grp_text_utilities Term Frequency + @defgroup grp_vec2cols Vector to Columns @} @defgroup grp_early_stage Early Stage Development http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/internal/db_utils.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/internal/db_utils.py_in b/src/ports/postgres/modules/internal/db_utils.py_in index c75babf..45477ef 100644 --- a/src/ports/postgres/modules/internal/db_utils.py_in +++ b/src/ports/postgres/modules/internal/db_utils.py_in @@ -79,3 +79,12 @@ def quote_literal(input_str): return "{qd}{input_str}{qd}".format(qd=QUOTE_DELIMITER, input_str=input_str) # ------------------------------------------------------------------------------ + +def is_col_1d_array(source_table, col_name): + query = """ + SELECT array_upper({0}, 2) IS NULL AS n_y + FROM {1} + LIMIT 1 + """.format(col_name, source_table) + result = plpy.execute(query) + return result[0]["n_y"] http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/cols2vec.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/utilities/cols2vec.py_in b/src/ports/postgres/modules/utilities/cols2vec.py_in deleted file mode 100644 index 4f2b1c9..0000000 --- a/src/ports/postgres/modules/utilities/cols2vec.py_in +++ /dev/null @@ -1,128 +0,0 @@ -# coding=utf-8 -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -@file cols2vec.py_in - -@brief Utility to convert Columns to array - -""" - -import plpy -from control import MinWarning -from internal.db_utils import quote_literal -from utilities import split_quoted_delimited_str -from utilities import _string_to_array -from utilities import _assert -from utilities import add_postfix -from validate_args import columns_exist_in_table -from validate_args import is_var_valid -from validate_args import get_cols -from validate_args import quote_ident -from utilities import py_list_to_sql_string - - -m4_changequote(`<!', `!>') - - -def validate_cols2vec_args(source_table, output_table, - list_of_features, list_of_features_to_exclude, cols_to_output, **kwargs): - """ - Function to validate input parameters - """ - if list_of_features.strip() != '*': - if not (list_of_features and list_of_features.strip()): - plpy.error("Features to include is empty") - _assert( - columns_exist_in_table( - source_table, split_quoted_delimited_str(list_of_features)), - "Invalid columns in list_of_features {0}".format(list_of_features)) - - if cols_to_output and cols_to_output.strip() != '*': - _assert( - columns_exist_in_table( - source_table, split_quoted_delimited_str(cols_to_output)), - "Invalid columns to output list {0}".format(cols_to_output)) - - -def cols2vec(schema_madlib, source_table, output_table, list_of_features, - list_of_features_to_exclude=None, cols_to_output=None, **kwargs): - """ - Args: - @param schema_madlib: Name of MADlib schema - @param model: Name of table containing the tree model - @param source_table: Name of table containing prediction data - @param output_table: Name of table to output the results - @param list_of_features: Comma-separated string of column names or - expressions to put into feature array. - Can also be a '*' implying all columns - are to be put into feature array. - @param list_of_features_to_exclude: Comma-separated string of column names - to exclude from the feature array - @param cols_to_output: Comma-separated string of column names - from the source table to keep in the output table, - in addition to the feature array. - - Returns: - None - - """ - with MinWarning('warning'): - validate_cols2vec_args(source_table, output_table, list_of_features, - list_of_features_to_exclude, - cols_to_output, **kwargs) - - all_cols = get_cols(source_table, schema_madlib) - if list_of_features.strip() == '*': - exclude_set = set(split_quoted_delimited_str(list_of_features_to_exclude)) - feature_list = [c for c in all_cols if c not in exclude_set] - else: - feature_list = split_quoted_delimited_str(list_of_features) - - if cols_to_output: - additional_cols = (all_cols if cols_to_output == '*' else - split_quoted_delimited_str(cols_to_output)) - additional_cols_str = ', '.join(additional_cols) + "," - else: - additional_cols_str = '' - - feature_list_str = py_list_to_sql_string(feature_list, "TEXT[]", True) - plpy.execute(""" - CREATE TABLE {output_table} AS - SELECT {additional_cols_str} - {feature_list_str} AS feature_vector - FROM {source_table} - """.format(**locals())) - - feature_cols = py_list_to_sql_string( - [quote_literal(f) for f in feature_list], "TEXT", True) - - output_table_summary = add_postfix(output_table, "_summary") - # Dollar-quote the text to allow single-quotes without escaping - dq = "$__MADLIB_OUTER__$" - feature_exclude_str = ("NULL" if not list_of_features_to_exclude else - list_of_features_to_exclude) - plpy.execute(""" - CREATE TABLE {output_table_summary} AS - SELECT - {dq}{source_table}{dq}::TEXT AS source_table, - {dq}{list_of_features}{dq}::TEXT AS list_of_features, - {dq}{feature_exclude_str}{dq}::TEXT AS list_of_features_to_exclude, - {feature_cols} AS feature_names - """.format(**locals())) http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/cols2vec.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/utilities/cols2vec.sql_in b/src/ports/postgres/modules/utilities/cols2vec.sql_in index a70a953..82a1f94 100644 --- a/src/ports/postgres/modules/utilities/cols2vec.sql_in +++ b/src/ports/postgres/modules/utilities/cols2vec.sql_in @@ -28,7 +28,6 @@ m4_include(`SQLCommon.m4') - /** @addtogroup grp_cols2vec @@ -38,7 +37,7 @@ m4_include(`SQLCommon.m4') <ul> <li class="level1"><a href="#cols2vec_syntax">Syntax</a> <li class="level1"><a href="#cols2vec_usage">Usage</a> -<li class="level1"><a href="#cols2vec_example">Example</a> +<li class="level1"><a href="#cols2vec_example">Examples</a> </ul> </div> @@ -66,7 +65,7 @@ cols2vec( \b Arguments <dl class="arglist"> <dt>source_table</dt> -<dd>TEXT. Name of the table containing the source data.</dd>. +<dd>TEXT. Name of the table containing the source data.</dd> <dt>output_table</dt> <dd>TEXT. Name of the generated table containing the output.</dd> @@ -74,107 +73,319 @@ cols2vec( <dt>list_of_features</dt> <dd>TEXT. Comma-separated string of column names or expressions to put into feature array. -Can also be a '*' implying all columns are to be put into feature array (except -for the ones included in the next argument that lists exclusions). Array columns -in the source table are not supported in the 'list_of_features' parameter. </dd> +Can also be '*' implying all columns are to be put into feature array (except +for the ones included in the next argument that lists exclusions). Type casting +will be done as per the regular type casting rules of the underlying database. +Array columns in the source table are not supported in the 'list_of_features' parameter. Also, +all of the features to be included must be of the same type and must not have null +values.</dd> <dt>list_of_features_to_exclude (optional)</dt> <dd>TEXT. Default NULL. -Comma-separated string of column names to exclude from the feature array. Use -only when 'list_of_features' is '*'. </dd> +Comma-separated string of column names to exclude from the feature array. +Typically used when 'list_of_features' is set to '*'.</dd> <dt>cols_to_output (optional)</dt> <dd>TEXT. Default NULL. Comma-separated string of column names from the source table to keep in the output table, in addition to the feature array. To keep all columns from the source table, use '*' for this parameter. </dd> - </dl> +<b>Output table</b> +<br> + The output table produced by the cols2vec function contains the following columns: + <table class="output"> + <tr> + <th><...></th> + <td>Columns from source table, depending on which ones are kept (if any). + </td> + </tr> + <tr> + <th>feature_vector</th> + <td>Column that contains the feature array. + </tr> + </table> + +<b>Output summary table</b> +<br> + A summary table named <em><output_table>_summary</em> is also created + that contains: + <table class="output"> + <tr> + <th>source_table</th> + <td>Name of the table containing the source data.</td> + </tr> + <tr> + <th>list_of_features</th> + <td>List of features to put in vector.</td> + </tr> + <tr> + <th>list_of_features_to_exclude</th> + <td>Features specified by the user to exclude from 'list_of_features'.</td> + </tr> + <tr> + <th>feature_names</th> + <td>Names of the features that were nested (converted to a vector) in the output table.</td> + </tr> + </table> + @anchor cols2vec_example @par Examples -# Load sample data: <pre class="example"> -DROP TABLE IF EXISTS cols2vec; -CREATE TABLE cols2vec ( - id bigint, - label int, - feat1 int, - feat2 int, - feat3 float, - other_col float +DROP TABLE IF EXISTS golf CASCADE; +CREATE TABLE golf ( + id integer NOT NULL, + "OUTLOOK" text, + temperature double precision, + humidity double precision, + "Temp_Humidity" double precision[], + clouds_airquality text[], + windy boolean, + class text, + observation_weight double precision +); +INSERT INTO golf VALUES +(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0), +(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0), +(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5), +(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0), +(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0), +(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0), +(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5), +(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0), +(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0), +(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0), +(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0), +(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5), +(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5), +(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0); +</pre> + +-# Run cols2vec to combine the temperature and humidity columns into a single array feature. +<pre class="example"> +DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary; +SELECT madlib.cols2vec( + 'golf', + 'cols2vec_result', + 'temperature, humidity' ); -INSERT INTO cols2vec VALUES -(1, 0, 1, 1, 0.5, 0.9), -(2, 1, 0, 1, 0.3, 0.3), -(3, 0, 0, 0, 0.1, 1.1), -(4, 1, 1, 0, 0.9, 0.4); +SELECT * FROM cols2vec_result; +</pre> +<pre class="result"> + feature_vector +----------------+ + {85,85} + {80,90} + {83,78} + {70,96} + {68,80} + {65,70} + {64,65} + {72,95} + {69,70} + {75,80} + {75,70} + {72,90} + {81,75} + {71,80} +(14 rows) +</pre> +View the summary table: +<pre class="example"> +\\x on +SELECT * FROM cols2vec_result_summary; +\\x off +</pre> +<pre class="result"> +-[ RECORD 1 ]---------------+---------------------------------------------------------------- +source_table | golf +list_of_features | temperature, humidity +list_of_features_to_exclude | None +feature_names | {temperature,humidity} </pre> --# Create feature array: +-# Combine the temperature and humidity columns +and keep 2 other columns from source_table. <pre class="example"> DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary; SELECT madlib.cols2vec( - 'cols2vec', -- input table - 'cols2vec_result', -- output table - 'feat1,feat2,feat3', -- list of features - NULL, -- features to exclude - 'id, label ' -- columns from input table to be included in output + 'golf', + 'cols2vec_result', + 'temperature, humidity', + NULL, + 'id, "OUTLOOK"' ); SELECT * FROM cols2vec_result ORDER BY id; </pre> <pre class="result"> - id | label | feature_vector -----+-------+---------------- - 1 | 0 | {1,1,0.5} - 2 | 1 | {0,1,0.3} - 3 | 0 | {0,0,0.1} - 4 | 1 | {1,0,0.9} -(4 rows) + id | OUTLOOK | feature_vector +----+----------+---------------- + 1 | sunny | {85,85} + 2 | sunny | {80,90} + 3 | overcast | {83,78} + 4 | rain | {70,96} + 5 | rain | {68,80} + 6 | rain | {65,70} + 7 | overcast | {64,65} + 8 | sunny | {72,95} + 9 | sunny | {69,70} + 10 | rain | {75,80} + 11 | sunny | {75,70} + 12 | overcast | {72,90} + 13 | overcast | {81,75} + 14 | rain | {71,80} +(14 rows) </pre> -View summary table: +View the summary table: <pre class="example"> +\\x on SELECT * FROM cols2vec_result_summary; +\\x off </pre> <pre class="result"> - source_table | list_of_features | list_of_features_to_exclude | feature_names ---------------+-------------------+-----------------------------+--------------------- - cols2vec | feat1,feat2,feat3 | None | {feat1,feat2,feat3} +-[ RECORD 1 ]---------------+---------------------------------------------------------------- +source_table | golf +list_of_features | temperature, humidity +list_of_features_to_exclude | None +feature_names | {temperature,humidity} </pre> --# The above result could be obtained in a similar way using the 'features_to_exclude' parameter: +-# Combine all columns, excluding all columns that are not of type double precision. <pre class="example"> DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary; SELECT madlib.cols2vec( - 'cols2vec', -- input table - 'cols2vec_result', -- output table - '*', -- list of features - 'id, label, other_col', -- features to exclude - 'id, label' -- columns from input table to be included in output + 'golf', + 'cols2vec_result', + '*', + '"OUTLOOK", "Temp_Humidity", clouds_airquality, windy, class, id', + 'id, "OUTLOOK"' ); SELECT * FROM cols2vec_result ORDER BY id; </pre> <pre class="result"> - id | label | feature_vector -----+-------+---------------- - 1 | 0 | {1,1,0.5} - 2 | 1 | {0,1,0.3} - 3 | 0 | {0,0,0.1} - 4 | 1 | {1,0,0.9} -(4 rows) + id | OUTLOOK | feature_vector +----+----------+---------------- + 1 | sunny | {85,85,5} + 2 | sunny | {80,90,5} + 3 | overcast | {83,78,1.5} + 4 | rain | {70,96,1} + 5 | rain | {68,80,1} + 6 | rain | {65,70,1} + 7 | overcast | {64,65,1.5} + 8 | sunny | {72,95,5} + 9 | sunny | {69,70,5} + 10 | rain | {75,80,1} + 11 | sunny | {75,70,5} + 12 | overcast | {72,90,1.5} + 13 | overcast | {81,75,1.5} + 14 | rain | {71,80,1} +(14 rows) </pre> View summary table: <pre class="example"> +\\x on +SELECT * FROM cols2vec_result_summary; +\\x off +</pre> +<pre class="result"> +-[ RECORD 1 ]---------------+---------------------------------------------------------------- +source_table | golf +list_of_features | * +list_of_features_to_exclude | "OUTLOOK", "Temp_Humidity", clouds_airquality, windy, class, id +feature_names | {temperature,humidity,observation_weight} +</pre> + +-# Combine the temperature and humidity columns, exclude windy, and keep all of the +columns from the source table. +<pre class="example"> +DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary; +SELECT madlib.cols2vec( + 'golf', + 'cols2vec_result', + 'windy, temperature, humidity', + 'windy', + '*' +); +SELECT * FROM cols2vec_result ORDER BY id; +</pre> +<pre class="result"> + id | OUTLOOK | temperature | humidity | Temp_Humidity | clouds_airquality | windy | class | observation_weight | feature_vector +----+----------+-------------+----------+---------------+-------------------+-------+------------+--------------------+---------------- + 1 | sunny | 85 | 85 | {85,85} | {none,unhealthy} | f | Don't Play | 5 | {85,85} + 2 | sunny | 80 | 90 | {80,90} | {none,moderate} | t | Don't Play | 5 | {80,90} + 3 | overcast | 83 | 78 | {83,78} | {low,moderate} | f | Play | 1.5 | {83,78} + 4 | rain | 70 | 96 | {70,96} | {low,moderate} | f | Play | 1 | {70,96} + 5 | rain | 68 | 80 | {68,80} | {medium,good} | f | Play | 1 | {68,80} + 6 | rain | 65 | 70 | {65,70} | {low,unhealthy} | t | Don't Play | 1 | {65,70} + 7 | overcast | 64 | 65 | {64,65} | {medium,moderate} | t | Play | 1.5 | {64,65} + 8 | sunny | 72 | 95 | {72,95} | {high,unhealthy} | f | Don't Play | 5 | {72,95} + 9 | sunny | 69 | 70 | {69,70} | {high,good} | f | Play | 5 | {69,70} + 10 | rain | 75 | 80 | {75,80} | {medium,good} | f | Play | 1 | {75,80} + 11 | sunny | 75 | 70 | {75,70} | {none,good} | t | Play | 5 | {75,70} + 12 | overcast | 72 | 90 | {72,90} | {medium,moderate} | t | Play | 1.5 | {72,90} + 13 | overcast | 81 | 75 | {81,75} | {medium,moderate} | f | Play | 1.5 | {81,75} + 14 | rain | 71 | 80 | {71,80} | {low,unhealthy} | t | Don't Play | 1 | {71,80} +(14 rows) +</pre> +View the summary table: +<pre class="example"> +\\x on SELECT * FROM cols2vec_result_summary; +\\x off +</pre> +<pre class="result"> +-[ RECORD 1 ]---------------+----------------------------- +source_table | golf +list_of_features | windy, temperature, humidity +list_of_features_to_exclude | windy +feature_names | {temperature,humidity} +</pre> +This also shows that you can exclude features in 'list_of_features_to_exclude' +that are in the list of 'list_of_features'. This can be useful if the 'list_of_features' +is generated from an expression or subquery. + +-# Type casting works as per regular rules of the underlying database. +E.g, combining integer and double precisions columns will create a double precision feature vector. +For Boolean, do an explicit cast to the target type: +<pre class="example"> +DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary; +SELECT madlib.cols2vec( + 'golf', + 'cols2vec_result', + 'windy::TEXT, class', + NULL, + 'id' +); +SELECT * FROM cols2vec_result ORDER BY id; </pre> <pre class="result"> - source_table | list_of_features | list_of_features_to_exclude | feature_names ---------------+------------------+-----------------------------+--------------------- - cols2vec | * | id, label, other_col | {feat1,feat2,feat3} + id | feature_vector +----+---------------------- + 1 | {false,"Don't Play"} + 2 | {true,"Don't Play"} + 3 | {false,Play} + 4 | {false,Play} + 5 | {false,Play} + 6 | {true,"Don't Play"} + 7 | {true,Play} + 8 | {false,"Don't Play"} + 9 | {false,Play} + 10 | {false,Play} + 11 | {true,Play} + 12 | {true,Play} + 13 | {false,Play} + 14 | {true,"Don't Play"} +(14 rows) </pre> + */ +------------------------------------------------------------------------- +-- cols2vec +------------------------------------------------------------------------- CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec( source_table VARCHAR, @@ -183,10 +394,13 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec( list_of_features_to_exclude VARCHAR, cols_to_output VARCHAR ) RETURNS void AS $$ - PythonFunction(utilities, cols2vec, cols2vec) -$$ LANGUAGE plpythonu VOLATILE -m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); - + PythonFunctionBodyOnly(utilities, transform_vec_cols) + from utilities.control import MinWarning + with MinWarning('warning'): + cols2vec_obj = transform_vec_cols.cols2vec() + return cols2vec_obj.cols2vec(**globals()) + $$ LANGUAGE plpythonu VOLATILE + m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec( source_table VARCHAR, @@ -206,3 +420,16 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec( SELECT MADLIB_SCHEMA.cols2vec($1, $2, $3, NULL, NULL) $$ LANGUAGE SQL m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(message TEXT) +RETURNS text AS $$ + PythonFunctionBodyOnly(utilities, transform_vec_cols) + return transform_vec_cols.cols2vec().cols2vec_help_message(schema_madlib, message) +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec() +RETURNS text AS $$ + SELECT MADLIB_SCHEMA.cols2vec(''); +$$ language SQL +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/test/cols2vec.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/utilities/test/cols2vec.sql_in b/src/ports/postgres/modules/utilities/test/cols2vec.sql_in deleted file mode 100644 index fd22238..0000000 --- a/src/ports/postgres/modules/utilities/test/cols2vec.sql_in +++ /dev/null @@ -1,91 +0,0 @@ -/* ----------------------------------------------------------------------- */ -/** - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * - */ -/* ----------------------------------------------------------------------- */ - -DROP TABLE IF EXISTS source_table; -CREATE TABLE source_table ( - id bigint, - label int, - feat1 int, - feat2 int, - feat3 float, - other_col float -); -INSERT INTO source_table VALUES -(1, 0, 1, 1, 0.5, 0.9), -(2, 1, 0, 1, 0.3, 0.3), -(3, 0, 0, 0, 0.1, 1.1), -(4, 1, 1, 0, 0.9, 0.4); - - -DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary; -SELECT cols2vec( 'source_table', 'cols2vec_out', 'feat1,feat2,feat3', 'id', 'id,label' ); -SELECT assert(feature_vector = '{1,1,0.5}', - 'Incorrect results for cols2vec') -FROM cols2vec_out -WHERE id = 1; - -DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary; -SELECT cols2vec( 'source_table', 'cols2vec_out', '*', 'id,other_col', 'id,label' ); - -SELECT * FROM cols2vec_out; -SELECT assert(feature_vector = '{0,1,1,0.5}','Incorrect results for cols2vec') -FROM cols2vec_out -WHERE id = 1; - - -DROP TABLE IF EXISTS cols2vec_spcl; -CREATE TABLE cols2vec_spcl ( - "i,Ð!#'d" bigint, - "lab$$''%*Ð!#''()el" int, - "fe''at1" int, - feat2 int, - "fe'%*()at3" float, - other_col float, - "se''x" TEXT -); - -INSERT INTO cols2vec_spcl VALUES -(1, 0, 1, 1, 0.5, 0.9,'M''M'), -(2, 1, 0, 1, 0.3, 0.3,'M$M'), -(3, 0, 0, 0, 0.1, 1.1,'M,M'), -(4, 1, 1, 0, 0.9, 0.4,'M@[}(:*;M'), -(5, 1, 0, 1, 0.85, 0.34,'M@[}(:*;M'), -(6, 1, 0, 1, 0.63, 0.12,'M"M'), -(7, 0, 1, 0, 0.7, 1.4,'MÐM'); - -DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary; -SELECT cols2vec( 'cols2vec_spcl', 'cols2vec_out','"fe''''at1",feat2,"fe''%*()at3"', - NULL, '"i,Ð!#''d","lab$$''''%*Ð!#''''()el"' ); - -select assert(feature_vector = '{1,1,0.5}','Incorrect results for cols2vec') -FROM cols2vec_out -WHERE "i,Ð!#'d" = 1; - -DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary; - -SELECT cols2vec('cols2vec_spcl', 'cols2vec_out','*', '"se''''x"', - '"i,Ð!#''d","lab$$''''%*Ð!#''''()el"'); -SELECT assert(feature_vector = '{1,0,1,1,0.5,0.9}', - 'Incorrect results for cols2vec') -FROM cols2vec_out -WHERE "i,Ð!#'d" = 1; http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/test/transform_vec_cols.ic.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/utilities/test/transform_vec_cols.ic.sql_in b/src/ports/postgres/modules/utilities/test/transform_vec_cols.ic.sql_in new file mode 100644 index 0000000..e965b22 --- /dev/null +++ b/src/ports/postgres/modules/utilities/test/transform_vec_cols.ic.sql_in @@ -0,0 +1,68 @@ +/* ----------------------------------------------------------------------- *//** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + *//* ----------------------------------------------------------------------- */ + +-- Create testing dataset + +DROP TABLE IF EXISTS dt_golf CASCADE; +CREATE TABLE dt_golf ( + id integer NOT NULL, + "OUTLOOK" text, + temperature double precision, + humidity double precision, + "Temp_Humidity" double precision[], + clouds_airquality text[], + windy boolean, + class text, + observation_weight double precision +); +INSERT INTO dt_golf VALUES +(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0), +(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0), +(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5), +(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0), +(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0), +(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0), +(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5), +(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0), +(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0), +(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0), +(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0), +(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5), +(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5), +(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0); + +DROP TABLE IF EXISTS out_table; +SELECT vec2cols( + 'dt_golf', + 'out_table', + 'clouds_airquality', + ARRAY['clouds', 'air_quality'], + '"OUTLOOK", id' +); + +DROP TABLE IF EXISTS out_table, out_table_summary; +SELECT cols2vec( + 'dt_golf', + 'out_table', + 'windy, temperature, humidity', + 'windy', + '"OUTLOOK", id' +); http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/test/transform_vec_cols.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/utilities/test/transform_vec_cols.sql_in b/src/ports/postgres/modules/utilities/test/transform_vec_cols.sql_in new file mode 100644 index 0000000..47ab299 --- /dev/null +++ b/src/ports/postgres/modules/utilities/test/transform_vec_cols.sql_in @@ -0,0 +1,470 @@ +/* ----------------------------------------------------------------------- *//** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + *//* ----------------------------------------------------------------------- */ + +----------------------------------------------------------------------- +-- vec2cols +----------------------------------------------------------------------- + +-- Create testing dataset + +DROP TABLE IF EXISTS dt_golf CASCADE; +CREATE TABLE dt_golf ( + id integer NOT NULL, + "OUTLOOK" text, + temperature double precision, + humidity double precision, + "Temp_Humidity" double precision[], + clouds_airquality text[], + windy boolean, + class text, + observation_weight double precision +); +INSERT INTO dt_golf VALUES +(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0), +(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0), +(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5), +(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0), +(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0), +(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0), +(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5), +(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0), +(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0), +(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0), +(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0), +(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5), +(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5), +(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0); + +-- Call the vec2cols function with different parameters on the 'clouds_airquality' column, to split it up + +DROP TABLE IF EXISTS out_table; +SELECT vec2cols( + 'dt_golf', + 'out_table', + 'clouds_airquality' +); + +SELECT f1 from out_table; +SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE table_name='out_table') = 2, 'Number of split columns does not match'); + +DROP TABLE IF EXISTS out_table; +SELECT vec2cols( + 'dt_golf', + 'out_table', + 'clouds_airquality', + NULL, + 'id' +); + +SELECT f1 from out_table; +SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE table_name='out_table') = 3, 'Number of split columns does not match'); +SELECT assert ((SELECT clouds_airquality[1] FROM dt_golf WHERE id = 1) = (SELECT f1 FROM out_table WHERE id = 1), 'Split values do not match up'); +SELECT assert ((SELECT clouds_airquality[2] FROM dt_golf WHERE id = 1) = (SELECT f2 FROM out_table WHERE id = 1), 'Split values do not match up'); + +DROP TABLE IF EXISTS out_table; +SELECT vec2cols( + 'dt_golf', + 'out_table', + 'clouds_airquality', + ARRAY['clouds', 'air_quality'] +); + +SELECT clouds from out_table; +SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE table_name='out_table') = 2, 'Number of split columns does not match'); + +DROP TABLE IF EXISTS out_table; +SELECT vec2cols( + 'dt_golf', + 'out_table', + 'clouds_airquality', + ARRAY['clouds', 'air_quality'], + '"OUTLOOK", id' +); + +SELECT clouds from out_table; +SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE table_name='out_table') = 4, 'Number of split columns does not match'); +SELECT assert ((SELECT clouds_airquality[1] FROM dt_golf WHERE id = 1) = (SELECT clouds FROM out_table WHERE id = 1), 'Split values do not match up'); +SELECT assert ((SELECT clouds_airquality[2] FROM dt_golf WHERE id = 1) = (SELECT air_quality FROM out_table WHERE id = 1), 'Split values do not match up'); + +-- Special character tests + +DROP TABLE IF EXISTS special_char_check; +CREATE TABLE special_char_check( + "id" INTEGER, + "se$$,''x" TEXT, + "len$,,$'%*()gth" TEXT[], + "rin,$$Ð!#'gs" INTEGER, + ",hel~!@#$%^&*()_+{}|:""<>?`-=[]\;',./'," TEXT, + "$$hi//\\*$$" INTEGER, + "**" INTEGER, + "'~d())" TEXT); + +INSERT INTO special_char_check VALUES +(1,'M''M',ARRAY['sd''f', 'ab,,c'],6,'sd''f',1,2,'ab,,c'), +(2,'''M''M''',ARRAY['sdf$$sdfk(),', '$$sdlhf$$'],6,'sdf$$sdfk(),',1,2,'$$sdlhf$$'), +(3,'M|$$M',ARRAY['%~(())Ð"', 'sdf'],6,'%~(())Ð"',1,2,'sdf'), +(4,'M,M',ARRAY['sdf', 'sdf'],6,'sdf',1,2,'sdf'), +(5,'M@[}(:*;M',ARRAY['sdf', 'sdf'],6,'sdf',1,2,'sdf'), +(6,'M"M',ARRAY['sdf', 'sdf'],6,'sdf',1,2,'sdf'), +(7,'MÐM',ARRAY['sdf', 'sdf'],6,'sdf',1,2,'sdf'); + +DROP TABLE IF EXISTS out_table; +SELECT vec2cols( + 'special_char_check', + 'out_table', + '"len$,,$''%*()gth"', + ARRAY['",cl''oÐu,ds,"', '"air_qu""ality"'], + '*' +); + +SELECT ",cl'oÐu,ds,", "air_qu""ality", ",hel~!@#$%^&*()_+{}|:""<>?`-=[]\;',./',", "$$hi//\\*$$", "**", "'~d())" from out_table; +SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE table_name='out_table') = 10, 'Number of split columns does not match'); +SELECT assert ((SELECT "len$,,$'%*()gth"[1] FROM special_char_check WHERE id = 1) = (SELECT ",cl'oÐu,ds," FROM out_table WHERE id = 1), 'Split values do not match up'); +SELECT assert ((SELECT "se$$,''x" FROM special_char_check WHERE id = 6) = 'M"M', 'Incorrect entries in out_table'); + +DROP TABLE IF EXISTS out_table; +SELECT vec2cols( + 'special_char_check', + 'out_table', + '"len$,,$''%*()gth"', + ARRAY['",cl''oÐu,ds,"', '"air_qu""ality"'], + $__madlib__$id, "rin,$$Ð!#'gs", ",hel~!@#$%^&*()_+{}|:""<>?`-=[]\;',./',", "$$hi//\\*$$", "**"$__madlib__$ +); + +SELECT ",cl'oÐu,ds,", "air_qu""ality", id, "rin,$$Ð!#'gs", ",hel~!@#$%^&*()_+{}|:""<>?`-=[]\;',./',", "$$hi//\\*$$", "**" from out_table; +SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE table_name='out_table') = 7, 'Number of split columns does not match'); +SELECT assert ((SELECT "len$,,$'%*()gth"[1] FROM special_char_check WHERE id = 1) = (SELECT ",cl'oÐu,ds," FROM out_table WHERE id = 1), 'Split values do not match up'); +SELECT assert ((SELECT ",hel~!@#$%^&*()_+{}|:""<>?`-=[]\;',./'," FROM special_char_check WHERE id = 1) = 'sd''f', 'Incorrect entries in out_table'); + +DROP TABLE IF EXISTS out_table; +SELECT vec2cols( + 'special_char_check', + 'out_table', + '"len$,,$''%*()gth"', + ARRAY['",cl''oÐu,ds,"', '"air_qu""ality"'], + '"**"' +); + +SELECT "**", ",cl'oÐu,ds,", "air_qu""ality" from out_table; +SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE table_name='out_table') = 3, 'Number of split columns does not match'); + +----------------------------------------------------------------------- +-- cols2vec +----------------------------------------------------------------------- + +DROP TABLE IF EXISTS out_table, out_table_summary; +SELECT cols2vec( + 'dt_golf', + 'out_table', + 'temperature, humidity' +); + +SELECT feature_vector from out_table; +SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE table_name='out_table') = 1, 'Number of output table columns does not match'); + +DROP TABLE IF EXISTS out_table, out_table_summary; +SELECT cols2vec( + 'dt_golf', + 'out_table', + 'temperature, humidity', + NULL, + 'id' +); + +SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE table_name='out_table') = 2, 'Number of output table columns does not match'); +SELECT assert ((SELECT feature_vector from out_table where id = 1) = '{85, 85}', 'Output table values do not match up'); + +-- All numeric types +DROP TABLE IF EXISTS out_table, out_table_summary; +SELECT cols2vec( + 'dt_golf', + 'out_table', + 'id, temperature, humidity, observation_weight', + NULL, + '*' +); + +SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE table_name='out_table') = 10, 'Number of output table columns does not match'); +SELECT assert ((SELECT feature_vector from out_table where id = 1) = '{1.0, 85.0, 85.0, 5.0}', 'Output table values do not match up'); + +DROP TABLE IF EXISTS out_table, out_table_summary; +SELECT cols2vec( + 'dt_golf', + 'out_table', + 'windy, temperature, humidity', + 'windy', + '*' +); + +SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE table_name='out_table') = 10, 'Number of output table columns does not match'); +SELECT assert ((SELECT feature_vector from out_table where id = 1) = '{85, 85}', 'Output table values do not match up'); + +DROP TABLE IF EXISTS out_table, out_table_summary; +SELECT cols2vec( + 'dt_golf', + 'out_table', + '*', + '"OUTLOOK", "Temp_Humidity", clouds_airquality, windy, class, id', + '*' +); + +SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE table_name='out_table') = 10, 'Number of output table columns does not match'); +SELECT assert ((SELECT feature_vector from out_table where id = 1) = '{85, 85, 5}', 'Output table values do not match up'); + +DROP TABLE IF EXISTS out_table, out_table_summary; +SELECT cols2vec( + 'dt_golf', + 'out_table', + 'windy, temperature, humidity', + 'windy, class', + '*' +); + +SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE table_name='out_table') = 10, 'Number of output table columns does not match'); +SELECT assert ((SELECT feature_vector from out_table where id = 1) = '{85, 85}', 'Output table values do not match up'); + +-- Special character tests + +DROP TABLE IF EXISTS cols2vec_spcl; +CREATE TABLE cols2vec_spcl ( + "i,Ð!#'d" bigint, + "lab$$''%*Ð!#''()el" int, + "fe''at1" float, + feat2 float, + "fe'%*()at3" float, + other_col float, + "se''x" TEXT +); + +INSERT INTO cols2vec_spcl VALUES +(1, 0, 1, 1, 0.5, 0.9,'M''M'), +(2, 1, 0, 1, 0.3, 0.3,'M$M'), +(3, 0, 0, 0, 0.1, 1.1,'M,M'), +(4, 1, 1, 0, 0.9, 0.4,'M@[}(:*;M'), +(5, 1, 0, 1, 0.85, 0.34,'M@[}(:*;M'), +(6, 1, 0, 1, 0.63, 0.12,'M"M'), +(7, 0, 1, 0, 0.7, 1.4,'MÐM'); + +DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary; +SELECT cols2vec( 'cols2vec_spcl', 'cols2vec_out','"fe''''at1",feat2,"fe''%*()at3"', + NULL, '"i,Ð!#''d","lab$$''''%*Ð!#''''()el"' ); + +SELECT assert(feature_vector = '{1,1,0.5}', 'Incorrect results for cols2vec') +FROM cols2vec_out +WHERE "i,Ð!#'d" = 1; + +DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary; + +SELECT cols2vec('cols2vec_spcl', 'cols2vec_out','*', '"se''''x","i,Ð!#''d","lab$$''''%*Ð!#''''()el"', + '"i,Ð!#''d","lab$$''''%*Ð!#''''()el"'); +SELECT assert(feature_vector = '{1,1,0.5,0.9}', + 'Incorrect results for cols2vec') +FROM cols2vec_out +WHERE "i,Ð!#'d" = 1; + +-- Type casting tests + +DROP TABLE IF EXISTS cols2vec_casting_test; +CREATE TABLE cols2vec_casting_test ( + a smallint, + b int, + c bigint, + d decimal, + e numeric, + f real, + g float, + h double precision, + i serial, + j bigserial, + k boolean, + l boolean, + m text, + n varchar, + o character varying, + p char, + q character, + r smallint, + s boolean, + t boolean, + u text[], + v text[], + w integer[], + x text +); + +INSERT INTO cols2vec_casting_test VALUES +(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 'true', 'true', 'hello', 'hello', 'hello', 'h', 'h', 1, 'false', 'false', ARRAY['1','sdfsd','serw'], ARRAY['2','sdfsd','serw'], ARRAY[1, 2, 3], '1'); + +DROP TABLE IF EXISTS out_table, out_table_summary, out_table2; +SELECT cols2vec( + 'cols2vec_casting_test', + 'out_table', + 'a, b' +); +SELECT vec2cols( + 'out_table', + 'out_table2', + 'feature_vector' +); +SELECT assert((select data_type from information_schema.columns where table_name = 'out_table2' limit 1) = 'integer', 'Incorrect type casting of features for cols2vec'); + +DROP TABLE IF EXISTS out_table, out_table_summary, out_table2; +SELECT cols2vec( + 'cols2vec_casting_test', + 'out_table', + 'a, b, c' +); +SELECT vec2cols( + 'out_table', + 'out_table2', + 'feature_vector' +); +SELECT assert((select data_type from information_schema.columns where table_name = 'out_table2' limit 1) = 'bigint', 'Incorrect type casting of features for cols2vec'); + +DROP TABLE IF EXISTS out_table, out_table_summary, out_table2; +SELECT cols2vec( + 'cols2vec_casting_test', + 'out_table', + 'a, b, c, d, e, f, g, h, i, j' +); +SELECT vec2cols( + 'out_table', + 'out_table2', + 'feature_vector' +); +SELECT assert((select data_type from information_schema.columns where table_name = 'out_table2' limit 1) = 'double precision', 'Incorrect type casting of features for cols2vec'); + +DROP TABLE IF EXISTS out_table, out_table_summary, out_table2; +SELECT cols2vec( + 'cols2vec_casting_test', + 'out_table', + 'k, l' +); +SELECT vec2cols( + 'out_table', + 'out_table2', + 'feature_vector' +); +SELECT assert((select data_type from information_schema.columns where table_name = 'out_table2' limit 1) = 'boolean', 'Incorrect type casting of features for cols2vec'); + +DROP TABLE IF EXISTS out_table, out_table_summary, out_table2; +SELECT cols2vec( + 'cols2vec_casting_test', + 'out_table', + 'm, n, o, p, q' +); +SELECT vec2cols( + 'out_table', + 'out_table2', + 'feature_vector' +); +SELECT assert((select data_type from information_schema.columns where table_name = 'out_table2' limit 1) = 'text', 'Incorrect type casting of features for cols2vec'); + +DROP TABLE IF EXISTS out_table, out_table_summary, out_table2; +SELECT cols2vec( + 'cols2vec_casting_test', + 'out_table', + 'a, r' +); +SELECT vec2cols( + 'out_table', + 'out_table2', + 'feature_vector' +); +SELECT assert((select data_type from information_schema.columns where table_name = 'out_table2' limit 1) = 'smallint', 'Incorrect type casting of features for cols2vec'); + +DROP TABLE IF EXISTS out_table, out_table_summary, out_table2; +SELECT cols2vec( + 'cols2vec_casting_test', + 'out_table', + 's, t' +); +SELECT vec2cols( + 'out_table', + 'out_table2', + 'feature_vector' +); +SELECT assert((select data_type from information_schema.columns where table_name = 'out_table2' limit 1) = 'boolean', 'Incorrect type casting of features for cols2vec'); + +DROP TABLE IF EXISTS out_table, out_table_summary, out_table2; +SELECT cols2vec( + 'cols2vec_casting_test', + 'out_table', + 's' +); +SELECT vec2cols( + 'out_table', + 'out_table2', + 'feature_vector' +); +SELECT assert((select data_type from information_schema.columns where table_name = 'out_table2' limit 1) = 'boolean', 'Incorrect type casting of features for cols2vec'); + +DROP TABLE IF EXISTS out_table, out_table_summary, out_table2; +SELECT cols2vec( + 'cols2vec_casting_test', + 'out_table', + 's::int, a, b' +); +SELECT vec2cols( + 'out_table', + 'out_table2', + 'feature_vector' +); +SELECT assert((select data_type from information_schema.columns where table_name = 'out_table2' limit 1) = 'integer', 'Incorrect type casting of features for cols2vec'); + +DROP TABLE IF EXISTS out_table, out_table_summary, out_table2; +SELECT cols2vec( + 'cols2vec_casting_test', + 'out_table', + 'm, b::text' +); +SELECT vec2cols( + 'out_table', + 'out_table2', + 'feature_vector' +); +SELECT assert((select data_type from information_schema.columns where table_name = 'out_table2' limit 1) = 'text', 'Incorrect type casting of features for cols2vec'); + +DROP TABLE IF EXISTS out_table, out_table_summary, out_table2; +SELECT cols2vec( + 'cols2vec_casting_test', + 'out_table', + 'x::integer, b' +); +SELECT vec2cols( + 'out_table', + 'out_table2', + 'feature_vector' +); +SELECT assert((select data_type from information_schema.columns where table_name = 'out_table2' limit 1) = 'integer', 'Incorrect type casting of features for cols2vec'); + +DROP TABLE IF EXISTS out_table, out_table_summary, out_table2; +SELECT cols2vec( + 'cols2vec_casting_test', + 'out_table', + 'g::smallint, h::smallint' +); +SELECT vec2cols( + 'out_table', + 'out_table2', + 'feature_vector' +); +SELECT assert((select data_type from information_schema.columns where table_name = 'out_table2' limit 1) = 'smallint', 'Incorrect type casting of features for cols2vec'); http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/test/unit_tests/test_transform_vec_cols.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/utilities/test/unit_tests/test_transform_vec_cols.py_in b/src/ports/postgres/modules/utilities/test/unit_tests/test_transform_vec_cols.py_in new file mode 100644 index 0000000..6475f9b --- /dev/null +++ b/src/ports/postgres/modules/utilities/test/unit_tests/test_transform_vec_cols.py_in @@ -0,0 +1,226 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys +from os import path +from itertools import repeat + +# Add modules module to the pythonpath. +sys.path.append(path.dirname(path.dirname(path.dirname(path.dirname(path.abspath(__file__)))))) + +import unittest +from mock import * +import plpy_mock as plpy + +m4_changequote(`<!', `!>') + +class VecColsHelperTestSuite(unittest.TestCase): + def setUp(self): + patches = { + 'plpy': plpy + } + self.plpy_mock_execute = MagicMock() + plpy.execute = self.plpy_mock_execute + + self.module_patcher = patch.dict('sys.modules', patches) + self.module_patcher.start() + + import utilities.transform_vec_cols + self.module = utilities.transform_vec_cols + self.subject = self.module.vec_cols_helper() + + self.default_source_table = "source" + self.does_not_matter = 'does_not_matter' + self.default_output_table = "output" + + def tearDown(self): + self.module_patcher.stop() + + def test_get_cols_as_list_null_cols_to_output(self): + cols_to_keep = self.subject.get_cols_as_list(None, self.does_not_matter, 'c') + self.assertEqual([], cols_to_keep) + + def test_get_cols_as_list_star_cols_to_output_without_others(self): + self.module.get_cols = Mock(return_value = ['a', 'b', 'c']) + cols_to_keep = self.subject.get_cols_as_list('*') + self.assertEqual([], cols_to_keep) + cols_to_keep = self.subject.get_cols_as_list('*', None, 'foo') + self.assertEqual([], cols_to_keep) + + def test_get_cols_as_list_star_cols_to_output_with_others(self): + self.module.get_cols = Mock(return_value = ['a', 'b', 'c']) + cols_to_keep = self.subject.get_cols_as_list('*', self.does_not_matter, 'c') + self.assertEqual(['a', 'b'], cols_to_keep) + + def test_get_cols_as_list_exclude_single_col(self): + cols_to_keep = self.subject.get_cols_as_list('a', self.does_not_matter, 'c') + self.assertEqual(['a'], cols_to_keep) + cols_to_keep = self.subject.get_cols_as_list('a, b', self.does_not_matter, 'c') + self.assertEqual(['a', 'b'], cols_to_keep) + cols_to_keep = self.subject.get_cols_as_list('a, b, c', self.does_not_matter, 'c') + self.assertEqual(['a', 'b'], cols_to_keep) + + def test_get_cols_as_list_exclude_comma_sep_col_list(self): + self.module.get_cols = Mock(return_value=['a', 'b', 'c', 'd']) + self.assertEqual(['c', 'd'], + self.subject.get_cols_as_list("*", self.does_not_matter, 'a, b')) + self.assertEqual(['a'], + self.subject.get_cols_as_list("a, b, c", self.does_not_matter, 'b, c')) + self.assertEqual(['a', 'b'], + self.subject.get_cols_as_list("a, b, c", self.does_not_matter, 'c, e')) + + def test_get_cols_as_list_all_cols_to_output(self): + self.module.get_cols = Mock(return_value = ['a', 'b', 'c']) + cols_to_keep = self.subject.get_cols_as_list('*', self.does_not_matter) + self.assertEqual(['a', 'b', 'c'], cols_to_keep) + + def test_get_cols_as_list_exclude_none_with_star(self): + self.module.get_cols = Mock(return_value=['a', 'b', 'c']) + self.assertEqual(['a', 'b', 'c'], + self.subject.get_cols_as_list("*", self.does_not_matter, None)) + self.assertEqual(['a', 'b', 'c'], + self.subject.get_cols_as_list("*", self.does_not_matter, "")) + +class Vec2ColsTestSuite(unittest.TestCase): + def setUp(self): + patches = { + 'plpy': plpy + } + self.plpy_mock_execute = MagicMock() + plpy.execute = self.plpy_mock_execute + + self.module_patcher = patch.dict('sys.modules', patches) + self.module_patcher.start() + + import utilities.transform_vec_cols + self.module = utilities.transform_vec_cols + self.subject = self.module.vec2cols() + + self.default_source_table = "source" + self.does_not_matter = 'does_not_matter' + self.default_output_table = "output" + + self.subject.input_tbl_valid = Mock() + self.subject.output_tbl_valid = Mock() + self.subject.cols_in_tbl_valid = Mock() + + def tearDown(self): + self.module_patcher.stop() + + def test_get_names_for_split_output_cols_feature_names_none(self): + self.plpy_mock_execute.return_value = [{"n_x": 3}] + new_cols = self.subject.get_names_for_split_output_cols(self.default_source_table, 'foobar', None) + self.assertEqual(['f1', 'f2', 'f3'], new_cols) + + def test_get_names_for_split_output_cols_feature_names_not_none(self): + self.plpy_mock_execute.return_value = [{"n_x": 3}] + new_cols = self.subject.get_names_for_split_output_cols(self.default_source_table, 'foobar', ['a', 'b', 'c']) + self.assertEqual(['a', 'b', 'c'], new_cols) + + def test_get_names_for_split_output_cols_array_upper_returns_none(self): + self.plpy_mock_execute.return_value = [{"n_x": None}] + with self.assertRaises(plpy.PLPYException): + self.subject.get_names_for_split_output_cols(self.default_source_table, 'foobar', None) + + def test_get_names_for_split_output_cols_feature_names_size_matches_vector_col(self): + self.plpy_mock_execute.return_value = [{"n_x": 3}] + with self.assertRaises(plpy.PLPYException): + self.subject.get_names_for_split_output_cols(self.default_source_table, 'foobar', ['a', 'b']) + + def test_validate_output_cols_max_cols_exception(self): + with self.assertRaises(plpy.PLPYException): + self.subject.validate_output_cols(range(0, 1550), range(1550, 1650)) + + def test_validate_output_cols_duplicate_cols_exception(self): + with self.assertRaises(plpy.PLPYException): + self.subject.validate_output_cols(['a'], ['a']) + with self.assertRaises(plpy.PLPYException): + self.subject.validate_output_cols(['a', 'b', 'c'], ['a', 'b']) + + def test_validate_output_cols_passes(self): + self.subject.validate_output_cols(range(0, 1450), range(1450, 1500)) + self.subject.validate_output_cols(['a', 'b', 'c'], ['d', 'e']) + +class Cols2VecTestSuite(unittest.TestCase): + def setUp(self): + patches = { + 'plpy': plpy + } + self.plpy_mock_execute = MagicMock() + plpy.execute = self.plpy_mock_execute + + self.module_patcher = patch.dict('sys.modules', patches) + self.module_patcher.start() + + import utilities.transform_vec_cols + self.module = utilities.transform_vec_cols + self.subject = self.module.cols2vec() + + self.default_source_table = "source" + self.does_not_matter = 'does_not_matter' + self.default_output_table = "output" + + self.module.input_tbl_valid = Mock() + self.module.output_tbl_valid = Mock() + self.module.cols_in_tbl_valid = Mock() + + def tearDown(self): + self.module_patcher.stop() + + def test_validate_args_null_list_of_features_exception(self): + with self.assertRaises(plpy.PLPYException): + self.subject.validate_args(self.does_not_matter, + self.does_not_matter, + None, + self.does_not_matter, + self.does_not_matter) + with self.assertRaises(plpy.PLPYException): + self.subject.validate_args(self.does_not_matter, + self.does_not_matter, + " ", + self.does_not_matter, + self.does_not_matter) + + def test_validate_args_star_features_to_exclude_exception(self): + with self.assertRaises(plpy.PLPYException): + self.subject.validate_args(self.does_not_matter, + self.does_not_matter, + self.does_not_matter, + ' * ', + self.does_not_matter) + + def test_get_and_validate_feature_types_with_array_list_of_features_exception(self): + self.module.get_cols_and_types = Mock(return_value=list(zip(['a', 'b'], ['s1', 's1[]']))) + self.subject.features_to_nest = ['a','b'] + with self.assertRaises(plpy.PLPYException): + self.subject.get_and_validate_feature_types(self.does_not_matter) + self.plpy_mock_execute.return_value = Mock(return_value=list(zip(['a', 'b'], ['s1[]', 's1[]']))) + with self.assertRaises(plpy.PLPYException): + self.subject.get_and_validate_feature_types(self.does_not_matter) + self.plpy_mock_execute.return_value = Mock(return_value=list(zip(['a', 'b'], ['s1[]', 's1']))) + with self.assertRaises(plpy.PLPYException): + self.subject.get_and_validate_feature_types(self.does_not_matter) + + def test_get_and_validate_feature_types_nonstar_with_scalar_list_of_features(self): + self.module.get_cols_and_types = Mock(return_value=list(zip(['a', 'b'], ['s1', 's1']))) + self.subject.features_to_nest = ['a','b'] + self.subject.get_and_validate_feature_types(self.does_not_matter) + +if __name__ == '__main__': + unittest.main() http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/transform_vec_cols.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/utilities/transform_vec_cols.py_in b/src/ports/postgres/modules/utilities/transform_vec_cols.py_in new file mode 100644 index 0000000..bf9229e --- /dev/null +++ b/src/ports/postgres/modules/utilities/transform_vec_cols.py_in @@ -0,0 +1,496 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import plpy +from control import MinWarning +from internal.db_utils import is_col_1d_array +from internal.db_utils import quote_literal +from utilities import _assert +from utilities import add_postfix +from utilities import ANY_ARRAY +from utilities import is_valid_psql_type +from utilities import py_list_to_sql_string +from utilities import split_quoted_delimited_str +from validate_args import is_var_valid +from validate_args import explicit_bool_to_text +from validate_args import get_cols +from validate_args import get_cols_and_types +from validate_args import get_expr_type +from validate_args import input_tbl_valid +from validate_args import output_tbl_valid +from validate_args import table_exists + +class vec_cols_helper: + def __init__(self): + self.all_cols = None + + def get_cols_as_list(self, cols_to_process, source_table=None, exclude_cols=None): + """ + Get a list of columns based on the value of cols_to_process + Args: + @param cols_to_process: str, Either a * or a comma-separated list of col names + @param source_table: str, optional. Source table name + @param exclude_cols: str, optional. Comma-separated list of the col(s) to exclude + from the source table, only used if cols_to_process is * + Returns: + A list of column names (or an empty list) + """ + # If cols_to_process is empty/None, return empty list + if not cols_to_process: + return [] + if cols_to_process.strip() != "*": + # If cols_to_process is a comma separated list of names, return list + # of column names in cols_to_process. + return [col for col in split_quoted_delimited_str(cols_to_process) + if col not in split_quoted_delimited_str(exclude_cols)] + if source_table: + if not self.all_cols: + self.all_cols = get_cols(source_table) + return [col for col in self.all_cols + if col not in split_quoted_delimited_str(exclude_cols)] + return [] + +class vec2cols: + def __init__(self): + self.get_cols_helper = vec_cols_helper() + self.module_name = self.__class__.__name__ + + def validate_args(self, source_table, output_table, vector_col, feature_names, + cols_to_output): + """ + Validate args for vec2cols + """ + input_tbl_valid(source_table, self.module_name) + output_tbl_valid(output_table, self.module_name) + is_var_valid(source_table, cols_to_output) + is_var_valid(source_table, vector_col) + _assert(is_valid_psql_type(get_expr_type(vector_col, source_table), ANY_ARRAY), + "{0}: vector_col should refer to an array.".format(self.module_name)) + _assert(is_col_1d_array(source_table, vector_col), + "{0}: vector_col must be a 1-dimensional array.".format(self.module_name)) + + def get_names_for_split_output_cols(self, source_table, vector_col, feature_names): + """ + Get list of names for the newly-split columns to include in the + output table. + Args: + @param: source_table, str. Source table + @param: vector_col, str. Column name containing the array input + @param: feature_names, list. Python list of the feature names to + use for the split elements in the vector_col array + """ + query = """ + SELECT array_upper({0}, 1) AS n_x + FROM {1} + LIMIT 1 + """.format(vector_col, source_table) + result = plpy.execute(query)[0]["n_x"] + if not result: + plpy.error('{0}: Column to split ({1}) must not be an empty array' + .format(self.module_name, vector_col)) + if not feature_names: + # Create custom col names for output columns, with prefix "f". + feature_names = ["f{0}".format(i+1) for i in range(result)] + else: + # Check if the array dimension is equal to the number of col names + # specified in feature_names. + _assert(result == len(feature_names), + "{0}: Mismatch between size of vector_col and number of " + "cols in feature_names.".format(self.module_name)) + return feature_names + + def validate_output_cols(self, features_to_unnest, cols_to_keep): + # If there are more than 1600 columns for the output table, we give a + # warning as it might give an error + MAX_OUTPUT_COLUMN_COUNT = 1600 + _assert(len(features_to_unnest)+len(cols_to_keep) < MAX_OUTPUT_COLUMN_COUNT, + "{0}: The output exceeds the max number of columns that " + + "can be created ({1})".format(self.module_name, MAX_OUTPUT_COLUMN_COUNT)) + # Check if newly created col names have the same name as existing cols + duplicate_col_names = set(features_to_unnest).intersection(set(cols_to_keep)) + _assert(len(duplicate_col_names) == 0, + "{0}: Conflicting column names. Column names in source " + "table cannot be {1}".format(self.module_name, + list(duplicate_col_names))) + + def vec2cols(self, schema_madlib, source_table, output_table, + vector_col, feature_names, cols_to_output, **kwargs): + """ + Split up a column of array entries into multiple columns, each column + corresponding to one array position + Args: + @param: schema_madlib, str. The schema with madlib installed + @param: source_table, str. The source table + @param: output_table, str. The output table + @param: vector_col, str. The column with array entries to split up + @param: feature_names, list. Python list of the feature names to use + for the split elements in the vector_col array + @param: cols_to_output, str. Comma-separated list of the columns in + the source_table to include in the output_table + """ + self.validate_args(source_table, output_table, vector_col, feature_names, + cols_to_output) + + # Get names of columns to use for the split vector_col + features_to_unnest = self.get_names_for_split_output_cols(source_table, + vector_col, feature_names) + cols_to_keep = self.get_cols_helper.get_cols_as_list(cols_to_output, + source_table) + + self.validate_output_cols(features_to_unnest, cols_to_keep) + + # Construct the output query and populate the output table with all the + # correct parameters + select_new_cols = ', '.join(["{0}[{1}] AS {2}".format(vector_col, + i+1, features_to_unnest[i]) for i in range(len(features_to_unnest))]) + cols_from_src_table = ', '.join(cols_to_keep)+', ' if cols_to_keep else '' + query = """ + CREATE TABLE {output_table} AS + SELECT {cols_from_src_table} {select_new_cols} + FROM {source_table} + """.format(**locals()) + plpy.execute(query) + + def vec2cols_help_message(self, schema_madlib, message, **kwargs): + """ + Help message for vec2cols function + """ + summary_string = """ +----------------------------------------------------------------------------------- + SUMMARY +----------------------------------------------------------------------------------- +Functionality: Vector to Columns + +The MADlib vec2cols function enables the user to split up a single column into +multiple columns, given that the input column contains array entries. For example, +if the input column contained ARRAY[1, 2, 3] in one of its rows, the output table +will contain 3 different columns, one for each element of the array. + +For more details on function usage: + SELECT {schema_madlib}.vec2cols('usage'); + +For a small example on using the function: + SELECT {schema_madlib}.vec2cols('example'); + """.format(schema_madlib=schema_madlib) + + usage_string = """ +----------------------------------------------------------------------------------- + USAGE +----------------------------------------------------------------------------------- +SELECT {schema_madlib}.vec2cols( + 'source_table', -- str, Name of the source table that contains the data + 'output_table', -- str, Name of the output view or table + 'vector_col', -- str, Name of the array entry column to be split + 'feature_names', -- array, Optional parameter to provide a text array of + -- the feature names for the newly split columns (if not + -- provided, default names f0, f1, ... will be used) + 'cols_to_output' -- str, Optional parameter to specify any other columns + -- in the source_table to include in the output_table + -- (default none of them, also supports '*' as input) + """.format(schema_madlib=schema_madlib) + + example_string = """ +----------------------------------------------------------------------------------- + EXAMPLE +----------------------------------------------------------------------------------- +-- Create an input data set: + +DROP TABLE IF EXISTS golf CASCADE; +CREATE TABLE golf ( + id integer NOT NULL, + "OUTLOOK" text, + temperature double precision, + humidity double precision, + "Temp_Humidity" double precision[], + clouds_airquality text[], + windy boolean, + class text, + observation_weight double precision +); +INSERT INTO golf VALUES +(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0), +(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0), +(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5), +(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0), +(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0), +(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0), +(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5), +(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0), +(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0), +(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0), +(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0), +(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5), +(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5), +(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0); + +-- Call the vec2cols function on the 'clouds_airquality' column, to split it up + +DROP TABLE IF EXISTS output_table; +SELECT {schema_madlib}.vec2cols( + 'golf', -- source table + 'output_table', -- output table + 'clouds_airquality', -- column with array entries to split + ARRAY['a', 'b'], -- feature_names array (will use 'a' to name the first new column, and 'b' for the second) + '"OUTLOOK", id' -- columns to keep from source table (as a comma-separated list) +); + +SELECT * FROM output_table ORDER BY id; + OUTLOOK | id | a | b +----------+----+--------+----------- + sunny | 1 | none | unhealthy + sunny | 2 | none | moderate + overcast | 3 | low | moderate + rain | 4 | low | moderate + rain | 5 | medium | good + rain | 6 | low | unhealthy + overcast | 7 | medium | moderate + sunny | 8 | high | unhealthy + sunny | 9 | high | good + rain | 10 | medium | good + sunny | 11 | none | good + overcast | 12 | medium | moderate + overcast | 13 | medium | moderate + rain | 14 | low | unhealthy +(14 rows) +""".format(schema_madlib=schema_madlib) + + if not message: + return summary_string + elif message.lower() in ('usage', 'help', '?'): + return usage_string + elif message.lower() in ('example', 'examples'): + return example_string + else: + return """ +No such option. Use "SELECT {schema_madlib}.vec2cols()" for help. + """.format(schema_madlib=schema_madlib) + +class cols2vec: + def __init__(self): + self.get_cols_helper = vec_cols_helper() + self.module_name = self.__class__.__name__ + + def validate_args(self, source_table, output_table, + list_of_features, list_of_features_to_exclude, cols_to_output): + """ + Function to validate input parameters + """ + input_tbl_valid(source_table, self.module_name) + output_tbl_valid(output_table, self.module_name) + + _assert(list_of_features and list_of_features.strip(), "{0}: List of " + "features cannot be empty".format(self.module_name)) + if list_of_features.strip() != '*': + is_var_valid(source_table, list_of_features) + + if list_of_features_to_exclude: + if list_of_features_to_exclude.strip() == "*": + plpy.error("{0}: Cannot exclude all columns from being " + "features".format(self.module_name)) + elif list_of_features.strip() != '*': + plpy.info("{0} NOTICE: will exclude given column(s) even though " + "list of features was not *".format(self.module_name)) + + is_var_valid(source_table, list_of_features_to_exclude) + is_var_valid(source_table, cols_to_output) + + def get_and_validate_feature_types(self, source_table): + """ + This function validates the types of all of the features_to_nest, + disallowing any features of type array. Lets underlying platform + decide whether to allow 2 elements of different types be put into + an array together (ex. integer and text features, if put together, + will yield error by the underlying platform) + """ + all_cols_and_types = get_cols_and_types(source_table) + distinct_types = set([col_type[1] for col_type in all_cols_and_types + if col_type[0] in self.features_to_nest]) + _assert(not any(is_valid_psql_type(expr_type, ANY_ARRAY) + for expr_type in distinct_types), + "{0}: Feature columns to nest cannot be of type array" + .format(self.module_name)) + + if len(distinct_types) != 1 and 'boolean' in distinct_types: + self.features_to_nest = explicit_bool_to_text(source_table, self.features_to_nest, self.schema_madlib) + + def cols2vec(self, schema_madlib, source_table, output_table, list_of_features, + list_of_features_to_exclude=None, cols_to_output=None, **kwargs): + """ + Args: + @param schema_madlib: Name of MADlib schema + @param source_table: Name of table containing input data + @param output_table: Name of table to output the results + @param list_of_features: Comma-separated string of column names or + expressions to put into feature array. + Can also be a '*' implying all columns + are to be put into feature array + @param list_of_features_to_exclude: Comma-separated string of column names + to exclude from the feature array + @param cols_to_output: Comma-separated string of column names + from the source table to keep in the output table, + in addition to the feature array + """ + self.schema_madlib = schema_madlib + self.validate_args(source_table, output_table, list_of_features, + list_of_features_to_exclude, cols_to_output) + + self.features_to_nest = self.get_cols_helper.get_cols_as_list(list_of_features, + source_table, list_of_features_to_exclude) + self.get_and_validate_feature_types(source_table) + + cols_to_keep = ', '.join(self.get_cols_helper.get_cols_as_list(cols_to_output, + source_table))+", " if cols_to_output else '' + + feature_list_str = "ARRAY[ {val} ]".format(val=','.join(map(str, self.features_to_nest))) + plpy.execute(""" + CREATE TABLE {output_table} AS + SELECT {cols_to_keep} + {feature_list_str} AS feature_vector + FROM {source_table} + """.format(**locals())) + + feature_cols = py_list_to_sql_string( + [quote_literal(f) for f in self.features_to_nest], "TEXT", True) + + output_table_summary = add_postfix(output_table, "_summary") + # Dollar-quote the text to allow single-quotes without escaping + dq = "$__MADLIB_OUTER__$" + plpy.execute(""" + CREATE TABLE {output_table_summary} AS + SELECT + {dq}{source_table}{dq}::TEXT AS source_table, + {dq}{list_of_features}{dq}::TEXT AS list_of_features, + {dq}{list_of_features_to_exclude}{dq}::TEXT AS list_of_features_to_exclude, + {feature_cols} AS feature_names + """.format(**locals())) + + def cols2vec_help_message(self, schema_madlib, message, **kwargs): + """ + Help message for cols2vec function + """ + summary_string = """ +----------------------------------------------------------------------------------- + SUMMARY +----------------------------------------------------------------------------------- +Functionality: Columns to Vector + +The MADlib cols2vec function, given a table with a number of feature columns, will +create an output table that contains the feature columns in an array. A summary table +will also be created that contains the names of the features combined into array, +so that this process can be reversed using the function vec2cols. + +For more details on function usage: + SELECT {schema_madlib}.cols2vec('usage'); + +For a small example on using the function: + SELECT {schema_madlib}.cols2vec('example'); + """.format(schema_madlib=schema_madlib) + + usage_string = """ +----------------------------------------------------------------------------------- + USAGE +----------------------------------------------------------------------------------- +SELECT {schema_madlib}.cols2vec( + 'source_table', -- str, Name of the source table that contains the data + 'output_table', -- str, Name of the output view or table + 'list_of_features', -- Comma-separated string of column names or + expressions to put into feature array. + Can also be a '*' implying all columns + are to be put into feature array + 'list_of_features_to_exclude', -- Comma-separated string of column names + to exclude from the feature array + 'cols_to_output', -- Comma-separated string of column names + from the source table to keep in the output table, + in addition to the feature array + """.format(schema_madlib=schema_madlib) + + example_string = """ +----------------------------------------------------------------------------------- + EXAMPLE +----------------------------------------------------------------------------------- +-- Create an input data set: + +DROP TABLE IF EXISTS golf CASCADE; +CREATE TABLE golf ( + id integer NOT NULL, + "OUTLOOK" text, + temperature double precision, + humidity double precision, + "Temp_Humidity" double precision[], + clouds_airquality text[], + windy boolean, + class text, + observation_weight double precision +); +INSERT INTO golf VALUES +(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0), +(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0), +(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5), +(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0), +(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0), +(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0), +(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5), +(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0), +(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0), +(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0), +(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0), +(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5), +(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5), +(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0); + +-- Call the cols2vec function on the temperature and humidity columns, to nest them. +-- Also include "windy" in features list, but exclude this feature using the exclude field + +DROP TABLE IF EXISTS output_table; +SELECT {schema_madlib}.cols2vec( + 'golf', -- source table + 'output_table', -- output table + 'windy, temperature, humidity', -- feature list to nest + 'windy', -- features to exclude, from the above feature list + '"OUTLOOK", id' -- columns to keep from source table (as a comma-separated list) +); + +SELECT * FROM output_table ORDER BY id; + OUTLOOK | id | feature_vector +----------+----+---------------- + sunny | 1 | {{85,85}} + sunny | 2 | {{80,90}} + overcast | 3 | {{83,78}} + rain | 4 | {{70,96}} + rain | 5 | {{68,80}} + rain | 6 | {{65,70}} + overcast | 7 | {{64,65}} + sunny | 8 | {{72,95}} + sunny | 9 | {{69,70}} + rain | 10 | {{75,80}} + sunny | 11 | {{75,70}} + overcast | 12 | {{72,90}} + overcast | 13 | {{81,75}} + rain | 14 | {{71,80}} +(14 rows) +""".format(schema_madlib=schema_madlib) + + if not message: + return summary_string + elif message.lower() in ('usage', 'help', '?'): + return usage_string + elif message.lower() in ('example', 'examples'): + return example_string + else: + return """ +No such option. Use "SELECT {schema_madlib}.cols2vec()" for help. + """.format(schema_madlib=schema_madlib) http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/utilities.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in index 87445a7..3bd3aaf 100644 --- a/src/ports/postgres/modules/utilities/utilities.py_in +++ b/src/ports/postgres/modules/utilities/utilities.py_in @@ -221,11 +221,9 @@ def is_psql_numeric_type(arg, exclude=None): Returns: Boolean. Returns if 'arg' is one of the numeric types """ - numeric_types = set(['smallint', 'integer', 'bigint', 'decimal', 'numeric', - 'real', 'double precision', 'serial', 'bigserial']) if exclude is None: exclude = [] - to_check_types = numeric_types - set(exclude) + to_check_types = NUMERIC - set(exclude) return (arg in to_check_types) # ------------------------------------------------------------------------- @@ -240,11 +238,10 @@ def is_psql_int_type(arg, exclude=None): Returns: Boolean. Returns if 'arg' is one of the numeric types """ - int_types = set(['smallint', 'integer', 'bigint']) if exclude is None: - to_check_types = int_types + to_check_types = INTEGER else: - to_check_types = int_types - set(exclude) + to_check_types = INTEGER - set(exclude) return (arg in to_check_types) # ------------------------------------------------------------------------- @@ -260,10 +257,7 @@ def is_psql_char_type(arg, exclude_list=[]): """ if not isinstance(exclude_list, list): exclude_list = [exclude_list] - - text_types = set(['text', 'varchar', 'character varying', - 'char', 'character']) - return arg in text_types - set(exclude_list) + return arg in TEXT - set(exclude_list) def is_psql_boolean_type(arg):