[2/2] madlib git commit: Utilities: Add module transform_vec_cols for column-vector conversion

okislal Wed, 01 Aug 2018 11:26:24 -0700

Utilities: Add module transform_vec_cols for column-vector conversion

JIRA: MADLIB-1240


This commit adds a new SQL function called vec2cols and refactors the
current function cols2vec, providing greater integration between the two
modules. We now have a single Python file with separate classes for each
feature. We also have unified unit-tests and dev-check/install-check
tests.

The vec2cols function enables users to split up a single column into
multiple columns, given that the input column contains array entries.
For example, if the input column contained ARRAY[1, 2, 3] in one of its
rows, the output table will contain 3 different columns, one for each
element of the array.

Co-authored-by: Nandish Jayaram <njaya...@apache.org>
Co-authored-by: Rahul Iyer <ri...@apache.org>
Co-authored-by: Nikhil Kak <n...@pivotal.io>
Co-authored-by: Orhan Kislal <okis...@pivotal.io>
Co-authored-by: Frank McQuillan <fmcquil...@pivotal.io>

Closes #291


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/20f95b33
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/20f95b33
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/20f95b33

Branch: refs/heads/master
Commit: 20f95b33bcbd05b154a566c81958091c66258858
Parents: a0cfcf8
Author: Arvind Sridhar <asrid...@pivotal.io>
Authored: Wed Aug 1 11:22:27 2018 -0700
Committer: Orhan Kislal <okis...@pivotal.io>
Committed: Wed Aug 1 11:22:27 2018 -0700

----------------------------------------------------------------------
 doc/mainpage.dox.in                             |   1 +
 .../postgres/modules/internal/db_utils.py_in    |   9 +
 .../postgres/modules/utilities/cols2vec.py_in   | 128 -----
 .../postgres/modules/utilities/cols2vec.sql_in  | 345 ++++++++++---
 .../modules/utilities/test/cols2vec.sql_in      |  91 ----
 .../utilities/test/transform_vec_cols.ic.sql_in |  68 +++
 .../utilities/test/transform_vec_cols.sql_in    | 470 ++++++++++++++++++
 .../unit_tests/test_transform_vec_cols.py_in    | 226 +++++++++
 .../modules/utilities/transform_vec_cols.py_in  | 496 +++++++++++++++++++
 .../postgres/modules/utilities/utilities.py_in  |  14 +-
 .../postgres/modules/utilities/vec2cols.sql_in  | 348 +++++++++++++
 11 files changed, 1908 insertions(+), 288 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/doc/mainpage.dox.in
----------------------------------------------------------------------
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index 8f97491..d174ab7 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -276,6 +276,7 @@ complete matrix stored as a distributed table.
     @defgroup grp_minibatch_preprocessing Mini-Batch Preprocessor
     @defgroup grp_pmml PMML Export
     @defgroup grp_text_utilities Term Frequency
+    @defgroup grp_vec2cols Vector to Columns
 @}
 
 @defgroup grp_early_stage Early Stage Development

http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/internal/db_utils.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/internal/db_utils.py_in 
b/src/ports/postgres/modules/internal/db_utils.py_in
index c75babf..45477ef 100644
--- a/src/ports/postgres/modules/internal/db_utils.py_in
+++ b/src/ports/postgres/modules/internal/db_utils.py_in
@@ -79,3 +79,12 @@ def quote_literal(input_str):
         return "{qd}{input_str}{qd}".format(qd=QUOTE_DELIMITER,
                                             input_str=input_str)
 # 
------------------------------------------------------------------------------
+
+def is_col_1d_array(source_table, col_name):
+    query = """
+        SELECT array_upper({0}, 2) IS NULL AS n_y
+        FROM {1}
+        LIMIT 1
+    """.format(col_name, source_table)
+    result = plpy.execute(query)
+    return result[0]["n_y"]

http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/cols2vec.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/cols2vec.py_in 
b/src/ports/postgres/modules/utilities/cols2vec.py_in
deleted file mode 100644
index 4f2b1c9..0000000
--- a/src/ports/postgres/modules/utilities/cols2vec.py_in
+++ /dev/null
@@ -1,128 +0,0 @@
-# coding=utf-8
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-@file cols2vec.py_in
-
-@brief Utility to convert Columns to array
-
-"""
-
-import plpy
-from control import MinWarning
-from internal.db_utils import quote_literal
-from utilities import split_quoted_delimited_str
-from utilities import _string_to_array
-from utilities import _assert
-from utilities import add_postfix
-from validate_args import columns_exist_in_table
-from validate_args import is_var_valid
-from validate_args import get_cols
-from validate_args import quote_ident
-from utilities import py_list_to_sql_string
-
-
-m4_changequote(`<!', `!>')
-
-
-def validate_cols2vec_args(source_table, output_table,
-                           list_of_features, list_of_features_to_exclude, 
cols_to_output, **kwargs):
-    """
-        Function to validate input parameters
-    """
-    if list_of_features.strip() != '*':
-        if not (list_of_features and list_of_features.strip()):
-            plpy.error("Features to include is empty")
-        _assert(
-            columns_exist_in_table(
-                source_table, split_quoted_delimited_str(list_of_features)),
-            "Invalid columns in list_of_features {0}".format(list_of_features))
-
-    if cols_to_output and cols_to_output.strip() != '*':
-        _assert(
-            columns_exist_in_table(
-                source_table, split_quoted_delimited_str(cols_to_output)),
-            "Invalid columns to output list {0}".format(cols_to_output))
-
-
-def cols2vec(schema_madlib, source_table, output_table, list_of_features,
-             list_of_features_to_exclude=None, cols_to_output=None, **kwargs):
-    """
-    Args:
-        @param schema_madlib: Name of MADlib schema
-        @param model: Name of table containing the tree model
-        @param source_table: Name of table containing prediction data
-        @param output_table: Name of table to output the results
-        @param list_of_features: Comma-separated string of column names or
-                                 expressions to put into feature array.
-                                 Can also be a '*' implying all columns
-                                 are to be put into feature array.
-        @param list_of_features_to_exclude: Comma-separated string of column 
names
-                                            to exclude from the feature array
-        @param cols_to_output: Comma-separated string of column names
-                               from the source table to keep in the output 
table,
-                               in addition to the feature array.
-
-    Returns:
-        None
-
-    """
-    with MinWarning('warning'):
-        validate_cols2vec_args(source_table, output_table, list_of_features,
-                               list_of_features_to_exclude,
-                               cols_to_output, **kwargs)
-
-        all_cols = get_cols(source_table, schema_madlib)
-        if list_of_features.strip() == '*':
-            exclude_set = 
set(split_quoted_delimited_str(list_of_features_to_exclude))
-            feature_list = [c for c in all_cols if c not in exclude_set]
-        else:
-            feature_list = split_quoted_delimited_str(list_of_features)
-
-        if cols_to_output:
-            additional_cols = (all_cols if cols_to_output == '*' else
-                               split_quoted_delimited_str(cols_to_output))
-            additional_cols_str = ', '.join(additional_cols) + ","
-        else:
-            additional_cols_str = ''
-
-        feature_list_str = py_list_to_sql_string(feature_list, "TEXT[]", True)
-        plpy.execute("""
-            CREATE TABLE {output_table} AS
-                SELECT {additional_cols_str}
-                       {feature_list_str} AS feature_vector
-                FROM {source_table}
-            """.format(**locals()))
-
-        feature_cols = py_list_to_sql_string(
-            [quote_literal(f) for f in feature_list], "TEXT", True)
-
-        output_table_summary = add_postfix(output_table, "_summary")
-        # Dollar-quote the text to allow single-quotes without escaping
-        dq = "$__MADLIB_OUTER__$"
-        feature_exclude_str = ("NULL" if not list_of_features_to_exclude else
-                               list_of_features_to_exclude)
-        plpy.execute("""
-            CREATE TABLE {output_table_summary} AS
-            SELECT
-                {dq}{source_table}{dq}::TEXT AS source_table,
-                {dq}{list_of_features}{dq}::TEXT AS list_of_features,
-                {dq}{feature_exclude_str}{dq}::TEXT AS 
list_of_features_to_exclude,
-                {feature_cols} AS feature_names
-            """.format(**locals()))

http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/cols2vec.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/cols2vec.sql_in 
b/src/ports/postgres/modules/utilities/cols2vec.sql_in
index a70a953..82a1f94 100644
--- a/src/ports/postgres/modules/utilities/cols2vec.sql_in
+++ b/src/ports/postgres/modules/utilities/cols2vec.sql_in
@@ -28,7 +28,6 @@
 
 m4_include(`SQLCommon.m4')
 
-
 /**
 @addtogroup grp_cols2vec
 
@@ -38,7 +37,7 @@ m4_include(`SQLCommon.m4')
 <ul>
 <li class="level1"><a href="#cols2vec_syntax">Syntax</a>
 <li class="level1"><a href="#cols2vec_usage">Usage</a>
-<li class="level1"><a href="#cols2vec_example">Example</a>
+<li class="level1"><a href="#cols2vec_example">Examples</a>
 </ul>
 </div>
 
@@ -66,7 +65,7 @@ cols2vec(
 \b Arguments
 <dl class="arglist">
 <dt>source_table</dt>
-<dd>TEXT. Name of the table containing the source data.</dd>.
+<dd>TEXT. Name of the table containing the source data.</dd>
 
 <dt>output_table</dt>
 <dd>TEXT. Name of the generated table containing the output.</dd>
@@ -74,107 +73,319 @@ cols2vec(
 <dt>list_of_features</dt>
 <dd>TEXT.
 Comma-separated string of column names or expressions to put into feature 
array.
-Can also be a '*' implying all columns are to be put into feature array (except
-for the ones included in the next argument that lists exclusions). Array 
columns
-in the source table are not supported in the 'list_of_features' parameter. 
</dd>
+Can also be '*' implying all columns are to be put into feature array (except
+for the ones included in the next argument that lists exclusions).  Type 
casting
+will be done as per the regular type casting rules of the underlying database.
+Array columns in the source table are not supported in the 'list_of_features' 
parameter. Also,
+all of the features to be included must be of the same type and must not have 
null
+values.</dd>
 
 <dt>list_of_features_to_exclude (optional)</dt>
 <dd>TEXT. Default NULL.
-Comma-separated string of column names to exclude from the feature array.  Use
-only when 'list_of_features' is '*'. </dd>
+Comma-separated string of column names to exclude from the feature array.  
+Typically used when 'list_of_features' is set to '*'.</dd>
 
 <dt>cols_to_output (optional)</dt>
 <dd>TEXT. Default NULL.
 Comma-separated string of column names from the source table to keep in the
 output table, in addition to the feature array.  To keep all columns from the
 source table, use '*' for this parameter. </dd>
-
 </dl>
 
+<b>Output table</b>
+<br>
+    The output table produced by the cols2vec function contains the following 
columns:
+    <table class="output">
+      <tr>
+        <th><...></th>
+        <td>Columns from source table, depending on which ones are kept (if 
any).
+        </td>
+      </tr>
+      <tr>
+        <th>feature_vector</th>
+        <td>Column that contains the feature array.
+      </tr>
+    </table>
+
+<b>Output summary table</b>
+<br>
+    A summary table named <em><output_table>_summary</em> is also created
+    that contains:
+    <table class="output">
+      <tr>
+        <th>source_table</th>
+        <td>Name of the table containing the source data.</td>
+      </tr>
+      <tr>
+        <th>list_of_features</th>
+        <td>List of features to put in vector.</td>
+      </tr>
+      <tr>
+        <th>list_of_features_to_exclude</th>
+        <td>Features specified by the user to exclude from 
'list_of_features'.</td>
+      </tr>
+      <tr>
+        <th>feature_names</th>
+        <td>Names of the features that were nested (converted to a vector) in 
the output table.</td>
+      </tr>
+    </table>
+
 @anchor cols2vec_example
 @par Examples
 
 -# Load sample data:
 <pre class="example">
-DROP TABLE IF EXISTS cols2vec;
-CREATE TABLE cols2vec (
-    id              bigint,
-    label           int,
-    feat1           int,
-    feat2           int,
-    feat3           float,
-    other_col       float
+DROP TABLE IF EXISTS golf CASCADE;
+CREATE TABLE golf (
+    id integer NOT NULL,
+    "OUTLOOK" text,
+    temperature double precision,
+    humidity double precision,
+    "Temp_Humidity" double precision[],
+    clouds_airquality text[],
+    windy boolean,
+    class text,
+    observation_weight double precision
+);
+INSERT INTO golf VALUES
+(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t 
Play', 5.0),
+(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t 
Play', 5.0),
+(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 
'Play', 1.5),
+(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 
1.0),
+(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 
1.0),
+(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t 
Play', 1.0),
+(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 
'Play', 1.5),
+(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 
'Don''t Play', 5.0),
+(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 
5.0),
+(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 
1.0),
+(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 
5.0),
+(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 
'Play', 1.5),
+(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 
'Play', 1.5),
+(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t 
Play', 1.0);
+</pre>
+
+-# Run cols2vec to combine the temperature and humidity columns into a single 
array feature.
+<pre class="example">
+DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
+SELECT madlib.cols2vec(
+    'golf',
+    'cols2vec_result',
+    'temperature, humidity'
 );
-INSERT INTO cols2vec  VALUES
-(1, 0, 1, 1, 0.5, 0.9),
-(2, 1, 0, 1, 0.3, 0.3),
-(3, 0, 0, 0, 0.1, 1.1),
-(4, 1, 1, 0, 0.9, 0.4);
+SELECT * FROM cols2vec_result;
+</pre>
+<pre class="result">
+ feature_vector
+----------------+
+ {85,85}
+ {80,90}
+ {83,78}
+ {70,96}
+ {68,80}
+ {65,70}
+ {64,65}
+ {72,95}
+ {69,70}
+ {75,80}
+ {75,70}
+ {72,90}
+ {81,75}
+ {71,80}
+(14 rows)
+</pre>
+View the summary table:
+<pre class="example">
+\\x on
+SELECT * FROM cols2vec_result_summary;
+\\x off
+</pre>
+<pre class="result">
+-[ RECORD 1 
]---------------+----------------------------------------------------------------
+source_table                | golf
+list_of_features            | temperature, humidity
+list_of_features_to_exclude | None
+feature_names               | {temperature,humidity}
 </pre>
 
--# Create feature array:
+-# Combine the temperature and humidity columns 
+and keep 2 other columns from source_table.
 <pre class="example">
 DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
 SELECT madlib.cols2vec(
-      'cols2vec',               -- input table
-      'cols2vec_result',        -- output table
-      'feat1,feat2,feat3',      -- list of features
-       NULL,                    -- features to exclude
-      'id, label           '    -- columns from input table to be included in 
output
+    'golf',
+    'cols2vec_result',
+    'temperature, humidity',
+    NULL,
+    'id, "OUTLOOK"'
 );
 SELECT * FROM cols2vec_result ORDER BY id;
 </pre>
 <pre class="result">
- id | label | feature_vector 
-----+-------+----------------
-  1 |     0 | {1,1,0.5}
-  2 |     1 | {0,1,0.3}
-  3 |     0 | {0,0,0.1}
-  4 |     1 | {1,0,0.9}
-(4 rows)
+ id | OUTLOOK  | feature_vector
+----+----------+----------------
+  1 | sunny    | {85,85}
+  2 | sunny    | {80,90}
+  3 | overcast | {83,78}
+  4 | rain     | {70,96}
+  5 | rain     | {68,80}
+  6 | rain     | {65,70}
+  7 | overcast | {64,65}
+  8 | sunny    | {72,95}
+  9 | sunny    | {69,70}
+ 10 | rain     | {75,80}
+ 11 | sunny    | {75,70}
+ 12 | overcast | {72,90}
+ 13 | overcast | {81,75}
+ 14 | rain     | {71,80}
+(14 rows)
 </pre>
-View summary table:
+View the summary table:
 <pre class="example">
+\\x on
 SELECT * FROM cols2vec_result_summary;
+\\x off
 </pre>
 <pre class="result">
- source_table | list_of_features  | list_of_features_to_exclude |    
feature_names    
---------------+-------------------+-----------------------------+---------------------
- cols2vec     | feat1,feat2,feat3 | None                        | 
{feat1,feat2,feat3}
+-[ RECORD 1 
]---------------+----------------------------------------------------------------
+source_table                | golf
+list_of_features            | temperature, humidity
+list_of_features_to_exclude | None
+feature_names               | {temperature,humidity}
 </pre>
 
--# The above result could be obtained in a similar way using the 
'features_to_exclude' parameter:
+-# Combine all columns, excluding all columns that are not of type double 
precision.
 <pre class="example">
 DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
 SELECT madlib.cols2vec(
-      'cols2vec',               -- input table
-      'cols2vec_result',        -- output table
-      '*',                      -- list of features
-      'id, label, other_col',   -- features to exclude
-      'id, label'               -- columns from input table to be included in 
output
+    'golf',
+    'cols2vec_result',
+    '*',
+    '"OUTLOOK", "Temp_Humidity", clouds_airquality, windy, class, id',
+    'id, "OUTLOOK"'
 );
 SELECT * FROM cols2vec_result ORDER BY id;
 </pre>
 <pre class="result">
- id | label | feature_vector 
-----+-------+----------------
-  1 |     0 | {1,1,0.5}
-  2 |     1 | {0,1,0.3}
-  3 |     0 | {0,0,0.1}
-  4 |     1 | {1,0,0.9}
-(4 rows)
+ id | OUTLOOK  | feature_vector
+----+----------+----------------
+  1 | sunny    | {85,85,5}
+  2 | sunny    | {80,90,5}
+  3 | overcast | {83,78,1.5}
+  4 | rain     | {70,96,1}
+  5 | rain     | {68,80,1}
+  6 | rain     | {65,70,1}
+  7 | overcast | {64,65,1.5}
+  8 | sunny    | {72,95,5}
+  9 | sunny    | {69,70,5}
+ 10 | rain     | {75,80,1}
+ 11 | sunny    | {75,70,5}
+ 12 | overcast | {72,90,1.5}
+ 13 | overcast | {81,75,1.5}
+ 14 | rain     | {71,80,1}
+(14 rows)
 </pre>
 View summary table:
 <pre class="example">
+\\x on
+SELECT * FROM cols2vec_result_summary;
+\\x off
+</pre>
+<pre class="result">
+-[ RECORD 1 
]---------------+----------------------------------------------------------------
+source_table                | golf
+list_of_features            | *
+list_of_features_to_exclude | "OUTLOOK", "Temp_Humidity", clouds_airquality, 
windy, class, id
+feature_names               | {temperature,humidity,observation_weight}
+</pre>
+
+-# Combine the temperature and humidity columns, exclude windy, and keep all 
of the 
+columns from the source table.
+<pre class="example">
+DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
+SELECT madlib.cols2vec(
+    'golf',
+    'cols2vec_result',
+    'windy, temperature, humidity',
+    'windy',
+    '*'
+);
+SELECT * FROM cols2vec_result ORDER BY id;
+</pre>
+<pre class="result">
+ id | OUTLOOK  | temperature | humidity | Temp_Humidity | clouds_airquality | 
windy |   class    | observation_weight | feature_vector
+----+----------+-------------+----------+---------------+-------------------+-------+------------+--------------------+----------------
+  1 | sunny    |          85 |       85 | {85,85}       | {none,unhealthy}  | 
f     | Don't Play |                  5 | {85,85}
+  2 | sunny    |          80 |       90 | {80,90}       | {none,moderate}   | 
t     | Don't Play |                  5 | {80,90}
+  3 | overcast |          83 |       78 | {83,78}       | {low,moderate}    | 
f     | Play       |                1.5 | {83,78}
+  4 | rain     |          70 |       96 | {70,96}       | {low,moderate}    | 
f     | Play       |                  1 | {70,96}
+  5 | rain     |          68 |       80 | {68,80}       | {medium,good}     | 
f     | Play       |                  1 | {68,80}
+  6 | rain     |          65 |       70 | {65,70}       | {low,unhealthy}   | 
t     | Don't Play |                  1 | {65,70}
+  7 | overcast |          64 |       65 | {64,65}       | {medium,moderate} | 
t     | Play       |                1.5 | {64,65}
+  8 | sunny    |          72 |       95 | {72,95}       | {high,unhealthy}  | 
f     | Don't Play |                  5 | {72,95}
+  9 | sunny    |          69 |       70 | {69,70}       | {high,good}       | 
f     | Play       |                  5 | {69,70}
+ 10 | rain     |          75 |       80 | {75,80}       | {medium,good}     | 
f     | Play       |                  1 | {75,80}
+ 11 | sunny    |          75 |       70 | {75,70}       | {none,good}       | 
t     | Play       |                  5 | {75,70}
+ 12 | overcast |          72 |       90 | {72,90}       | {medium,moderate} | 
t     | Play       |                1.5 | {72,90}
+ 13 | overcast |          81 |       75 | {81,75}       | {medium,moderate} | 
f     | Play       |                1.5 | {81,75}
+ 14 | rain     |          71 |       80 | {71,80}       | {low,unhealthy}   | 
t     | Don't Play |                  1 | {71,80}
+(14 rows)
+</pre>
+View the summary table:
+<pre class="example">
+\\x on
 SELECT * FROM cols2vec_result_summary;
+\\x off
+</pre>
+<pre class="result">
+-[ RECORD 1 ]---------------+-----------------------------
+source_table                | golf
+list_of_features            | windy, temperature, humidity
+list_of_features_to_exclude | windy
+feature_names               | {temperature,humidity}
+</pre>
+This also shows that you can exclude features in 'list_of_features_to_exclude'
+that are in the list of 'list_of_features'.  This can be useful if the 
'list_of_features'
+is generated from an expression or subquery.
+
+-# Type casting works as per regular rules of the underlying database.  
+E.g, combining integer and double precisions columns will create a double 
precision feature vector.  
+For Boolean, do an explicit cast to the target type:
+<pre class="example">
+DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
+SELECT madlib.cols2vec(
+    'golf',
+    'cols2vec_result',
+    'windy::TEXT, class',
+    NULL,
+    'id'
+);
+SELECT * FROM cols2vec_result ORDER BY id;
 </pre>
 <pre class="result">
- source_table | list_of_features | list_of_features_to_exclude |    
feature_names    
---------------+------------------+-----------------------------+---------------------
- cols2vec     | *                | id, label, other_col        | 
{feat1,feat2,feat3}
+ id |    feature_vector    
+----+----------------------
+  1 | {false,"Don't Play"}
+  2 | {true,"Don't Play"}
+  3 | {false,Play}
+  4 | {false,Play}
+  5 | {false,Play}
+  6 | {true,"Don't Play"}
+  7 | {true,Play}
+  8 | {false,"Don't Play"}
+  9 | {false,Play}
+ 10 | {false,Play}
+ 11 | {true,Play}
+ 12 | {true,Play}
+ 13 | {false,Play}
+ 14 | {true,"Don't Play"}
+(14 rows)
 </pre>
+
 */
 
+-------------------------------------------------------------------------
+-- cols2vec
+-------------------------------------------------------------------------
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
     source_table VARCHAR,
@@ -183,10 +394,13 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
     list_of_features_to_exclude VARCHAR,
     cols_to_output VARCHAR
 ) RETURNS void AS $$
-    PythonFunction(utilities, cols2vec, cols2vec)
-$$ LANGUAGE plpythonu VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
+    PythonFunctionBodyOnly(utilities, transform_vec_cols)
+    from utilities.control import MinWarning
+    with MinWarning('warning'):
+        cols2vec_obj = transform_vec_cols.cols2vec()
+        return cols2vec_obj.cols2vec(**globals())
+    $$ LANGUAGE plpythonu VOLATILE
+    m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
     source_table VARCHAR,
@@ -206,3 +420,16 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
     SELECT MADLIB_SCHEMA.cols2vec($1, $2, $3, NULL, NULL)
 $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(message TEXT)
+RETURNS text AS $$
+    PythonFunctionBodyOnly(utilities, transform_vec_cols)
+    return transform_vec_cols.cols2vec().cols2vec_help_message(schema_madlib, 
message)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec()
+RETURNS text AS $$
+    SELECT MADLIB_SCHEMA.cols2vec('');
+$$ language SQL
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');

http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/test/cols2vec.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/cols2vec.sql_in 
b/src/ports/postgres/modules/utilities/test/cols2vec.sql_in
deleted file mode 100644
index fd22238..0000000
--- a/src/ports/postgres/modules/utilities/test/cols2vec.sql_in
+++ /dev/null
@@ -1,91 +0,0 @@
-/* ----------------------------------------------------------------------- */
-/**
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-/* ----------------------------------------------------------------------- */
-
-DROP TABLE IF EXISTS source_table;
-CREATE TABLE source_table (
-    id              bigint,
-    label           int,
-    feat1           int,
-    feat2           int,
-    feat3           float,
-    other_col       float
-);
-INSERT INTO source_table  VALUES
-(1, 0, 1, 1, 0.5, 0.9),
-(2, 1, 0, 1, 0.3, 0.3),
-(3, 0, 0, 0, 0.1, 1.1),
-(4, 1, 1, 0, 0.9, 0.4);
-
-
-DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary;
-SELECT cols2vec( 'source_table',  'cols2vec_out', 'feat1,feat2,feat3', 'id', 
'id,label' );
-SELECT assert(feature_vector = '{1,1,0.5}',
-              'Incorrect results for cols2vec')
-FROM cols2vec_out
-WHERE id = 1;
-
-DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary;
-SELECT cols2vec( 'source_table',  'cols2vec_out', '*', 'id,other_col', 
'id,label' );
-
-SELECT * FROM cols2vec_out;
-SELECT assert(feature_vector = '{0,1,1,0.5}','Incorrect results for cols2vec')
-FROM cols2vec_out
-WHERE id = 1;
-
-
-DROP TABLE IF EXISTS cols2vec_spcl;
-CREATE TABLE cols2vec_spcl (
-    "i,Ð!#'d"                 bigint,
-    "lab$$''%*Ð!#''()el"    int,
-    "fe''at1"                  int,
-    feat2                      int,
-    "fe'%*()at3"               float,
-    other_col                  float,
-    "se''x"                            TEXT
-);
-
-INSERT INTO cols2vec_spcl  VALUES
-(1, 0, 1, 1, 0.5, 0.9,'M''M'),
-(2, 1, 0, 1, 0.3, 0.3,'M$M'),
-(3, 0, 0, 0, 0.1, 1.1,'M,M'),
-(4, 1, 1, 0, 0.9, 0.4,'M@[}(:*;M'),
-(5, 1, 0, 1, 0.85, 0.34,'M@[}(:*;M'),
-(6, 1, 0, 1, 0.63, 0.12,'M"M'),
-(7, 0, 1, 0, 0.7, 1.4,'MÐM');
-
-DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary;
-SELECT cols2vec( 'cols2vec_spcl',  
'cols2vec_out','"fe''''at1",feat2,"fe''%*()at3"',
-                NULL, '"i,Ð!#''d","lab$$''''%*Ð!#''''()el"' );
-
-select assert(feature_vector = '{1,1,0.5}','Incorrect results for cols2vec')
-FROM cols2vec_out
-WHERE "i,Ð!#'d" = 1;
-
-DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary;
-
-SELECT cols2vec('cols2vec_spcl',  'cols2vec_out','*', '"se''''x"',
-                '"i,Ð!#''d","lab$$''''%*Ð!#''''()el"');
-SELECT assert(feature_vector = '{1,0,1,1,0.5,0.9}',
-              'Incorrect results for cols2vec')
-FROM cols2vec_out
-WHERE "i,Ð!#'d" = 1;

http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/test/transform_vec_cols.ic.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/utilities/test/transform_vec_cols.ic.sql_in 
b/src/ports/postgres/modules/utilities/test/transform_vec_cols.ic.sql_in
new file mode 100644
index 0000000..e965b22
--- /dev/null
+++ b/src/ports/postgres/modules/utilities/test/transform_vec_cols.ic.sql_in
@@ -0,0 +1,68 @@
+/* ----------------------------------------------------------------------- 
*//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ----------------------------------------------------------------------- 
*/
+
+-- Create testing dataset
+
+DROP TABLE IF EXISTS dt_golf CASCADE;
+CREATE TABLE dt_golf (
+    id integer NOT NULL,
+    "OUTLOOK" text,
+    temperature double precision,
+    humidity double precision,
+    "Temp_Humidity" double precision[],
+    clouds_airquality text[],
+    windy boolean,
+    class text,
+    observation_weight double precision
+);
+INSERT INTO dt_golf VALUES
+(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t 
Play', 5.0),
+(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t 
Play', 5.0),
+(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 
'Play', 1.5),
+(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 
1.0),
+(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 
1.0),
+(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t 
Play', 1.0),
+(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 
'Play', 1.5),
+(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 
'Don''t Play', 5.0),
+(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 
5.0),
+(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 
1.0),
+(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 
5.0),
+(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 
'Play', 1.5),
+(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 
'Play', 1.5),
+(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t 
Play', 1.0);
+
+DROP TABLE IF EXISTS out_table;
+SELECT vec2cols(
+    'dt_golf',
+    'out_table',
+    'clouds_airquality',
+    ARRAY['clouds', 'air_quality'],
+    '"OUTLOOK", id'
+);
+
+DROP TABLE IF EXISTS out_table, out_table_summary;
+SELECT cols2vec(
+    'dt_golf',
+    'out_table',
+    'windy, temperature, humidity',
+    'windy',
+    '"OUTLOOK", id'
+);

http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/test/transform_vec_cols.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/utilities/test/transform_vec_cols.sql_in 
b/src/ports/postgres/modules/utilities/test/transform_vec_cols.sql_in
new file mode 100644
index 0000000..47ab299
--- /dev/null
+++ b/src/ports/postgres/modules/utilities/test/transform_vec_cols.sql_in
@@ -0,0 +1,470 @@
+/* ----------------------------------------------------------------------- 
*//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ----------------------------------------------------------------------- 
*/
+
+-----------------------------------------------------------------------
+-- vec2cols
+-----------------------------------------------------------------------
+
+-- Create testing dataset
+
+DROP TABLE IF EXISTS dt_golf CASCADE;
+CREATE TABLE dt_golf (
+    id integer NOT NULL,
+    "OUTLOOK" text,
+    temperature double precision,
+    humidity double precision,
+    "Temp_Humidity" double precision[],
+    clouds_airquality text[],
+    windy boolean,
+    class text,
+    observation_weight double precision
+);
+INSERT INTO dt_golf VALUES
+(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t 
Play', 5.0),
+(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t 
Play', 5.0),
+(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 
'Play', 1.5),
+(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 
1.0),
+(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 
1.0),
+(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t 
Play', 1.0),
+(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 
'Play', 1.5),
+(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 
'Don''t Play', 5.0),
+(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 
5.0),
+(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 
1.0),
+(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 
5.0),
+(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 
'Play', 1.5),
+(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 
'Play', 1.5),
+(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t 
Play', 1.0);
+
+-- Call the vec2cols function with different parameters on the 
'clouds_airquality' column, to split it up
+
+DROP TABLE IF EXISTS out_table;
+SELECT vec2cols(
+    'dt_golf',
+    'out_table',
+    'clouds_airquality'
+);
+
+SELECT f1 from out_table;
+SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE 
table_name='out_table') = 2, 'Number of split columns does not match');
+
+DROP TABLE IF EXISTS out_table;
+SELECT vec2cols(
+    'dt_golf',
+    'out_table',
+    'clouds_airquality',
+    NULL,
+    'id'
+);
+
+SELECT f1 from out_table;
+SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE 
table_name='out_table') = 3, 'Number of split columns does not match');
+SELECT assert ((SELECT clouds_airquality[1] FROM dt_golf WHERE id = 1) = 
(SELECT f1 FROM out_table WHERE id = 1), 'Split values do not match up');
+SELECT assert ((SELECT clouds_airquality[2] FROM dt_golf WHERE id = 1) = 
(SELECT f2 FROM out_table WHERE id = 1), 'Split values do not match up');
+
+DROP TABLE IF EXISTS out_table;
+SELECT vec2cols(
+    'dt_golf',
+    'out_table',
+    'clouds_airquality',
+    ARRAY['clouds', 'air_quality']
+);
+
+SELECT clouds from out_table;
+SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE 
table_name='out_table') = 2, 'Number of split columns does not match');
+
+DROP TABLE IF EXISTS out_table;
+SELECT vec2cols(
+    'dt_golf',
+    'out_table',
+    'clouds_airquality',
+    ARRAY['clouds', 'air_quality'],
+    '"OUTLOOK", id'
+);
+
+SELECT clouds from out_table;
+SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE 
table_name='out_table') = 4, 'Number of split columns does not match');
+SELECT assert ((SELECT clouds_airquality[1] FROM dt_golf WHERE id = 1) = 
(SELECT clouds FROM out_table WHERE id = 1), 'Split values do not match up');
+SELECT assert ((SELECT clouds_airquality[2] FROM dt_golf WHERE id = 1) = 
(SELECT air_quality FROM out_table WHERE id = 1), 'Split values do not match 
up');
+
+-- Special character tests
+
+DROP TABLE IF EXISTS special_char_check;
+CREATE TABLE special_char_check(
+    "id" INTEGER,
+    "se$$,''x" TEXT,
+    "len$,,$'%*()gth" TEXT[],
+    "rin,$$Ð!#'gs" INTEGER,
+    ",hel~!@#$%^&*()_+{}|:""<>?`-=[]\;',./'," TEXT,
+    "$$hi//\\*$$" INTEGER,
+    "**" INTEGER,
+    "'~d())" TEXT);
+
+INSERT INTO special_char_check VALUES
+(1,'M''M',ARRAY['sd''f', 'ab,,c'],6,'sd''f',1,2,'ab,,c'),
+(2,'''M''M''',ARRAY['sdf$$sdfk(),', 
'$$sdlhf$$'],6,'sdf$$sdfk(),',1,2,'$$sdlhf$$'),
+(3,'M|$$M',ARRAY['%~(())Ð"', 'sdf'],6,'%~(())Ð"',1,2,'sdf'),
+(4,'M,M',ARRAY['sdf', 'sdf'],6,'sdf',1,2,'sdf'),
+(5,'M@[}(:*;M',ARRAY['sdf', 'sdf'],6,'sdf',1,2,'sdf'),
+(6,'M"M',ARRAY['sdf', 'sdf'],6,'sdf',1,2,'sdf'),
+(7,'MÐM',ARRAY['sdf', 'sdf'],6,'sdf',1,2,'sdf');
+
+DROP TABLE IF EXISTS out_table;
+SELECT vec2cols(
+    'special_char_check',
+    'out_table',
+    '"len$,,$''%*()gth"',
+    ARRAY['",cl''oÐu,ds,"', '"air_qu""ality"'],
+    '*'
+);
+
+SELECT ",cl'oÐu,ds,", "air_qu""ality", 
",hel~!@#$%^&*()_+{}|:""<>?`-=[]\;',./',", "$$hi//\\*$$", "**", "'~d())" from 
out_table;
+SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE 
table_name='out_table') = 10, 'Number of split columns does not match');
+SELECT assert ((SELECT "len$,,$'%*()gth"[1] FROM special_char_check WHERE id = 
1) = (SELECT ",cl'oÐu,ds," FROM out_table WHERE id = 1), 'Split values do not 
match up');
+SELECT assert ((SELECT "se$$,''x" FROM special_char_check WHERE id = 6) = 
'M"M', 'Incorrect entries in out_table');
+
+DROP TABLE IF EXISTS out_table;
+SELECT vec2cols(
+    'special_char_check',
+    'out_table',
+    '"len$,,$''%*()gth"',
+    ARRAY['",cl''oÐu,ds,"', '"air_qu""ality"'],
+    $__madlib__$id, "rin,$$Ð!#'gs", 
",hel~!@#$%^&*()_+{}|:""<>?`-=[]\;',./',", "$$hi//\\*$$", "**"$__madlib__$
+);
+
+SELECT ",cl'oÐu,ds,", "air_qu""ality", id, "rin,$$Ð!#'gs", 
",hel~!@#$%^&*()_+{}|:""<>?`-=[]\;',./',", "$$hi//\\*$$", "**" from out_table;
+SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE 
table_name='out_table') = 7, 'Number of split columns does not match');
+SELECT assert ((SELECT "len$,,$'%*()gth"[1] FROM special_char_check WHERE id = 
1) = (SELECT ",cl'oÐu,ds," FROM out_table WHERE id = 1), 'Split values do not 
match up');
+SELECT assert ((SELECT ",hel~!@#$%^&*()_+{}|:""<>?`-=[]\;',./'," FROM 
special_char_check WHERE id = 1) = 'sd''f', 'Incorrect entries in out_table');
+
+DROP TABLE IF EXISTS out_table;
+SELECT vec2cols(
+    'special_char_check',
+    'out_table',
+    '"len$,,$''%*()gth"',
+    ARRAY['",cl''oÐu,ds,"', '"air_qu""ality"'],
+    '"**"'
+);
+
+SELECT "**", ",cl'oÐu,ds,", "air_qu""ality" from out_table;
+SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE 
table_name='out_table') = 3, 'Number of split columns does not match');
+
+-----------------------------------------------------------------------
+-- cols2vec
+-----------------------------------------------------------------------
+
+DROP TABLE IF EXISTS out_table, out_table_summary;
+SELECT cols2vec(
+    'dt_golf',
+    'out_table',
+    'temperature, humidity'
+);
+
+SELECT feature_vector from out_table;
+SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE 
table_name='out_table') = 1, 'Number of output table columns does not match');
+
+DROP TABLE IF EXISTS out_table, out_table_summary;
+SELECT cols2vec(
+    'dt_golf',
+    'out_table',
+    'temperature, humidity',
+    NULL,
+    'id'
+);
+
+SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE 
table_name='out_table') = 2, 'Number of output table columns does not match');
+SELECT assert ((SELECT feature_vector from out_table where id = 1) = '{85, 
85}', 'Output table values do not match up');
+
+-- All numeric types
+DROP TABLE IF EXISTS out_table, out_table_summary;
+SELECT cols2vec(
+    'dt_golf',
+    'out_table',
+    'id, temperature, humidity, observation_weight',
+    NULL,
+    '*'
+);
+
+SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE 
table_name='out_table') = 10, 'Number of output table columns does not match');
+SELECT assert ((SELECT feature_vector from out_table where id = 1) = '{1.0, 
85.0, 85.0, 5.0}', 'Output table values do not match up');
+
+DROP TABLE IF EXISTS out_table, out_table_summary;
+SELECT cols2vec(
+    'dt_golf',
+    'out_table',
+    'windy, temperature, humidity',
+    'windy',
+    '*'
+);
+
+SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE 
table_name='out_table') = 10, 'Number of output table columns does not match');
+SELECT assert ((SELECT feature_vector from out_table where id = 1) = '{85, 
85}', 'Output table values do not match up');
+
+DROP TABLE IF EXISTS out_table, out_table_summary;
+SELECT cols2vec(
+    'dt_golf',
+    'out_table',
+    '*',
+    '"OUTLOOK", "Temp_Humidity", clouds_airquality, windy, class, id',
+    '*'
+);
+
+SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE 
table_name='out_table') = 10, 'Number of output table columns does not match');
+SELECT assert ((SELECT feature_vector from out_table where id = 1) = '{85, 85, 
5}', 'Output table values do not match up');
+
+DROP TABLE IF EXISTS out_table, out_table_summary;
+SELECT cols2vec(
+    'dt_golf',
+    'out_table',
+    'windy, temperature, humidity',
+    'windy, class',
+    '*'
+);
+
+SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE 
table_name='out_table') = 10, 'Number of output table columns does not match');
+SELECT assert ((SELECT feature_vector from out_table where id = 1) = '{85, 
85}', 'Output table values do not match up');
+
+-- Special character tests
+
+DROP TABLE IF EXISTS cols2vec_spcl;
+CREATE TABLE cols2vec_spcl (
+    "i,Ð!#'d"               bigint,
+    "lab$$''%*Ð!#''()el"    int,
+    "fe''at1"               float,
+    feat2                   float,
+    "fe'%*()at3"            float,
+    other_col               float,
+    "se''x"                 TEXT
+);
+
+INSERT INTO cols2vec_spcl VALUES
+(1, 0, 1, 1, 0.5, 0.9,'M''M'),
+(2, 1, 0, 1, 0.3, 0.3,'M$M'),
+(3, 0, 0, 0, 0.1, 1.1,'M,M'),
+(4, 1, 1, 0, 0.9, 0.4,'M@[}(:*;M'),
+(5, 1, 0, 1, 0.85, 0.34,'M@[}(:*;M'),
+(6, 1, 0, 1, 0.63, 0.12,'M"M'),
+(7, 0, 1, 0, 0.7, 1.4,'MÐM');
+
+DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary;
+SELECT cols2vec( 'cols2vec_spcl',  
'cols2vec_out','"fe''''at1",feat2,"fe''%*()at3"',
+                NULL, '"i,Ð!#''d","lab$$''''%*Ð!#''''()el"' );
+
+SELECT assert(feature_vector = '{1,1,0.5}', 'Incorrect results for cols2vec')
+FROM cols2vec_out
+WHERE "i,Ð!#'d" = 1;
+
+DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary;
+
+SELECT cols2vec('cols2vec_spcl',  'cols2vec_out','*', 
'"se''''x","i,Ð!#''d","lab$$''''%*Ð!#''''()el"',
+                '"i,Ð!#''d","lab$$''''%*Ð!#''''()el"');
+SELECT assert(feature_vector = '{1,1,0.5,0.9}',
+              'Incorrect results for cols2vec')
+FROM cols2vec_out
+WHERE "i,Ð!#'d" = 1;
+
+-- Type casting tests
+
+DROP TABLE IF EXISTS cols2vec_casting_test;
+CREATE TABLE cols2vec_casting_test (
+    a smallint,
+    b int,
+    c bigint,
+    d decimal,
+    e numeric,
+    f real,
+    g float,
+    h double precision,
+    i serial,
+    j bigserial,
+    k boolean,
+    l boolean,
+    m text,
+    n varchar,
+    o character varying,
+    p char,
+    q character,
+    r smallint,
+    s boolean,
+    t boolean,
+    u text[],
+    v text[],
+    w integer[],
+    x text
+);
+
+INSERT INTO cols2vec_casting_test VALUES
+(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 'true', 'true', 'hello', 'hello', 'hello', 'h', 
'h', 1, 'false', 'false', ARRAY['1','sdfsd','serw'], ARRAY['2','sdfsd','serw'], 
ARRAY[1, 2, 3], '1');
+
+DROP TABLE IF EXISTS out_table, out_table_summary, out_table2;
+SELECT cols2vec(
+    'cols2vec_casting_test',
+    'out_table',
+    'a, b'
+);
+SELECT vec2cols(
+    'out_table',
+    'out_table2',
+    'feature_vector'
+);
+SELECT assert((select data_type from information_schema.columns where 
table_name = 'out_table2' limit 1) = 'integer', 'Incorrect type casting of 
features for cols2vec');
+
+DROP TABLE IF EXISTS out_table, out_table_summary, out_table2;
+SELECT cols2vec(
+    'cols2vec_casting_test',
+    'out_table',
+    'a, b, c'
+);
+SELECT vec2cols(
+    'out_table',
+    'out_table2',
+    'feature_vector'
+);
+SELECT assert((select data_type from information_schema.columns where 
table_name = 'out_table2' limit 1) = 'bigint', 'Incorrect type casting of 
features for cols2vec');
+
+DROP TABLE IF EXISTS out_table, out_table_summary, out_table2;
+SELECT cols2vec(
+    'cols2vec_casting_test',
+    'out_table',
+    'a, b, c, d, e, f, g, h, i, j'
+);
+SELECT vec2cols(
+    'out_table',
+    'out_table2',
+    'feature_vector'
+);
+SELECT assert((select data_type from information_schema.columns where 
table_name = 'out_table2' limit 1) = 'double precision', 'Incorrect type 
casting of features for cols2vec');
+
+DROP TABLE IF EXISTS out_table, out_table_summary, out_table2;
+SELECT cols2vec(
+    'cols2vec_casting_test',
+    'out_table',
+    'k, l'
+);
+SELECT vec2cols(
+    'out_table',
+    'out_table2',
+    'feature_vector'
+);
+SELECT assert((select data_type from information_schema.columns where 
table_name = 'out_table2' limit 1) = 'boolean', 'Incorrect type casting of 
features for cols2vec');
+
+DROP TABLE IF EXISTS out_table, out_table_summary, out_table2;
+SELECT cols2vec(
+    'cols2vec_casting_test',
+    'out_table',
+    'm, n, o, p, q'
+);
+SELECT vec2cols(
+    'out_table',
+    'out_table2',
+    'feature_vector'
+);
+SELECT assert((select data_type from information_schema.columns where 
table_name = 'out_table2' limit 1) = 'text', 'Incorrect type casting of 
features for cols2vec');
+
+DROP TABLE IF EXISTS out_table, out_table_summary, out_table2;
+SELECT cols2vec(
+    'cols2vec_casting_test',
+    'out_table',
+    'a, r'
+);
+SELECT vec2cols(
+    'out_table',
+    'out_table2',
+    'feature_vector'
+);
+SELECT assert((select data_type from information_schema.columns where 
table_name = 'out_table2' limit 1) = 'smallint', 'Incorrect type casting of 
features for cols2vec');
+
+DROP TABLE IF EXISTS out_table, out_table_summary, out_table2;
+SELECT cols2vec(
+    'cols2vec_casting_test',
+    'out_table',
+    's, t'
+);
+SELECT vec2cols(
+    'out_table',
+    'out_table2',
+    'feature_vector'
+);
+SELECT assert((select data_type from information_schema.columns where 
table_name = 'out_table2' limit 1) = 'boolean', 'Incorrect type casting of 
features for cols2vec');
+
+DROP TABLE IF EXISTS out_table, out_table_summary, out_table2;
+SELECT cols2vec(
+    'cols2vec_casting_test',
+    'out_table',
+    's'
+);
+SELECT vec2cols(
+    'out_table',
+    'out_table2',
+    'feature_vector'
+);
+SELECT assert((select data_type from information_schema.columns where 
table_name = 'out_table2' limit 1) = 'boolean', 'Incorrect type casting of 
features for cols2vec');
+
+DROP TABLE IF EXISTS out_table, out_table_summary, out_table2;
+SELECT cols2vec(
+    'cols2vec_casting_test',
+    'out_table',
+    's::int, a, b'
+);
+SELECT vec2cols(
+    'out_table',
+    'out_table2',
+    'feature_vector'
+);
+SELECT assert((select data_type from information_schema.columns where 
table_name = 'out_table2' limit 1) = 'integer', 'Incorrect type casting of 
features for cols2vec');
+
+DROP TABLE IF EXISTS out_table, out_table_summary, out_table2;
+SELECT cols2vec(
+    'cols2vec_casting_test',
+    'out_table',
+    'm, b::text'
+);
+SELECT vec2cols(
+    'out_table',
+    'out_table2',
+    'feature_vector'
+);
+SELECT assert((select data_type from information_schema.columns where 
table_name = 'out_table2' limit 1) = 'text', 'Incorrect type casting of 
features for cols2vec');
+
+DROP TABLE IF EXISTS out_table, out_table_summary, out_table2;
+SELECT cols2vec(
+    'cols2vec_casting_test',
+    'out_table',
+    'x::integer, b'
+);
+SELECT vec2cols(
+    'out_table',
+    'out_table2',
+    'feature_vector'
+);
+SELECT assert((select data_type from information_schema.columns where 
table_name = 'out_table2' limit 1) = 'integer', 'Incorrect type casting of 
features for cols2vec');
+
+DROP TABLE IF EXISTS out_table, out_table_summary, out_table2;
+SELECT cols2vec(
+    'cols2vec_casting_test',
+    'out_table',
+    'g::smallint, h::smallint'
+);
+SELECT vec2cols(
+    'out_table',
+    'out_table2',
+    'feature_vector'
+);
+SELECT assert((select data_type from information_schema.columns where 
table_name = 'out_table2' limit 1) = 'smallint', 'Incorrect type casting of 
features for cols2vec');

http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/test/unit_tests/test_transform_vec_cols.py_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/utilities/test/unit_tests/test_transform_vec_cols.py_in
 
b/src/ports/postgres/modules/utilities/test/unit_tests/test_transform_vec_cols.py_in
new file mode 100644
index 0000000..6475f9b
--- /dev/null
+++ 
b/src/ports/postgres/modules/utilities/test/unit_tests/test_transform_vec_cols.py_in
@@ -0,0 +1,226 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+from os import path
+from itertools import repeat
+
+# Add modules module to the pythonpath.
+sys.path.append(path.dirname(path.dirname(path.dirname(path.dirname(path.abspath(__file__))))))
+
+import unittest
+from mock import *
+import plpy_mock as plpy
+
+m4_changequote(`<!', `!>')
+
+class VecColsHelperTestSuite(unittest.TestCase):
+    def setUp(self):
+        patches = {
+            'plpy': plpy
+        }
+        self.plpy_mock_execute = MagicMock()
+        plpy.execute = self.plpy_mock_execute
+
+        self.module_patcher = patch.dict('sys.modules', patches)
+        self.module_patcher.start()
+
+        import utilities.transform_vec_cols
+        self.module = utilities.transform_vec_cols
+        self.subject = self.module.vec_cols_helper()
+
+        self.default_source_table = "source"
+        self.does_not_matter = 'does_not_matter'
+        self.default_output_table = "output"
+
+    def tearDown(self):
+        self.module_patcher.stop()
+
+    def test_get_cols_as_list_null_cols_to_output(self):
+        cols_to_keep = self.subject.get_cols_as_list(None, 
self.does_not_matter, 'c')
+        self.assertEqual([], cols_to_keep)
+
+    def test_get_cols_as_list_star_cols_to_output_without_others(self):
+        self.module.get_cols = Mock(return_value = ['a', 'b', 'c'])
+        cols_to_keep = self.subject.get_cols_as_list('*')
+        self.assertEqual([], cols_to_keep)
+        cols_to_keep = self.subject.get_cols_as_list('*', None, 'foo')
+        self.assertEqual([], cols_to_keep)
+
+    def test_get_cols_as_list_star_cols_to_output_with_others(self):
+        self.module.get_cols = Mock(return_value = ['a', 'b', 'c'])
+        cols_to_keep = self.subject.get_cols_as_list('*', 
self.does_not_matter, 'c')
+        self.assertEqual(['a', 'b'], cols_to_keep)
+
+    def test_get_cols_as_list_exclude_single_col(self):
+        cols_to_keep = self.subject.get_cols_as_list('a', 
self.does_not_matter, 'c')
+        self.assertEqual(['a'], cols_to_keep)
+        cols_to_keep = self.subject.get_cols_as_list('a, b', 
self.does_not_matter, 'c')
+        self.assertEqual(['a', 'b'], cols_to_keep)
+        cols_to_keep = self.subject.get_cols_as_list('a, b, c', 
self.does_not_matter, 'c')
+        self.assertEqual(['a', 'b'], cols_to_keep)
+
+    def test_get_cols_as_list_exclude_comma_sep_col_list(self):
+        self.module.get_cols = Mock(return_value=['a', 'b', 'c', 'd'])
+        self.assertEqual(['c', 'd'],
+            self.subject.get_cols_as_list("*", self.does_not_matter, 'a, b'))
+        self.assertEqual(['a'],
+            self.subject.get_cols_as_list("a, b, c", self.does_not_matter, 'b, 
c'))
+        self.assertEqual(['a', 'b'],
+            self.subject.get_cols_as_list("a, b, c", self.does_not_matter, 'c, 
e'))
+
+    def test_get_cols_as_list_all_cols_to_output(self):
+        self.module.get_cols = Mock(return_value = ['a', 'b', 'c'])
+        cols_to_keep = self.subject.get_cols_as_list('*', self.does_not_matter)
+        self.assertEqual(['a', 'b', 'c'], cols_to_keep)
+
+    def test_get_cols_as_list_exclude_none_with_star(self):
+        self.module.get_cols = Mock(return_value=['a', 'b', 'c'])
+        self.assertEqual(['a', 'b', 'c'],
+            self.subject.get_cols_as_list("*", self.does_not_matter, None))
+        self.assertEqual(['a', 'b', 'c'],
+            self.subject.get_cols_as_list("*", self.does_not_matter, ""))
+
+class Vec2ColsTestSuite(unittest.TestCase):
+    def setUp(self):
+        patches = {
+            'plpy': plpy
+        }
+        self.plpy_mock_execute = MagicMock()
+        plpy.execute = self.plpy_mock_execute
+
+        self.module_patcher = patch.dict('sys.modules', patches)
+        self.module_patcher.start()
+
+        import utilities.transform_vec_cols
+        self.module = utilities.transform_vec_cols
+        self.subject = self.module.vec2cols()
+
+        self.default_source_table = "source"
+        self.does_not_matter = 'does_not_matter'
+        self.default_output_table = "output"
+
+        self.subject.input_tbl_valid = Mock()
+        self.subject.output_tbl_valid = Mock()
+        self.subject.cols_in_tbl_valid = Mock()
+
+    def tearDown(self):
+        self.module_patcher.stop()
+
+    def test_get_names_for_split_output_cols_feature_names_none(self):
+        self.plpy_mock_execute.return_value = [{"n_x": 3}]
+        new_cols = 
self.subject.get_names_for_split_output_cols(self.default_source_table, 
'foobar', None)
+        self.assertEqual(['f1', 'f2', 'f3'], new_cols)
+
+    def test_get_names_for_split_output_cols_feature_names_not_none(self):
+        self.plpy_mock_execute.return_value = [{"n_x": 3}]
+        new_cols = 
self.subject.get_names_for_split_output_cols(self.default_source_table, 
'foobar', ['a', 'b', 'c'])
+        self.assertEqual(['a', 'b', 'c'], new_cols)
+
+    def test_get_names_for_split_output_cols_array_upper_returns_none(self):
+        self.plpy_mock_execute.return_value = [{"n_x": None}]
+        with self.assertRaises(plpy.PLPYException):
+            
self.subject.get_names_for_split_output_cols(self.default_source_table, 
'foobar', None)
+
+    def 
test_get_names_for_split_output_cols_feature_names_size_matches_vector_col(self):
+        self.plpy_mock_execute.return_value = [{"n_x": 3}]
+        with self.assertRaises(plpy.PLPYException):
+            
self.subject.get_names_for_split_output_cols(self.default_source_table, 
'foobar', ['a', 'b'])
+
+    def test_validate_output_cols_max_cols_exception(self):
+        with self.assertRaises(plpy.PLPYException):
+            self.subject.validate_output_cols(range(0, 1550), range(1550, 
1650))
+
+    def test_validate_output_cols_duplicate_cols_exception(self):
+        with self.assertRaises(plpy.PLPYException):
+            self.subject.validate_output_cols(['a'], ['a'])
+        with self.assertRaises(plpy.PLPYException):
+            self.subject.validate_output_cols(['a', 'b', 'c'], ['a', 'b'])
+
+    def test_validate_output_cols_passes(self):
+        self.subject.validate_output_cols(range(0, 1450), range(1450, 1500))
+        self.subject.validate_output_cols(['a', 'b', 'c'], ['d', 'e'])
+
+class Cols2VecTestSuite(unittest.TestCase):
+    def setUp(self):
+        patches = {
+            'plpy': plpy
+        }
+        self.plpy_mock_execute = MagicMock()
+        plpy.execute = self.plpy_mock_execute
+
+        self.module_patcher = patch.dict('sys.modules', patches)
+        self.module_patcher.start()
+
+        import utilities.transform_vec_cols
+        self.module = utilities.transform_vec_cols
+        self.subject = self.module.cols2vec()
+
+        self.default_source_table = "source"
+        self.does_not_matter = 'does_not_matter'
+        self.default_output_table = "output"
+
+        self.module.input_tbl_valid = Mock()
+        self.module.output_tbl_valid = Mock()
+        self.module.cols_in_tbl_valid = Mock()
+
+    def tearDown(self):
+        self.module_patcher.stop()
+
+    def test_validate_args_null_list_of_features_exception(self):
+        with self.assertRaises(plpy.PLPYException):
+            self.subject.validate_args(self.does_not_matter,
+                                                self.does_not_matter,
+                                                None,
+                                                self.does_not_matter,
+                                                self.does_not_matter)
+        with self.assertRaises(plpy.PLPYException):
+            self.subject.validate_args(self.does_not_matter,
+                                                self.does_not_matter,
+                                                " ",
+                                                self.does_not_matter,
+                                                self.does_not_matter)
+
+    def test_validate_args_star_features_to_exclude_exception(self):
+        with self.assertRaises(plpy.PLPYException):
+            self.subject.validate_args(self.does_not_matter,
+                                                self.does_not_matter,
+                                                self.does_not_matter,
+                                                ' * ',
+                                                self.does_not_matter)
+
+    def 
test_get_and_validate_feature_types_with_array_list_of_features_exception(self):
+        self.module.get_cols_and_types = Mock(return_value=list(zip(['a', 
'b'], ['s1', 's1[]'])))
+        self.subject.features_to_nest = ['a','b']
+        with self.assertRaises(plpy.PLPYException):
+            self.subject.get_and_validate_feature_types(self.does_not_matter)
+        self.plpy_mock_execute.return_value = Mock(return_value=list(zip(['a', 
'b'], ['s1[]', 's1[]'])))
+        with self.assertRaises(plpy.PLPYException):
+            self.subject.get_and_validate_feature_types(self.does_not_matter)
+        self.plpy_mock_execute.return_value = Mock(return_value=list(zip(['a', 
'b'], ['s1[]', 's1'])))
+        with self.assertRaises(plpy.PLPYException):
+            self.subject.get_and_validate_feature_types(self.does_not_matter)
+                                                
+    def 
test_get_and_validate_feature_types_nonstar_with_scalar_list_of_features(self):
+        self.module.get_cols_and_types = Mock(return_value=list(zip(['a', 
'b'], ['s1', 's1'])))
+        self.subject.features_to_nest = ['a','b']
+        self.subject.get_and_validate_feature_types(self.does_not_matter)
+
+if __name__ == '__main__':
+    unittest.main()

http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/transform_vec_cols.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/transform_vec_cols.py_in 
b/src/ports/postgres/modules/utilities/transform_vec_cols.py_in
new file mode 100644
index 0000000..bf9229e
--- /dev/null
+++ b/src/ports/postgres/modules/utilities/transform_vec_cols.py_in
@@ -0,0 +1,496 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import plpy
+from control import MinWarning
+from internal.db_utils import is_col_1d_array
+from internal.db_utils import quote_literal
+from utilities import _assert
+from utilities import add_postfix
+from utilities import ANY_ARRAY
+from utilities import is_valid_psql_type
+from utilities import py_list_to_sql_string
+from utilities import split_quoted_delimited_str
+from validate_args import is_var_valid
+from validate_args import explicit_bool_to_text
+from validate_args import get_cols
+from validate_args import get_cols_and_types
+from validate_args import get_expr_type
+from validate_args import input_tbl_valid
+from validate_args import output_tbl_valid
+from validate_args import table_exists
+
+class vec_cols_helper:
+    def __init__(self):
+        self.all_cols = None
+
+    def get_cols_as_list(self, cols_to_process, source_table=None, 
exclude_cols=None):
+        """
+            Get a list of columns based on the value of cols_to_process
+            Args:
+            @param cols_to_process: str, Either a * or a comma-separated list 
of col names
+            @param source_table: str, optional. Source table name
+            @param exclude_cols: str, optional. Comma-separated list of the 
col(s) to exclude
+                                 from the source table, only used if 
cols_to_process is *
+            Returns:
+            A list of column names (or an empty list)
+        """
+        # If cols_to_process is empty/None, return empty list
+        if not cols_to_process:
+            return []
+        if cols_to_process.strip() != "*":
+            # If cols_to_process is a comma separated list of names, return 
list
+            # of column names in cols_to_process.
+            return [col for col in split_quoted_delimited_str(cols_to_process)
+                    if col not in split_quoted_delimited_str(exclude_cols)]
+        if source_table:
+            if not self.all_cols:
+                self.all_cols = get_cols(source_table)
+            return [col for col in self.all_cols
+                    if col not in split_quoted_delimited_str(exclude_cols)]
+        return []
+
+class vec2cols:
+    def __init__(self):
+        self.get_cols_helper = vec_cols_helper()
+        self.module_name = self.__class__.__name__
+
+    def validate_args(self, source_table, output_table, vector_col, 
feature_names,
+                      cols_to_output):
+        """
+            Validate args for vec2cols
+        """
+        input_tbl_valid(source_table, self.module_name)
+        output_tbl_valid(output_table, self.module_name)
+        is_var_valid(source_table, cols_to_output)
+        is_var_valid(source_table, vector_col)
+        _assert(is_valid_psql_type(get_expr_type(vector_col, source_table), 
ANY_ARRAY),
+            "{0}: vector_col should refer to an 
array.".format(self.module_name))
+        _assert(is_col_1d_array(source_table, vector_col),
+            "{0}: vector_col must be a 1-dimensional 
array.".format(self.module_name))
+
+    def get_names_for_split_output_cols(self, source_table, vector_col, 
feature_names):
+        """
+            Get list of names for the newly-split columns to include in the
+            output table.
+            Args:
+            @param: source_table, str. Source table
+            @param: vector_col, str. Column name containing the array input
+            @param: feature_names, list. Python list of the feature names to
+                    use for the split elements in the vector_col array
+        """
+        query = """
+            SELECT array_upper({0}, 1) AS n_x
+            FROM {1}
+            LIMIT 1
+        """.format(vector_col, source_table)
+        result = plpy.execute(query)[0]["n_x"]
+        if not result:
+            plpy.error('{0}: Column to split ({1}) must not be an empty array'
+                .format(self.module_name, vector_col))
+        if not feature_names:
+            # Create custom col names for output columns, with prefix "f".
+            feature_names = ["f{0}".format(i+1) for i in range(result)]
+        else:
+            # Check if the array dimension is equal to the number of col names
+            # specified in feature_names.
+            _assert(result == len(feature_names),
+                    "{0}: Mismatch between size of vector_col and number of "
+                    "cols in feature_names.".format(self.module_name))
+        return feature_names
+
+    def validate_output_cols(self, features_to_unnest, cols_to_keep):
+        # If there are more than 1600 columns for the output table, we give a
+        # warning as it might give an error
+        MAX_OUTPUT_COLUMN_COUNT = 1600
+        _assert(len(features_to_unnest)+len(cols_to_keep) < 
MAX_OUTPUT_COLUMN_COUNT,
+                "{0}: The output exceeds the max number of columns that " +
+                "can be created ({1})".format(self.module_name, 
MAX_OUTPUT_COLUMN_COUNT))
+        # Check if newly created col names have the same name as existing cols
+        duplicate_col_names = 
set(features_to_unnest).intersection(set(cols_to_keep))
+        _assert(len(duplicate_col_names) == 0,
+                "{0}: Conflicting column names. Column names in source "
+                "table cannot be {1}".format(self.module_name,
+                                            list(duplicate_col_names)))
+
+    def vec2cols(self, schema_madlib, source_table, output_table,
+                 vector_col, feature_names, cols_to_output, **kwargs):
+        """
+            Split up a column of array entries into multiple columns, each 
column
+            corresponding to one array position
+            Args:
+            @param: schema_madlib, str. The schema with madlib installed
+            @param: source_table, str. The source table
+            @param: output_table, str. The output table
+            @param: vector_col, str. The column with array entries to split up
+            @param: feature_names, list. Python list of the feature names to 
use
+                    for the split elements in the vector_col array
+            @param: cols_to_output, str. Comma-separated list of the columns in
+                    the source_table to include in the output_table
+        """
+        self.validate_args(source_table, output_table, vector_col, 
feature_names,
+                           cols_to_output)
+
+        # Get names of columns to use for the split vector_col
+        features_to_unnest = self.get_names_for_split_output_cols(source_table,
+                             vector_col, feature_names)
+        cols_to_keep = self.get_cols_helper.get_cols_as_list(cols_to_output,
+                       source_table)
+
+        self.validate_output_cols(features_to_unnest, cols_to_keep)
+
+        # Construct the output query and populate the output table with all the
+        # correct parameters
+        select_new_cols = ', '.join(["{0}[{1}] AS {2}".format(vector_col,
+            i+1, features_to_unnest[i]) for i in 
range(len(features_to_unnest))])
+        cols_from_src_table = ', '.join(cols_to_keep)+', ' if cols_to_keep 
else ''
+        query = """
+        CREATE TABLE {output_table} AS
+        SELECT {cols_from_src_table} {select_new_cols}
+        FROM {source_table}
+        """.format(**locals())
+        plpy.execute(query)
+
+    def vec2cols_help_message(self, schema_madlib, message, **kwargs):
+        """
+            Help message for vec2cols function
+        """
+        summary_string = """
+-----------------------------------------------------------------------------------
+                                    SUMMARY
+-----------------------------------------------------------------------------------
+Functionality: Vector to Columns
+
+The MADlib vec2cols function enables the user to split up a single column into
+multiple columns, given that the input column contains array entries. For 
example,
+if the input column contained ARRAY[1, 2, 3] in one of its rows, the output 
table
+will contain 3 different columns, one for each element of the array.
+
+For more details on function usage:
+    SELECT {schema_madlib}.vec2cols('usage');
+
+For a small example on using the function:
+    SELECT {schema_madlib}.vec2cols('example');
+    """.format(schema_madlib=schema_madlib)
+
+        usage_string = """
+-----------------------------------------------------------------------------------
+                                    USAGE
+-----------------------------------------------------------------------------------
+SELECT {schema_madlib}.vec2cols(
+    'source_table',     -- str, Name of the source table that contains the data
+    'output_table',     -- str, Name of the output view or table
+    'vector_col',       -- str, Name of the array entry column to be split
+    'feature_names',    -- array, Optional parameter to provide a text array of
+                        -- the feature names for the newly split columns (if 
not
+                        -- provided, default names f0, f1, ... will be used)
+    'cols_to_output'    -- str, Optional parameter to specify any other columns
+                        -- in the source_table to include in the output_table
+                        -- (default none of them, also supports '*' as input)
+    """.format(schema_madlib=schema_madlib)
+
+        example_string = """
+-----------------------------------------------------------------------------------
+                                    EXAMPLE
+-----------------------------------------------------------------------------------
+-- Create an input data set:
+
+DROP TABLE IF EXISTS golf CASCADE;
+CREATE TABLE golf (
+    id integer NOT NULL,
+    "OUTLOOK" text,
+    temperature double precision,
+    humidity double precision,
+    "Temp_Humidity" double precision[],
+    clouds_airquality text[],
+    windy boolean,
+    class text,
+    observation_weight double precision
+);
+INSERT INTO golf VALUES
+(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t 
Play', 5.0),
+(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t 
Play', 5.0),
+(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 
'Play', 1.5),
+(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 
1.0),
+(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 
1.0),
+(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t 
Play', 1.0),
+(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 
'Play', 1.5),
+(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 
'Don''t Play', 5.0),
+(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 
5.0),
+(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 
1.0),
+(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 
5.0),
+(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 
'Play', 1.5),
+(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 
'Play', 1.5),
+(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t 
Play', 1.0);
+
+-- Call the vec2cols function on the 'clouds_airquality' column, to split it up
+
+DROP TABLE IF EXISTS output_table;
+SELECT {schema_madlib}.vec2cols(
+    'golf',               -- source table
+    'output_table',       -- output table
+    'clouds_airquality',  -- column with array entries to split
+    ARRAY['a', 'b'],      -- feature_names array (will use 'a' to name the 
first new column, and 'b' for the second)
+    '"OUTLOOK", id'       -- columns to keep from source table (as a 
comma-separated list)
+);
+
+SELECT * FROM output_table ORDER BY id;
+ OUTLOOK  | id |   a    |     b
+----------+----+--------+-----------
+ sunny    |  1 | none   | unhealthy
+ sunny    |  2 | none   | moderate
+ overcast |  3 | low    | moderate
+ rain     |  4 | low    | moderate
+ rain     |  5 | medium | good
+ rain     |  6 | low    | unhealthy
+ overcast |  7 | medium | moderate
+ sunny    |  8 | high   | unhealthy
+ sunny    |  9 | high   | good
+ rain     | 10 | medium | good
+ sunny    | 11 | none   | good
+ overcast | 12 | medium | moderate
+ overcast | 13 | medium | moderate
+ rain     | 14 | low    | unhealthy
+(14 rows)
+""".format(schema_madlib=schema_madlib)
+
+        if not message:
+            return summary_string
+        elif message.lower() in ('usage', 'help', '?'):
+            return usage_string
+        elif message.lower() in ('example', 'examples'):
+            return example_string
+        else:
+            return """
+No such option. Use "SELECT {schema_madlib}.vec2cols()" for help.
+        """.format(schema_madlib=schema_madlib)
+
+class cols2vec:
+    def __init__(self):
+        self.get_cols_helper = vec_cols_helper()
+        self.module_name = self.__class__.__name__
+
+    def validate_args(self, source_table, output_table,
+                      list_of_features, list_of_features_to_exclude, 
cols_to_output):
+        """
+            Function to validate input parameters
+        """
+        input_tbl_valid(source_table, self.module_name)
+        output_tbl_valid(output_table, self.module_name)
+
+        _assert(list_of_features and list_of_features.strip(), "{0}: List of "
+                    "features cannot be empty".format(self.module_name))
+        if list_of_features.strip() != '*':
+            is_var_valid(source_table, list_of_features)
+
+        if list_of_features_to_exclude:
+            if list_of_features_to_exclude.strip() == "*":
+                plpy.error("{0}: Cannot exclude all columns from being "
+                    "features".format(self.module_name))
+            elif list_of_features.strip() != '*':
+                plpy.info("{0} NOTICE: will exclude given column(s) even 
though "
+                    "list of features was not *".format(self.module_name))
+
+        is_var_valid(source_table, list_of_features_to_exclude)
+        is_var_valid(source_table, cols_to_output)
+
+    def get_and_validate_feature_types(self, source_table):
+        """
+            This function validates the types of all of the features_to_nest,
+            disallowing any features of type array. Lets underlying platform
+            decide whether to allow 2 elements of different types be put into
+            an array together (ex. integer and text features, if put together,
+            will yield error by the underlying platform)
+        """
+        all_cols_and_types = get_cols_and_types(source_table)
+        distinct_types = set([col_type[1] for col_type in all_cols_and_types
+            if col_type[0] in self.features_to_nest])
+        _assert(not any(is_valid_psql_type(expr_type, ANY_ARRAY)
+                    for expr_type in distinct_types),
+                "{0}: Feature columns to nest cannot be of type array"
+                .format(self.module_name))
+
+        if len(distinct_types) != 1 and 'boolean' in distinct_types:
+            self.features_to_nest = explicit_bool_to_text(source_table, 
self.features_to_nest, self.schema_madlib)
+
+    def cols2vec(self, schema_madlib, source_table, output_table, 
list_of_features,
+                 list_of_features_to_exclude=None, cols_to_output=None, 
**kwargs):
+        """
+        Args:
+            @param schema_madlib: Name of MADlib schema
+            @param source_table: Name of table containing input data
+            @param output_table: Name of table to output the results
+            @param list_of_features: Comma-separated string of column names or
+                                     expressions to put into feature array.
+                                     Can also be a '*' implying all columns
+                                     are to be put into feature array
+            @param list_of_features_to_exclude: Comma-separated string of 
column names
+                                                to exclude from the feature 
array
+            @param cols_to_output: Comma-separated string of column names
+                                   from the source table to keep in the output 
table,
+                                   in addition to the feature array
+        """
+        self.schema_madlib = schema_madlib
+        self.validate_args(source_table, output_table, list_of_features,
+                           list_of_features_to_exclude, cols_to_output)
+
+        self.features_to_nest = 
self.get_cols_helper.get_cols_as_list(list_of_features,
+                                source_table, list_of_features_to_exclude)
+        self.get_and_validate_feature_types(source_table)
+
+        cols_to_keep = ', 
'.join(self.get_cols_helper.get_cols_as_list(cols_to_output,
+                       source_table))+", " if cols_to_output else ''
+
+        feature_list_str = "ARRAY[ {val} ]".format(val=','.join(map(str, 
self.features_to_nest)))
+        plpy.execute("""
+                CREATE TABLE {output_table} AS
+                    SELECT {cols_to_keep}
+                           {feature_list_str} AS feature_vector
+                    FROM {source_table}
+                """.format(**locals()))
+
+        feature_cols = py_list_to_sql_string(
+                [quote_literal(f) for f in self.features_to_nest], "TEXT", 
True)
+
+        output_table_summary = add_postfix(output_table, "_summary")
+        # Dollar-quote the text to allow single-quotes without escaping
+        dq = "$__MADLIB_OUTER__$"
+        plpy.execute("""
+            CREATE TABLE {output_table_summary} AS
+            SELECT
+                {dq}{source_table}{dq}::TEXT AS source_table,
+                {dq}{list_of_features}{dq}::TEXT AS list_of_features,
+                {dq}{list_of_features_to_exclude}{dq}::TEXT AS 
list_of_features_to_exclude,
+                {feature_cols} AS feature_names
+            """.format(**locals()))
+
+    def cols2vec_help_message(self, schema_madlib, message, **kwargs):
+        """
+            Help message for cols2vec function
+        """
+        summary_string = """
+-----------------------------------------------------------------------------------
+                                    SUMMARY
+-----------------------------------------------------------------------------------
+Functionality: Columns to Vector
+
+The MADlib cols2vec function, given a table with a number of feature columns, 
will
+create an output table that contains the feature columns in an array. A 
summary table
+will also be created that contains the names of the features combined into 
array,
+so that this process can be reversed using the function vec2cols.
+
+For more details on function usage:
+    SELECT {schema_madlib}.cols2vec('usage');
+
+For a small example on using the function:
+    SELECT {schema_madlib}.cols2vec('example');
+    """.format(schema_madlib=schema_madlib)
+
+        usage_string = """
+-----------------------------------------------------------------------------------
+                                    USAGE
+-----------------------------------------------------------------------------------
+SELECT {schema_madlib}.cols2vec(
+    'source_table',     -- str, Name of the source table that contains the data
+    'output_table',     -- str, Name of the output view or table
+    'list_of_features', -- Comma-separated string of column names or
+                           expressions to put into feature array.
+                           Can also be a '*' implying all columns
+                           are to be put into feature array
+    'list_of_features_to_exclude', -- Comma-separated string of column names
+                                      to exclude from the feature array
+    'cols_to_output',   -- Comma-separated string of column names
+                           from the source table to keep in the output table,
+                           in addition to the feature array
+    """.format(schema_madlib=schema_madlib)
+
+        example_string = """
+-----------------------------------------------------------------------------------
+                                    EXAMPLE
+-----------------------------------------------------------------------------------
+-- Create an input data set:
+
+DROP TABLE IF EXISTS golf CASCADE;
+CREATE TABLE golf (
+    id integer NOT NULL,
+    "OUTLOOK" text,
+    temperature double precision,
+    humidity double precision,
+    "Temp_Humidity" double precision[],
+    clouds_airquality text[],
+    windy boolean,
+    class text,
+    observation_weight double precision
+);
+INSERT INTO golf VALUES
+(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t 
Play', 5.0),
+(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t 
Play', 5.0),
+(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 
'Play', 1.5),
+(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 
1.0),
+(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 
1.0),
+(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t 
Play', 1.0),
+(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 
'Play', 1.5),
+(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 
'Don''t Play', 5.0),
+(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 
5.0),
+(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 
1.0),
+(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 
5.0),
+(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 
'Play', 1.5),
+(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 
'Play', 1.5),
+(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t 
Play', 1.0);
+
+-- Call the cols2vec function on the temperature and humidity columns, to nest 
them.
+-- Also include "windy" in features list, but exclude this feature using the 
exclude field
+
+DROP TABLE IF EXISTS output_table;
+SELECT {schema_madlib}.cols2vec(
+    'golf',               -- source table
+    'output_table',       -- output table
+    'windy, temperature, humidity',  -- feature list to nest
+    'windy',              -- features to exclude, from the above feature list
+    '"OUTLOOK", id'       -- columns to keep from source table (as a 
comma-separated list)
+);
+
+SELECT * FROM output_table ORDER BY id;
+ OUTLOOK  | id | feature_vector
+----------+----+----------------
+ sunny    |  1 | {{85,85}}
+ sunny    |  2 | {{80,90}}
+ overcast |  3 | {{83,78}}
+ rain     |  4 | {{70,96}}
+ rain     |  5 | {{68,80}}
+ rain     |  6 | {{65,70}}
+ overcast |  7 | {{64,65}}
+ sunny    |  8 | {{72,95}}
+ sunny    |  9 | {{69,70}}
+ rain     | 10 | {{75,80}}
+ sunny    | 11 | {{75,70}}
+ overcast | 12 | {{72,90}}
+ overcast | 13 | {{81,75}}
+ rain     | 14 | {{71,80}}
+(14 rows)
+""".format(schema_madlib=schema_madlib)
+
+        if not message:
+            return summary_string
+        elif message.lower() in ('usage', 'help', '?'):
+            return usage_string
+        elif message.lower() in ('example', 'examples'):
+            return example_string
+        else:
+            return """
+No such option. Use "SELECT {schema_madlib}.cols2vec()" for help.
+        """.format(schema_madlib=schema_madlib)

http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/utilities.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/utilities.py_in 
b/src/ports/postgres/modules/utilities/utilities.py_in
index 87445a7..3bd3aaf 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in
@@ -221,11 +221,9 @@ def is_psql_numeric_type(arg, exclude=None):
     Returns:
         Boolean. Returns if 'arg' is one of the numeric types
     """
-    numeric_types = set(['smallint', 'integer', 'bigint', 'decimal', 'numeric',
-                         'real', 'double precision', 'serial', 'bigserial'])
     if exclude is None:
         exclude = []
-    to_check_types = numeric_types - set(exclude)
+    to_check_types = NUMERIC - set(exclude)
     return (arg in to_check_types)
 # -------------------------------------------------------------------------
 
@@ -240,11 +238,10 @@ def is_psql_int_type(arg, exclude=None):
     Returns:
         Boolean. Returns if 'arg' is one of the numeric types
     """
-    int_types = set(['smallint', 'integer', 'bigint'])
     if exclude is None:
-        to_check_types = int_types
+        to_check_types = INTEGER
     else:
-        to_check_types = int_types - set(exclude)
+        to_check_types = INTEGER - set(exclude)
     return (arg in to_check_types)
 # -------------------------------------------------------------------------
 
@@ -260,10 +257,7 @@ def is_psql_char_type(arg, exclude_list=[]):
     """
     if not isinstance(exclude_list, list):
         exclude_list = [exclude_list]
-
-    text_types = set(['text', 'varchar', 'character varying',
-                      'char', 'character'])
-    return arg in text_types - set(exclude_list)
+    return arg in TEXT - set(exclude_list)
 
 
 def is_psql_boolean_type(arg):

[2/2] madlib git commit: Utilities: Add module transform_vec_cols for column-vector conversion

Reply via email to