update jupyter notebooks for 1dot15
Project: http://git-wip-us.apache.org/repos/asf/madlib-site/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib-site/commit/acd339f6 Tree: http://git-wip-us.apache.org/repos/asf/madlib-site/tree/acd339f6 Diff: http://git-wip-us.apache.org/repos/asf/madlib-site/diff/acd339f6 Branch: refs/heads/asf-site Commit: acd339f65ab5b6b9c2f95ca370cc1fb8460fd7c6 Parents: 5fa1ac0 Author: Frank McQuillan <fmcquil...@pivotal.io> Authored: Wed Aug 1 13:13:25 2018 -0700 Committer: Frank McQuillan <fmcquil...@pivotal.io> Committed: Wed Aug 1 13:13:25 2018 -0700 ---------------------------------------------------------------------- .../Column-vector-operations-v1.ipynb | 2553 ++++++++++ .../Covariance-and-correlation-v1.ipynb | 1318 +++++ community-artifacts/Decision-trees-v1.ipynb | 3051 ------------ community-artifacts/Decision-trees-v2.ipynb | 3208 ++++++++++++ community-artifacts/Elastic-net-v2.ipynb | 2078 -------- community-artifacts/Elastic-net-v3.ipynb | 2049 ++++++++ community-artifacts/KNN-v4.ipynb | 857 ++++ community-artifacts/MLP-mnist-v3.ipynb | 1329 +++++ community-artifacts/MLP-v4.ipynb | 4588 ++++++++++++++++++ .../Novelty-detection-demo-1.ipynb | 478 -- community-artifacts/Random-forest-v1.ipynb | 2899 ----------- community-artifacts/Random-forest-v2.ipynb | 3082 ++++++++++++ .../SVM-novelty-detection-v2.ipynb | 511 ++ community-artifacts/SVM-v1.ipynb | 2806 +++++++++++ .../Stratified-sampling-v2.ipynb | 672 +++ community-artifacts/kNN-v3.ipynb | 857 ---- community-artifacts/mlp-mnist-v2.ipynb | 1154 ----- community-artifacts/mlp-v3.ipynb | 4584 ----------------- .../stratified-sampling-v1.ipynb | 672 --- 19 files changed, 22973 insertions(+), 15773 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Column-vector-operations-v1.ipynb ---------------------------------------------------------------------- diff --git a/community-artifacts/Column-vector-operations-v1.ipynb b/community-artifacts/Column-vector-operations-v1.ipynb new file mode 100644 index 0000000..147b328 --- /dev/null +++ b/community-artifacts/Column-vector-operations-v1.ipynb @@ -0,0 +1,2553 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Column and vector operations\n", + "\n", + "Column and vector operations were added in 1.15.\n", + "\n", + "* cols2vec\n", + "* vec2cols\n", + "* drop columns" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", + " \"You should import from traitlets.config instead.\", ShimWarning)\n", + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", + " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" + ] + } + ], + "source": [ + "%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Greenplum Database 5.4.0 on GCP (demo machine)\n", + "%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", + " \n", + "# PostgreSQL local\n", + "#%sql postgresql://fmcquillan@localhost:5432/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>version</th>\n", + " </tr>\n", + " <tr>\n", + " <td>MADlib version: 1.15-dev, git revision: rc/1.14-rc1-52-g1a7c756, cmake configuration time: Tue Jul 31 20:31:52 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-52-g1a7c756, cmake configuration time: Tue Jul 31 20:31:52 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%sql select madlib.version();\n", + "#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cols2vec\n", + "\n", + "Convert feature columns in a table into an array in a single column.\n", + "\n", + "Given a table with a number of feature columns, this function will create an output table that contains the feature columns in an array. A summary table will also be created that contains the names of the features combined into array, so that this process can be reversed using the function vec2cols." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Load sample data" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "14 rows affected.\n", + "14 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>OUTLOOK</th>\n", + " <th>temperature</th>\n", + " <th>humidity</th>\n", + " <th>Temp_Humidity</th>\n", + " <th>clouds_airquality</th>\n", + " <th>windy</th>\n", + " <th>class</th>\n", + " <th>observation_weight</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>sunny</td>\n", + " <td>85.0</td>\n", + " <td>85.0</td>\n", + " <td>[85.0, 85.0]</td>\n", + " <td>[u'none', u'unhealthy']</td>\n", + " <td>False</td>\n", + " <td>Don't Play</td>\n", + " <td>5.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>sunny</td>\n", + " <td>80.0</td>\n", + " <td>90.0</td>\n", + " <td>[80.0, 90.0]</td>\n", + " <td>[u'none', u'moderate']</td>\n", + " <td>True</td>\n", + " <td>Don't Play</td>\n", + " <td>5.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>overcast</td>\n", + " <td>83.0</td>\n", + " <td>78.0</td>\n", + " <td>[83.0, 78.0]</td>\n", + " <td>[u'low', u'moderate']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>rain</td>\n", + " <td>70.0</td>\n", + " <td>96.0</td>\n", + " <td>[70.0, 96.0]</td>\n", + " <td>[u'low', u'moderate']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>rain</td>\n", + " <td>68.0</td>\n", + " <td>80.0</td>\n", + " <td>[68.0, 80.0]</td>\n", + " <td>[u'medium', u'good']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>rain</td>\n", + " <td>65.0</td>\n", + " <td>70.0</td>\n", + " <td>[65.0, 70.0]</td>\n", + " <td>[u'low', u'unhealthy']</td>\n", + " <td>True</td>\n", + " <td>Don't Play</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>overcast</td>\n", + " <td>64.0</td>\n", + " <td>65.0</td>\n", + " <td>[64.0, 65.0]</td>\n", + " <td>[u'medium', u'moderate']</td>\n", + " <td>True</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>sunny</td>\n", + " <td>72.0</td>\n", + " <td>95.0</td>\n", + " <td>[72.0, 95.0]</td>\n", + " <td>[u'high', u'unhealthy']</td>\n", + " <td>False</td>\n", + " <td>Don't Play</td>\n", + " <td>5.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>sunny</td>\n", + " <td>69.0</td>\n", + " <td>70.0</td>\n", + " <td>[69.0, 70.0]</td>\n", + " <td>[u'high', u'good']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>5.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>rain</td>\n", + " <td>75.0</td>\n", + " <td>80.0</td>\n", + " <td>[75.0, 80.0]</td>\n", + " <td>[u'medium', u'good']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>sunny</td>\n", + " <td>75.0</td>\n", + " <td>70.0</td>\n", + " <td>[75.0, 70.0]</td>\n", + " <td>[u'none', u'good']</td>\n", + " <td>True</td>\n", + " <td>Play</td>\n", + " <td>5.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>overcast</td>\n", + " <td>72.0</td>\n", + " <td>90.0</td>\n", + " <td>[72.0, 90.0]</td>\n", + " <td>[u'medium', u'moderate']</td>\n", + " <td>True</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>overcast</td>\n", + " <td>81.0</td>\n", + " <td>75.0</td>\n", + " <td>[81.0, 75.0]</td>\n", + " <td>[u'medium', u'moderate']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>rain</td>\n", + " <td>71.0</td>\n", + " <td>80.0</td>\n", + " <td>[71.0, 80.0]</td>\n", + " <td>[u'low', u'unhealthy']</td>\n", + " <td>True</td>\n", + " <td>Don't Play</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, u'sunny', 85.0, 85.0, [85.0, 85.0], [u'none', u'unhealthy'], False, u\"Don't Play\", 5.0),\n", + " (2, u'sunny', 80.0, 90.0, [80.0, 90.0], [u'none', u'moderate'], True, u\"Don't Play\", 5.0),\n", + " (3, u'overcast', 83.0, 78.0, [83.0, 78.0], [u'low', u'moderate'], False, u'Play', 1.5),\n", + " (4, u'rain', 70.0, 96.0, [70.0, 96.0], [u'low', u'moderate'], False, u'Play', 1.0),\n", + " (5, u'rain', 68.0, 80.0, [68.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0),\n", + " (6, u'rain', 65.0, 70.0, [65.0, 70.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0),\n", + " (7, u'overcast', 64.0, 65.0, [64.0, 65.0], [u'medium', u'moderate'], True, u'Play', 1.5),\n", + " (8, u'sunny', 72.0, 95.0, [72.0, 95.0], [u'high', u'unhealthy'], False, u\"Don't Play\", 5.0),\n", + " (9, u'sunny', 69.0, 70.0, [69.0, 70.0], [u'high', u'good'], False, u'Play', 5.0),\n", + " (10, u'rain', 75.0, 80.0, [75.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0),\n", + " (11, u'sunny', 75.0, 70.0, [75.0, 70.0], [u'none', u'good'], True, u'Play', 5.0),\n", + " (12, u'overcast', 72.0, 90.0, [72.0, 90.0], [u'medium', u'moderate'], True, u'Play', 1.5),\n", + " (13, u'overcast', 81.0, 75.0, [81.0, 75.0], [u'medium', u'moderate'], False, u'Play', 1.5),\n", + " (14, u'rain', 71.0, 80.0, [71.0, 80.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0)]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS golf CASCADE;\n", + "\n", + "CREATE TABLE golf (\n", + " id integer NOT NULL,\n", + " \"OUTLOOK\" text,\n", + " temperature double precision,\n", + " humidity double precision,\n", + " \"Temp_Humidity\" double precision[],\n", + " clouds_airquality text[],\n", + " windy boolean,\n", + " class text,\n", + " observation_weight double precision\n", + ");\n", + "\n", + "INSERT INTO golf VALUES\n", + "(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),\n", + "(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),\n", + "(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),\n", + "(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),\n", + "(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),\n", + "(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),\n", + "(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),\n", + "(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),\n", + "(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),\n", + "(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),\n", + "(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),\n", + "(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),\n", + "(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),\n", + "(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);\n", + "\n", + "SELECT * FROM golf ORDER BY id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Create feature array\n", + "Combine the temperature and humidity columns into a single array feature." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "14 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>feature_vector</th>\n", + " </tr>\n", + " <tr>\n", + " <td>[85.0, 85.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>[68.0, 80.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>[69.0, 70.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>[81.0, 75.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>[80.0, 90.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>[65.0, 70.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>[75.0, 80.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>[71.0, 80.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>[83.0, 78.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>[64.0, 65.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>[75.0, 70.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>[70.0, 96.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>[72.0, 95.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>[72.0, 90.0]</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[([85.0, 85.0],),\n", + " ([68.0, 80.0],),\n", + " ([69.0, 70.0],),\n", + " ([81.0, 75.0],),\n", + " ([80.0, 90.0],),\n", + " ([65.0, 70.0],),\n", + " ([75.0, 80.0],),\n", + " ([71.0, 80.0],),\n", + " ([83.0, 78.0],),\n", + " ([64.0, 65.0],),\n", + " ([75.0, 70.0],),\n", + " ([70.0, 96.0],),\n", + " ([72.0, 95.0],),\n", + " ([72.0, 90.0],)]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;\n", + "\n", + "SELECT madlib.cols2vec(\n", + " 'golf',\n", + " 'cols2vec_result',\n", + " 'temperature, humidity'\n", + ");\n", + "\n", + "SELECT * FROM cols2vec_result;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "View the summary table:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>source_table</th>\n", + " <th>list_of_features</th>\n", + " <th>list_of_features_to_exclude</th>\n", + " <th>feature_names</th>\n", + " </tr>\n", + " <tr>\n", + " <td>golf</td>\n", + " <td>temperature, humidity</td>\n", + " <td>None</td>\n", + " <td>[u'temperature', u'humidity']</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(u'golf', u'temperature, humidity', u'None', [u'temperature', u'humidity'])]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "SELECT * FROM cols2vec_result_summary;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Create feature array and keep some source table columns\n", + "Combine the temperature and humidity columns and keep 2 other columns from source_table." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "14 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>OUTLOOK</th>\n", + " <th>feature_vector</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>sunny</td>\n", + " <td>[85.0, 85.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>sunny</td>\n", + " <td>[80.0, 90.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>overcast</td>\n", + " <td>[83.0, 78.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>rain</td>\n", + " <td>[70.0, 96.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>rain</td>\n", + " <td>[68.0, 80.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>rain</td>\n", + " <td>[65.0, 70.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>overcast</td>\n", + " <td>[64.0, 65.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>sunny</td>\n", + " <td>[72.0, 95.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>sunny</td>\n", + " <td>[69.0, 70.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>rain</td>\n", + " <td>[75.0, 80.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>sunny</td>\n", + " <td>[75.0, 70.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>overcast</td>\n", + " <td>[72.0, 90.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>overcast</td>\n", + " <td>[81.0, 75.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>rain</td>\n", + " <td>[71.0, 80.0]</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, u'sunny', [85.0, 85.0]),\n", + " (2, u'sunny', [80.0, 90.0]),\n", + " (3, u'overcast', [83.0, 78.0]),\n", + " (4, u'rain', [70.0, 96.0]),\n", + " (5, u'rain', [68.0, 80.0]),\n", + " (6, u'rain', [65.0, 70.0]),\n", + " (7, u'overcast', [64.0, 65.0]),\n", + " (8, u'sunny', [72.0, 95.0]),\n", + " (9, u'sunny', [69.0, 70.0]),\n", + " (10, u'rain', [75.0, 80.0]),\n", + " (11, u'sunny', [75.0, 70.0]),\n", + " (12, u'overcast', [72.0, 90.0]),\n", + " (13, u'overcast', [81.0, 75.0]),\n", + " (14, u'rain', [71.0, 80.0])]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;\n", + "\n", + "SELECT madlib.cols2vec(\n", + " 'golf',\n", + " 'cols2vec_result',\n", + " 'temperature, humidity',\n", + " NULL,\n", + " 'id, \"OUTLOOK\"'\n", + ");\n", + "\n", + "SELECT * FROM cols2vec_result ORDER BY id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "View the summary table:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>source_table</th>\n", + " <th>list_of_features</th>\n", + " <th>list_of_features_to_exclude</th>\n", + " <th>feature_names</th>\n", + " </tr>\n", + " <tr>\n", + " <td>golf</td>\n", + " <td>temperature, humidity</td>\n", + " <td>None</td>\n", + " <td>[u'temperature', u'humidity']</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(u'golf', u'temperature, humidity', u'None', [u'temperature', u'humidity'])]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "SELECT * FROM cols2vec_result_summary;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 4. Exclude all columns that are not double precision\n", + "Combine all columns, excluding all columns that are not of type double precision" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "14 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>OUTLOOK</th>\n", + " <th>feature_vector</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>sunny</td>\n", + " <td>[85.0, 85.0, 5.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>sunny</td>\n", + " <td>[80.0, 90.0, 5.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>overcast</td>\n", + " <td>[83.0, 78.0, 1.5]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>rain</td>\n", + " <td>[70.0, 96.0, 1.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>rain</td>\n", + " <td>[68.0, 80.0, 1.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>rain</td>\n", + " <td>[65.0, 70.0, 1.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>overcast</td>\n", + " <td>[64.0, 65.0, 1.5]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>sunny</td>\n", + " <td>[72.0, 95.0, 5.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>sunny</td>\n", + " <td>[69.0, 70.0, 5.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>rain</td>\n", + " <td>[75.0, 80.0, 1.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>sunny</td>\n", + " <td>[75.0, 70.0, 5.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>overcast</td>\n", + " <td>[72.0, 90.0, 1.5]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>overcast</td>\n", + " <td>[81.0, 75.0, 1.5]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>rain</td>\n", + " <td>[71.0, 80.0, 1.0]</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, u'sunny', [85.0, 85.0, 5.0]),\n", + " (2, u'sunny', [80.0, 90.0, 5.0]),\n", + " (3, u'overcast', [83.0, 78.0, 1.5]),\n", + " (4, u'rain', [70.0, 96.0, 1.0]),\n", + " (5, u'rain', [68.0, 80.0, 1.0]),\n", + " (6, u'rain', [65.0, 70.0, 1.0]),\n", + " (7, u'overcast', [64.0, 65.0, 1.5]),\n", + " (8, u'sunny', [72.0, 95.0, 5.0]),\n", + " (9, u'sunny', [69.0, 70.0, 5.0]),\n", + " (10, u'rain', [75.0, 80.0, 1.0]),\n", + " (11, u'sunny', [75.0, 70.0, 5.0]),\n", + " (12, u'overcast', [72.0, 90.0, 1.5]),\n", + " (13, u'overcast', [81.0, 75.0, 1.5]),\n", + " (14, u'rain', [71.0, 80.0, 1.0])]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;\n", + "\n", + "SELECT madlib.cols2vec(\n", + " 'golf',\n", + " 'cols2vec_result',\n", + " '*',\n", + " '\"OUTLOOK\", \"Temp_Humidity\", clouds_airquality, windy, class, id',\n", + " 'id, \"OUTLOOK\"'\n", + ");\n", + "\n", + "SELECT * FROM cols2vec_result ORDER BY id;" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>source_table</th>\n", + " <th>list_of_features</th>\n", + " <th>list_of_features_to_exclude</th>\n", + " <th>feature_names</th>\n", + " </tr>\n", + " <tr>\n", + " <td>golf</td>\n", + " <td>*</td>\n", + " <td>\"OUTLOOK\", \"Temp_Humidity\", clouds_airquality, windy, class, id</td>\n", + " <td>[u'temperature', u'humidity', u'observation_weight']</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(u'golf', u'*', u'\"OUTLOOK\", \"Temp_Humidity\", clouds_airquality, windy, class, id', [u'temperature', u'humidity', u'observation_weight'])]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "SELECT * FROM cols2vec_result_summary;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 5. Keep all columns from source table\n", + "Combine the temperature and humidity columns, exclude windy, and keep all of the columns from the source table." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "14 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>OUTLOOK</th>\n", + " <th>temperature</th>\n", + " <th>humidity</th>\n", + " <th>Temp_Humidity</th>\n", + " <th>clouds_airquality</th>\n", + " <th>windy</th>\n", + " <th>class</th>\n", + " <th>observation_weight</th>\n", + " <th>feature_vector</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>sunny</td>\n", + " <td>85.0</td>\n", + " <td>85.0</td>\n", + " <td>[85.0, 85.0]</td>\n", + " <td>[u'none', u'unhealthy']</td>\n", + " <td>False</td>\n", + " <td>Don't Play</td>\n", + " <td>5.0</td>\n", + " <td>[85.0, 85.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>sunny</td>\n", + " <td>80.0</td>\n", + " <td>90.0</td>\n", + " <td>[80.0, 90.0]</td>\n", + " <td>[u'none', u'moderate']</td>\n", + " <td>True</td>\n", + " <td>Don't Play</td>\n", + " <td>5.0</td>\n", + " <td>[80.0, 90.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>overcast</td>\n", + " <td>83.0</td>\n", + " <td>78.0</td>\n", + " <td>[83.0, 78.0]</td>\n", + " <td>[u'low', u'moderate']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " <td>[83.0, 78.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>rain</td>\n", + " <td>70.0</td>\n", + " <td>96.0</td>\n", + " <td>[70.0, 96.0]</td>\n", + " <td>[u'low', u'moderate']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.0</td>\n", + " <td>[70.0, 96.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>rain</td>\n", + " <td>68.0</td>\n", + " <td>80.0</td>\n", + " <td>[68.0, 80.0]</td>\n", + " <td>[u'medium', u'good']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.0</td>\n", + " <td>[68.0, 80.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>rain</td>\n", + " <td>65.0</td>\n", + " <td>70.0</td>\n", + " <td>[65.0, 70.0]</td>\n", + " <td>[u'low', u'unhealthy']</td>\n", + " <td>True</td>\n", + " <td>Don't Play</td>\n", + " <td>1.0</td>\n", + " <td>[65.0, 70.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>overcast</td>\n", + " <td>64.0</td>\n", + " <td>65.0</td>\n", + " <td>[64.0, 65.0]</td>\n", + " <td>[u'medium', u'moderate']</td>\n", + " <td>True</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " <td>[64.0, 65.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>sunny</td>\n", + " <td>72.0</td>\n", + " <td>95.0</td>\n", + " <td>[72.0, 95.0]</td>\n", + " <td>[u'high', u'unhealthy']</td>\n", + " <td>False</td>\n", + " <td>Don't Play</td>\n", + " <td>5.0</td>\n", + " <td>[72.0, 95.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>sunny</td>\n", + " <td>69.0</td>\n", + " <td>70.0</td>\n", + " <td>[69.0, 70.0]</td>\n", + " <td>[u'high', u'good']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>5.0</td>\n", + " <td>[69.0, 70.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>rain</td>\n", + " <td>75.0</td>\n", + " <td>80.0</td>\n", + " <td>[75.0, 80.0]</td>\n", + " <td>[u'medium', u'good']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.0</td>\n", + " <td>[75.0, 80.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>sunny</td>\n", + " <td>75.0</td>\n", + " <td>70.0</td>\n", + " <td>[75.0, 70.0]</td>\n", + " <td>[u'none', u'good']</td>\n", + " <td>True</td>\n", + " <td>Play</td>\n", + " <td>5.0</td>\n", + " <td>[75.0, 70.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>overcast</td>\n", + " <td>72.0</td>\n", + " <td>90.0</td>\n", + " <td>[72.0, 90.0]</td>\n", + " <td>[u'medium', u'moderate']</td>\n", + " <td>True</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " <td>[72.0, 90.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>overcast</td>\n", + " <td>81.0</td>\n", + " <td>75.0</td>\n", + " <td>[81.0, 75.0]</td>\n", + " <td>[u'medium', u'moderate']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " <td>[81.0, 75.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>rain</td>\n", + " <td>71.0</td>\n", + " <td>80.0</td>\n", + " <td>[71.0, 80.0]</td>\n", + " <td>[u'low', u'unhealthy']</td>\n", + " <td>True</td>\n", + " <td>Don't Play</td>\n", + " <td>1.0</td>\n", + " <td>[71.0, 80.0]</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, u'sunny', 85.0, 85.0, [85.0, 85.0], [u'none', u'unhealthy'], False, u\"Don't Play\", 5.0, [85.0, 85.0]),\n", + " (2, u'sunny', 80.0, 90.0, [80.0, 90.0], [u'none', u'moderate'], True, u\"Don't Play\", 5.0, [80.0, 90.0]),\n", + " (3, u'overcast', 83.0, 78.0, [83.0, 78.0], [u'low', u'moderate'], False, u'Play', 1.5, [83.0, 78.0]),\n", + " (4, u'rain', 70.0, 96.0, [70.0, 96.0], [u'low', u'moderate'], False, u'Play', 1.0, [70.0, 96.0]),\n", + " (5, u'rain', 68.0, 80.0, [68.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0, [68.0, 80.0]),\n", + " (6, u'rain', 65.0, 70.0, [65.0, 70.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0, [65.0, 70.0]),\n", + " (7, u'overcast', 64.0, 65.0, [64.0, 65.0], [u'medium', u'moderate'], True, u'Play', 1.5, [64.0, 65.0]),\n", + " (8, u'sunny', 72.0, 95.0, [72.0, 95.0], [u'high', u'unhealthy'], False, u\"Don't Play\", 5.0, [72.0, 95.0]),\n", + " (9, u'sunny', 69.0, 70.0, [69.0, 70.0], [u'high', u'good'], False, u'Play', 5.0, [69.0, 70.0]),\n", + " (10, u'rain', 75.0, 80.0, [75.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0, [75.0, 80.0]),\n", + " (11, u'sunny', 75.0, 70.0, [75.0, 70.0], [u'none', u'good'], True, u'Play', 5.0, [75.0, 70.0]),\n", + " (12, u'overcast', 72.0, 90.0, [72.0, 90.0], [u'medium', u'moderate'], True, u'Play', 1.5, [72.0, 90.0]),\n", + " (13, u'overcast', 81.0, 75.0, [81.0, 75.0], [u'medium', u'moderate'], False, u'Play', 1.5, [81.0, 75.0]),\n", + " (14, u'rain', 71.0, 80.0, [71.0, 80.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0, [71.0, 80.0])]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;\n", + "\n", + "SELECT madlib.cols2vec(\n", + " 'golf',\n", + " 'cols2vec_result',\n", + " 'windy, temperature, humidity',\n", + " 'windy',\n", + " '*'\n", + ");\n", + "\n", + "SELECT * FROM cols2vec_result ORDER BY id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "View summary table:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>source_table</th>\n", + " <th>list_of_features</th>\n", + " <th>list_of_features_to_exclude</th>\n", + " <th>feature_names</th>\n", + " </tr>\n", + " <tr>\n", + " <td>golf</td>\n", + " <td>windy, temperature, humidity</td>\n", + " <td>windy</td>\n", + " <td>[u'temperature', u'humidity']</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(u'golf', u'windy, temperature, humidity', u'windy', [u'temperature', u'humidity'])]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "SELECT * FROM cols2vec_result_summary;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This also shows that you can exclude features in 'list_of_features_to_exclude' that are in the list of 'list_of_features'. This can be useful if the 'list_of_features' is generated from an expression or subquery." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 6. Boolean casting\n", + "\n", + "Type casting works as per regular rules of the underlying database. E.g, combining integer and double precisions columns will create a double precision feature vector. For Boolean, do an explicit cast to the target type:" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "14 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>feature_vector</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>[u'false', u\"Don't Play\"]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>[u'true', u\"Don't Play\"]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>[u'false', u'Play']</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>[u'false', u'Play']</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>[u'false', u'Play']</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>[u'true', u\"Don't Play\"]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>[u'true', u'Play']</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>[u'false', u\"Don't Play\"]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>[u'false', u'Play']</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>[u'false', u'Play']</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>[u'true', u'Play']</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>[u'true', u'Play']</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>[u'false', u'Play']</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>[u'true', u\"Don't Play\"]</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, [u'false', u\"Don't Play\"]),\n", + " (2, [u'true', u\"Don't Play\"]),\n", + " (3, [u'false', u'Play']),\n", + " (4, [u'false', u'Play']),\n", + " (5, [u'false', u'Play']),\n", + " (6, [u'true', u\"Don't Play\"]),\n", + " (7, [u'true', u'Play']),\n", + " (8, [u'false', u\"Don't Play\"]),\n", + " (9, [u'false', u'Play']),\n", + " (10, [u'false', u'Play']),\n", + " (11, [u'true', u'Play']),\n", + " (12, [u'true', u'Play']),\n", + " (13, [u'false', u'Play']),\n", + " (14, [u'true', u\"Don't Play\"])]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;\n", + "\n", + "SELECT madlib.cols2vec(\n", + " 'golf',\n", + " 'cols2vec_result',\n", + " 'windy::TEXT, class',\n", + " NULL,\n", + " 'id'\n", + ");\n", + "\n", + "SELECT * FROM cols2vec_result ORDER BY id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Vec2cols\n", + "\n", + "Converts a feature array in a single column into multiple columns. This process can be used to reverse the function cols2vec.\n", + "\n", + "Given a table with a column of type array, this function will create an output table that splits this array into multiple columns, one per array element. It includes the option to name the new feature columns, and to include columns from the original table in the output." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Load sample data" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "14 rows affected.\n", + "14 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>OUTLOOK</th>\n", + " <th>temperature</th>\n", + " <th>humidity</th>\n", + " <th>Temp_Humidity</th>\n", + " <th>clouds_airquality</th>\n", + " <th>windy</th>\n", + " <th>class</th>\n", + " <th>observation_weight</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>sunny</td>\n", + " <td>85.0</td>\n", + " <td>85.0</td>\n", + " <td>[85.0, 85.0]</td>\n", + " <td>[u'none', u'unhealthy']</td>\n", + " <td>False</td>\n", + " <td>Don't Play</td>\n", + " <td>5.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>sunny</td>\n", + " <td>80.0</td>\n", + " <td>90.0</td>\n", + " <td>[80.0, 90.0]</td>\n", + " <td>[u'none', u'moderate']</td>\n", + " <td>True</td>\n", + " <td>Don't Play</td>\n", + " <td>5.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>overcast</td>\n", + " <td>83.0</td>\n", + " <td>78.0</td>\n", + " <td>[83.0, 78.0]</td>\n", + " <td>[u'low', u'moderate']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>rain</td>\n", + " <td>70.0</td>\n", + " <td>96.0</td>\n", + " <td>[70.0, 96.0]</td>\n", + " <td>[u'low', u'moderate']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>rain</td>\n", + " <td>68.0</td>\n", + " <td>80.0</td>\n", + " <td>[68.0, 80.0]</td>\n", + " <td>[u'medium', u'good']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>rain</td>\n", + " <td>65.0</td>\n", + " <td>70.0</td>\n", + " <td>[65.0, 70.0]</td>\n", + " <td>[u'low', u'unhealthy']</td>\n", + " <td>True</td>\n", + " <td>Don't Play</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>overcast</td>\n", + " <td>64.0</td>\n", + " <td>65.0</td>\n", + " <td>[64.0, 65.0]</td>\n", + " <td>[u'medium', u'moderate']</td>\n", + " <td>True</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>sunny</td>\n", + " <td>72.0</td>\n", + " <td>95.0</td>\n", + " <td>[72.0, 95.0]</td>\n", + " <td>[u'high', u'unhealthy']</td>\n", + " <td>False</td>\n", + " <td>Don't Play</td>\n", + " <td>5.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>sunny</td>\n", + " <td>69.0</td>\n", + " <td>70.0</td>\n", + " <td>[69.0, 70.0]</td>\n", + " <td>[u'high', u'good']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>5.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>rain</td>\n", + " <td>75.0</td>\n", + " <td>80.0</td>\n", + " <td>[75.0, 80.0]</td>\n", + " <td>[u'medium', u'good']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>sunny</td>\n", + " <td>75.0</td>\n", + " <td>70.0</td>\n", + " <td>[75.0, 70.0]</td>\n", + " <td>[u'none', u'good']</td>\n", + " <td>True</td>\n", + " <td>Play</td>\n", + " <td>5.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>overcast</td>\n", + " <td>72.0</td>\n", + " <td>90.0</td>\n", + " <td>[72.0, 90.0]</td>\n", + " <td>[u'medium', u'moderate']</td>\n", + " <td>True</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>overcast</td>\n", + " <td>81.0</td>\n", + " <td>75.0</td>\n", + " <td>[81.0, 75.0]</td>\n", + " <td>[u'medium', u'moderate']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>rain</td>\n", + " <td>71.0</td>\n", + " <td>80.0</td>\n", + " <td>[71.0, 80.0]</td>\n", + " <td>[u'low', u'unhealthy']</td>\n", + " <td>True</td>\n", + " <td>Don't Play</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, u'sunny', 85.0, 85.0, [85.0, 85.0], [u'none', u'unhealthy'], False, u\"Don't Play\", 5.0),\n", + " (2, u'sunny', 80.0, 90.0, [80.0, 90.0], [u'none', u'moderate'], True, u\"Don't Play\", 5.0),\n", + " (3, u'overcast', 83.0, 78.0, [83.0, 78.0], [u'low', u'moderate'], False, u'Play', 1.5),\n", + " (4, u'rain', 70.0, 96.0, [70.0, 96.0], [u'low', u'moderate'], False, u'Play', 1.0),\n", + " (5, u'rain', 68.0, 80.0, [68.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0),\n", + " (6, u'rain', 65.0, 70.0, [65.0, 70.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0),\n", + " (7, u'overcast', 64.0, 65.0, [64.0, 65.0], [u'medium', u'moderate'], True, u'Play', 1.5),\n", + " (8, u'sunny', 72.0, 95.0, [72.0, 95.0], [u'high', u'unhealthy'], False, u\"Don't Play\", 5.0),\n", + " (9, u'sunny', 69.0, 70.0, [69.0, 70.0], [u'high', u'good'], False, u'Play', 5.0),\n", + " (10, u'rain', 75.0, 80.0, [75.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0),\n", + " (11, u'sunny', 75.0, 70.0, [75.0, 70.0], [u'none', u'good'], True, u'Play', 5.0),\n", + " (12, u'overcast', 72.0, 90.0, [72.0, 90.0], [u'medium', u'moderate'], True, u'Play', 1.5),\n", + " (13, u'overcast', 81.0, 75.0, [81.0, 75.0], [u'medium', u'moderate'], False, u'Play', 1.5),\n", + " (14, u'rain', 71.0, 80.0, [71.0, 80.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0)]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS golf CASCADE;\n", + "\n", + "CREATE TABLE golf (\n", + " id integer NOT NULL,\n", + " \"OUTLOOK\" text,\n", + " temperature double precision,\n", + " humidity double precision,\n", + " \"Temp_Humidity\" double precision[],\n", + " clouds_airquality text[],\n", + " windy boolean,\n", + " class text,\n", + " observation_weight double precision\n", + ");\n", + "\n", + "INSERT INTO golf VALUES\n", + "(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),\n", + "(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),\n", + "(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),\n", + "(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),\n", + "(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),\n", + "(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),\n", + "(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),\n", + "(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),\n", + "(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),\n", + "(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),\n", + "(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),\n", + "(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),\n", + "(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),\n", + "(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);\n", + "\n", + "SELECT * FROM golf ORDER BY id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Split array into columns\n", + "Split the column \"clouds_airquality\" into new columns called \"clouds\" and \"air_quality\". Also keep columns id and \"OUTLOOK\" from the source table." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "14 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>OUTLOOK</th>\n", + " <th>clouds</th>\n", + " <th>air_quality</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>sunny</td>\n", + " <td>none</td>\n", + " <td>unhealthy</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>sunny</td>\n", + " <td>none</td>\n", + " <td>moderate</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>overcast</td>\n", + " <td>low</td>\n", + " <td>moderate</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>rain</td>\n", + " <td>low</td>\n", + " <td>moderate</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>rain</td>\n", + " <td>medium</td>\n", + " <td>good</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>rain</td>\n", + " <td>low</td>\n", + " <td>unhealthy</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>overcast</td>\n", + " <td>medium</td>\n", + " <td>moderate</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>sunny</td>\n", + " <td>high</td>\n", + " <td>unhealthy</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>sunny</td>\n", + " <td>high</td>\n", + " <td>good</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>rain</td>\n", + " <td>medium</td>\n", + " <td>good</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>sunny</td>\n", + " <td>none</td>\n", + " <td>good</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>overcast</td>\n", + " <td>medium</td>\n", + " <td>moderate</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>overcast</td>\n", + " <td>medium</td>\n", + " <td>moderate</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>rain</td>\n", + " <td>low</td>\n", + " <td>unhealthy</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, u'sunny', u'none', u'unhealthy'),\n", + " (2, u'sunny', u'none', u'moderate'),\n", + " (3, u'overcast', u'low', u'moderate'),\n", + " (4, u'rain', u'low', u'moderate'),\n", + " (5, u'rain', u'medium', u'good'),\n", + " (6, u'rain', u'low', u'unhealthy'),\n", + " (7, u'overcast', u'medium', u'moderate'),\n", + " (8, u'sunny', u'high', u'unhealthy'),\n", + " (9, u'sunny', u'high', u'good'),\n", + " (10, u'rain', u'medium', u'good'),\n", + " (11, u'sunny', u'none', u'good'),\n", + " (12, u'overcast', u'medium', u'moderate'),\n", + " (13, u'overcast', u'medium', u'moderate'),\n", + " (14, u'rain', u'low', u'unhealthy')]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS vec2cols_result;\n", + "\n", + "SELECT madlib.vec2cols(\n", + " 'golf', -- source table\n", + " 'vec2cols_result', -- output table\n", + " 'clouds_airquality', -- column with array entries to split\n", + " ARRAY['clouds', 'air_quality'], -- feature names\n", + " 'id, \"OUTLOOK\", ' -- columns to keep from source table\n", + ");\n", + "\n", + "SELECT * FROM vec2cols_result ORDER BY id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Auto-generate feature names\n", + "Similar to the previous example, except now we keep all columns from source table and do not specify the feature names, so that default names are created." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "14 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>OUTLOOK</th>\n", + " <th>temperature</th>\n", + " <th>humidity</th>\n", + " <th>Temp_Humidity</th>\n", + " <th>clouds_airquality</th>\n", + " <th>windy</th>\n", + " <th>class</th>\n", + " <th>observation_weight</th>\n", + " <th>f1</th>\n", + " <th>f2</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>sunny</td>\n", + " <td>85.0</td>\n", + " <td>85.0</td>\n", + " <td>[85.0, 85.0]</td>\n", + " <td>[u'none', u'unhealthy']</td>\n", + " <td>False</td>\n", + " <td>Don't Play</td>\n", + " <td>5.0</td>\n", + " <td>none</td>\n", + " <td>unhealthy</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>sunny</td>\n", + " <td>80.0</td>\n", + " <td>90.0</td>\n", + " <td>[80.0, 90.0]</td>\n", + " <td>[u'none', u'moderate']</td>\n", + " <td>True</td>\n", + " <td>Don't Play</td>\n", + " <td>5.0</td>\n", + " <td>none</td>\n", + " <td>moderate</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>overcast</td>\n", + " <td>83.0</td>\n", + " <td>78.0</td>\n", + " <td>[83.0, 78.0]</td>\n", + " <td>[u'low', u'moderate']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " <td>low</td>\n", + " <td>moderate</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>rain</td>\n", + " <td>70.0</td>\n", + " <td>96.0</td>\n", + " <td>[70.0, 96.0]</td>\n", + " <td>[u'low', u'moderate']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.0</td>\n", + " <td>low</td>\n", + " <td>moderate</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>rain</td>\n", + " <td>68.0</td>\n", + " <td>80.0</td>\n", + " <td>[68.0, 80.0]</td>\n", + " <td>[u'medium', u'good']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.0</td>\n", + " <td>medium</td>\n", + " <td>good</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>rain</td>\n", + " <td>65.0</td>\n", + " <td>70.0</td>\n", + " <td>[65.0, 70.0]</td>\n", + " <td>[u'low', u'unhealthy']</td>\n", + " <td>True</td>\n", + " <td>Don't Play</td>\n", + " <td>1.0</td>\n", + " <td>low</td>\n", + " <td>unhealthy</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>overcast</td>\n", + " <td>64.0</td>\n", + " <td>65.0</td>\n", + " <td>[64.0, 65.0]</td>\n", + " <td>[u'medium', u'moderate']</td>\n", + " <td>True</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " <td>medium</td>\n", + " <td>moderate</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>sunny</td>\n", + " <td>72.0</td>\n", + " <td>95.0</td>\n", + " <td>[72.0, 95.0]</td>\n", + " <td>[u'high', u'unhealthy']</td>\n", + " <td>False</td>\n", + " <td>Don't Play</td>\n", + " <td>5.0</td>\n", + " <td>high</td>\n", + " <td>unhealthy</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>sunny</td>\n", + " <td>69.0</td>\n", + " <td>70.0</td>\n", + " <td>[69.0, 70.0]</td>\n", + " <td>[u'high', u'good']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>5.0</td>\n", + " <td>high</td>\n", + " <td>good</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>rain</td>\n", + " <td>75.0</td>\n", + " <td>80.0</td>\n", + " <td>[75.0, 80.0]</td>\n", + " <td>[u'medium', u'good']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.0</td>\n", + " <td>medium</td>\n", + " <td>good</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>sunny</td>\n", + " <td>75.0</td>\n", + " <td>70.0</td>\n", + " <td>[75.0, 70.0]</td>\n", + " <td>[u'none', u'good']</td>\n", + " <td>True</td>\n", + " <td>Play</td>\n", + " <td>5.0</td>\n", + " <td>none</td>\n", + " <td>good</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>overcast</td>\n", + " <td>72.0</td>\n", + " <td>90.0</td>\n", + " <td>[72.0, 90.0]</td>\n", + " <td>[u'medium', u'moderate']</td>\n", + " <td>True</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " <td>medium</td>\n", + " <td>moderate</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>overcast</td>\n", + " <td>81.0</td>\n", + " <td>75.0</td>\n", + " <td>[81.0, 75.0]</td>\n", + " <td>[u'medium', u'moderate']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " <td>medium</td>\n", + " <td>moderate</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>rain</td>\n", + " <td>71.0</td>\n", + " <td>80.0</td>\n", + " <td>[71.0, 80.0]</td>\n", + " <td>[u'low', u'unhealthy']</td>\n", + " <td>True</td>\n", + " <td>Don't Play</td>\n", + " <td>1.0</td>\n", + " <td>low</td>\n", + " <td>unhealthy</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, u'sunny', 85.0, 85.0, [85.0, 85.0], [u'none', u'unhealthy'], False, u\"Don't Play\", 5.0, u'none', u'unhealthy'),\n", + " (2, u'sunny', 80.0, 90.0, [80.0, 90.0], [u'none', u'moderate'], True, u\"Don't Play\", 5.0, u'none', u'moderate'),\n", + " (3, u'overcast', 83.0, 78.0, [83.0, 78.0], [u'low', u'moderate'], False, u'Play', 1.5, u'low', u'moderate'),\n", + " (4, u'rain', 70.0, 96.0, [70.0, 96.0], [u'low', u'moderate'], False, u'Play', 1.0, u'low', u'moderate'),\n", + " (5, u'rain', 68.0, 80.0, [68.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0, u'medium', u'good'),\n", + " (6, u'rain', 65.0, 70.0, [65.0, 70.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0, u'low', u'unhealthy'),\n", + " (7, u'overcast', 64.0, 65.0, [64.0, 65.0], [u'medium', u'moderate'], True, u'Play', 1.5, u'medium', u'moderate'),\n", + " (8, u'sunny', 72.0, 95.0, [72.0, 95.0], [u'high', u'unhealthy'], False, u\"Don't Play\", 5.0, u'high', u'unhealthy'),\n", + " (9, u'sunny', 69.0, 70.0, [69.0, 70.0], [u'high', u'good'], False, u'Play', 5.0, u'high', u'good'),\n", + " (10, u'rain', 75.0, 80.0, [75.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0, u'medium', u'good'),\n", + " (11, u'sunny', 75.0, 70.0, [75.0, 70.0], [u'none', u'good'], True, u'Play', 5.0, u'none', u'good'),\n", + " (12, u'overcast', 72.0, 90.0, [72.0, 90.0], [u'medium', u'moderate'], True, u'Play', 1.5, u'medium', u'moderate'),\n", + " (13, u'overcast', 81.0, 75.0, [81.0, 75.0], [u'medium', u'moderate'], False, u'Play', 1.5, u'medium', u'moderate'),\n", + " (14, u'rain', 71.0, 80.0, [71.0, 80.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0, u'low', u'unhealthy')]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS vec2cols_result;\n", + "SELECT madlib.vec2cols(\n", + " 'golf', -- source table\n", + " 'vec2cols_result', -- output table\n", + " 'clouds_airquality', -- column with array entries to split\n", + " NULL, -- feature names\n", + " '*' -- columns to keep from source table\n", + ");\n", + "SELECT * FROM vec2cols_result ORDER BY id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 4. Get feature names from cols2vec summary table\n", + "Now let's run cols2vec then reverse it using vec2cols. In this case we will get feature names from the cols2vec summary table.\n", + "\n", + "First run cols2vec:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "14 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>temperature</th>\n", + " <th>humidity</th>\n", + " <th>feature_vector</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>85.0</td>\n", + " <td>85.0</td>\n", + " <td>[85.0, 85.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>80.0</td>\n", + " <td>90.0</td>\n", + " <td>[80.0, 90.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>83.0</td>\n", + " <td>78.0</td>\n", + " <td>[83.0, 78.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>70.0</td>\n", + " <td>96.0</td>\n", + " <td>[70.0, 96.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>68.0</td>\n", + " <td>80.0</td>\n", + " <td>[68.0, 80.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>65.0</td>\n", + " <td>70.0</td>\n", + " <td>[65.0, 70.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>64.0</td>\n", + " <td>65.0</td>\n", + " <td>[64.0, 65.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>72.0</td>\n", + " <td>95.0</td>\n", + " <td>[72.0, 95.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>69.0</td>\n", + " <td>70.0</td>\n", + " <td>[69.0, 70.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>75.0</td>\n", + " <td>80.0</td>\n", + " <td>[75.0, 80.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>75.0</td>\n", + " <td>70.0</td>\n", + " <td>[75.0, 70.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>72.0</td>\n", + " <td>90.0</td>\n", + " <td>[72.0, 90.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>81.0</td>\n", + " <td>75.0</td>\n", + " <td>[81.0, 75.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>71.0</td>\n", + " <td>80.0</td>\n", + " <td>[71.0, 80.0]</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, 85.0, 85.0, [85.0, 85.0]),\n", + " (2, 80.0, 90.0, [80.0, 90.0]),\n", + " (3, 83.0, 78.0, [83.0, 78.0]),\n", + " (4, 70.0, 96.0, [70.0, 96.0]),\n", + " (5, 68.0, 80.0, [68.0, 80.0]),\n", + " (6, 65.0, 70.0, [65.0, 70.0]),\n", + " (7, 64.0, 65.0, [64.0, 65.0]),\n", + " (8, 72.0, 95.0, [72.0, 95.0]),\n", + " (9, 69.0, 70.0, [69.0, 70.0]),\n", + " (10, 75.0, 80.0, [75.0, 80.0]),\n", + " (11, 75.0, 70.0, [75.0, 70.0]),\n", + " (12, 72.0, 90.0, [72.0, 90.0]),\n", + " (13, 81.0, 75.0, [81.0, 75.0]),\n", + " (14, 71.0, 80.0, [71.0, 80.0])]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;\n", + "SELECT madlib.cols2vec(\n", + " 'golf',\n", + " 'cols2vec_result',\n", + " 'temperature, humidity',\n", + " NULL,\n", + " 'id, temperature, humidity'\n", + ");\n", + "SELECT * FROM cols2vec_result ORDER BY id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "View the summary table with the feature_names dictionary:" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>source_table</th>\n", + " <th>list_of_features</th>\n", + " <th>list_of_features_to_exclude</th>\n", + " <th>feature_names</th>\n", + " </tr>\n", + " <tr>\n", + " <td>golf</td>\n", + " <td>temperature, humidity</td>\n", + " <td>None</td>\n", + " <td>[u'temperature', u'humidity']</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(u'golf', u'temperature, humidity', u'None', [u'temperature', u'humidity'])]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "SELECT * FROM cols2vec_result_summary;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now use feature_names from the summary table above to name the columns of the split array:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "14 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>temperature</th>\n", + " <th>humidity</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>85.0</td>\n", + " <td>85.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>80.0</td>\n", + " <td>90.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>83.0</td>\n", + " <td>78.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>70.0</td>\n", + " <td>96.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>68.0</td>\n", + " <td>80.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>65.0</td>\n", + " <td>70.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>64.0</td>\n", + " <td>65.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>72.0</td>\n", + " <td>95.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>69.0</td>\n", + " <td>70.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>75.0</td>\n", + " <td>80.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>75.0</td>\n", + " <td>70.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>72.0</td>\n", + " <td>90.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>81.0</td>\n", + " <td>75.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>71.0</td>\n", + " <td>80.0</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, 85.0, 85.0),\n", + " (2, 80.0, 90.0),\n", + " (3, 83.0, 78.0),\n", + " (4, 70.0, 96.0),\n", + " (5, 68.0, 80.0),\n", + " (6, 65.0, 70.0),\n", + " (7, 64.0, 65.0),\n", + " (8, 72.0, 95.0),\n", + " (9, 69.0, 70.0),\n", + " (10, 75.0, 80.0),\n", + " (11, 75.0, 70.0),\n", + " (12, 72.0, 90.0),\n", + " (13, 81.0, 75.0),\n", + " (14, 71.0, 80.0)]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS vec2cols_result;\n", + "SELECT madlib.vec2cols(\n", + " 'cols2vec_result', -- source table containing the feature vector\n", + " 'vec2cols_result', -- output table\n", + " 'feature_vector', -- column with array entries to split\n", + " (SELECT feature_names from cols2vec_result_summary), -- feature_names from summary table of cols2vec\n", + " 'id' -- columns to keep from source table\n", + ");\n", + "SELECT * FROM vec2cols_result ORDER BY id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is the same as the format of the original 'golf' dataset that we started with." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Drop columns\n", + "Drop some columns from the source table." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "14 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>Temp_Humidity</th>\n", + " <th>clouds_airquality</th>\n", + " <th>windy</th>\n", + " <th>class</th>\n", + " <th>observation_weight</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>[85.0, 85.0]</td>\n", + " <td>[u'none', u'unhealthy']</td>\n", + " <td>False</td>\n", + " <td>Don't Play</td>\n", + " <td>5.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>[80.0, 90.0]</td>\n", + " <td>[u'none', u'moderate']</td>\n", + " <td>True</td>\n", + " <td>Don't Play</td>\n", + " <td>5.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>[83.0, 78.0]</td>\n", + " <td>[u'low', u'moderate']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>[70.0, 96.0]</td>\n", + " <td>[u'low', u'moderate']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>[68.0, 80.0]</td>\n", + " <td>[u'medium', u'good']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>[65.0, 70.0]</td>\n", + " <td>[u'low', u'unhealthy']</td>\n", + " <td>True</td>\n", + " <td>Don't Play</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>[64.0, 65.0]</td>\n", + " <td>[u'medium', u'moderate']</td>\n", + " <td>True</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>[72.0, 95.0]</td>\n", + " <td>[u'high', u'unhealthy']</td>\n", + " <td>False</td>\n", + " <td>Don't Play</td>\n", + " <td>5.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>[69.0, 70.0]</td>\n", + " <td>[u'high', u'good']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>5.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>[75.0, 80.0]</td>\n", + " <td>[u'medium', u'good']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>[75.0, 70.0]</td>\n", + " <td>[u'none', u'good']</td>\n", + " <td>True</td>\n", + " <td>Play</td>\n", + " <td>5.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>[72.0, 90.0]</td>\n", + " <td>[u'medium', u'moderate']</td>\n", + " <td>True</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>[81.0, 75.0]</td>\n", + " <td>[u'medium', u'moderate']</td>\n", + " <td>False</td>\n", + " <td>Play</td>\n", + " <td>1.5</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>[71.0, 80.0]</td>\n", + " <td>[u'low', u'unhealthy']</td>\n", + " <td>True</td>\n", + " <td>Don't Play</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, [85.0, 85.0], [u'none', u'unhealthy'], False, u\"Don't Play\", 5.0),\n", + " (2, [80.0, 90.0], [u'none', u'moderate'], True, u\"Don't Play\", 5.0),\n", + " (3, [83.0, 78.0], [u'low', u'moderate'], False, u'Play', 1.5),\n", + " (4, [70.0, 96.0], [u'low', u'moderate'], False, u'Play', 1.0),\n", + " (5, [68.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0),\n", + " (6, [65.0, 70.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0),\n", + " (7, [64.0, 65.0], [u'medium', u'moderate'], True, u'Play', 1.5),\n", + " (8, [72.0, 95.0], [u'high', u'unhealthy'], False, u\"Don't Play\", 5.0),\n", + " (9, [69.0, 70.0], [u'high', u'good'], False, u'Play', 5.0),\n", + " (10, [75.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0),\n", + " (11, [75.0, 70.0], [u'none', u'good'], True, u'Play', 5.0),\n", + " (12, [72.0, 90.0], [u'medium', u'moderate'], True, u'Play', 1.5),\n", + " (13, [81.0, 75.0], [u'medium', u'moderate'], False, u'Play', 1.5),\n", + " (14, [71.0, 80.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0)]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS dropcols_out;\n", + "\n", + "SELECT madlib.dropcols(\n", + " 'golf',\n", + " 'dropcols_out',\n", + " '\"OUTLOOK\", temperature, humidity'\n", + ");\n", + "\n", + "SELECT * FROM dropcols_out ORDER BY id;" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}