This is an automated email from the ASF dual-hosted git repository. jingyimei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/madlib.git
The following commit(s) were added to refs/heads/master by this push: new 8de32ed Minibatch Preprocessor for Deep learning 8de32ed is described below commit 8de32ede33c48d2f4a440f0f639c94a277a359c1 Author: Nandish Jayaram <njaya...@apache.org> AuthorDate: Mon Dec 17 17:54:42 2018 -0800 Minibatch Preprocessor for Deep learning The minibatch preprocessor we currently have in MADlib is bloated for DL tasks. This feature adds a simplified version of creating buffers. It can divide each element of the independent array by a normalizing constant for standardization (which is 255.0 by default), and optionally shift the dependent variables by an offset if they are a numeric type. Closes #342 Co-authored-by: Arvind Sridhar <asrid...@pivotal.io> Co-authored-by: Domino Valdano <dvald...@pivotal.io> --- doc/mainpage.dox.in | 6 + .../utilities/minibatch_preprocessing.py_in | 192 +++++++++- .../utilities/minibatch_preprocessing_dl.sql_in | 396 +++++++++++++++++++++ .../test/minibatch_preprocessing_dl.sql_in | 125 +++++++ .../postgres/modules/utilities/utilities.py_in | 4 +- 5 files changed, 719 insertions(+), 4 deletions(-) diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in index aab37af..5568da6 100644 --- a/doc/mainpage.dox.in +++ b/doc/mainpage.dox.in @@ -285,6 +285,12 @@ complete matrix stored as a distributed table. Interface and implementation are subject to change. @{ @defgroup grp_cg Conjugate Gradient + @defgroup grp_dl Deep Learning + @brief A collection of deep learning interfaces. + @details A collection of deep learning interfaces. + @{ + @defgroup grp_minibatch_preprocessing_dl Mini-Batch Preprocessor for Deep Learning + @} @defgroup grp_knn k-Nearest Neighbors @defgroup grp_bayes Naive Bayes Classification @defgroup grp_sample Random Sampling diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in index 88433c9..be568a9 100644 --- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in +++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in @@ -39,7 +39,7 @@ from utilities import py_list_to_sql_string from utilities import split_quoted_delimited_str from utilities import unique_string from utilities import validate_module_input_params -from utilities import NUMERIC, INTEGER, TEXT, BOOLEAN, INCLUDE_ARRAY +from utilities import NUMERIC, INTEGER, TEXT, BOOLEAN, INCLUDE_ARRAY, ONLY_ARRAY from mean_std_dev_calculator import MeanStdDevCalculator from validate_args import get_expr_type @@ -51,6 +51,111 @@ m4_changequote(`<!', `!>') MINIBATCH_OUTPUT_DEPENDENT_COLNAME = "dependent_varname" MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname" +class MiniBatchPreProcessorDL: + def __init__(self, schema_madlib, source_table, output_table, + dependent_varname, independent_varname, buffer_size, + normalizing_const, dependent_offset, **kwargs): + self.schema_madlib = schema_madlib + self.source_table = source_table + self.output_table = output_table + self.dependent_varname = dependent_varname + self.independent_varname = independent_varname + self.buffer_size = buffer_size + self.normalizing_const = normalizing_const + self.dependent_offset = dependent_offset + self.module_name = "minibatch_preprocessor_DL" + self.output_summary_table = add_postfix(self.output_table, "_summary") + self._validate_args() + self.num_of_buffers = self._get_num_buffers() + + def minibatch_preprocessor_dl(self): + norm_tbl = unique_string(desp='normalized') + # Create a temp table that has independent var normalized. + + dependent_varname_with_offset = self.dependent_varname + if self.dependent_offset: + dependent_varname_with_offset = '{0} + {1}'.format(self.dependent_varname, self.dependent_offset) + + scalar_mult_sql = """ + CREATE TEMP TABLE {norm_tbl} AS + SELECT {self.schema_madlib}.array_scalar_mult( + {self.independent_varname}::REAL[], (1/{self.normalizing_const})::REAL) AS x_norm, + {dependent_varname_with_offset} AS y, + row_number() over() AS row_id + FROM {self.source_table} + """.format(**locals()) + plpy.execute(scalar_mult_sql) + # Create the mini-batched output table + if is_platform_pg(): + distributed_by_clause = '' + else: + distributed_by_clause= ' DISTRIBUTED BY (buffer_id) ' + sql = """ + CREATE TABLE {self.output_table} AS + SELECT * FROM + ( + SELECT {self.schema_madlib}.agg_array_concat( + ARRAY[{norm_tbl}.x_norm::REAL[]]) AS {x}, + array_agg({norm_tbl}.y) AS {y}, + ({norm_tbl}.row_id%{self.num_of_buffers})::smallint AS buffer_id + FROM {norm_tbl} + GROUP BY buffer_id + ) b + {distributed_by_clause} + """.format(x=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME, + y=MINIBATCH_OUTPUT_DEPENDENT_COLNAME, + **locals()) + plpy.execute(sql) + plpy.execute("DROP TABLE {0}".format(norm_tbl)) + # Create summary table + self._create_output_summary_table() + + def _create_output_summary_table(self): + query = """ + CREATE TABLE {self.output_summary_table} AS + SELECT + $__madlib__${self.source_table}$__madlib__$::TEXT AS source_table, + $__madlib__${self.output_table}$__madlib__$::TEXT AS output_table, + $__madlib__${self.dependent_varname}$__madlib__$::TEXT AS dependent_varname, + $__madlib__${self.independent_varname}$__madlib__$::TEXT AS independent_varname, + $__madlib__${self.dependent_vartype}$__madlib__$::TEXT AS dependent_vartype, + {self.buffer_size} AS buffer_size + """.format(self=self) + plpy.execute(query) + + def _validate_args(self): + validate_module_input_params( + self.source_table, self.output_table, self.independent_varname, + self.dependent_varname, self.module_name, None, + [self.output_summary_table]) + self.independent_vartype = get_expr_type( + self.independent_varname, self.source_table) + _assert(is_valid_psql_type(self.independent_vartype, + NUMERIC | ONLY_ARRAY), + "Invalid independent variable type, should be an array of " \ + "one of {0}".format(','.join(NUMERIC))) + self.dependent_vartype = get_expr_type( + self.dependent_varname, self.source_table) + dep_valid_types = NUMERIC | TEXT | BOOLEAN + _assert(is_valid_psql_type(self.dependent_vartype, dep_valid_types), + "Invalid dependent variable type, should be one of {0}". + format(','.join(dep_valid_types))) + if self.buffer_size is not None: + _assert(self.buffer_size > 0, + "minibatch_preprocessor_dl: The buffer size has to be a " \ + "positive integer or NULL.") + + def _get_num_buffers(self): + num_rows_in_tbl = plpy.execute(""" + SELECT count(*) AS cnt FROM {0} + """.format(self.source_table))[0]['cnt'] + buffer_size_calculator = MiniBatchBufferSizeCalculator() + indepdent_var_dim = _tbl_dimension_rownum( + self.schema_madlib, self.source_table, self.independent_varname, + skip_row_count=True) + self.buffer_size = buffer_size_calculator.calculate_default_buffer_size( + self.buffer_size, num_rows_in_tbl, indepdent_var_dim[0]) + return ceil((1.0*num_rows_in_tbl)/self.buffer_size) class MiniBatchPreProcessor: """ @@ -509,7 +614,7 @@ class MiniBatchDocumentation: SELECT {schema_madlib}.{method}( source_table, -- TEXT. Name of the table containing input data. Can also be a view - output_table , -- TEXT. Name of the output table for + output_table, -- TEXT. Name of the output table for mini-batching dependent_varname, -- TEXT. Name of the dependent variable column independent_varname, -- TEXT. Name of the independent variable @@ -580,3 +685,86 @@ class MiniBatchDocumentation: for help. """.format(**locals()) # --------------------------------------------------------------------- + @staticmethod + def minibatch_preprocessor_dl_help(schema_madlib, message): + method = "minibatch_preprocessor_dl" + summary = """ + ---------------------------------------------------------------- + SUMMARY + ---------------------------------------------------------------- + For Deep Learning based techniques such as Convolutional Neural Nets, + the input data is mostly images. These images can be represented as an + array of numbers where all elements are between 0 and 255 in value. + It is standard practice to divide each of these numbers by 255.0 to + normalize the image data. minibatch_preprocessor() is for general + use-cases, but for deep learning based use-cases we provide + minibatch_preprocessor_dl() that is light-weight and is + specific to image datasets. + + The normalizing constant is parameterized, and can be specified based + on the kind of image data used. + + For more details on function usage: + SELECT {schema_madlib}.{method}('usage') + """.format(**locals()) + + usage = """ + --------------------------------------------------------------------------- + USAGE + --------------------------------------------------------------------------- + SELECT {schema_madlib}.{method}( + source_table, -- TEXT. Name of the table containing input + data. Can also be a view + output_table, -- TEXT. Name of the output table for + mini-batching + dependent_varname, -- TEXT. Name of the dependent variable column + independent_varname, -- TEXT. Name of the independent variable + column + buffer_size -- INTEGER. Default computed automatically. + Number of source input rows to pack into a buffer + normalizing_const -- DOUBLE PRECISON. Default 255.0. The + normalizing constant to use for + standardizing arrays in independent_varname. + dependent_offset -- INTEGER. If specified, shifts all dependent + variable values by this number (should + only be used for numeric types). + ); + + + --------------------------------------------------------------------------- + OUTPUT + --------------------------------------------------------------------------- + The output table produced by MiniBatch Preprocessor contains the + following columns: + + buffer_id -- INTEGER. Unique id for packed table. + dependent_varname -- ANYARRAY[]. Packed array of dependent variables. + independent_varname -- REAL[]. Packed array of independent + variables. + + --------------------------------------------------------------------------- + The algorithm also creates a summary table named <output_table>_summary + that has the following columns: + + source_table -- Source table name. + output_table -- Output table name from preprocessor. + dependent_varname -- Dependent variable values from the original table + (shifted by dependent_offset, if specified). + independent_varname -- Independent variable values from the original + table. + dependent_vartype -- Type of the dependent variable from the + original table. + buffer_size -- Buffer size used in preprocessing step. + + --------------------------------------------------------------------------- + """.format(**locals()) + + + if not message: + return summary + elif message.lower() in ('usage', 'help', '?'): + return usage + return """ + No such option. Use "SELECT {schema_madlib}.minibatch_preprocessor_dl()" + for help. + """.format(**locals()) diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in new file mode 100644 index 0000000..994e458 --- /dev/null +++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in @@ -0,0 +1,396 @@ +/* ----------------------------------------------------------------------- */ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * @file minibatch_preprocessing_dl.sql_in + * @brief TODO + * @date December 2018 + * + */ +/* ----------------------------------------------------------------------- */ + +m4_include(`SQLCommon.m4') + +/** +@addtogroup grp_minibatch_preprocessing_dl + +<div class="toc"><b>Contents</b><ul> +<li class="level1"><a href="#minibatch_preprocessor_dl">Mini-Batch Preprocessor for Deep Learning</a></li> +<li class="level1"><a href="#example">Examples</a></li> +</ul></div> + +For Deep Learning based techniques such as Convolutional Neural Nets, the input +data is mostly images. These images can be represented as an array of numbers +where all elements are between 0 and 255 in value. It is standard practice +to divide each of these numbers by 255.0 to normalize the image data. +minibatch_preprocessor() is for general use-cases, but for deep learning based +use-cases we provide minibatch_preprocessor_dl() that is light-weight and is +specific to image datasets. The normalizing constant is parameterized, and can +be specified based on the kind of image data used. +<pre class="syntax"> +minibatch_preprocessor_dl(source_table, + output_table, + dependent_varname, + independent_varname, + buffer_size, + normalizing_const, + dependent_offset + ) +</pre> + +\b Arguments +<dl class="arglist"> + <dt>source_table</dt> + <dd>TEXT. Name of the table containing input data. Can also be a view. + </dd> + + <dt>output_table</dt> + <dd>TEXT. Name of the output table from the preprocessor which + will be used as input to algorithms that support mini-batching. + Note that the arrays packed into the output table are randomized + and normalized (by dividing each element in the independent variable array + by the normalizing_const), so they will not match up in an obvious way with + the rows in the source table. + </dd> + + <dt>dependent_varname</dt> + <dd>TEXT. Name of the dependent variable column. + </dd> + + <dt>independent_varname</dt> + <dd>TEXT. Name of the independent variable column. The column must be of + a numeric array type. + </dd> + + <dt>buffer_size (optional)</dt> + <dd>INTEGER, default: computed. Buffer size is the + number of rows from the + source table that are packed into one row of the preprocessor + output table. The default value is computed considering size of + the source table, number of independent variables, number of groups, + and number of segments in the database cluster. For larger data sets, + the computed buffer size will typically be a value in the millions. + </dd> + + <dt>normalizing_const (optional)</dt> + <dd>DOUBLE PRECISION, default: 255.0. The normalizing constant to divide + each value in the independent_varname array by. + </dd> + + <dt>dependent_offset (optional)</dt> + <dd>INTEGER, default: NULL. If specified, shifts all dependent + variable values by this number (should only be used for numeric types). + </dd> + +</dl> + +<b>Output tables</b> +<br> + The output table produced by the mini-batch preprocessor contains the following columns: + <table class="output"> + <tr> + <th>buffer_id</th> + <td>INTEGER. Unique id for packed table. + </td> + </tr> + <tr> + <th>dependent_varname</th> + <td>ANYARRAY[]. Packed array of dependent variables. The type + of the array is the same as the type of the dependent variable from + the source table. + </td> + </tr> + <tr> + <th>independent_varname</th> + <td>REAL[]. Packed array of independent variables. + </td> + </tr> + </table> + +A summary table named \<output_table\>_summary is also created, which has the following columns: + <table class="output"> + <tr> + <th>source_table</th> + <td>Name of the source table.</td> + </tr> + <tr> + <th>output_table</th> + <td>Name of output table generated by preprocessor.</td> + </tr> + <tr> + <th>dependent_varname</th> + <td>Dependent variable from the source table.</td> + </tr> + <tr> + <th>independent_varname</th> + <td>Independent variable from the source table.</td> + </tr> + <tr> + <th>dependent_vartype</th> + <td>Type of the dependent varialbe from the source table.</td> + </tr> + <tr> + <th>buffer_size</th> + <td>Buffer size used in preprocessing step.</td> + </tr> + </table> + +@anchor example +@par Examples +-# Create an input data set based on the well known iris data set: +<pre class="example"> +DROP TABLE IF EXISTS iris_data; +CREATE TABLE iris_data( + id serial, + attributes numeric[], + class_text varchar, + class integer, + state varchar +); +INSERT INTO iris_data(id, attributes, class_text, class, state) VALUES +(1,ARRAY[5.0,3.2,1.2,0.2],'Iris_setosa',1,'Alaska'), +(2,ARRAY[5.5,3.5,1.3,0.2],'Iris_setosa',1,'Alaska'), +(3,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Alaska'), +(4,ARRAY[4.4,3.0,1.3,0.2],'Iris_setosa',1,'Alaska'), +(5,ARRAY[5.1,3.4,1.5,0.2],'Iris_setosa',1,'Alaska'), +(6,ARRAY[5.0,3.5,1.3,0.3],'Iris_setosa',1,'Alaska'), +(7,ARRAY[4.5,2.3,1.3,0.3],'Iris_setosa',1,'Alaska'), +(8,ARRAY[4.4,3.2,1.3,0.2],'Iris_setosa',1,'Alaska'), +(9,ARRAY[5.0,3.5,1.6,0.6],'Iris_setosa',1,'Alaska'), +(10,ARRAY[5.1,3.8,1.9,0.4],'Iris_setosa',1,'Alaska'), +(11,ARRAY[4.8,3.0,1.4,0.3],'Iris_setosa',1,'Alaska'), +(12,ARRAY[5.1,3.8,1.6,0.2],'Iris_setosa',1,'Alaska'), +(13,ARRAY[5.7,2.8,4.5,1.3],'Iris_versicolor',2,'Alaska'), +(14,ARRAY[6.3,3.3,4.7,1.6],'Iris_versicolor',2,'Alaska'), +(15,ARRAY[4.9,2.4,3.3,1.0],'Iris_versicolor',2,'Alaska'), +(16,ARRAY[6.6,2.9,4.6,1.3],'Iris_versicolor',2,'Alaska'), +(17,ARRAY[5.2,2.7,3.9,1.4],'Iris_versicolor',2,'Alaska'), +(18,ARRAY[5.0,2.0,3.5,1.0],'Iris_versicolor',2,'Alaska'), +(19,ARRAY[5.9,3.0,4.2,1.5],'Iris_versicolor',2,'Alaska'), +(20,ARRAY[6.0,2.2,4.0,1.0],'Iris_versicolor',2,'Alaska'), +(21,ARRAY[6.1,2.9,4.7,1.4],'Iris_versicolor',2,'Alaska'), +(22,ARRAY[5.6,2.9,3.6,1.3],'Iris_versicolor',2,'Alaska'), +(23,ARRAY[6.7,3.1,4.4,1.4],'Iris_versicolor',2,'Alaska'), +(24,ARRAY[5.6,3.0,4.5,1.5],'Iris_versicolor',2,'Alaska'), +(25,ARRAY[5.8,2.7,4.1,1.0],'Iris_versicolor',2,'Alaska'), +(26,ARRAY[6.2,2.2,4.5,1.5],'Iris_versicolor',2,'Alaska'), +(27,ARRAY[5.6,2.5,3.9,1.1],'Iris_versicolor',2,'Alaska'), +(28,ARRAY[5.0,3.4,1.5,0.2],'Iris_setosa',1,'Tennessee'), +(29,ARRAY[4.4,2.9,1.4,0.2],'Iris_setosa',1,'Tennessee'), +(30,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Tennessee'), +(31,ARRAY[5.4,3.7,1.5,0.2],'Iris_setosa',1,'Tennessee'), +(32,ARRAY[4.8,3.4,1.6,0.2],'Iris_setosa',1,'Tennessee'), +(33,ARRAY[4.8,3.0,1.4,0.1],'Iris_setosa',1,'Tennessee'), +(34,ARRAY[4.3,3.0,1.1,0.1],'Iris_setosa',1,'Tennessee'), +(35,ARRAY[5.8,4.0,1.2,0.2],'Iris_setosa',1,'Tennessee'), +(36,ARRAY[5.7,4.4,1.5,0.4],'Iris_setosa',1,'Tennessee'), +(37,ARRAY[5.4,3.9,1.3,0.4],'Iris_setosa',1,'Tennessee'), +(38,ARRAY[6.0,2.9,4.5,1.5],'Iris_versicolor',2,'Tennessee'), +(39,ARRAY[5.7,2.6,3.5,1.0],'Iris_versicolor',2,'Tennessee'), +(40,ARRAY[5.5,2.4,3.8,1.1],'Iris_versicolor',2,'Tennessee'), +(41,ARRAY[5.5,2.4,3.7,1.0],'Iris_versicolor',2,'Tennessee'), +(42,ARRAY[5.8,2.7,3.9,1.2],'Iris_versicolor',2,'Tennessee'), +(43,ARRAY[6.0,2.7,5.1,1.6],'Iris_versicolor',2,'Tennessee'), +(44,ARRAY[5.4,3.0,4.5,1.5],'Iris_versicolor',2,'Tennessee'), +(45,ARRAY[6.0,3.4,4.5,1.6],'Iris_versicolor',2,'Tennessee'), +(46,ARRAY[6.7,3.1,4.7,1.5],'Iris_versicolor',2,'Tennessee'), +(47,ARRAY[6.3,2.3,4.4,1.3],'Iris_versicolor',2,'Tennessee'), +(48,ARRAY[5.6,3.0,4.1,1.3],'Iris_versicolor',2,'Tennessee'), +(49,ARRAY[5.5,2.5,4.0,1.3],'Iris_versicolor',2,'Tennessee'), +(50,ARRAY[5.5,2.6,4.4,1.2],'Iris_versicolor',2,'Tennessee'), +(51,ARRAY[6.1,3.0,4.6,1.4],'Iris_versicolor',2,'Tennessee'), +(52,ARRAY[5.8,2.6,4.0,1.2],'Iris_versicolor',2,'Tennessee'); +</pre> + +-# Run the preprocessor: +<pre class="example"> +DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary; +SELECT madlib.minibatch_preprocessor_dl('iris_data', -- Source table + 'iris_data_packed', -- Output table + 'class_text', -- Dependent variable + 'attributes', -- Independent variables + NULL, -- buffer size + 2 -- normalizing constant + ); +</pre> +For small datasets like in this example, buffer size is mainly +determined by the number of segments in the database. +This example is run on a Greenplum database with 3 segments, +so there are 3 rows with a buffer size of 18. +For PostgresSQL, there would be only one row with a buffer +size of 52 since it is a single node database. +For larger data sets, other factors go into +computing buffers size besides number of segments. +Also, note that the dependent variable has +been one-hot encoded since it is categorical. +Here is a sample of the packed output table: +<pre class="example"> +\\x on +SELECT * FROM iris_data_packed; +</pre> +<pre class="result"> +-[ RECORD 1 ]-------+------------------------------------- +independent_varname | {{2.55,1.7,0.75,0.1},{2.55,1.9,0.95,0.2},{2.9,1.35,1.95,0.6},{3.35,1.55,2.2,0.7},{2.85,1.3,1.75,0.5},{3.1,1.1,2.25,0.75},...}} +dependent_varname | {Iris_versicolor,Iris_versicolor,Iris_versicolor,Iris_versicolor,Iris_setosa,Iris_setosa,...} +buffer_id | 0 +-[ RECORD 2 ]-------+------------------------------------- +independent_varname | {{2.4,1.5,0.7,0.15},{2.2,1.6,0.65,0.1},{2.8,1.45,1.8,0.65},{2.9,1.3,2,0.6},{2.2,1.45,0.7,0.1},{2.85,1.4,2.25,0.65},{2.8,1.25,1.95,0.55},...}} +dependent_varname | {Iris_setosa,Iris_setosa,Iris_versicolor,Iris_versicolor,Iris_setosa,Iris_versicolor,...} +buffer_id | 1 +-[ RECORD 2 ]-------+------------------------------------- +independent_varname | {{2.4,1.7,0.8,0.1},{2.9,1.35,2.05,0.5},{3.15,1.65,2.35,0.8},{2.7,1.95,0.65,0.2},{2.75,1.3,2.2,0.6},{3,1.1,2,0.5},{2.55,1.9,0.8,0.1},...}} +dependent_varname | {Iris_versicolor,Iris_setosa,Iris_versicolor,Iris_versicolor,Iris_setosa,Iris_setosa,...} +buffer_id | 2 +</pre> +Review the output summary table: +<pre class="example"> +\\x on +SELECT * FROM iris_data_packed_summary; +</pre> +<pre class="result"> +-[ RECORD 1 ]-------+------------------ +source_table | iris_data +output_table | iris_data_packed +dependent_varname | class_text +independent_varname | attributes +dependent_vartype | character varying +buffer_size | 18 +</pre> + +-# Generally the default buffer size will work well, +but if you have occasion to change it: +<pre class="example"> +DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary; +SELECT madlib.minibatch_preprocessor_dl('iris_data', -- Source table + 'iris_data_packed', -- Output table + 'class_text', -- Dependent variable + 'attributes', -- Independent variables + 10 -- Buffer size + ); +</pre> +Review the number of buffers in the output table: +<pre class="example"> +SELECT COUNT(*) FROM iris_data_packed; +</pre> +<pre class="result"> +-[ RECORD 1 ] +count | 6 +</pre> +Review the output summary table: +<pre class="example"> +\\x on +SELECT * FROM iris_data_packed_summary; +</pre> +<pre class="result"> +-[ RECORD 1 ]-------+------------------ +source_table | iris_data +output_table | iris_data_packed +dependent_varname | class_text +independent_varname | attributes +dependent_vartype | character varying +buffer_size | 10 +</pre> + +*/ + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( + source_table VARCHAR, + output_table VARCHAR, + dependent_varname VARCHAR, + independent_varname VARCHAR, + buffer_size INTEGER, + normalizing_const DOUBLE PRECISION, + dependent_offset INTEGER +) RETURNS VOID AS $$ + PythonFunctionBodyOnly(utilities, minibatch_preprocessing) + from utilities.control import MinWarning + with AOControl(False): + with MinWarning('error'): + minibatch_preprocessor_obj = minibatch_preprocessing.MiniBatchPreProcessorDL(**globals()) + minibatch_preprocessor_obj.minibatch_preprocessor_dl() +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( + source_table VARCHAR, + output_table VARCHAR, + dependent_varname VARCHAR, + independent_varname VARCHAR, + buffer_size INTEGER, + normalizing_const DOUBLE PRECISION +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, $6, NULL); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( + source_table VARCHAR, + output_table VARCHAR, + dependent_varname VARCHAR, + independent_varname VARCHAR, + buffer_size INTEGER +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, 255.0, NULL); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( + source_table VARCHAR, + output_table VARCHAR, + dependent_varname VARCHAR, + independent_varname VARCHAR +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, NULL, 255.0, NULL); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( + message VARCHAR +) RETURNS VARCHAR AS $$ + PythonFunctionBodyOnly(utilities, minibatch_preprocessing) + return minibatch_preprocessing.MiniBatchDocumentation.minibatch_preprocessor_dl_help(schema_madlib, message) +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl() +RETURNS VARCHAR AS $$ + PythonFunctionBodyOnly(utilities, minibatch_preprocessing) + return minibatch_preprocessing.MiniBatchDocumentation.minibatch_preprocessor_dl_help(schema_madlib, '') +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.agg_array_concat_transition(anyarray, anyarray) + RETURNS anyarray + AS 'select $1 || $2' + LANGUAGE SQL + IMMUTABLE; + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.agg_array_concat_merge(anyarray, anyarray) + RETURNS anyarray + AS 'select $1 || $2' + LANGUAGE SQL + IMMUTABLE + RETURNS NULL ON NULL INPUT; + +DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.agg_array_concat(anyarray); +CREATE AGGREGATE MADLIB_SCHEMA.agg_array_concat(anyarray) ( + SFUNC = MADLIB_SCHEMA.agg_array_concat_transition, + STYPE = anyarray, + PREFUNC = MADLIB_SCHEMA.agg_array_concat_merge + ); diff --git a/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in b/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in new file mode 100644 index 0000000..dd9b0d5 --- /dev/null +++ b/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in @@ -0,0 +1,125 @@ +/* ----------------------------------------------------------------------- *//** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + *//* ----------------------------------------------------------------------- */ + +DROP TABLE IF EXISTS minibatch_preprocessor_dl_input; +CREATE TABLE minibatch_preprocessor_dl_input(id serial, x double precision[], label TEXT); +INSERT INTO minibatch_preprocessor_dl_input(x, label) VALUES +(ARRAY[1,2,3,4,5,6], 'a'), +(ARRAY[11,2,3,4,5,6], 'a'), +(ARRAY[11,22,33,4,5,6], 'a'), +(ARRAY[11,22,33,44,5,6], 'a'), +(ARRAY[11,22,33,44,65,6], 'a'), +(ARRAY[11,22,33,44,65,56], 'a'), +(ARRAY[11,22,33,44,65,56], 'a'), +(ARRAY[11,22,33,44,65,56], 'a'), +(ARRAY[11,22,33,44,65,56], 'a'), +(ARRAY[11,22,33,44,65,56], 'a'), +(ARRAY[11,22,33,44,65,56], 'a'), +(ARRAY[11,22,33,44,65,56], 'a'), +(ARRAY[11,22,33,144,65,56], 'a'), +(ARRAY[11,22,233,44,65,56], 'a'), +(ARRAY[11,22,33,44,65,56], 'b'), +(ARRAY[11,22,33,44,65,56], 'b'), +(ARRAY[11,22,33,44,65,56], 'b'); + +DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; +SELECT minibatch_preprocessor_dl( + 'minibatch_preprocessor_dl_input', + 'minibatch_preprocessor_dl_batch', + 'id', + 'x', + 5); + +SELECT assert(count(*)=4, 'Incorrect number of buffers in minibatch_preprocessor_dl_batch.') +FROM minibatch_preprocessor_dl_batch; + +SELECT assert(array_upper(independent_varname, 1)=5, 'Incorrect buffer size.') +FROM minibatch_preprocessor_dl_batch WHERE buffer_id=1; + +SELECT assert(array_upper(independent_varname, 1)=2, 'Incorrect buffer size.') +FROM minibatch_preprocessor_dl_batch WHERE buffer_id=4; + +DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; +SELECT minibatch_preprocessor_dl( + 'minibatch_preprocessor_dl_input', + 'minibatch_preprocessor_dl_batch', + 'label', + 'x'); + +DROP TABLE IF EXISTS minibatch_preprocessor_dl_input; +CREATE TABLE minibatch_preprocessor_dl_input(id serial, x double precision[], y INTEGER); +INSERT INTO minibatch_preprocessor_dl_input(x, y) VALUES +(ARRAY[1,2,3,4,5,6], 4), +(ARRAY[11,2,3,4,5,6], 3), +(ARRAY[11,22,33,4,5,6], 8), +(ARRAY[11,22,33,44,5,6], 2), +(ARRAY[11,22,33,44,65,6], 5), +(ARRAY[11,22,33,44,65,56], 6), +(ARRAY[11,22,33,44,65,56], 2), +(ARRAY[11,22,33,44,65,56], 10), +(ARRAY[11,22,33,44,65,56], 3), +(ARRAY[11,22,33,44,65,56], 7), +(ARRAY[11,22,33,44,65,56], 6), +(ARRAY[11,22,33,44,65,56], -6), +(ARRAY[11,22,33,144,65,56], 9), +(ARRAY[11,22,233,44,65,56], 0), +(ARRAY[11,22,33,44,65,56], 12), +(ARRAY[11,22,33,44,65,56], -3), +(ARRAY[11,22,33,44,65,56], -1); + +DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; +SELECT minibatch_preprocessor_dl( + 'minibatch_preprocessor_dl_input', + 'minibatch_preprocessor_dl_batch', + 'y', + 'x', + 4, + 5); + +-- Test that indepdendent vars get divided by 5, by verifying min value goes from 1 to 0.2, and max value from 233 to 46.6 +SELECT assert(relative_error(MIN(x),0.2) < 0.00001, 'Independent var not normalized properly!') FROM (SELECT UNNEST(independent_varname) as x FROM minibatch_preprocessor_dl_batch) a; +SELECT assert(relative_error(MAX(x),46.6) < 0.00001, 'Independent var not normalized properly!') FROM (SELECT UNNEST(independent_varname) as x FROM minibatch_preprocessor_dl_batch) a; + +DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; +SELECT minibatch_preprocessor_dl( + 'minibatch_preprocessor_dl_input', + 'minibatch_preprocessor_dl_batch', + 'y', + 'x', + 4, + 5, + 6); + +-- Test that dependent vars gets shifted by +6, by verifying minimum value goes from -6 to 0 +SELECT assert(abs(MIN(y))<0.00001, 'Dependent var not shifted properly!') FROM (SELECT UNNEST(dependent_varname) as y FROM minibatch_preprocessor_dl_batch) a; + +DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; +SELECT minibatch_preprocessor_dl( + 'minibatch_preprocessor_dl_input', + 'minibatch_preprocessor_dl_batch', + 'y', + 'x', + 4, + 5, + -6); + +-- Test that dependent vars gets shifted by -6, by verifying minimum value goes from -6 to -12 +SELECT assert(relative_error(MIN(y), -12)<0.00001, 'Dependent var not shifted properly!') FROM (SELECT UNNEST(dependent_varname) as y FROM minibatch_preprocessor_dl_batch) a; \ No newline at end of file diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in index 50c426b..89907b3 100644 --- a/src/ports/postgres/modules/utilities/utilities.py_in +++ b/src/ports/postgres/modules/utilities/utilities.py_in @@ -875,8 +875,8 @@ def collate_plpy_result(plpy_result_rows): def validate_module_input_params(source_table, output_table, independent_varname, - dependent_varname, module_name, grouping_cols, - other_output_tables=None): + dependent_varname, module_name, + grouping_cols=None, other_output_tables=None): """ This function is supposed to be used for validating params for supervised learning like algos, e.g. linear regression, mlp, etc. since all