This is an automated email from the ASF dual-hosted git repository. nkak pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/madlib.git
The following commit(s) were added to refs/heads/master by this push: new df03bc2 DL: Add function for predict byom df03bc2 is described below commit df03bc2029fffa081685c9ffd9471386666bc9a6 Author: Nikhil Kak <n...@pivotal.io> AuthorDate: Fri Jul 19 17:39:27 2019 -0700 DL: Add function for predict byom JIRA: MADLIB-1371 , MADLIB-1359 Previously a user would have to train a deep learning model in madlib and only then they could use that model to predict. This commit adds a new function called `madlib_keras_predict_byom` which allows the user to run prediction on their own model which doesn't have to be trained on madlib. * Refactored the code to reuse the logic between predict and predict_byom * user doc changes Co-authored-by: Nandish Jayaram <njaya...@apache.org> Co-authored-by: Orhan Kislal <okis...@apache.org> --- .../deep_learning/input_data_preprocessor.py_in | 2 +- .../modules/deep_learning/madlib_keras.py_in | 71 +++-- .../modules/deep_learning/madlib_keras.sql_in | 337 +++++++++++++++++++- .../deep_learning/madlib_keras_helper.py_in | 2 + .../deep_learning/madlib_keras_predict.py_in | 312 +++++++++++++++---- .../deep_learning/madlib_keras_validator.py_in | 346 ++++++++++----------- .../model_arch_info.py_in | 36 +++ .../modules/deep_learning/test/madlib_keras.sql_in | 196 +++++++++--- .../test/unit_tests/test_madlib_keras.py_in | 203 +++++++++--- 9 files changed, 1140 insertions(+), 365 deletions(-) diff --git a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in index 82bec97..6a03eca 100644 --- a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in +++ b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in @@ -58,7 +58,7 @@ class InputDataPreprocessorDL(object): self.dependent_varname = dependent_varname self.independent_varname = independent_varname self.buffer_size = buffer_size - self.normalizing_const = normalizing_const if normalizing_const is not None else 1.0 + self.normalizing_const = normalizing_const if normalizing_const is not None else DEFAULT_NORMALIZING_CONST self.num_classes = num_classes self.module_name = module_name self.output_summary_table = None diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in index 47ce306..fa55093 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in @@ -29,25 +29,19 @@ import time import keras from keras import backend as K -from keras import utils as keras_utils from keras.layers import * from keras.models import * from keras.optimizers import * from keras.regularizers import * -import madlib_keras_serializer from madlib_keras_helper import * from madlib_keras_validator import * from madlib_keras_wrapper import * -from keras_model_arch_table import ModelArchSchema +from model_arch_info import * -from utilities.control import MinWarning -from utilities.model_arch_info import get_input_shape -from utilities.model_arch_info import get_num_classes from utilities.utilities import _assert from utilities.utilities import is_platform_pg from utilities.utilities import get_segments_per_host from utilities.utilities import madlib_version -from utilities.validate_args import get_col_value_and_type from utilities.validate_args import get_expr_type from utilities.validate_args import quote_ident from utilities.control import MinWarning @@ -68,7 +62,7 @@ def fit(schema_madlib, source_table, model, model_arch_table, fit_validator = FitInputValidator( source_table, validation_table, model, model_arch_table, - mb_dep_var_col, mb_indep_var_col, + model_arch_id, mb_dep_var_col, mb_indep_var_col, num_iterations, metrics_compute_frequency, warm_start) if metrics_compute_frequency is None: metrics_compute_frequency = num_iterations @@ -88,23 +82,13 @@ def fit(schema_madlib, source_table, model, model_arch_table, # Get the serialized master model start_deserialization = time.time() - model_arch_query = "SELECT {0}, {1} FROM {2} WHERE {3} = {4}".format( - ModelArchSchema.MODEL_ARCH, ModelArchSchema.MODEL_WEIGHTS, - model_arch_table, ModelArchSchema.MODEL_ID, - model_arch_id) - model_arch_result = plpy.execute(model_arch_query) - if not model_arch_result: - plpy.error("no model arch found in table {0} with id {1}".format( - model_arch_table, model_arch_id)) - model_arch_result = model_arch_result[0] - model_arch = model_arch_result[ModelArchSchema.MODEL_ARCH] - input_shape = get_input_shape(model_arch) + model_arch, model_weights = get_model_arch_weights(model_arch_table, model_arch_id) num_classes = get_num_classes(model_arch) + input_shape = get_input_shape(model_arch) fit_validator.validate_input_shapes(input_shape) - gp_segment_id_col = '0' if is_platform_pg() else 'gp_segment_id' - serialized_weights = get_initial_weights(model, model_arch_result, + serialized_weights = get_initial_weights(model, model_arch, model_weights, warm_start, gpus_per_host) # Compute total images on each segment seg_ids_train, images_per_seg_train = get_image_count_per_seg_for_minibatched_data_from_db(source_table) @@ -289,7 +273,7 @@ def fit(schema_madlib, source_table, model, model_arch_table, #TODO add a unit test for this in a future PR reset_cuda_env(original_cuda_env) -def get_initial_weights(model_table, model_arch_result, warm_start, gpus_per_host): +def get_initial_weights(model_table, model_arch, serialized_weights, warm_start, gpus_per_host): """ If warm_start is True, return back initial weights from model table. If warm_start is False, first try to get the weights from model_arch @@ -317,10 +301,8 @@ def get_initial_weights(model_table, model_arch_result, warm_start, gpus_per_hos SELECT model_data FROM {0} """.format(model_table))[0]['model_data'] else: - serialized_weights = model_arch_result[ModelArchSchema.MODEL_WEIGHTS] if not serialized_weights: - model = model_from_json( - model_arch_result[ModelArchSchema.MODEL_ARCH]) + model = model_from_json(model_arch) serialized_weights = madlib_keras_serializer.serialize_nd_weights( model.get_weights()) return serialized_weights @@ -518,10 +500,12 @@ def get_segments_and_gpus(gpus_per_host): def evaluate(schema_madlib, model_table, test_table, output_table, gpus_per_host, **kwargs): module_name = 'madlib_keras_evaluate' - input_validator = EvaluateInputValidator(test_table, model_table, output_table, module_name) - - model_summary_table = input_validator.model_summary_table - test_summary_table = input_validator.test_summary_table + if test_table: + test_summary_table = add_postfix(test_table, "_summary") + model_summary_table = None + if model_table: + model_summary_table = add_postfix(model_table, "_summary") + validate_evaluate(module_name, model_table, model_summary_table, test_table, test_summary_table, output_table) segments_per_host, gpus_per_host = get_segments_and_gpus(gpus_per_host) @@ -531,7 +515,8 @@ def evaluate(schema_madlib, model_table, test_table, output_table, gpus_per_host model_arch = res['model_arch'] input_shape = get_input_shape(model_arch) - input_validator.validate_input_shape(input_shape) + InputValidator.validate_input_shape( + test_table, MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL, input_shape, 2) compile_params_query = "SELECT compile_params, metrics_type FROM {0}".format(model_summary_table) res = plpy.execute(compile_params_query)[0] @@ -540,10 +525,11 @@ def evaluate(schema_madlib, model_table, test_table, output_table, gpus_per_host seg_ids, images_per_seg = get_image_count_per_seg_for_minibatched_data_from_db(test_table) - loss, metric =\ - get_loss_metric_from_keras_eval(schema_madlib, test_table, compile_params, model_arch, - model_data, gpus_per_host, segments_per_host, - seg_ids, images_per_seg) + loss, metric = \ + get_loss_metric_from_keras_eval( + schema_madlib, test_table, compile_params, model_arch, + model_data, gpus_per_host, segments_per_host, + seg_ids, images_per_seg) if not metrics_type: metrics_type = None @@ -555,6 +541,23 @@ def evaluate(schema_madlib, model_table, test_table, output_table, gpus_per_host SELECT $1 as loss, $2 as metric, $3 as metrics_type""".format(output_table), ["FLOAT", "FLOAT", "TEXT[]"]) plpy.execute(create_output_table, [loss, metric, metrics_type]) +def validate_evaluate(module_name, model_table, model_summary_table, test_table, test_summary_table, output_table): + def _validate_test_summary_tbl(): + input_tbl_valid(test_summary_table, module_name, + error_suffix_str="Please ensure that the test table ({0}) " + "has been preprocessed by " + "the image preprocessor.".format(test_table)) + cols_in_tbl_valid(test_summary_table, [CLASS_VALUES_COLNAME, + NORMALIZING_CONST_COLNAME, DEPENDENT_VARTYPE_COLNAME, + DEPENDENT_VARNAME_COLNAME, INDEPENDENT_VARNAME_COLNAME], module_name) + + InputValidator.validate_predict_evaluate_tables( + module_name, model_table, model_summary_table, + test_table, output_table, MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL) + _validate_test_summary_tbl() + validate_dependent_var_for_minibatch(test_table, + MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL) + def get_loss_metric_from_keras_eval(schema_madlib, table, compile_params, model_arch, serialized_weights, gpus_per_host, segments_per_host, seg_ids, images_per_seg): diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in index c5d8d35..c69f158 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in @@ -35,6 +35,7 @@ m4_include(`SQLCommon.m4') <li class="level1"><a href="#keras_fit">Fit</a></li> <li class="level1"><a href="#keras_evaluate">Evaluate</a></li> <li class="level1"><a href="#keras_predict">Predict</a></li> +<li class="level1"><a href="#keras_predict_byom">Predict BYOM</a></li> <li class="level1"><a href="#example">Examples</a></li> <li class="level1"><a href="#notes">Notes</a></li> <li class="level1"><a href="#background">Technical Background</a></li> @@ -616,6 +617,127 @@ madlib_keras_predict( </DD> </DL> + + +@anchor keras_predict_byom +@par Predict BYOM (Bring your own model) +The predict byom function has the following format: +<pre class="syntax"> +madlib_keras_predict_byom( + model_arch_table, + model_arch_id, + test_table, + id_col, + independent_varname, + output_table, + pred_type, + gpus_per_host, + class_values, + normalizing_const + ) +</pre> + + +\b Arguments +<dl class="arglist"> + +<DT>model_arch_table</DT> + <DD>TEXT. Name of the architecture table containing the model + to use for prediction. The model weights and architecture can be loaded to + this table by using the + <a href="group__grp__keras__model__arch.html">load_keras_model</a> function + </DD> + + <DT>model_arch_id</DT> + <DD>INTEGER. This is the id in 'model_arch_table'containing the model + architecture and model weights to use for prediction. + </DD> + + <DT>test_table</DT> + <DD>TEXT. Name of the table containing the dataset to + predict on. Note that test data is not preprocessed (unlike + fit and evaluate) so put one test image per row for prediction. + Also see the comment below for the 'independent_varname' parameter + regarding normalization. + + </DD> + + <DT>id_col</DT> + <DD>TEXT. Name of the id column in the test data table. + </DD> + + <DT>independent_varname</DT> + <DD>TEXT. Column with independent variables in the test table. + If a 'normalizing_const' is specified when preprocessing the + training dataset, this same normalization will be applied to + the independent variables used in predict. + </DD> + + <DT>output_table</DT> + <DD>TEXT. Name of the table that prediction output will be + written to. Table contains:</DD> + <table class="output"> + <tr> + <th>id</th> + <td>Gives the 'id' for each prediction, corresponding to each row from the test_table.</td> + </tr> + <tr> + <th>estimated_dependent_var</th> + <td> + (For pred_type='response') The estimated class for classification. If + class_values is passed in as NULL, then we assume that the class + labels are [0,1,2...,n] where n in the num of classes in the model + architecture. + </td> + </tr> + <tr> + <th>prob_CLASS</th> + <td> + (For pred_type='prob' for classification) + The probability of a given class. + If class_values is passed in as NULL, we create just one column called + 'prob' which is an array of probabilities of all the classes. + Otherwise if class_values is not NULL, then there will be one + column for each class in the training data. + </td> + </tr> + + <DT>pred_type (optional)</DT> + <DD>TEXT, default: 'response'. The type of output desired, where 'response' + gives the actual prediction and 'prob' gives the probability value for each class. + </DD> + + <DT>gpus_per_host (optional)</DT> + <DD>INTEGER, default: 0 (i.e., CPU). + Number of GPUs per segment host to be used for training the neural network. + For example, if you specify 4 for this parameter and your database cluster + is set up to have 4 segments per segment host, it means that each segment + will have a dedicated GPU. A value of 0 means that CPUs, not GPUs, will + be used for training. + + @note + We have seen some memory related issues when segments share GPU resources. + For example, if you specify 1 for this parameter and your database cluster + is set up to have 4 segments per segment host, it means that all 4 segments + on a segment host will share the same GPU. The current recommended + configuration is 1 GPU per segment. + </DD> + + <DT>class_values (optional)</DT> + <DD>TEXT[], default: NULL. + List of class labels that were used while training the model. See the + output_table column for more details. + </DD> + + <DT>normalizing_const (optional)</DT> + <DD>DOUBLE PRECISION, default: 1.0. + The normalizing constant to divide each value in the 'independent_varname' + array by. For example, you would use 255 for this value if the image data is + in the form 0-255. + </DD> +</DL> + + @anchor example @par Examples @@ -814,7 +936,6 @@ SELECT COUNT(*) FROM iris_train; -# Call the preprocessor for deep learning. For the training dataset: <pre class="example"> -DROP TABLE IF EXISTS mlp_prediction; \\x off DROP TABLE IF EXISTS iris_train_packed, iris_train_packed_summary; SELECT madlib.training_preprocessor_dl('iris_train', -- Source table @@ -1049,6 +1170,142 @@ WHERE q.actual=q.estimated; (1 row) </pre> + + + + + + + + + + + + + + + + + + +-# Predict BYOM. +We will use the validation dataset for prediction +as well, which is not usual but serves to show the +syntax. See <a href="group__grp__keras__model__arch.html">load_keras_model</a> +for details on how to load the model architecture and weights. + + +The prediction is in the 'estimated_dependent_var' +column: +<pre class="example"> +UPDATE model_arch_library set model_weights = (select model_data from iris_model) WHERE model_id = 1; + +DROP TABLE IF EXISTS iris_predict_byom; +SELECT madlib.madlib_keras_predict_byom('model_arch_library', -- model arch table + 1, -- model arch id + 'iris_test', -- test_table + 'id', -- id column + 'attributes', -- independent var + 'iris_predict_byom', -- output table + 'response', -- pred_type + 0, -- gpus_per_host + ARRAY['Iris-setosa', 'Iris-versicolor', + 'Iris-virginica'], -- class_values + 1.0 -- normalizing_const + ); +SELECT * FROM iris_predict_byom ORDER BY id; +</pre> +<pre class="result"> + id | estimated_dependent_var +-----+------------------------- + 1 | Iris-setosa + 4 | Iris-setosa + 9 | Iris-setosa + 27 | Iris-setosa + 32 | Iris-setosa + 35 | Iris-setosa + 40 | Iris-setosa + 41 | Iris-setosa + 44 | Iris-setosa + 46 | Iris-setosa + 55 | Iris-versicolor + 56 | Iris-versicolor + 66 | Iris-versicolor + 69 | Iris-versicolor + 75 | Iris-versicolor + 76 | Iris-versicolor + 102 | Iris-virginica + 105 | Iris-virginica + 108 | Iris-virginica + 113 | Iris-virginica + 115 | Iris-virginica + 116 | Iris-virginica + 118 | Iris-virginica + 119 | Iris-virginica + 122 | Iris-virginica + 125 | Iris-virginica + 133 | Iris-virginica + 134 | Iris-virginica + 135 | Iris-virginica + 138 | Iris-virginica + </pre> +Count missclassifications: +<pre class="example"> +SELECT COUNT(*) FROM iris_predict_byom JOIN iris_test USING (id) +WHERE iris_predict_byom.estimated_dependent_var != iris_test.class_text; +</pre> +<pre class="result"> + count +-------+ + 6 +(1 row) +</pre> +Percent missclassifications: +<pre class="example"> +SELECT round(count(*)*100/(150*0.2),2) as test_accuracy_percent from + (select iris_test.class_text as actual, iris_predict_byom.estimated_dependent_var as estimated + from iris_predict_byom inner join iris_test + on iris_test.id=iris_predict_byom.id) q +WHERE q.actual=q.estimated; +</pre> +<pre class="result"> + test_accuracy_percent +-----------------------+ + 80.00 +(1 row) +</pre> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <h4>Classification with Other Parameters</h4> -# Validation dataset. Now use a validation dataset @@ -1571,7 +1828,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict( ) RETURNS VOID AS $$ PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict') with AOControl(False): - madlib_keras_predict.predict(schema_madlib, + madlib_keras_predict.Predict(schema_madlib, model_table, test_table, id_col, @@ -1622,6 +1879,82 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_keras_predict( $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); +------------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict_byom( + model_arch_table VARCHAR, + model_arch_id INTEGER, + test_table VARCHAR, + id_col VARCHAR, + independent_varname VARCHAR, + output_table VARCHAR, + pred_type VARCHAR, + gpus_per_host INTEGER, + class_values TEXT[], + normalizing_const DOUBLE PRECISION +) RETURNS VOID AS $$ + PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict') + with AOControl(False): + madlib_keras_predict.PredictBYOM(**globals()) +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict_byom( + model_arch_table VARCHAR, + model_arch_id INTEGER, + test_table VARCHAR, + id_col VARCHAR, + independent_varname VARCHAR, + output_table VARCHAR, + pred_type VARCHAR, + gpus_per_host INTEGER, + class_values TEXT[] +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.madlib_keras_predict_byom($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict_byom( + model_arch_table VARCHAR, + model_arch_id INTEGER, + test_table VARCHAR, + id_col VARCHAR, + independent_varname VARCHAR, + output_table VARCHAR, + pred_type VARCHAR, + gpus_per_host INTEGER +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.madlib_keras_predict_byom($1, $2, $3, $4, $5, $6, $7, $8, NULL, NULL); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict_byom( + model_arch_table VARCHAR, + model_arch_id INTEGER, + test_table VARCHAR, + id_col VARCHAR, + independent_varname VARCHAR, + output_table VARCHAR, + pred_type VARCHAR +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.madlib_keras_predict_byom($1, $2, $3, $4, $5, $6, $7, NULL, NULL, NULL); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict_byom( + model_arch_table VARCHAR, + model_arch_id INTEGER, + test_table VARCHAR, + id_col VARCHAR, + independent_varname VARCHAR, + output_table VARCHAR +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.madlib_keras_predict_byom($1, $2, $3, $4, $5, $6, NULL, NULL, NULL, NULL); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +------------------------------------------------------------------------------- CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_evaluate( model_table VARCHAR, test_table VARCHAR, diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in index 17bdda4..e8218a6 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in @@ -45,6 +45,8 @@ MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL = "independent_var" FLOAT32_SQL_TYPE = 'REAL' SMALLINT_SQL_TYPE = 'SMALLINT' +DEFAULT_NORMALIZING_CONST = 1.0 + ##################################################################### # Prepend a dimension to np arrays using expand_dims. diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in index ca7a9ad..819ff98 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in @@ -18,7 +18,6 @@ # under the License. import plpy -import os import keras from keras import backend as K @@ -26,82 +25,190 @@ from keras.layers import * from keras.models import * from keras.optimizers import * +from model_arch_info import * from madlib_keras_helper import * -from madlib_keras_validator import PredictInputValidator +from madlib_keras_validator import * from predict_input_params import PredictParamsProcessor from utilities.control import MinWarning -from utilities.model_arch_info import get_input_shape +from utilities.utilities import _assert from utilities.utilities import add_postfix from utilities.utilities import create_cols_from_array_sql_string from utilities.utilities import get_segments_per_host -from utilities.utilities import is_platform_pg from utilities.utilities import unique_string +from utilities.validate_args import input_tbl_valid +from utilities.validate_args import quote_ident from madlib_keras_wrapper import * -MODULE_NAME = 'madlib_keras_predict' +class BasePredict(): + def __init__(self, schema_madlib, table_to_validate, test_table, id_col, + independent_varname, output_table, pred_type, gpus_per_host): + self.schema_madlib = schema_madlib + self.table_to_validate = table_to_validate + self.test_table = test_table + self.id_col = id_col + self.independent_varname = independent_varname + self.output_table = output_table + self.pred_type = pred_type + self.gpus_per_host = gpus_per_host + self._set_default_gpus_pred_type() + + def _set_default_gpus_pred_type(self): + self.pred_type = 'response' if not self.pred_type else self.pred_type + self.is_response = True if self.pred_type == 'response' else False + self.gpus_per_host = 0 if self.gpus_per_host is None else self.gpus_per_host + + + def call_internal_keras(self): + if self.is_response: + pred_col_name = add_postfix("estimated_", self.dependent_varname) + pred_col_type = self.dependent_vartype + else: + pred_col_name = "prob" + pred_col_type = 'double precision' + + intermediate_col = unique_string() + class_values = strip_trailing_nulls_from_class_values(self.class_values) + + prediction_select_clause = create_cols_from_array_sql_string( + class_values, intermediate_col, pred_col_name, + pred_col_type, self.is_response, self.module_name) + gp_segment_id_col, seg_ids_test, \ + images_per_seg_test = get_image_count_per_seg_for_non_minibatched_data_from_db( + self.test_table) + segments_per_host = get_segments_per_host() + + predict_query = plpy.prepare(""" + CREATE TABLE {self.output_table} AS + SELECT {self.id_col}, {prediction_select_clause} + FROM ( + SELECT {self.test_table}.{self.id_col}, + ({self.schema_madlib}.internal_keras_predict + ({self.independent_varname}, + $1, + $2, + {self.is_response}, + {self.normalizing_const}, + {gp_segment_id_col}, + ARRAY{seg_ids_test}, + ARRAY{images_per_seg_test}, + {self.gpus_per_host}, + {segments_per_host}) + ) AS {intermediate_col} + FROM {self.test_table} + ) q + """.format(self=self, prediction_select_clause=prediction_select_clause, + seg_ids_test=seg_ids_test, + images_per_seg_test=images_per_seg_test, + gp_segment_id_col=gp_segment_id_col, + segments_per_host=segments_per_host, + intermediate_col=intermediate_col), + ["text", "bytea"]) + plpy.execute(predict_query, [self.model_arch, self.model_weights]) + + def set_default_class_values(self, class_values): + self.class_values = class_values + if self.pred_type == 'prob': + return + if self.class_values is None: + num_classes = get_num_classes(self.model_arch) + self.class_values = range(0, num_classes) @MinWarning("warning") -def predict(schema_madlib, model_table, test_table, id_col, - independent_varname, output_table, pred_type, gpus_per_host, **kwargs): - if not pred_type: - pred_type = 'response' - input_validator = PredictInputValidator( - test_table, model_table, id_col, independent_varname, - output_table, pred_type, MODULE_NAME) - - param_proc = PredictParamsProcessor(model_table, MODULE_NAME) - class_values = param_proc.get_class_values() - input_validator.validate_pred_type(class_values) - dependent_varname = param_proc.get_dependent_varname() - dependent_vartype = param_proc.get_dependent_vartype() - model_data = param_proc.get_model_data() - model_arch = param_proc.get_model_arch() - normalizing_const = param_proc.get_normalizing_const() - input_shape = get_input_shape(model_arch) - input_validator.validate_input_shape(input_shape) - - is_response = True if pred_type == 'response' else False - intermediate_col = unique_string() - if is_response: - pred_col_name = add_postfix("estimated_", dependent_varname) - pred_col_type = dependent_vartype - else: - pred_col_name = "prob" - pred_col_type = 'double precision' - - class_values = strip_trailing_nulls_from_class_values(class_values) - - prediction_select_clause = create_cols_from_array_sql_string( - class_values, intermediate_col, pred_col_name, - pred_col_type, is_response, MODULE_NAME) - - gp_segment_id_col, seg_ids_test, \ - images_per_seg_test = get_image_count_per_seg_for_non_minibatched_data_from_db(test_table) - segments_per_host = get_segments_per_host() - - predict_query = plpy.prepare(""" - CREATE TABLE {output_table} AS - SELECT {id_col}, {prediction_select_clause} - FROM ( - SELECT {test_table}.{id_col}, - ({schema_madlib}.internal_keras_predict - ({independent_varname}, - $1, - $2, - {is_response}, - {normalizing_const}, - {gp_segment_id_col}, - ARRAY{seg_ids_test}, - ARRAY{images_per_seg_test}, - {gpus_per_host}, - {segments_per_host}) - ) AS {intermediate_col} - FROM {test_table} - ) q - """.format(**locals()), ["text", "bytea"]) - plpy.execute(predict_query, [model_arch, model_data]) +class Predict(BasePredict): + def __init__(self, schema_madlib, model_table, + test_table, id_col, independent_varname, + output_table, pred_type, gpus_per_host, + **kwargs): + + self.module_name = 'madlib_keras_predict' + self.model_table = model_table + if self.model_table: + self.model_summary_table = add_postfix(self.model_table, "_summary") + + BasePredict.__init__(self, schema_madlib, model_table, test_table, + id_col, independent_varname, + output_table, pred_type, + gpus_per_host) + param_proc = PredictParamsProcessor(model_table, self.module_name) + self.dependent_vartype = param_proc.get_dependent_vartype() + self.model_weights = param_proc.get_model_data() + self.model_arch = param_proc.get_model_arch() + class_values = param_proc.get_class_values() + self.set_default_class_values(class_values) + self.normalizing_const = param_proc.get_normalizing_const() + self.dependent_varname = param_proc.get_dependent_varname() + + self.validate() + BasePredict.call_internal_keras(self) + + def validate(self): + InputValidator.validate_predict_evaluate_tables( + self.module_name, self.model_table, self.model_summary_table, + self.test_table, self.output_table, self.independent_varname) + + InputValidator.validate_id_in_test_tbl( + self.module_name, self.test_table, self.id_col) + + InputValidator.validate_class_values( + self.module_name, self.class_values, self.pred_type, self.model_arch) + input_shape = get_input_shape(self.model_arch) + InputValidator.validate_pred_type( + self.module_name, self.pred_type, self.class_values) + InputValidator.validate_input_shape( + self.test_table, self.independent_varname, input_shape, 1) + +@MinWarning("warning") +class PredictBYOM(BasePredict): + def __init__(self, schema_madlib, model_arch_table, model_arch_id, + test_table, id_col, independent_varname, output_table, + pred_type, gpus_per_host, class_values, normalizing_const, + **kwargs): + self.module_name='madlib_keras_predict_byom' + self.model_arch_table = model_arch_table + self.model_arch_id = model_arch_id + self.class_values = class_values + self.normalizing_const = normalizing_const + self.dependent_varname = 'dependent_var' + BasePredict.__init__(self, schema_madlib, model_arch_table, + test_table, id_col, independent_varname, + output_table, pred_type, gpus_per_host) + if self.is_response: + self.dependent_vartype = 'text' + else: + self.dependent_vartype = 'double precision' + ## Set default values for norm const and class_values + # gpus_per_host and pred_type are defaulted in base_predict's init + self.normalizing_const = normalizing_const + if self.normalizing_const is None: + self.normalizing_const = DEFAULT_NORMALIZING_CONST + InputValidator.validate_predict_byom_tables( + self.module_name, self.model_arch_table, self.model_arch_id, + self.test_table, self.id_col, self.output_table, + self.independent_varname) + self.validate_and_set_defaults() + BasePredict.call_internal_keras(self) + + def validate_and_set_defaults(self): + # Set some defaults first and then validate and then set some more defaults + self.model_arch, self.model_weights = get_model_arch_weights( + quote_ident(self.model_arch_table), self.model_arch_id) + # Assert model_weights and model_arch are not empty. + _assert(self.model_weights and self.model_arch, + "{0}: Model weights and architecture should not be NULL.".format( + self.module_name)) + self.set_default_class_values(self.class_values) + + InputValidator.validate_pred_type( + self.module_name, self.pred_type, self.class_values) + InputValidator.validate_normalizing_const( + self.module_name, self.normalizing_const) + InputValidator.validate_class_values( + self.module_name, self.class_values, self.pred_type, self.model_arch) + InputValidator.validate_input_shape( + self.test_table, self.independent_varname, + get_input_shape(self.model_arch), 1) def internal_keras_predict(independent_var, model_architecture, model_data, is_response, normalizing_const, current_seg_id, seg_ids, @@ -216,9 +323,86 @@ estimated_COL_NAME: (For pred_type='response') The estimated class for prob_CLASS: (For pred_type='prob' for classification) The probability of a given class. There will be one column for each class in the training data. + TODO change this """ else: help_string = "No such option. Use {schema_madlib}.madlib_keras_predict()" return help_string.format(schema_madlib=schema_madlib) + +def predict_byom_help(schema_madlib, message, **kwargs): + """ + Help function for keras predict + + Args: + @param schema_madlib + @param message: string, Help message string + @param kwargs + + Returns: + String. Help/usage information + """ + if not message: + help_string = """ +----------------------------------------------------------------------- + SUMMARY +----------------------------------------------------------------------- +This function allows the user to predict with their own pre trained model (note +that this model doesn't have to be trained using MADlib.) + +For more details on function usage: + SELECT {schema_madlib}.madlib_keras_predict_byom('usage') + """ + elif message in ['usage', 'help', '?']: + help_string = """ +----------------------------------------------------------------------- + USAGE +----------------------------------------------------------------------- + SELECT {schema_madlib}.madlib_keras_predict_byom( + model_arch_table, -- Name of the table containing the model architecture + and the pre trained model weights + model_arch_id, -- This is the id in 'model_arch_table' containing the + model architecture + test_table, -- Name of the table containing the evaluation dataset + id_col, -- Name of the id column in the test data table + independent_varname, -- Name of the column with independent + variables in the test table + output_table, -- Name of the output table + pred_type, -- The type of the desired output + gpus_per_host, -- Number of GPUs per segment host to + be used for training + class_values, -- List of class labels that were used while training the + model. If class_values is passed in as NULL, the output + table will have a column named 'prob' which is an array + of probabilities of all the classes. + Otherwise if class_values is not NULL, then the output + table will contain a column for each class/label from + the training data + normalizing_const -- Normalizing constant used for standardizing arrays in + independent_varname + ) + ); + +----------------------------------------------------------------------- + OUTPUT +----------------------------------------------------------------------- +The output table ('output_table' above) contains the following columns: + +id: Gives the 'id' for each prediction, corresponding + to each row from the test_table. +estimated_dependent_var: (For pred_type='response') The estimated class for + classification. If class_values is passed in as NULL, then we + assume that the class labels are [0,1,2...,n] where n in the + num of classes in the model architecture. +prob_CLASS: (For pred_type='prob' for classification) The + probability of a given class. + If class_values is passed in as NULL, we create just one column + called 'prob' which is an array of probabilites of all the classes + Otherwise if class_values is not NULL, then there will be one + column for each class in the training data. +""" + else: + help_string = "No such option. Use {schema_madlib}.madlib_keras_predict_byom()" + + return help_string.format(schema_madlib=schema_madlib) # --------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in index b111fc4..e9d7d14 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in @@ -19,6 +19,7 @@ import plpy from keras_model_arch_table import ModelArchSchema +from model_arch_info import get_input_shape, get_num_classes from madlib_keras_helper import CLASS_VALUES_COLNAME from madlib_keras_helper import COMPILE_PARAMS_COLNAME from madlib_keras_helper import DEPENDENT_VARNAME_COLNAME @@ -45,182 +46,177 @@ from utilities.validate_args import get_expr_type from utilities.validate_args import input_tbl_valid from utilities.validate_args import output_tbl_valid - -def _validate_input_shapes(table, independent_varname, input_shape, offset): - """ - Validate if the input shape specified in model architecture is the same - as the shape of the image specified in the indepedent var of the input - table. - offset: This offset is the index of the start of the image array. We also - need to consider that sql array indexes start from 1 - For ex if the image is of shape [32,32,3] and is minibatched, the image will - look like [10, 32, 32, 3]. The offset in this case is 1 (start the index at 1) + - 1 (ignore the buffer size 10) = 2. - If the image is not batched then it will look like [32, 32 ,3] and the offset in - this case is 1 (start the index at 1). - """ - array_upper_query = ", ".join("array_upper({0}, {1}) AS n_{2}".format( - independent_varname, i+offset, i) for i in range(len(input_shape))) - query = """ - SELECT {0} - FROM {1} - LIMIT 1 - """.format(array_upper_query, table) - # This query will fail if an image in independent var does not have the - # same number of dimensions as the input_shape. - result = plpy.execute(query)[0] - _assert(len(result) == len(input_shape), - "model_keras error: The number of dimensions ({0}) of each image" - " in model architecture and {1} in {2} ({3}) do not match.".format( - len(input_shape), independent_varname, table, len(result))) - for i in range(len(input_shape)): - key_name = "n_{0}".format(i) - if result[key_name] != input_shape[i]: - # Construct the shape in independent varname to display - # meaningful error msg. - input_shape_from_table = [result["n_{0}".format(i)] - for i in range(len(input_shape))] - plpy.error("model_keras error: Input shape {0} in the model" - " architecture does not match the input shape {1} of column" - " {2} in table {3}.".format( - input_shape, input_shape_from_table, - independent_varname, table)) - class InputValidator: - def __init__(self, test_table, model_table, independent_varname, - output_table, module_name): - self.test_table = test_table - self.model_table = model_table - self.independent_varname = independent_varname - self.output_table = output_table - if self.model_table: - self.model_summary_table = add_postfix( - self.model_table, "_summary") - self.module_name = module_name - self._validate_input_args() - - def _validate_input_args(self): - input_tbl_valid(self.model_table, self.module_name) - self._validate_model_data_cols() - input_tbl_valid(self.model_summary_table, self.module_name) - self._validate_model_summary_tbl_cols() - input_tbl_valid(self.test_table, self.module_name) - self._validate_test_tbl_cols() - output_tbl_valid(self.output_table, self.module_name) - - - def _validate_model_data_cols(self): - _assert(is_var_valid(self.model_table, MODEL_DATA_COLNAME), + @staticmethod + def validate_predict_evaluate_tables( + module_name, model_table, model_summary_table, test_table, output_table, + independent_varname): + InputValidator._validate_model_data_tbl(module_name, model_table) + InputValidator._validate_model_summary_tbl( + module_name, model_summary_table) + InputValidator._validate_test_tbl( + module_name, test_table, independent_varname) + output_tbl_valid(output_table, module_name) + + @staticmethod + def validate_id_in_test_tbl(module_name, test_table, id_col): + _assert(is_var_valid(test_table, id_col), + "{module_name} error: invalid id column " + "('{id_col}') for test table ({table}).".format( + module_name=module_name, + id_col=id_col, + table=test_table)) + + @staticmethod + def validate_predict_byom_tables(module_name, model_arch_table, model_arch_id, + test_table, id_col, output_table, + independent_varname): + InputValidator.validate_model_arch_table( + module_name, model_arch_table, model_arch_id) + InputValidator._validate_test_tbl( + module_name, test_table, independent_varname) + InputValidator.validate_id_in_test_tbl(module_name, test_table, id_col) + + output_tbl_valid(output_table, module_name) + + + @staticmethod + def validate_pred_type(module_name, pred_type, class_values): + if not pred_type in ['prob', 'response']: + plpy.error("{0}: Invalid value for pred_type param ({1}). Must be "\ + "either response or prob.".format(module_name, pred_type)) + + + @staticmethod + def validate_input_shape(table, independent_varname, input_shape, offset): + """ + Validate if the input shape specified in model architecture is the same + as the shape of the image specified in the indepedent var of the input + table. + offset: This offset is the index of the start of the image array. We also + need to consider that sql array indexes start from 1 + For ex if the image is of shape [32,32,3] and is minibatched, the image will + look like [10, 32, 32, 3]. The offset in this case is 1 (start the index at 1) + + 1 (ignore the buffer size 10) = 2. + If the image is not batched then it will look like [32, 32 ,3] and the offset in + this case is 1 (start the index at 1). + """ + array_upper_query = ", ".join("array_upper({0}, {1}) AS n_{2}".format( + independent_varname, i+offset, i) for i in range(len(input_shape))) + query = """ + SELECT {0} + FROM {1} + LIMIT 1 + """.format(array_upper_query, table) + # This query will fail if an image in independent var does not have the + # same number of dimensions as the input_shape. + result = plpy.execute(query)[0] + _assert(len(result) == len(input_shape), + "model_keras error: The number of dimensions ({0}) of each image" + " in model architecture and {1} in {2} ({3}) do not match.".format( + len(input_shape), independent_varname, table, len(result))) + for i in range(len(input_shape)): + key_name = "n_{0}".format(i) + if result[key_name] != input_shape[i]: + # Construct the shape in independent varname to display + # meaningful error msg. + input_shape_from_table = [result["n_{0}".format(i)] + for i in range(len(input_shape))] + plpy.error("model_keras error: Input shape {0} in the model" + " architecture does not match the input shape {1} of column" + " {2} in table {3}.".format( + input_shape, input_shape_from_table, + independent_varname, table)) + + @staticmethod + def validate_model_arch_table(module_name, model_arch_table, model_arch_id): + input_tbl_valid(model_arch_table, module_name) + _assert(model_arch_id is not None, + "{0}: Invalid model architecture ID.".format(module_name)) + + + @staticmethod + def validate_normalizing_const(module_name, normalizing_const): + _assert(normalizing_const > 0, + "{0} error: Normalizing constant has to be greater than 0.". + format(module_name)) + + @staticmethod + def validate_class_values(module_name, class_values, pred_type, model_arch): + if not class_values: + return + num_classes = len(class_values) + _assert(num_classes == get_num_classes(model_arch), + "{0}: The number of class values do not match the " \ + "provided architecture.".format(module_name)) + if pred_type == 'prob' and num_classes+1 >= 1600: + plpy.error({"{0}: The output will have {1} columns, exceeding the "\ + " max number of columns that can be created (1600)".format( + module_name, num_classes+1)}) + + @staticmethod + def validate_model_weights(module_name, model_arch, model_weights): + _assert(model_weights and model_arch, + "{0}: Model weights and architecture must be valid.".format( + module_name)) + + @staticmethod + def _validate_model_data_tbl(module_name, model_table): + input_tbl_valid(model_table, module_name) + _assert(is_var_valid(model_table, MODEL_DATA_COLNAME), "{module_name} error: column '{model_data}' " "does not exist in model table '{table}'.".format( - module_name=self.module_name, + module_name=module_name, model_data=MODEL_DATA_COLNAME, - table=self.model_table)) - _assert(is_var_valid(self.model_table, ModelArchSchema.MODEL_ARCH), + table=model_table)) + _assert(is_var_valid(model_table, ModelArchSchema.MODEL_ARCH), "{module_name} error: column '{model_arch}' " "does not exist in model table '{table}'.".format( - module_name=self.module_name, + module_name=module_name, model_arch=ModelArchSchema.MODEL_ARCH, - table=self.model_table)) + table=model_table)) - def _validate_test_tbl_cols(self): - _assert(is_var_valid(self.test_table, self.independent_varname), + @staticmethod + def _validate_test_tbl(module_name, test_table, independent_varname): + input_tbl_valid(test_table, module_name) + _assert(is_var_valid(test_table, independent_varname), "{module_name} error: invalid independent_varname " "('{independent_varname}') for test table " "({table}).".format( - module_name=self.module_name, - independent_varname=self.independent_varname, - table=self.test_table)) + module_name=module_name, + independent_varname=independent_varname, + table=test_table)) - def _validate_model_summary_tbl_cols(self): + @staticmethod + def _validate_model_summary_tbl(module_name, model_summary_table): + input_tbl_valid(model_summary_table, module_name) cols_to_check_for = [CLASS_VALUES_COLNAME, DEPENDENT_VARNAME_COLNAME, DEPENDENT_VARTYPE_COLNAME, MODEL_ARCH_ID_COLNAME, MODEL_ARCH_TABLE_COLNAME, - NORMALIZING_CONST_COLNAME] - _assert(columns_exist_in_table( - self.model_summary_table, cols_to_check_for), - "{0} error: One or more expected columns missing in model " - "summary table ('{1}'). The expected columns are {2}.".format( - self.module_name, self.model_summary_table, cols_to_check_for)) - -class EvaluateInputValidator(InputValidator): - def __init__(self, test_table, model_table, output_table, module_name): - self.test_summary_table = None - if test_table: - self.test_summary_table = add_postfix(test_table, "_summary") - - self.independent_varname = MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL - InputValidator.__init__(self, test_table, model_table, - self.independent_varname, - output_table, module_name) - - def _validate_input_args(self): - input_tbl_valid(self.test_summary_table, self.module_name, - error_suffix_str="Please ensure that the test table ({0}) " - "has been preprocessed by " - "the image preprocessor.".format(self.test_table)) - self._validate_test_summary_tbl_cols() - InputValidator._validate_input_args(self) - validate_dependent_var_for_minibatch(self.test_table, - MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL) - - def _validate_model_summary_tbl_cols(self): - cols_to_check_for = [COMPILE_PARAMS_COLNAME, METRIC_TYPE_COLNAME] + NORMALIZING_CONST_COLNAME, + COMPILE_PARAMS_COLNAME, + METRIC_TYPE_COLNAME] _assert(columns_exist_in_table( - self.model_summary_table, cols_to_check_for), + model_summary_table, cols_to_check_for), "{0} error: One or more expected columns missing in model " "summary table ('{1}'). The expected columns are {2}.".format( - self.module_name, self.model_summary_table, cols_to_check_for)) + module_name, model_summary_table, cols_to_check_for)) - def _validate_test_summary_tbl_cols(self): - cols_in_tbl_valid(self.test_summary_table, [CLASS_VALUES_COLNAME, - NORMALIZING_CONST_COLNAME, DEPENDENT_VARTYPE_COLNAME, - DEPENDENT_VARNAME_COLNAME, INDEPENDENT_VARNAME_COLNAME], self.module_name) - - def validate_input_shape(self, input_shape_from_arch): - _validate_input_shapes(self.test_table, self.independent_varname, - input_shape_from_arch, 2) - -class PredictInputValidator(InputValidator): - def __init__(self, test_table, model_table, id_col, independent_varname, - output_table, pred_type, module_name): - self.id_col = id_col - self.pred_type = pred_type - InputValidator.__init__(self, test_table, model_table, independent_varname, - output_table, module_name) - - def validate_pred_type(self, class_values): - if not self.pred_type in ['prob', 'response']: - plpy.error("{0}: Invalid value for pred_type param ({1}). Must be "\ - "either response or prob.".format(self.module_name, self.pred_type)) - if self.pred_type == 'prob' and class_values and len(class_values)+1 >= 1600: - plpy.error({"{0}: The output will have {1} columns, exceeding the "\ - " max number of columns that can be created (1600)".format( - self.module_name, len(class_values)+1)}) - def validate_input_shape(self, input_shape_from_arch): - _validate_input_shapes(self.test_table, self.independent_varname, - input_shape_from_arch, 1) - def _validate_test_tbl_cols(self): - InputValidator._validate_test_tbl_cols(self) - _assert(is_var_valid(self.test_table, self.id_col), - "{module_name} error: invalid id column " - "('{id_col}') for test table ({table}).".format( - module_name=self.module_name, - id_col=self.id_col, - table=self.test_table)) class FitInputValidator: def __init__(self, source_table, validation_table, output_model_table, - model_arch_table, dependent_varname, independent_varname, - num_iterations, metrics_compute_frequency, warm_start): + model_arch_table, model_arch_id, dependent_varname, + independent_varname, num_iterations, + metrics_compute_frequency, warm_start): self.source_table = source_table self.validation_table = validation_table self.output_model_table = output_model_table self.model_arch_table = model_arch_table + self.model_arch_id = model_arch_id self.dependent_varname = dependent_varname self.independent_varname = independent_varname self.metrics_compute_frequency = metrics_compute_frequency @@ -236,30 +232,6 @@ class FitInputValidator: self.module_name = 'madlib_keras_fit' self._validate_input_args() - def _validate_input_table(self, table): - _assert(is_var_valid(table, self.independent_varname), - "{module_name}: invalid independent_varname " - "('{independent_varname}') for table ({table}). " - "Please ensure that the input table ({table}) " - "has been preprocessed by the image preprocessor.".format( - module_name=self.module_name, - independent_varname=self.independent_varname, - table=table)) - - _assert(is_var_valid(table, self.dependent_varname), - "{module_name}: invalid dependent_varname " - "('{dependent_varname}') for table ({table}). " - "Please ensure that the input table ({table}) " - "has been preprocessed by the image preprocessor.".format( - module_name=self.module_name, - dependent_varname=self.dependent_varname, - table=table)) - - def _is_valid_metrics_compute_frequency(self): - return self.metrics_compute_frequency is None or \ - (self.metrics_compute_frequency >= 1 and \ - self.metrics_compute_frequency <= self.num_iterations) - def _validate_input_args(self): _assert(self.num_iterations > 0, "{0}: Number of iterations cannot be < 1.".format(self.module_name)) @@ -281,8 +253,8 @@ class FitInputValidator: self.dependent_varname) self._validate_validation_table() - - input_tbl_valid(self.model_arch_table, self.module_name) + InputValidator.validate_model_arch_table(self.module_name, self.model_arch_table, + self.model_arch_id) if self.warm_start: input_tbl_valid(self.output_model_table, self.module_name) input_tbl_valid(self.output_summary_model_table, self.module_name) @@ -290,6 +262,31 @@ class FitInputValidator: output_tbl_valid(self.output_model_table, self.module_name) output_tbl_valid(self.output_summary_model_table, self.module_name) + def _validate_input_table(self, table): + _assert(is_var_valid(table, self.independent_varname), + "{module_name}: invalid independent_varname " + "('{independent_varname}') for table ({table}). " + "Please ensure that the input table ({table}) " + "has been preprocessed by the image preprocessor.".format( + module_name=self.module_name, + independent_varname=self.independent_varname, + table=table)) + + _assert(is_var_valid(table, self.dependent_varname), + "{module_name}: invalid dependent_varname " + "('{dependent_varname}') for table ({table}). " + "Please ensure that the input table ({table}) " + "has been preprocessed by the image preprocessor.".format( + module_name=self.module_name, + dependent_varname=self.dependent_varname, + table=table)) + + def _is_valid_metrics_compute_frequency(self): + return self.metrics_compute_frequency is None or \ + (self.metrics_compute_frequency >= 1 and \ + self.metrics_compute_frequency <= self.num_iterations) + + def _validate_validation_table(self): if self.validation_table and self.validation_table.strip() != '': @@ -305,9 +302,10 @@ class FitInputValidator: def validate_input_shapes(self, input_shape): - _validate_input_shapes(self.source_table, self.independent_varname, + InputValidator.validate_input_shape(self.source_table, self.independent_varname, input_shape, 2) if self.validation_table: - _validate_input_shapes( + InputValidator.validate_input_shape( self.validation_table, self.independent_varname, input_shape, 2) + diff --git a/src/ports/postgres/modules/utilities/model_arch_info.py_in b/src/ports/postgres/modules/deep_learning/model_arch_info.py_in similarity index 66% rename from src/ports/postgres/modules/utilities/model_arch_info.py_in rename to src/ports/postgres/modules/deep_learning/model_arch_info.py_in index a03594a..c749144 100644 --- a/src/ports/postgres/modules/utilities/model_arch_info.py_in +++ b/src/ports/postgres/modules/deep_learning/model_arch_info.py_in @@ -22,6 +22,7 @@ m4_changequote(`<!', `!>') import sys import json import plpy +from keras_model_arch_table import ModelArchSchema def _get_layers(model_arch): d = json.loads(model_arch) @@ -41,6 +42,22 @@ def get_input_shape(model_arch): plpy.error('Unable to get input shape from model architecture.') def get_num_classes(model_arch): + """ + We assume that the last dense layer in the model architecture contains the num_classes (units) + An example can be: + ``` + ... + model.add(Flatten()) + model.add(Dense(512)) + model.add(Activation('relu')) + model.add(Dropout(0.5)) + model.add(Dense(num_classes)) + model.add(Activation('softmax')) + ``` + where activation can be after the dense layer. + :param model_arch: + :return: + """ arch_layers = _get_layers(model_arch) i = len(arch_layers) - 1 while i >= 0: @@ -66,3 +83,22 @@ def get_model_arch_layers_str(model_arch): else: layers += "{1}\n".format(class_name) return layers + +def get_model_arch_weights(model_arch_table, model_arch_id): + + #assume validation is already called + model_arch_query = "SELECT {0}, {1} FROM {2} WHERE {3} = {4}".format( + ModelArchSchema.MODEL_ARCH, ModelArchSchema.MODEL_WEIGHTS, + model_arch_table, ModelArchSchema.MODEL_ID, + model_arch_id) + model_arch_result = plpy.execute(model_arch_query) + if not model_arch_result: + plpy.error("no model arch found in table {0} with id {1}".format( + model_arch_table, model_arch_id)) + + model_arch_result = model_arch_result[0] + + model_arch = model_arch_result[ModelArchSchema.MODEL_ARCH] + model_weights = model_arch_result[ModelArchSchema.MODEL_WEIGHTS] + + return model_arch, model_weights diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in index dacf236..28a500e 100644 --- a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in +++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in @@ -383,21 +383,20 @@ SELECT madlib_keras_predict( 0); -- Validate that prediction output table exists and has correct schema -SELECT assert(UPPER(atttypid::regtype::TEXT) = 'INTEGER', 'id column should be INTEGER type') - FROM pg_attribute WHERE attrelid = 'cifar10_predict'::regclass - AND attname = 'id'; +SELECT assert(UPPER(pg_typeof(id)::TEXT )= 'INTEGER', + 'id column should be INTEGER type') FROM cifar10_predict; -SELECT assert(UPPER(atttypid::regtype::TEXT) = +SELECT assert(UPPER(pg_typeof(estimated_y)::TEXT) = 'SMALLINT', 'prediction column should be SMALLINT type') - FROM pg_attribute WHERE attrelid = 'cifar10_predict'::regclass - AND attname = 'estimated_y'; +FROM cifar10_predict; -- Validate correct number of rows returned. -SELECT assert(COUNT(*)=2, 'Output table of madlib_keras_predict should have two rows') FROM cifar10_predict; +SELECT assert(COUNT(*)=2, 'Output table of madlib_keras_predict should have two rows') +FROM cifar10_predict; -- First test that all values are in set of class values; if this breaks, it's definitely a problem. SELECT assert(estimated_y IN (0,1), - 'Predicted value not in set of defined class values for model') + 'Predicted value not in set of defined class values for model') FROM cifar10_predict; DROP TABLE IF EXISTS cifar10_predict; @@ -512,15 +511,13 @@ SELECT madlib_keras_predict( 'prob', 0); -SELECT assert(UPPER(atttypid::regtype::TEXT) = +SELECT assert(UPPER(pg_typeof(prob_0)::TEXT) = 'DOUBLE PRECISION', 'column prob_0 should be double precision type') - FROM pg_attribute WHERE attrelid = 'cifar10_predict'::regclass - AND attname = 'prob_0'; +FROM cifar10_predict; -SELECT assert(UPPER(atttypid::regtype::TEXT) = +SELECT assert(UPPER(pg_typeof(prob_1)::TEXT) = 'DOUBLE PRECISION', 'column prob_1 should be double precision type') - FROM pg_attribute WHERE attrelid = 'cifar10_predict'::regclass - AND attname = 'prob_1'; +FROM cifar10_predict; SELECT assert(COUNT(*)=3, 'Predict out table must have exactly three cols.') FROM pg_attribute @@ -616,20 +613,17 @@ SELECT madlib_keras_predict( -- Validate the output datatype of newly created prediction columns -- for prediction type = 'prob' and class_values 'TEXT' with NULL as a valid -- class_values -SELECT assert(UPPER(atttypid::regtype::TEXT) = +SELECT assert(UPPER(pg_typeof(prob_cat)::TEXT) = 'DOUBLE PRECISION', 'column prob_cat should be double precision type') -FROM pg_attribute -WHERE attrelid = 'cifar10_predict'::regclass AND attname = 'prob_cat'; +FROM cifar10_predict; -SELECT assert(UPPER(atttypid::regtype::TEXT) = +SELECT assert(UPPER(pg_typeof(prob_dog)::TEXT) = 'DOUBLE PRECISION', 'column prob_dog should be double precision type') -FROM pg_attribute -WHERE attrelid = 'cifar10_predict'::regclass AND attname = 'prob_dog'; +FROM cifar10_predict; -SELECT assert(UPPER(atttypid::regtype::TEXT) = +SELECT assert(UPPER(pg_typeof("prob_NULL")::TEXT) = 'DOUBLE PRECISION', 'column prob_NULL should be double precision type') -FROM pg_attribute -WHERE attrelid = 'cifar10_predict'::regclass AND attname = 'prob_NULL'; +FROM cifar10_predict; -- Must have exactly 4 cols (3 for class_values and 1 for id) SELECT assert(COUNT(*)=4, 'Predict out table must have exactly four cols.') @@ -650,11 +644,10 @@ SELECT madlib_keras_predict( -- Validate the output datatype of newly created prediction columns -- for prediction type = 'response' and class_values 'TEXT' with NULL -- as a valid class_values -SELECT assert(UPPER(atttypid::regtype::TEXT) = - 'TEXT', 'prediction column should be TEXT type') -FROM pg_attribute -WHERE attrelid = 'cifar10_predict'::regclass - AND attname = 'estimated_y'; +SELECT assert(UPPER(pg_typeof(estimated_y_text)::TEXT) = 'TEXT', + 'prediction column should be TEXT type') +FROM cifar10_predict LIMIT 1; + -- Tests where the assumption is user has one-hot encoded, so class_values -- in input summary table will be NULL. @@ -674,10 +667,9 @@ SELECT madlib_keras_predict( -- Validate the output datatype of newly created prediction column -- for prediction type = 'response' and class_value = NULL -- Returns: Array of probabilities for user's one-hot encoded data -SELECT assert(UPPER(atttypid::regtype::TEXT) = - 'DOUBLE PRECISION[]', 'column prob should be double precision[] type') -FROM pg_attribute -WHERE attrelid = 'cifar10_predict'::regclass AND attname = 'prob'; +SELECT assert(UPPER(pg_typeof(prob)::TEXT) = 'DOUBLE PRECISION[]', + 'column prob should be double precision[] type') +FROM cifar10_predict LIMIT 1; -- Predict with pred_type=response DROP TABLE IF EXISTS cifar10_predict; @@ -694,11 +686,14 @@ SELECT madlib_keras_predict( -- for prediction type = 'response' and class_value = NULL -- Returns: Index of class value in user's one-hot encoded data with -- highest probability -SELECT assert(UPPER(atttypid::regtype::TEXT) = - 'DOUBLE PRECISION', 'prediction column should be double precision type') -FROM pg_attribute -WHERE attrelid = 'cifar10_predict'::regclass - AND attname = 'estimated_y'; +SELECT assert(UPPER(pg_typeof(estimated_y_text)::TEXT) = 'TEXT', + 'column estimated_y_text should be text type') +FROM cifar10_predict LIMIT 1; + +SELECT assert( + estimated_y_text IN ('0', '1'), + 'Predict failure for null class value and response pred_type.') +FROM cifar10_predict; -- Test predict with INTEGER class_values -- with NULL as a valid class value @@ -747,13 +742,11 @@ SELECT madlib_keras_predict( -- Validate the output datatype of newly created prediction column -- for prediction type = 'prob' and class_values 'INT' with NULL -- as a valid class_values -SELECT assert(UPPER(atttypid::regtype::TEXT) = +SELECT assert(UPPER(pg_typeof("prob_NULL")::TEXT) = 'DOUBLE PRECISION', 'column prob_NULL should be double precision type') -FROM pg_attribute -WHERE attrelid = 'cifar10_predict'::regclass AND attname = 'prob_NULL'; - +FROM cifar10_predict; -- Must have exactly 6 cols (5 for class_values and 1 for id) -SELECT assert(COUNT(*)=6, 'Predict out table must have exactly four cols.') +SELECT assert(COUNT(*)=6, 'Predict out table must have exactly six cols.') FROM pg_attribute WHERE attrelid='cifar10_predict'::regclass AND attnum>0; @@ -772,10 +765,9 @@ SELECT madlib_keras_predict( -- for prediction type = 'response' and class_values 'TEXT' with NULL -- as a valid class_values -- Returns: class_value with highest probability -SELECT assert(UPPER(atttypid::regtype::TEXT) = +SELECT assert(UPPER(pg_typeof(estimated_y)::TEXT) = 'SMALLINT', 'prediction column should be smallint type') -FROM pg_attribute -WHERE attrelid = 'cifar10_predict'::regclass AND attname = 'estimated_y'; +FROM cifar10_predict; -- Test case with a different input shape (3, 32, 32) instead of (32, 32, 3). -- Create a new table with image shape 3, 32, 32 @@ -1066,6 +1058,27 @@ SELECT madlib_keras_fit('iris_data_packed', -- source table 1 -- metrics_compute_frequency ); +DROP TABLE IF EXISTS iris_train, iris_test; +-- Set seed so results are reproducible +SELECT setseed(0); +SELECT train_test_split('iris_data', -- Source table + 'iris', -- Output table root name + 0.8, -- Train proportion + NULL, -- Test proportion (0.2) + NULL, -- Strata definition + NULL, -- Output all columns + NULL, -- Sample without replacement + TRUE -- Separate output tables + ); + +DROP TABLE IF EXISTS iris_predict; +SELECT madlib_keras_predict('iris_model', -- model + 'iris_test', -- test_table + 'id', -- id column + 'attributes', -- independent var + 'iris_predict' -- output table + ); + -- Test that our code is indeed learning something and not broken. The loss -- from the first iteration should be less than the 5th, while the accuracy -- must be greater. @@ -1179,3 +1192,96 @@ SELECT assert( abs(first.training_metrics_final-second.training_metrics[2]) < 1e-10, 'Transfer learning test failed because training loss and metrics don''t match the expected value.') FROM iris_model_first_run AS first, iris_model_transfer_summary AS second; + +---------------------- Predict BYOM test -------------------------------- + +-- class_values not NULL, pred_type is response +DROP TABLE IF EXISTS iris_predict_byom; +SELECT madlib_keras_predict_byom( + 'iris_model_arch', + 2, + 'iris_test', + 'id', + 'attributes', + 'iris_predict_byom', + 'response', + -1, + ARRAY['Iris-setosa', 'Iris-versicolor', + 'Iris-virginica'] + ); + +SELECT assert( + p0.estimated_class_text = p1.estimated_dependent_var, + 'Predict byom failure for non null class value and response pred_type.') +FROM iris_predict AS p0, iris_predict_byom AS p1 +WHERE p0.id=p1.id; +SELECT assert(UPPER(pg_typeof(estimated_dependent_var)::TEXT) = 'TEXT', + 'Predict byom failure for non null class value and response pred_type. + Expeceted estimated_dependent_var to be of type TEXT') +FROM iris_predict_byom LIMIT 1; + +-- class_values NULL, pred_type is NULL (response) +DROP TABLE IF EXISTS iris_predict_byom; +SELECT madlib_keras_predict_byom( + 'iris_model_arch', + 2, + 'iris_test', + 'id', + 'attributes', + 'iris_predict_byom' + ); +SELECT assert( + p1.estimated_dependent_var IN ('0', '1', '2'), + 'Predict byom failure for null class value and null pred_type.') +FROM iris_predict_byom AS p1; +SELECT assert(UPPER(pg_typeof(estimated_dependent_var)::TEXT) = 'TEXT', + 'Predict byom failure for non null class value and response pred_type. + Expeceted estimated_dependent_var to be of type TEXT') +FROM iris_predict_byom LIMIT 1; + +-- class_values not NULL, pred_type is prob +DROP TABLE IF EXISTS iris_predict_byom; +SELECT madlib_keras_predict_byom( + 'iris_model_arch', + 2, + 'iris_test', + 'id', + 'attributes', + 'iris_predict_byom', + 'prob', + -1, + ARRAY['Iris-setosa', 'Iris-versicolor', + 'Iris-virginica'], + 1.0 + ); + +SELECT assert( + (p1."prob_Iris-setosa" + p1."prob_Iris-virginica" + p1."prob_Iris-versicolor") - 1 < 1e-6, + 'Predict byom failure for non null class value and prob pred_type.') +FROM iris_predict_byom AS p1; +SELECT assert(UPPER(pg_typeof("prob_Iris-setosa")::TEXT) = 'DOUBLE PRECISION', + 'Predict byom failure for non null class value and prob pred_type. + Expeceted "prob_Iris-setosa" to be of type DOUBLE PRECISION') +FROM iris_predict_byom LIMIT 1; + +-- class_values NULL, pred_type is prob +DROP TABLE IF EXISTS iris_predict_byom; +SELECT madlib_keras_predict_byom( + 'iris_model_arch', + 2, + 'iris_test', + 'id', + 'attributes', + 'iris_predict_byom', + 'prob', + 0, + NULL + ); +SELECT assert( + (prob[1] + prob[2] + prob[3]) - 1 < 1e-6, + 'Predict byom failure for null class value and prob pred_type.') +FROM iris_predict_byom; +SELECT assert(UPPER(pg_typeof(prob)::TEXT) = 'DOUBLE PRECISION[]', + 'Predict byom failure for null class value and prob pred_type. Expeceted prob to + be of type DOUBLE PRECISION[]') +FROM iris_predict_byom LIMIT 1; diff --git a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in index 2a1c39e..9cce86a 100644 --- a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in +++ b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in @@ -301,7 +301,7 @@ class MadlibKerasFitTestCase(unittest.TestCase): self.assertEqual(True, res) -class MadlibKerasPredictTestCase(unittest.TestCase): +class InternalKerasPredictTestCase(unittest.TestCase): def setUp(self): self.plpy_mock = Mock(spec='error') patches = { @@ -406,6 +406,90 @@ class MadlibKerasPredictTestCase(unittest.TestCase): self.assertEqual(False, 'row_count' in k['SD']) self.assertEqual(False, 'segment_model_predict' in k['SD']) + +class MadlibKerasPredictBYOMTestCase(unittest.TestCase): + def setUp(self): + self.plpy_mock = Mock(spec='error') + patches = { + 'plpy': plpy + } + + self.plpy_mock_execute = MagicMock() + plpy.execute = self.plpy_mock_execute + + self.module_patcher = patch.dict('sys.modules', patches) + self.module_patcher.start() + self.num_classes = 5 + self.model = Sequential() + self.model.add(Conv2D(2, kernel_size=(1, 1), activation='relu', + input_shape=(1,1,1,), padding='same')) + self.model.add(Dense(self.num_classes)) + + self.pred_type = 'prob' + self.gpus_per_host = 2 + self.class_values = ['foo', 'bar', 'baaz', 'foo2', 'bar2'] + self.normalizing_const = 255.0 + + import madlib_keras_predict + self.module = madlib_keras_predict + self.module.get_model_arch_weights = Mock(return_value=( + self.model.to_json(), 'weights')) + self.module.InputValidator.validate_predict_byom_tables = Mock() + self.module.InputValidator.validate_input_shape = Mock() + self.module.BasePredict.call_internal_keras = Mock() + + def tearDown(self): + self.module_patcher.stop() + + def test_predictbyom_defaults_1(self): + res = self.module.PredictBYOM('schema_madlib', 'model_arch_table', + 'model_arch_id', 'test_table', 'id_col', + 'independent_varname', 'output_table', None, + None, None, None) + self.assertEqual('response', res.pred_type) + self.assertEqual(0, res.gpus_per_host) + self.assertEqual([0,1,2,3,4], res.class_values) + self.assertEqual(1.0, res.normalizing_const) + self.assertEqual('text', res.dependent_vartype) + + def test_predictbyom_defaults_2(self): + res = self.module.PredictBYOM('schema_madlib', 'model_arch_table', + 'model_arch_id', 'test_table', 'id_col', + 'independent_varname', 'output_table', + self.pred_type, self.gpus_per_host, + self.class_values, self.normalizing_const) + self.assertEqual('prob', res.pred_type) + self.assertEqual(2, res.gpus_per_host) + self.assertEqual(['foo', 'bar', 'baaz', 'foo2', 'bar2'], res.class_values) + self.assertEqual(255.0, res.normalizing_const) + self.assertEqual('double precision', res.dependent_vartype) + + def test_predictbyom_exception_invalid_params(self): + with self.assertRaises(plpy.PLPYException) as error: + self.module.PredictBYOM('schema_madlib', 'model_arch_table', + 'model_arch_id', 'test_table', 'id_col', + 'independent_varname', 'output_table', + 'invalid_pred_type', self.gpus_per_host, + self.class_values, self.normalizing_const) + self.assertIn('invalid_pred_type', str(error.exception)) + + with self.assertRaises(plpy.PLPYException) as error: + self.module.PredictBYOM('schema_madlib', 'model_arch_table', + 'model_arch_id', 'test_table', 'id_col', + 'independent_varname', 'output_table', + self.pred_type, self.gpus_per_host, + ["foo", "bar", "baaz"], self.normalizing_const) + self.assertIn('class values', str(error.exception).lower()) + + with self.assertRaises(plpy.PLPYException) as error: + self.module.PredictBYOM('schema_madlib', 'model_arch_table', + 'model_arch_id', 'test_table', 'id_col', + 'independent_varname', 'output_table', + self.pred_type, self.gpus_per_host, + self.class_values, 0) + self.assertIn('normalizing const', str(error.exception).lower()) + + class MadlibKerasWrapperTestCase(unittest.TestCase): def setUp(self): self.plpy_mock = Mock(spec='error') @@ -748,58 +832,37 @@ class MadlibKerasFitInputValidatorTestCase(unittest.TestCase): def tearDown(self): self.module_patcher.stop() - def test_validate_input_shapes_shapes_do_not_match(self): - self.plpy_mock_execute.return_value = [{'n_0': 32, 'n_1': 32}] - self.subject._validate_input_args = Mock() - with self.assertRaises(plpy.PLPYException): - self.subject._validate_input_shapes( - 'dummy_tbl', 'dummy_col', [32,32,3], 2) - - self.plpy_mock_execute.return_value = [{'n_0': 3, 'n_1': 32, 'n_2': 32}] - with self.assertRaises(plpy.PLPYException): - self.subject._validate_input_shapes( - 'dummy_tbl', 'dummy_col', [32,32,3], 2) - - self.plpy_mock_execute.return_value = [{'n_0': 3, 'n_1': None, 'n_2': None}] - with self.assertRaises(plpy.PLPYException): - self.subject._validate_input_shapes( - 'dummy_tbl', 'dummy_col', [3,32], 2) - - def test_validate_input_shapes_shapes_match(self): - self.plpy_mock_execute.return_value = [{'n_0': 32, 'n_1': 32, 'n_2': 3}] - self.subject._validate_input_args = Mock() - self.subject._validate_input_shapes( - 'dummy_tbl', 'dummy_col', [32,32,3], 1) def test_is_valid_metrics_compute_frequency_True_None(self): self.subject.FitInputValidator._validate_input_args = Mock() obj = self.subject.FitInputValidator( - 'test_table', 'val_table', 'model_table', 'model_arch_table', + 'test_table', 'val_table', 'model_table', 'model_arch_table', 2, 'dep_varname', 'independent_varname', 5, None, False) self.assertEqual(True, obj._is_valid_metrics_compute_frequency()) def test_is_valid_metrics_compute_frequency_True_num(self): self.subject.FitInputValidator._validate_input_args = Mock() obj = self.subject.FitInputValidator( - 'test_table', 'val_table', 'model_table', 'model_arch_table', + 'test_table', 'val_table', 'model_table', 'model_arch_table', 2, 'dep_varname', 'independent_varname', 5, 3, False) self.assertEqual(True, obj._is_valid_metrics_compute_frequency()) def test_is_valid_metrics_compute_frequency_False_zero(self): self.subject.FitInputValidator._validate_input_args = Mock() obj = self.subject.FitInputValidator( - 'test_table', 'val_table', 'model_table', 'model_arch_table', + 'test_table', 'val_table', 'model_table', 'model_arch_table', 2, 'dep_varname', 'independent_varname', 5, 0, False) self.assertEqual(False, obj._is_valid_metrics_compute_frequency()) def test_is_valid_metrics_compute_frequency_False_greater(self): self.subject.FitInputValidator._validate_input_args = Mock() obj = self.subject.FitInputValidator( - 'test_table', 'val_table', 'model_table', 'model_arch_table', + 'test_table', 'val_table', 'model_table', 'model_arch_table', 2, 'dep_varname', 'independent_varname', 5, 6, False) self.assertEqual(False, obj._is_valid_metrics_compute_frequency()) -class PredictInputValidatorTestCases(unittest.TestCase): + +class InputValidatorTestCase(unittest.TestCase): def setUp(self): self.plpy_mock = Mock(spec='error') patches = { @@ -813,34 +876,83 @@ class PredictInputValidatorTestCases(unittest.TestCase): self.module_patcher.start() import madlib_keras_validator self.module = madlib_keras_validator - self.module.PredictInputValidator._validate_input_args = Mock() - self.subject = self.module.PredictInputValidator( - 'test_table', 'model_table', 'id_col', 'independent_varname', - 'output_table', 'pred_type', 'module_name') + self.subject = self.module.InputValidator + + self.module_name = 'module' + self.test_table = 'test_table' + self.model_table = 'model_table' + self.id_col = 'id_col' + self.ind_var = 'ind_var' + self.model_arch_table = 'model_arch_table' + self.model_arch_id = 2 + self.num_classes = 1598 + self.model = Sequential() + self.model.add(Conv2D(2, kernel_size=(1, 1), activation='relu', + input_shape=(1,1,1,), padding='same')) + self.model.add(Dense(self.num_classes)) self.classes = ['train', 'boat', 'car', 'airplane'] def tearDown(self): self.module_patcher.stop() def test_validate_pred_type_invalid_pred_type(self): - self.subject.pred_type = 'invalid' + with self.assertRaises(plpy.PLPYException) as error: + self.subject.validate_pred_type( + self.module_name, 'invalid_pred_type', ['cat', 'dog']) + self.assertIn('type', str(error.exception).lower()) + + def test_validate_class_values_greater_than_1600_class_values(self): + self.model.add(Dense(1599)) + with self.assertRaises(plpy.PLPYException) as error: + self.subject.validate_class_values( + self.module_name, range(1599), 'prob', self.model.to_json()) + self.assertIn('1600', str(error.exception)) + + def test_validate_class_values_valid_class_values_prob(self): + self.subject.validate_class_values( + self.module_name, range(self.num_classes), 'prob', self.model.to_json()) + self.subject.validate_class_values( + self.module_name, None, 'prob', self.model.to_json()) + + def test_validate_class_values_valid_pred_type_valid_class_values_response(self): + self.subject.validate_class_values( + self.module_name, range(self.num_classes), 'response', self.model.to_json()) + self.subject.validate_class_values( + self.module_name, None, 'response', self.model.to_json()) + + def test_validate_input_shape_shapes_do_not_match(self): + self.plpy_mock_execute.return_value = [{'n_0': 32, 'n_1': 32}] + with self.assertRaises(plpy.PLPYException): + self.subject.validate_input_shape( + self.test_table, self.ind_var, [32,32,3], 2) + + self.plpy_mock_execute.return_value = [{'n_0': 3, 'n_1': 32, 'n_2': 32}] with self.assertRaises(plpy.PLPYException): - self.subject.validate_pred_type(['cat', 'dog']) + self.subject.validate_input_shape( + self.test_table, self.ind_var, [32,32,3], 2) - def test_validate_pred_type_valid_pred_type_invalid_num_class_values(self): - self.subject.pred_type = 'prob' + self.plpy_mock_execute.return_value = [{'n_0': 3, 'n_1': None, 'n_2': None}] with self.assertRaises(plpy.PLPYException): - self.subject.validate_pred_type(range(1599)) + self.subject.validate_input_shape( + self.test_table, self.ind_var, [3,32], 2) - def test_validate_pred_type_valid_pred_type_valid_class_values_prob(self): - self.subject.pred_type = 'prob' - self.subject.validate_pred_type(range(1598)) - self.subject.validate_pred_type(None) + def test_validate_input_shape_shapes_match(self): + self.plpy_mock_execute.return_value = [{'n_0': 32, 'n_1': 32, 'n_2': 3}] + self.subject.validate_input_shape( + self.test_table, self.ind_var, [32,32,3], 1) + + def test_validate_model_arch_table_none_values(self): + with self.assertRaises(plpy.PLPYException) as error: + obj = self.subject.validate_model_arch_table( + self.module_name, None, self.model_arch_id) + self.assertIn('null', str(error.exception).lower()) + + self.module.input_tbl_valid = Mock() + with self.assertRaises(plpy.PLPYException) as error: + obj = self.subject.validate_model_arch_table( + self.module_name, self.model_arch_table, None) + self.assertIn('id', str(error.exception).lower()) - def test_validate_pred_type_valid_pred_type_valid_class_values_response(self): - self.subject.pred_type = 'response' - self.subject.validate_pred_type(range(1598)) - self.subject.validate_pred_type(None) class MadlibSerializerTestCase(unittest.TestCase): def setUp(self): @@ -921,6 +1033,7 @@ class MadlibSerializerTestCase(unittest.TestCase): self.assertEqual(np.array([0,1,3,4,5], dtype=np.float32).tostring(), res) + class MadlibKerasHelperTestCase(unittest.TestCase): def setUp(self): self.plpy_mock = Mock(spec='error')