[madlib] 04/04: DL: Cleanup fit and fit_multiple

nkak Tue, 09 Feb 2021 13:06:52 -0800

This is an automated email from the ASF dual-hosted git repository.

nkak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


commit a0f711cc50762ec30f1b5f5f5d435286f8e94248
Author: Nikhil Kak <n...@vmware.com>
AuthorDate: Fri Jan 29 18:29:31 2021 -0800

    DL: Cleanup fit and fit_multiple
    
    JIRA: MADLIB-1464
    
    Previously we were creating a columns_dict variable which contained the
    output of the packed summary table. This led to code being slightly
    harder to maintain and also some duplication.
    
    This commit removes this variable and the code now relies directly on the
    output of the summary table.
    
    Also renamed a few variables for consistency.
    
    Co-authored-by: Ekta Khanna <ekha...@vmware.com>
---
 .../modules/deep_learning/madlib_keras.py_in       | 127 +++++++++------------
 .../madlib_keras_fit_multiple_model.py_in          |  47 ++++----
 .../deep_learning/madlib_keras_validator.py_in     |  55 ++++-----
 .../deep_learning/test/madlib_keras_fit.sql_in     |  24 ++++
 .../test/unit_tests/test_madlib_keras.py_in        |  12 +-
 5 files changed, 131 insertions(+), 134 deletions(-)

diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
index c4f8611..67f2a56 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
@@ -120,27 +120,10 @@ def fit(schema_madlib, source_table, model, 
model_arch_table,
         num_iterations, metrics_compute_frequency, warm_start,
         use_gpus, accessible_gpus_for_seg, object_table)
 
-    columns_dict = {}
-    columns_dict['mb_dep_var_cols'] = fit_validator.dependent_varname
-    columns_dict['mb_indep_var_cols'] = fit_validator.independent_varname
-    columns_dict['dep_shape_cols'] = fit_validator.dep_shape_cols
-    columns_dict['ind_shape_cols'] = fit_validator.ind_shape_cols
-    columns_dict['val_dep_var'] = fit_validator.val_dep_var
-    columns_dict['val_ind_var'] = fit_validator.val_ind_var
-    columns_dict['val_dep_shape_cols'] = fit_validator.val_dep_shape_cols
-    columns_dict['val_ind_shape_cols'] = fit_validator.val_ind_shape_cols
     multi_dep_count = len(fit_validator.dependent_varname)
-
-    # fit_validator.dependent_varname = columns_dict['mb_dep_var_cols']
-    # fit_validator.independent_varname = columns_dict['mb_indep_var_cols']
-    # fit_validator.dep_shape_col = columns_dict['dep_shape_cols']
-    # fit_validator.ind_shape_col = columns_dict['ind_shape_cols']
     src_summary_dict = fit_validator.src_summary_dict
-    class_values_colnames = [add_postfix(i, "_class_values") for i in 
columns_dict['mb_dep_var_cols']]
-    src_summary_dict['class_values_type'] =[ get_expr_type(
-        i, fit_validator.source_summary_table) for i in class_values_colnames]
-    src_summary_dict['norm_const_type'] = get_expr_type(
-        NORMALIZING_CONST_COLNAME, fit_validator.source_summary_table)
+    class_values_colnames = [add_postfix(i, "_class_values") for i in
+                             fit_validator.dependent_varname]
 
     if metrics_compute_frequency is None:
         metrics_compute_frequency = num_iterations
@@ -172,10 +155,16 @@ def fit(schema_madlib, source_table, model, 
model_arch_table,
     serialized_weights = get_initial_weights(model, model_arch, model_weights,
                                              warm_start, 
accessible_gpus_for_seg)
     # Compute total images on each segment
-    dist_key_mapping, images_per_seg_train = 
get_image_count_per_seg_for_minibatched_data_from_db(source_table, 
columns_dict['dep_shape_cols'][0])
+    shape_col = fit_validator.dependent_shape_varname[0]
+    dist_key_mapping, images_per_seg_train = \
+        get_image_count_per_seg_for_minibatched_data_from_db(source_table,
+                                                             shape_col)
 
     if validation_table:
-        dist_key_mapping_val, images_per_seg_val = 
get_image_count_per_seg_for_minibatched_data_from_db(validation_table, 
columns_dict['dep_shape_cols'][0])
+        shape_col = fit_validator.val_dependent_shape_varname[0]
+        dist_key_mapping_val, images_per_seg_val = \
+            
get_image_count_per_seg_for_minibatched_data_from_db(validation_table,
+                                                                 shape_col)
 
     # Construct validation dataset if provided
     validation_set_provided = bool(validation_table)
@@ -198,31 +187,31 @@ def fit(schema_madlib, source_table, model, 
model_arch_table,
         plpy.error("Object table not specified for function {0} in 
compile_params".format(custom_fn_list))
 
     # Use the smart interface
-    if (len(columns_dict['mb_dep_var_cols']) <= 5 and
-        len(columns_dict['mb_indep_var_cols']) <= 5):
+    if (len(fit_validator.dependent_varname) <= 5 and
+        len(fit_validator.independent_varname) <= 5):
 
         dep_var_array = 5 * ["NULL"]
         indep_var_array = 5 * ["NULL"]
 
-        for counter, var in enumerate(columns_dict['mb_dep_var_cols']):
+        for counter, var in enumerate(fit_validator.dependent_varname):
             dep_var_array[counter] = var
 
-        for counter, var in enumerate(columns_dict['mb_indep_var_cols']):
+        for counter, var in enumerate(fit_validator.independent_varname):
             indep_var_array[counter] = var
         mb_dep_var_cols_sql = ', '.join(dep_var_array)
         mb_indep_var_cols_sql = ', '.join(indep_var_array)
     else:
 
         mb_dep_var_cols_sql = ', '.join(["dependent_var_{0}".format(i)
-                                    for i in columns_dict['mb_dep_var_cols']])
+                                    for i in fit_validator.dependent_varname])
         mb_dep_var_cols_sql = "ARRAY[{0}]".format(mb_dep_var_cols_sql)
 
         mb_indep_var_cols_sql = ', '.join(["independent_var_{0}".format(i)
-                                    for i in 
columns_dict['mb_indep_var_cols']])
+                                    for i in 
fit_validator.independent_varname])
         mb_indep_var_cols_sql = "ARRAY[{0}]".format(mb_indep_var_cols_sql)
 
-    dep_shape_cols_sql = ', '.join(columns_dict['dep_shape_cols'])
-    ind_shape_cols_sql = ', '.join(columns_dict['ind_shape_cols'])
+    dep_shape_cols_sql = ', '.join(fit_validator.dependent_shape_varname)
+    ind_shape_cols_sql = ', '.join(fit_validator.independent_shape_varname)
 
     run_training_iteration = plpy.prepare("""
         SELECT {schema_madlib}.fit_step(
@@ -295,7 +284,8 @@ def fit(schema_madlib, source_table, model, 
model_arch_table,
                 should_clear_session = is_final_iteration
 
             compute_out = compute_loss_and_metrics(schema_madlib, source_table,
-                                                   columns_dict,
+                                                   
fit_validator.dependent_varname,
+                                                   
fit_validator.independent_varname,
                                                    compile_params_to_pass,
                                                    model_arch,
                                                    serialized_weights, 
use_gpus,
@@ -314,7 +304,8 @@ def fit(schema_madlib, source_table, model, 
model_arch_table,
                 # Compute loss/accuracy for validation data.
                 val_compute_out = compute_loss_and_metrics(schema_madlib,
                                                            validation_table,
-                                                           columns_dict,
+                                                           
fit_validator.val_dependent_varname,
+                                                           
fit_validator.val_independent_varname,
                                                            
compile_params_to_pass,
                                                            model_arch,
                                                            serialized_weights,
@@ -337,9 +328,7 @@ def fit(schema_madlib, source_table, model, 
model_arch_table,
     end_training_time = datetime.datetime.now()
 
     version = madlib_version(schema_madlib)
-    class_values_type = src_summary_dict['class_values_type']
     norm_const = src_summary_dict['normalizing_const']
-    norm_const_type = src_summary_dict['norm_const_type']
     dep_vartype = src_summary_dict['dependent_vartype']
     dependent_varname = src_summary_dict['dependent_varname']
     independent_varname = src_summary_dict['independent_varname']
@@ -504,33 +493,32 @@ def get_source_summary_table_dict(source_summary_table):
 
     return source_summary
 
-def compute_loss_and_metrics(schema_madlib, table, columns_dict, 
compile_params,
+def compute_loss_and_metrics(schema_madlib, table, dependent_varname,
+                             independent_varname, compile_params,
                              model_arch, serialized_weights, use_gpus,
                              accessible_gpus_for_seg, segments_per_host,
                              dist_key_mapping, images_per_seg_val, 
metrics_list,
                              loss_list, should_clear_session, custom_fn_map,
-                             model_table=None, mst_key=None, is_train=True):
+                             model_table=None, mst_key=None):
     """
     Compute the loss and metric using a given model (serialized_weights) on the
     given dataset (table.)
     """
     start_val = time.time()
-    evaluate_result = get_loss_metric_from_keras_eval(schema_madlib,
-                                                   table,
-                                                   columns_dict,
-                                                   compile_params,
-                                                   model_arch,
-                                                   serialized_weights,
-                                                   use_gpus,
-                                                   accessible_gpus_for_seg,
-                                                   segments_per_host,
-                                                   dist_key_mapping,
-                                                   images_per_seg_val,
-                                                   should_clear_session,
-                                                   custom_fn_map,
-                                                   model_table,
-                                                   mst_key,
-                                                   is_train)
+    evaluate_result = get_loss_metric_from_keras_eval(schema_madlib, table,
+                                                      dependent_varname,
+                                                      independent_varname,
+                                                      compile_params,
+                                                      model_arch,
+                                                      serialized_weights,
+                                                      use_gpus,
+                                                      accessible_gpus_for_seg,
+                                                      segments_per_host,
+                                                      dist_key_mapping,
+                                                      images_per_seg_val,
+                                                      should_clear_session,
+                                                      custom_fn_map, 
model_table,
+                                                      mst_key)
     end_val = time.time()
     loss = evaluate_result[0]
     metric = evaluate_result[1]
@@ -882,14 +870,11 @@ def evaluate(schema_madlib, model_table, test_table, 
output_table,
     # independent_varname = model_summary_dict['independent_varname']
     # ind_shape_cols = [add_postfix(i, "_shape") for i in independent_varname]
 
-    columns_dict = {}
-    columns_dict['mb_dep_var_cols'] = model_summary_dict['dependent_varname']
-    columns_dict['mb_indep_var_cols'] = 
model_summary_dict['independent_varname']
-    columns_dict['dep_shape_cols'] = [add_postfix(i, "_shape") for i in 
columns_dict['mb_dep_var_cols']]
-    columns_dict['ind_shape_cols'] = [add_postfix(i, "_shape") for i in 
columns_dict['mb_indep_var_cols']]
+    dep_varname = model_summary_dict['dependent_varname']
+    indep_varname = model_summary_dict['independent_varname']
 
     InputValidator.validate_input_shape(
-        test_table, columns_dict['mb_indep_var_cols'], input_shape, 2, True)
+        test_table, indep_varname, input_shape, 2, True)
 
     compile_params_query = "SELECT compile_params, metrics_type, object_table 
FROM {0}".format(model_summary_table)
     res = plpy.execute(compile_params_query)[0]
@@ -902,11 +887,13 @@ def evaluate(schema_madlib, model_table, test_table, 
output_table,
         custom_fn_list = get_custom_functions_list(res['compile_params'])
         custom_function_map = query_custom_functions_map(object_table, 
custom_fn_list)
 
-    dist_key_mapping, images_per_seg = 
get_image_count_per_seg_for_minibatched_data_from_db(test_table, 
columns_dict['ind_shape_cols'][0])
+    shape_col = add_postfix(dep_varname[0], "_shape")
+    dist_key_mapping, images_per_seg = \
+        get_image_count_per_seg_for_minibatched_data_from_db(test_table, 
shape_col)
 
     loss_metric = \
         get_loss_metric_from_keras_eval(
-            schema_madlib, test_table, columns_dict, compile_params, 
model_arch,
+            schema_madlib, test_table, dep_varname, indep_varname, 
compile_params, model_arch,
             model_weights, use_gpus, accessible_gpus_for_seg, 
segments_per_host,
             dist_key_mapping, images_per_seg, 
custom_function_map=custom_function_map)
 
@@ -951,12 +938,13 @@ def validate_evaluate(module_name, model_table, 
model_summary_table, test_table,
     for i in dependent_varname:
         validate_bytea_var_for_minibatch(test_table, i)
 
-def get_loss_metric_from_keras_eval(schema_madlib, table, columns_dict, 
compile_params,
+def get_loss_metric_from_keras_eval(schema_madlib, table, dependent_varname,
+                                    independent_varname, compile_params,
                                     model_arch, serialized_weights, use_gpus,
                                     accessible_gpus_for_seg, segments_per_host,
                                     dist_key_mapping, images_per_seg,
                                     should_clear_session=True, 
custom_function_map=None,
-                                    model_table=None, mst_key=None, 
is_train=True):
+                                    model_table=None, mst_key=None):
     """
     This function will call the internal keras evaluate function to get the 
loss
     and accuracy of each tuple which then gets averaged to get the final 
result.
@@ -971,17 +959,12 @@ def get_loss_metric_from_keras_eval(schema_madlib, table, 
columns_dict, compile_
     """
     use_gpus = use_gpus if use_gpus else False
 
-    if is_train:
-        mb_dep_var_cols_sql = ', '.join(columns_dict['mb_dep_var_cols'])
-        mb_indep_var_cols_sql = ', '.join(columns_dict['mb_indep_var_cols'])
-        dep_shape_cols_sql = ', '.join(columns_dict['dep_shape_cols'])
-        ind_shape_cols_sql = ', '.join(columns_dict['ind_shape_cols'])
-    else:
-        mb_dep_var_cols_sql = ', '.join(columns_dict['val_dep_var'])
-        mb_indep_var_cols_sql = ', '.join(columns_dict['val_ind_var'])
-        dep_shape_cols_sql = ', '.join(columns_dict['val_dep_shape_cols'])
-        ind_shape_cols_sql = ', '.join(columns_dict['val_ind_shape_cols'])
-
+    mb_dep_var_cols_sql = ', '.join(dependent_varname)
+    mb_indep_var_cols_sql = ', '.join(independent_varname)
+    dep_shape_cols = [add_postfix(i, "_shape") for i in dependent_varname]
+    ind_shape_cols = [add_postfix(i, "_shape") for i in independent_varname]
+    dep_shape_cols_sql = ', '.join(dep_shape_cols)
+    ind_shape_cols_sql = ', '.join(ind_shape_cols)
 
     eval_sql = """
         select ({schema_madlib}.internal_keras_evaluate(
diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.py_in
 
b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.py_in
index 22b9401..aa7a2bc 100644
--- 
a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.py_in
+++ 
b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.py_in
@@ -168,15 +168,6 @@ class FitMultipleModel(object):
         self.msts = self.fit_validator_train.msts
         self.model_arch_table = self.fit_validator_train.model_arch_table
         self.object_table = self.fit_validator_train.object_table
-        self.columns_dict = {}
-        self.columns_dict['mb_dep_var_cols'] = 
self.fit_validator_train.dependent_varname
-        self.columns_dict['mb_indep_var_cols'] = 
self.fit_validator_train.independent_varname
-        self.columns_dict['dep_shape_cols'] = 
self.fit_validator_train.dep_shape_cols
-        self.columns_dict['ind_shape_cols'] = 
self.fit_validator_train.ind_shape_cols
-        self.columns_dict['val_dep_var'] = self.fit_validator_train.val_dep_var
-        self.columns_dict['val_ind_var'] = self.fit_validator_train.val_ind_var
-        self.columns_dict['val_dep_shape_cols'] = 
self.fit_validator_train.val_dep_shape_cols
-        self.columns_dict['val_ind_shape_cols'] = 
self.fit_validator_train.val_ind_shape_cols
 
         self.metrics_iters = []
         self.object_map_col = 'object_map'
@@ -188,17 +179,19 @@ class FitMultipleModel(object):
         if CUDA_VISIBLE_DEVICES_KEY in os.environ:
             self.original_cuda_env = os.environ[CUDA_VISIBLE_DEVICES_KEY]
 
+        shape_col = self.fit_validator_train.dependent_shape_varname[0]
         self.dist_key_mapping, self.images_per_seg_train = \
             get_image_count_per_seg_for_minibatched_data_from_db(
-                self.source_table, self.fit_validator_train.dep_shape_cols[0])
+                self.source_table, shape_col)
 
         if self.validation_table:
+            shape_col = self.fit_validator_train.val_dependent_shape_varname[0]
             self.valid_mst_metric_eval_time = defaultdict(list)
             self.valid_mst_loss = defaultdict(list)
             self.valid_mst_metric = defaultdict(list)
             self.dist_key_mapping_valid, self.images_per_seg_valid = \
                 get_image_count_per_seg_for_minibatched_data_from_db(
-                    self.validation_table, 
self.fit_validator_train.val_dep_shape_cols[0])
+                    self.validation_table, shape_col)
 
         self.dist_keys = query_dist_keys(self.source_table, self.dist_key_col)
         self.max_dist_key = sorted(self.dist_keys)[-1]
@@ -312,16 +305,17 @@ class FitMultipleModel(object):
     def evaluate_model(self, iter, table, is_train):
         if is_train:
             label = "training"
-        else:
-            label = "validation"
-
-        if is_train:
+            dependent_varname = self.fit_validator_train.dependent_varname
+            independent_varname = self.fit_validator_train.independent_varname
             mst_metric_eval_time = self.train_mst_metric_eval_time
             mst_loss = self.train_mst_loss
             mst_metric = self.train_mst_metric
             seg_ids = self.dist_key_mapping
             images_per_seg = self.images_per_seg_train
         else:
+            label = "validation"
+            dependent_varname = self.fit_validator_train.val_dependent_varname
+            independent_varname = 
self.fit_validator_train.val_independent_varname
             mst_metric_eval_time = self.valid_mst_metric_eval_time
             mst_loss = self.valid_mst_loss
             mst_metric = self.valid_mst_metric
@@ -333,21 +327,20 @@ class FitMultipleModel(object):
             model_arch = get_model_arch(self.model_arch_table, 
mst[self.model_id_col])
             DEBUG.start_timing('eval_compute_loss_and_metrics')
             eval_compute_time, metric, loss = compute_loss_and_metrics(
-                self.schema_madlib, table, self.columns_dict,
-                    "$madlib${0}$madlib$".format(
+                self.schema_madlib, table, dependent_varname, 
independent_varname,
+                "$madlib${0}$madlib$".format(
                     mst[self.compile_params_col]),
-                    model_arch,
-                    None,
-                    self.use_gpus,
-                    self.accessible_gpus_for_seg,
-                    self.segments_per_host,
+                model_arch,
+                None,
+                self.use_gpus,
+                self.accessible_gpus_for_seg,
+                self.segments_per_host,
                 seg_ids,
                 images_per_seg,
                 [], [], True,
                 mst[self.object_map_col],
                 self.model_output_tbl,
-                mst[self.mst_key_col],
-                    is_train)
+                mst[self.mst_key_col])
             total_eval_compute_time += eval_compute_time
             mst_metric_eval_time[mst[self.mst_key_col]] \
                 .append(self.metrics_elapsed_time_offset + (time.time() - 
self.metrics_elapsed_start_time))
@@ -683,7 +676,7 @@ class FitMultipleModel(object):
 
         class_values_colnames = [add_postfix(i, "_class_values") for i in 
self.fit_validator_train.dependent_varname]
         # class_values = src_summary_dict['class_values']
-        class_values_type =[get_expr_type(i, source_summary_table) for i in 
class_values_colnames]
+        # class_values_type =[get_expr_type(i, source_summary_table) for i in 
class_values_colnames]
         # class_values_type = src_summary_dict['class_values_type']
 
         dependent_varname = src_summary_dict['dependent_varname']
@@ -865,8 +858,8 @@ class FitMultipleModel(object):
             """.format(self=self))
 
         #TODO: Fix these to add multi io
-        dep_shape_col = self.fit_validator_train.dep_shape_cols[0]
-        ind_shape_col = self.fit_validator_train.ind_shape_cols[0]
+        dep_shape_col = self.fit_validator_train.dependent_shape_varname[0]
+        ind_shape_col = self.fit_validator_train.independent_shape_varname[0]
         dep_var_col = self.fit_validator_train.dependent_varname[0]
         indep_var_col = self.fit_validator_train.independent_varname[0]
         source_table = self.source_table
diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
index 439d9d9..535d70d 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
@@ -291,23 +291,24 @@ class FitCommonValidator(object):
         self.independent_varname = self.src_summary_dict['independent_varname']
         if not isinstance(self.dependent_varname, list) or \
                 not isinstance(self.independent_varname, list):
-            #TODO improve error message
-            plpy.error("Input table '{0}' has not been preprocessed properly. "
-                       "Please run input preprocessor 
again.".format(self.source_table))
-        self.dep_shape_cols = [add_postfix(i, "_shape") for i in 
self.dependent_varname]
-        self.ind_shape_cols = [add_postfix(i, "_shape") for i in 
self.independent_varname]
-
-        self.val_dep_var = None
-        self.val_ind_var = None
-        self.val_dep_shape_cols = None
-        self.val_ind_shape_cols = None
+            plpy.error("Input table '{0}' was preprocessed with "\
+                       "an older version of the input preprocessor. "
+                       "Please re-run the current version of input 
preprocessor "\
+                       "on the dataset.".format(self.source_table))
+        self.dependent_shape_varname = [add_postfix(i, "_shape") for i in 
self.dependent_varname]
+        self.independent_shape_varname = [add_postfix(i, "_shape") for i in 
self.independent_varname]
+
+        self.val_dependent_varname = None
+        self.val_independent_varname = None
+        self.val_dependent_shape_varname = None
+        self.val_independent_shape_varname = None
         if self.validation_table:
             val_summary_dict = 
self.get_source_summary_table_dict(self.validation_summary_table)
 
-            self.val_dep_var = val_summary_dict['dependent_varname']
-            self.val_ind_var = val_summary_dict['independent_varname']
-            self.val_dep_shape_cols = [add_postfix(i, "_shape") for i in 
self.val_dep_var]
-            self.val_ind_shape_cols = [add_postfix(i, "_shape") for i in 
self.val_ind_var]
+            self.val_dependent_varname = val_summary_dict['dependent_varname']
+            self.val_independent_varname = 
val_summary_dict['independent_varname']
+            self.val_dependent_shape_varname = [add_postfix(i, "_shape") for i 
in self.val_dependent_varname]
+            self.val_independent_shape_varname = [add_postfix(i, "_shape") for 
i in self.val_independent_varname]
 
         self._validate_tables_schema()
         if use_gpus:
@@ -340,22 +341,22 @@ class FitCommonValidator(object):
             additional_cols.append(DISTRIBUTION_KEY_COLNAME)
 
         self._validate_columns_in_preprocessed_table(self.source_table,
-                                                    self.independent_varname +
-                                                    self.dependent_varname +
-                                                    self.ind_shape_cols +
-                                                    self.dep_shape_cols +
-                                                    additional_cols)
+                                                     self.independent_varname +
+                                                     self.dependent_varname +
+                                                     
self.independent_shape_varname +
+                                                     
self.dependent_shape_varname +
+                                                     additional_cols)
         for i in self.dependent_varname:
             validate_bytea_var_for_minibatch(self.source_table, i)
 
         if self.validation_table and self.validation_table.strip() != '':
             self._validate_columns_in_preprocessed_table(self.validation_table,
-                                                        self.val_ind_var +
-                                                        self.val_dep_var +
-                                                        
self.val_ind_shape_cols +
-                                                        
self.val_dep_shape_cols+
-                                                        additional_cols)
-            for i in self.val_dep_var:
+                                                         
self.val_independent_varname +
+                                                         
self.val_dependent_varname +
+                                                         
self.val_independent_shape_varname +
+                                                         
self.val_dependent_shape_varname +
+                                                         additional_cols)
+            for i in self.val_dependent_varname:
                 validate_bytea_var_for_minibatch(self.validation_table, i)
 
         cols_in_tbl_valid(self.source_summary_table,
@@ -397,7 +398,7 @@ class FitCommonValidator(object):
             self._validate_input_table(self.validation_table, True)
             validation_summary_table = add_postfix(self.validation_table, 
"_summary")
             input_tbl_valid(validation_summary_table, self.module_name)
-            for i in self.val_dep_var:
+            for i in self.val_dependent_varname:
                 dependent_vartype = get_expr_type(i,
                                                   self.validation_table)
                 _assert(dependent_vartype == 'bytea',
@@ -411,7 +412,7 @@ class FitCommonValidator(object):
                                input_shape, 2, True)
         if self.validation_table:
             InputValidator.validate_input_shape(
-                self.validation_table,  self.val_ind_var,
+                self.validation_table,  self.val_independent_varname,
                 input_shape, 2, True)
 
 
diff --git 
a/src/ports/postgres/modules/deep_learning/test/madlib_keras_fit.sql_in 
b/src/ports/postgres/modules/deep_learning/test/madlib_keras_fit.sql_in
index eaa6916..74aff3c 100644
--- a/src/ports/postgres/modules/deep_learning/test/madlib_keras_fit.sql_in
+++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras_fit.sql_in
@@ -514,3 +514,27 @@ SELECT madlib_keras_fit(
        FALSE
 );
 SELECT assert(sum(get_gd_keys_len()) = 0, 'GD was not cleared properly!') 
m4_ifdef(<!__POSTGRESQL__!>, <!!>, <! FROM gp_dist_random('gp_id') !>);
+
+--- Test when source table and validation table have different column names
+DROP TABLE IF EXISTS iris_data_2;
+CREATE TABLE iris_data_2 as SELECT id, attributes as val_attributes, 
class_text as val_class_text FROM iris_data;
+DROP TABLE IF EXISTS iris_data_val_packed_2, iris_data_val_packed_2_summary;
+SELECT validation_preprocessor_dl('iris_data_2',    -- Source table
+                                'iris_data_val_packed_2',  -- Output table
+                                'val_class_text',     -- Dependent variable
+                                'val_attributes',     -- Independent variable
+                                'iris_data_packed'    -- Training preprocessed 
table
+                                );
+
+DROP TABLE if exists iris_model, iris_model_summary;
+SELECT madlib_keras_fit(
+       'iris_data_packed',
+       'iris_model',
+       'iris_model_arch',
+       1,
+       $$loss='categorical_crossentropy', optimizer='Adam(lr=0.01)', 
metrics=['accuracy']$$,
+  $$batch_size=16, epochs=1$$,
+       3,
+       FALSE,
+    'iris_data_val_packed_2'
+);
diff --git 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
index 5ef4517..164d743 100644
--- 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
+++ 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
@@ -906,7 +906,6 @@ class MadlibKerasPredictBYOMTestCase(unittest.TestCase):
                                      self.dependent_count)
         self.assertIn('invalid_pred_type', str(error.exception))
 
-        # The validation for this test has been disabled
         with self.assertRaises(plpy.PLPYException) as error:
             self.module.PredictBYOM('schema_madlib', 'model_arch_table',
                                      'model_id', 'test_table', 'id_col',
@@ -1314,36 +1313,33 @@ class 
MadlibKerasFitCommonValidatorTestCase(unittest.TestCase):
         self.assertEqual(False, obj._is_valid_metrics_compute_frequency())
 
     def test_validator_dep_indep_type_not_array(self):
+        expected_error_regex = "test_table.*preprocessed.*older version.*input 
preprocessor.*"
         # only dep is not array
         self.subject.FitCommonValidator.get_source_summary_table_dict = \
             Mock(return_value={'dependent_varname':'a',
                                'independent_varname':['b']})
-        with self.assertRaises(plpy.PLPYException) as error:
+        with self.assertRaisesRegexp(plpy.PLPYException, expected_error_regex):
             self.subject.FitCommonValidator(
                 'test_table', 'val_table', 'model_table', 5, None, False, 
False, [0],
                 'module_name', None)
-        self.assertIn('not been preprocessed properly', str(error.exception))
 
         # only indep is not array
         self.subject.FitCommonValidator.get_source_summary_table_dict = \
             Mock(return_value={'dependent_varname':['a'],
                                'independent_varname':'b'})
-        with self.assertRaises(plpy.PLPYException) as error:
+        with self.assertRaisesRegexp(plpy.PLPYException, expected_error_regex):
             self.subject.FitCommonValidator(
                 'test_table', 'val_table', 'model_table', 5, None, False, 
False, [0],
                 'module_name', None)
-        self.assertIn('not been preprocessed properly', str(error.exception))
 
         # both indep and dep are not arrays
         self.subject.FitCommonValidator.get_source_summary_table_dict = \
             Mock(return_value={'dependent_varname':'a',
                                'independent_varname':'b'})
-        with self.assertRaises(plpy.PLPYException) as error:
+        with self.assertRaisesRegexp(plpy.PLPYException, expected_error_regex):
             self.subject.FitCommonValidator(
                 'test_table', 'val_table', 'model_table', 5, None, False, 
False, [0],
                 'module_name', None)
-        self.assertIn('not been preprocessed properly', str(error.exception))
-
 
 class InputValidatorTestCase(unittest.TestCase):
     def setUp(self):

[madlib] 04/04: DL: Cleanup fit and fit_multiple

Reply via email to