[GitHub] [madlib] orhankislal commented on a change in pull request #518: DL: [AutoML] Add support for Hyperopt on top of MOP for MPP + AutoML best-so-far

GitBox Wed, 23 Sep 2020 09:59:09 -0700


orhankislal commented on a change in pull request #518:
URL: https://github.com/apache/madlib/pull/518#discussion_r493747805




##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
##########
@@ -500,63 +648,432 @@ class KerasAutoML():
                 "b (key integer, s_val integer, i_val integer) WHERE 
t.mst_key=b.key".format(self=self, l=l)
         plpy.execute(query)
 
-    def update_model_selection_table(self):
+# @MinWarning("warning")
+class AutoMLHyperopt(KerasAutoML):
+    """
+    This class implements Hyperopt, another automl method that explores 
awkward search spaces using
+    Random Search, Tree-structured Parzen Estimator (TPE), or Adaptive TPE.
+
+    This function executes hyperopt on top of our multiple model training 
infrastructure powered with
+    Model hOpper Parallelism (MOP), a hybrid of data and task parallelism.
+
+    This automl method inherits qualities from the automl class.
+    """
+    def __init__(self, schema_madlib, source_table, model_output_table, 
model_arch_table, model_selection_table,
+                 model_id_list, compile_params_grid, fit_params_grid, 
automl_method='hyperopt',
+                 automl_params='num_models=20, num_iters=5, algorithm=tpe', 
random_state=None, object_table=None,
+                 use_gpus=False, validation_table=None, 
metrics_compute_frequency=None,
+                 name=None, description=None, **kwargs):
+        KerasAutoML.__init__(self, schema_madlib, source_table, 
model_output_table, model_arch_table,
+                             model_selection_table, model_id_list, 
compile_params_grid, fit_params_grid,
+                             automl_method, automl_params, random_state, 
object_table, use_gpus,
+                             validation_table, metrics_compute_frequency, 
name, description, **kwargs)
+        self.compile_params_grid = self.compile_params_grid.replace('\n', 
'').replace(' ', '')
+        self.fit_params_grid = self.fit_params_grid.replace('\n', 
'').replace(' ', '')
+        try:
+            self.compile_params_grid = literal_eval(self.compile_params_grid)
+
+        except:
+            plpy.error("Invalid syntax in 'compile_params_dict'")
+        try:
+            self.fit_params_grid = literal_eval(self.fit_params_grid)
+        except:
+            plpy.error("Invalid syntax in 'fit_params_dict'")
+        self.validate_and_define_inputs()
+
+        self.num_workers = get_seg_number() * get_segments_per_host()
+
+        self.create_model_output_table()
+        self.create_model_output_info_table()
+        self.find_hyperopt_config()
+
+    def validate_and_define_inputs(self):
+        automl_params_dict = extract_keyvalue_params(self.automl_params,
+                                                     
default_values={'num_models': 20,
+                                                                     
'num_iters': 5,
+                                                                     
'algorithm': 'tpe'},
+                                                     lower_case_names=True)
+        # casting relevant values to int
+        for i in automl_params_dict:
+            try:
+                automl_params_dict[i] = int(automl_params_dict[i])
+            except ValueError:
+                pass
+        _assert(len(automl_params_dict) >= 1 and len(automl_params_dict) <= 3,
+                "DL: Only num_models, num_iters, and algorithm may be 
specified")
+        for i in automl_params_dict:
+            if i == AutoMLSchema.NUM_MODELS:
+                self.num_models = automl_params_dict[AutoMLSchema.NUM_MODELS]
+            elif i == AutoMLSchema.NUM_ITERS:
+                self.num_iters = automl_params_dict[AutoMLSchema.NUM_ITERS]
+            elif i == AutoMLSchema.ALGORITHM:
+                if automl_params_dict[AutoMLSchema.ALGORITHM].lower() == 
'rand':
+                    self.algorithm = rand
+                elif automl_params_dict[AutoMLSchema.ALGORITHM].lower() == 
'tpe':
+                    self.algorithm = tpe
+                # elif automl_params_dict[AutoMLSchema.ALGORITHM].lower() == 
'atpe':
+                #     self.algorithm = atpe
+                # uncomment the above lines after atpe works
+                else:
+                    plpy.error("DL: algorithm in 'automl_params' must be 
'rand', 'tpe'") # , or 'atpe'
+            else:
+                plpy.error("DL: {0} is an invalid automl param".format(i))
+        _assert(self.num_models > 0 and self.num_iters > 0, "DL: num_models 
and num_iters in 'automl_params' "
+                                                            "must be positive")
+        _assert(self._is_valid_metrics_compute_frequency(self.num_iters), "DL: 
'metrics_compute_frequency' "
+                                                                          "out 
of iteration range")
+
+    def find_hyperopt_config(self):
         """
-        Drops and re-create the mst table to only include the best performing 
model configuration.
+        Executes hyperopt on top of MOP.
         """
-        drop_tables([self.model_selection_table])
 
-        # only retaining best performing config
-        plpy.execute("CREATE TABLE {self.model_selection_table} AS SELECT 
mst_key, model_id, compile_params, " \
-                     "fit_params FROM {self.model_info_table} " \
-                     "ORDER BY {AutoMLSchema.LOSS_METRIC} LIMIT 
1".format(self=self, AutoMLSchema=AutoMLSchema))
+        make_mst_summary = True
+        trials = Trials()
+        domain = Domain(None, self.get_search_space())
+        rand_state = np.random.RandomState(self.random_state)
+        configs_lst = self.get_configs_list()
 
-    def generate_model_output_summary_table(self, model_training):
+        self.start_training_time = self.get_current_timestamp()
+        for low, high in configs_lst:
+            i, n = low, high - low + 1
+
+            # Using HyperOpt TPE/ATPE to generate parameters
+            hyperopt_params = []
+            sampled_params = []
+            for j in range(i, i + n):
+                new_param = self.algorithm.suggest([j], domain, trials, 
rand_state.randint(0, 2 ** 31 - 1))
+                new_param[0]['status'] = STATUS_RUNNING
+
+                trials.insert_trial_docs(new_param)
+                trials.refresh()
+                hyperopt_params.append(new_param[0])
+                sampled_params.append(new_param[0]['misc']['vals'])
+
+            model_id_list, compile_params, fit_params = 
self.extract_param_vals(sampled_params)
+            msts_list = self.generate_msts(model_id_list, compile_params, 
fit_params)
+            try:
+                self.remove_temp_tables(model_training)
+            except:
+                pass
+            self.populate_temp_mst_table(i, msts_list)
+
+            plpy.info("***Evaluating {n} newly suggested model 
configurations***".format(n=n))
+            model_training = FitMultipleModel(self.schema_madlib, 
self.source_table, AutoMLSchema.TEMP_OUTPUT_TABLE,
+                             AutoMLSchema.TEMP_MST_TABLE, self.num_iters, 
self.use_gpus, self.validation_table,
+                             self.metrics_compute_frequency, False, self.name, 
self.description)
+            if make_mst_summary:
+                
self.generate_mst_summary_table(self.model_selection_summary_table)
+                make_mst_summary = False
+
+            # HyperOpt TPE update
+            for k, hyperopt_param in enumerate(hyperopt_params, i):
+                loss_val = plpy.execute("SELECT {AutoMLSchema.LOSS_METRIC} 
FROM {model_training.model_info_table} " \
+                             "WHERE 
{ModelSelectionSchema.MST_KEY}={k}".format(AutoMLSchema=AutoMLSchema,
+                                                                               
ModelSelectionSchema=ModelSelectionSchema,
+                                                                               
**locals()))[0][AutoMLSchema.LOSS_METRIC]
+
+                hyperopt_param['status'] = STATUS_OK
+                hyperopt_param['result'] = {'loss': loss_val, 'status': 
STATUS_OK}
+            trials.refresh()
+
+            # stacks info of all model configs together
+            self.update_model_output_and_info_tables(model_training)
+
+            self.print_best_so_far()
+
+        self.end_training_time = self.get_current_timestamp()
+        self.update_model_selection_table()
+        self.generate_model_output_summary_table(model_training)
+        self.remove_temp_tables(model_training)
+
+    def get_configs_list(self):
         """
-        Creates and populates static values related to the AutoML workload.
-        :param model_training: Fit Multiple function call object.
+        Gets schedule to evaluate model configs
+        :return: Model configs evaluation schedule
         """
-        create_query = plpy.prepare("""
-                CREATE TABLE {self.model_summary_table} AS
-                SELECT
-                    $MAD${self.source_table}$MAD$::TEXT AS source_table,
-                    $MAD${self.validation_table}$MAD$::TEXT AS 
validation_table,
-                    $MAD${self.model_output_table}$MAD$::TEXT AS model,
-                    $MAD${self.model_info_table}$MAD$::TEXT AS model_info,
-                    (SELECT dependent_varname FROM 
{model_training.model_summary_table})
-                    AS dependent_varname,
-                    (SELECT independent_varname FROM 
{model_training.model_summary_table})
-                    AS independent_varname,
-                    $MAD${self.model_arch_table}$MAD$::TEXT AS 
model_arch_table,
-                    $MAD${self.model_selection_table}$MAD$::TEXT AS 
model_selection_table,
-                    $MAD${self.automl_method}$MAD$::TEXT AS automl_method,
-                    $MAD${self.automl_params}$MAD$::TEXT AS automl_params,
-                    $MAD${self.random_state}$MAD$::TEXT AS random_state,
-                    $MAD${self.object_table}$MAD$::TEXT AS object_table,
-                    {self.use_gpus} AS use_gpus,
-                    (SELECT metrics_compute_frequency FROM 
{model_training.model_summary_table})::INTEGER 
-                    AS metrics_compute_frequency,
-                    $MAD${self.name}$MAD$::TEXT AS name,
-                    $MAD${self.description}$MAD$::TEXT AS description,
-                    '{self.start_training_time}'::TIMESTAMP AS 
start_training_time,
-                    '{self.end_training_time}'::TIMESTAMP AS end_training_time,
-                    (SELECT madlib_version FROM 
{model_training.model_summary_table}) AS madlib_version,
-                    (SELECT num_classes FROM 
{model_training.model_summary_table})::INTEGER AS num_classes,
-                    (SELECT class_values FROM 
{model_training.model_summary_table}) AS class_values,
-                    (SELECT dependent_vartype FROM 
{model_training.model_summary_table}) 
-                    AS dependent_vartype,
-                    (SELECT normalizing_const FROM 
{model_training.model_summary_table}) 
-                    AS normalizing_const
-            """.format(self=self, model_training=model_training))
+        prop = self.num_models // self.num_workers
+        if prop == 0:
+            return [(1, self.num_models)]
+        lst = [self.num_workers  for _ in range(prop)]
 
-        with MinWarning('warning'):
-            plpy.execute(create_query)
+        # balancing remaining models in initial rounds
+        for i in range(self.num_models - prop * self.num_workers):
+            lst[i%len(lst)] += 1
 
-    def remove_temp_tables(self, model_training):
+        return [(sum(lst[:e])+1, sum(lst[:e])+lst[e]) if e != 0 else (1, 
lst[e]) for e in range(len(lst))]
+
+    def get_search_space(self):
         """
-        Remove all intermediate tables created for AutoML runs/updates.
-        :param model_training: Fit Multiple function call object.
+        Converts user inputs to hyperopt search space.
+        :return: Hyperopt search space
         """
-        drop_tables([model_training.original_model_output_table, 
model_training.model_info_table,
-                     model_training.model_summary_table, 
AutoMLSchema.TEMP_MST_TABLE,
-                     AutoMLSchema.TEMP_MST_SUMMARY_TABLE])
+
+        # initial params (outside 'optimizer_params_list')
+        hyperopt_search_dict = {}
+        hyperopt_search_dict['model_id'] = self.get_hyperopt_exps('model_id', 
self.model_id_list)
+
+
+        for j in self.fit_params_grid:
+            hyperopt_search_dict[j] = self.get_hyperopt_exps(j, 
self.fit_params_grid[j])
+
+        for i in self.compile_params_grid:
+            if i == ModelSelectionSchema.OPTIMIZER_PARAMS_LIST:

Review comment:
       Why not use the opposite of the condition to get rid of continue?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [madlib] orhankislal commented on a change in pull request #518: DL: [AutoML] Add support for Hyperopt on top of MOP for MPP + AutoML best-so-far

Reply via email to