[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user asfgit closed the pull request at: https://github.com/apache/madlib/pull/243 ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r176262391 --- Diff: src/ports/postgres/modules/convex/test/mlp.sql_in --- @@ -340,6 +181,51 @@ INSERT INTO iris_data VALUES (149,ARRAY[6.2,3.4,5.4,2.3],'Iris-virginica',3,2), (150,ARRAY[5.9,3.0,5.1,1.8],'Iris-virginica',3,2); +-- NOTE that the batch specific tables were created using: +-- madlib.minibatch_preprocessor(), with the regular source tables used in +-- this file. + +-- Create preprocessed data that can be used with minibatch MLP: +DROP TABLE IF EXISTS iris_data_batch, iris_data_batch_summary, iris_data_batch_standardization; +CREATE TABLE iris_data_batch( +__id__ integer, +dependent_varname double precision[], +independent_varname double precision[] +); +COPY iris_data_batch (__id__, dependent_varname, independent_varname) FROM STDIN NULL '?' DELIMITER '|'; +0 | {{0,1,0},{0,1,0},{0,0,1},{1,0,0},{0,1,0},{0,1,0},{0,0,1},{1,0,0},{1,0,0},{0,1,0},{1,0,0},{0,0,1},{0,0,1},{0,0,1},{1,0,0},{0,0,1},{0,0,1},{1,0,0},{1,0,0},{0,0,1},{0,1,0},{0,0,1},{0,0,1},{0,0,1},{0,0,1},{1,0,0},{0,1,0},{0,0,1},{0,0,1},{1,0,0}} | {{0.828881825720994,-0.314980522532101,0.363710790466334,0.159758615207397},{-1.08079689039279,-1.57669227467446,-0.229158821743702,-0.240110581430527},{-1.08079689039279,-1.32434992424599,0.482284712908341,0.692917544057962},{-1.46273263361555,0.442046528753317,-1.35561108494277,-1.30642843913166},{-0.0623015751321059,-0.567322872960574,0.245136868024327,0.159758615207397},{-0.189613489539692,-0.819665223389045,0.304423829245331,0.159758615207397},{0.701569911313408,-1.32434992424599,0.778719519013359,0.959497008483245},{-1.20810880480038,-0.0626381721036282,-1.35561108494277,-1.4397181713443},{-0.698861147170034,0.946731229610261,-1.35561108494277,-1.30642843913166},{-0.82617306157762,-1.32434992424599,-0.407019705406713,-0.106820849 217886},{-0.698861147170034,2.71312768260957,-1.29632412372177,-1.4397181713443},{1.33812948335134,0.442046528753317,1.31230217000239,1.49265593733381},{0.319634168090651,-0.0626381721036282,0.660145596571352,0.826207276270604},{0.701569911313408,-1.32434992424599,0.778719519013359,0.959497008483245},{-0.698861147170034,1.19907358003873,-1.29632412372177,-1.30642843913166},{1.46544139775892,0.189704178324845,0.838006480234363,1.49265593733381},{1.21081756894375,-0.0626381721036282,0.897293441455367,1.49265593733381},{-0.444237318354863,1.70375828089568,-1.29632412372177,-1.30642843913166},{-0.82617306157762,1.95610063132415,-1.05917627883775,-1.03984897470638},{0.828881825720994,-0.819665223389045,0.95658040267637,0.959497008483245},{0.956193740128579,-0.567322872960574,0.541571674129345,0.42633807963268},{1.33812948335134,0.442046528753317,1.31230217000239,1.49265593733381},{0.574257996905822,0.946731229610261,1.01586736389737,1.49265593733381},{0.0650103392754793,-0.81966522338904 5,0.838006480234363,0.959497008483245},{0.0650103392754793,-0.819665223389045,0.838006480234363,0.959497008483245},{-1.46273263361555,0.442046528753317,-1.35561108494277,-1.30642843913166},{0.574257996905822,-2.08137697553141,0.482284712908341,0.42633807963268},{1.21081756894375,0.189704178324845,1.13444128633938,1.62594566954645},{1.97468905538926,-0.314980522532101,1.54945001488641,0.826207276270604},{-1.08079689039279,0.189704178324845,-1.29632412372177,-1.4397181713443}} +1 | {{0,1,0},{1,0,0},{0,1,0},{1,0,0},{1,0,0},{1,0,0},{1,0,0},{0,1,0},{0,0,1},{0,0,1},{1,0,0},{0,0,1},{1,0,0},{0,0,1},{0,1,0},{0,1,0},{0,1,0},{1,0,0},{1,0,0},{0,0,1},{0,1,0},{0,1,0},{0,0,1},{1,0,0},{1,0,0},{0,1,0},{1,0,0},{0,0,1},{0,1,0},{0,1,0}} | {{-0.0623015751321059,-0.0626381721036282,0.304423829245331,0.0264688829947554},{-0.316925403947277,2.96547003303804,-1.35561108494277,-1.30642843913166},{0.319634168090651,-0.819665223389045,0.838006480234363,0.559627811845321},{-0.953484975985206,1.19907358003873,-1.41489804616377,-1.17313870691902},{-0.953484975985206,0.442046528753317,-1.47418500738478,-1.30642843913166},{-1.33542071920796,0.442046528753317,-1.41489804616377,-1.30642843913166},{-1.71735646243072,-0.0626381721036282,-1.41489804616377,-1.30642843913166},{0.446946082498236,-0.0626381721036282,0.541571674129345,0.293048347420038},{1.21081756894375,-1.32434992424599,1.25301520878139,0.826207276270604},{0.701569911313408,0.694388879181789,1.3715891312234,1.75923540175909 },{-1.84466837683831,-0.0626381721036282,-1.53347196860578,-1.4397181713443},{1.84737714098168,1.45141593046721,1.430876092,1.75923540175909},{-0.82617306157762,1.19907358003873,-1.35561108494277,-1.30642843913166},{0.701569911313408,-0.314980522532101,1.13444128633938,0.826207276270604},{1.33812948335134,-0.567322872960574,0.660145596571352,0.293048347420038},{0.192322253683066,-0.0626381721036282,0.304423829245331,0.42633807963268},{-0.189613489539692,-0.819665223389045,0.304423829245331,0.159758615207397},{-1
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r176218740 --- Diff: src/modules/convex/mlp_igd.cpp --- @@ -130,6 +145,90 @@ mlp_igd_transition::run(AnyType &args) { return state; } +/** + * @brief Perform the multilayer perceptron minibatch transition step + * + * Called for each tuple. + */ +AnyType +mlp_minibatch_transition::run(AnyType &args) { +// For the first tuple: args[0] is nothing more than a marker that +// indicates that we should do some initial operations. +// For other tuples: args[0] holds the computation state until last tuple +MLPMiniBatchState > state = args[0]; + +// initilize the state if first tuple +if (state.algo.numRows == 0) { +if (!args[3].isNull()) { +MLPMiniBatchState > previousState = args[3]; --- End diff -- Tried it, it was cleaner this way. ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175949965 --- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in --- @@ -222,67 +243,83 @@ def mlp(schema_madlib, source_table, output_table, independent_varname, it_args.update({ 'group_by_clause': group_by_clause, 'using_clause': using_clause, -'grouping_str_comma': grouping_str_comma +'grouping_str_comma': grouping_str_comma, }) first_try = True temp_output_table = unique_string(desp='temp_output_table') + +layer_sizes = [num_input_nodes] + hidden_layer_sizes + [num_output_nodes] + for _ in range(n_tries): +prev_state = None if not warm_start: coeff = [] -for i in range(len(layer_sizes) - 1): -fan_in = layer_sizes[i] -fan_out = layer_sizes[i + 1] +for fan_in, fan_out in zip(layer_sizes, layer_sizes[1:]): # Initalize according to Glorot and Bengio (2010) # See design doc for more info span = math.sqrt(6.0 / (fan_in + fan_out)) -dim = (layer_sizes[i] + 1) * layer_sizes[i + 1] -rand = plpy.execute("""SELECT array_agg({span}*2*(random()-0.5)) - AS random - FROM generate_series(0,{dim}) -""".format(span=span, dim=dim))[0]["random"] +dim = (fan_in + 1) * fan_out +rand = [span * (random() - 0.5) for _ in range(dim)] --- End diff -- Its supposed to be explained in the design doc as per the comment. I think these formulae are taken from a research paper. ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175950079 --- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in --- @@ -292,26 +329,33 @@ def mlp(schema_madlib, source_table, output_table, independent_varname, # used, it will be an empty list if there was not grouping. groups = [t[col_grp_key] for t in res if t[col_grp_key]] losses = [t['loss'] for t in res] -loss = zip(groups, losses) if len(groups)==len(losses) \ - else losses -plpy.info("Iteration: " + str(it.iteration) + ", Loss: <" + \ - ', '.join([str(l) for l in loss]) + ">") +loss = zip(groups, losses) if groups else losses +plpy.info("Iteration: {0}, Loss: <{1}>". + format(it.iteration, ', '.join(map(str, loss it.final() _update_temp_model_table(it_args, it.iteration, temp_output_table, - first_try) + is_minibatch_enabled, first_try) first_try = False -layer_sizes_str = py_list_to_sql_string( -layer_sizes, array_type="integer") -classes_str = py_list_to_sql_string( -[strip_end_quotes(cl, "'") for cl in classes], -array_type=dependent_type) +layer_sizes_str = py_list_to_sql_string(layer_sizes, +array_type="integer") + _create_summary_table(locals()) -_create_standardization_table(standardization_table, x_mean_table, - warm_start) +if is_minibatch_enabled: +# We already have the mean and std in the input standardization table +input_std_table = add_postfix(source_table, '_standardization') +_create_standardization_table(standardization_table, input_std_table, + warm_start) +else: +_create_standardization_table(standardization_table, x_mean_table, + warm_start) +# The original input table is the tab_data_scaled for mini batch. +# Do NOT drop this, it will end up dropping the original data table. +plpy.execute("DROP TABLE IF EXISTS {0}".format(tbl_data_scaled)) +plpy.execute("DROP TABLE IF EXISTS {0}".format(x_mean_table)) --- End diff -- Yes. ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175947887 --- Diff: src/modules/convex/algo/igd.hpp --- @@ -90,20 +90,27 @@ IGD::transition(state_type &state, for (int curr_epoch=0; curr_epoch < n_epochs; curr_epoch++) { double loss = 0.0; -for (int curr_batch=0, curr_batch_row_index=0; curr_batch < n_batches; - curr_batch++, curr_batch_row_index += batch_size) { - Matrix X_batch; - ColumnVector y_batch; - if (curr_batch == n_batches-1) { - // last batch - X_batch = tuple.indVar.bottomRows(n_rows-curr_batch_row_index); - y_batch = tuple.depVar.tail(n_rows-curr_batch_row_index); - } else { - X_batch = tuple.indVar.block(curr_batch_row_index, 0, batch_size, n_ind_cols); - y_batch = tuple.depVar.segment(curr_batch_row_index, batch_size); - } - loss += Task::getLossAndUpdateModel( - state.task.model, X_batch, y_batch, state.task.stepsize); +int random_curr_batch[n_batches]; +for(int i=0; i
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175947857 --- Diff: src/modules/convex/algo/igd.hpp --- @@ -90,20 +90,27 @@ IGD::transition(state_type &state, for (int curr_epoch=0; curr_epoch < n_epochs; curr_epoch++) { double loss = 0.0; -for (int curr_batch=0, curr_batch_row_index=0; curr_batch < n_batches; - curr_batch++, curr_batch_row_index += batch_size) { - Matrix X_batch; - ColumnVector y_batch; - if (curr_batch == n_batches-1) { - // last batch - X_batch = tuple.indVar.bottomRows(n_rows-curr_batch_row_index); - y_batch = tuple.depVar.tail(n_rows-curr_batch_row_index); - } else { - X_batch = tuple.indVar.block(curr_batch_row_index, 0, batch_size, n_ind_cols); - y_batch = tuple.depVar.segment(curr_batch_row_index, batch_size); - } - loss += Task::getLossAndUpdateModel( - state.task.model, X_batch, y_batch, state.task.stepsize); +int random_curr_batch[n_batches]; +for(int i=0; i
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175950139 --- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in --- @@ -491,10 +571,28 @@ def _update_temp_model_table(args, iteration, temp_output_table, first_try): ) rel_state_subq {join_clause} """.format(insert_or_create_str=insert_or_create_str, - iteration=iteration, join_clause=join_clause, **args) + iteration=iteration, join_clause=join_clause, + internal_result_udf=internal_result_udf, **args) plpy.execute(model_table_query) +def _get_loss(schema_madlib, state, is_mini_batch): --- End diff -- Yes, must remove it. ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175948252 --- Diff: src/modules/convex/mlp_igd.cpp --- @@ -130,6 +145,90 @@ mlp_igd_transition::run(AnyType &args) { return state; } +/** + * @brief Perform the multilayer perceptron minibatch transition step + * + * Called for each tuple. + */ +AnyType +mlp_minibatch_transition::run(AnyType &args) { +// For the first tuple: args[0] is nothing more than a marker that +// indicates that we should do some initial operations. +// For other tuples: args[0] holds the computation state until last tuple +MLPMiniBatchState > state = args[0]; + +// initilize the state if first tuple +if (state.algo.numRows == 0) { +if (!args[3].isNull()) { +MLPMiniBatchState > previousState = args[3]; +state.allocate(*this, previousState.task.numberOfStages, + previousState.task.numbersOfUnits); +state = previousState; +} else { +// configuration parameters +ArrayHandle numbersOfUnits = args[4].getAs >(); --- End diff -- We probably could, but there are a couple of extra arguments that only minibatch gets, and not IGD (batch_size and n_epochs). ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175949682 --- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in --- @@ -222,67 +243,83 @@ def mlp(schema_madlib, source_table, output_table, independent_varname, it_args.update({ 'group_by_clause': group_by_clause, 'using_clause': using_clause, -'grouping_str_comma': grouping_str_comma +'grouping_str_comma': grouping_str_comma, }) first_try = True temp_output_table = unique_string(desp='temp_output_table') + +layer_sizes = [num_input_nodes] + hidden_layer_sizes + [num_output_nodes] --- End diff -- Yes, this looks like duplicated code. ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175948750 --- Diff: src/modules/convex/task/mlp.hpp --- @@ -111,6 +117,57 @@ class MLP { template double MLP::lambda = 0; +template +double +MLP::getLossAndUpdateModel( +model_type &model, +const Matrix &x_batch, +const Matrix &y_true_batch, +const double &stepsize) { + +uint16_t N = model.u.size(); // assuming nu. of layers >= 1 +size_t n = x_batch.rows(); +size_t i, k; +double total_loss = 0.; + +// gradient added over the batch +std::vector total_gradient_per_layer(N); +for (k=0; k < N; ++k) +total_gradient_per_layer[k] = Matrix::Zero(model.u[k].rows(), + model.u[k].cols()); + +for (i=0; i < n; i++){ +ColumnVector x = x_batch.row(i); +ColumnVector y_true = y_true_batch.row(i); + +std::vector net, o, delta; +feedForward(model, x, net, o); --- End diff -- We will have to change the design docs too for that. Apparently, the notation used here is supposed to be in sync with the design doc. ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175888098 --- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in --- @@ -590,51 +664,103 @@ def _validate_warm_start(output_table, summary_table, standardization_table, output_table + ". Invalid number of coefficients in model.") return coeff +def _validate_dependent_var(source_table, dependent_varname, +is_classification, is_minibatch_enabled): +expr_type = get_expr_type(dependent_varname, source_table) +int_types = ['integer', 'smallint', 'bigint'] +text_types = ['text', 'varchar', 'character varying', 'char', 'character'] +boolean_types = ['boolean'] +float_types = ['double precision', 'real'] +classification_types = int_types + boolean_types + text_types +regression_types = int_types + float_types +validate_type = classification_types if is_classification else regression_types + +if is_minibatch_enabled: +# With pre-processed data, dep type is always an array +_assert("[]" in expr_type, +"Dependent variable column should refer to an array.") +# The dependent variable is always a double precision array in +# preprocessed data (so use regression_types) +# strip out '[]' from expr_type +_assert(expr_type[:-2] in regression_types, --- End diff -- There are other numeric types like `decimal`, `numeric` etc. That makes me think if we really need this assert ? Same for the regression case for igd at line 696 And if we really want to assert this, consider using the function `is_psql_numeric_type` in `utilities_py.in` ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175627412 --- Diff: src/modules/convex/mlp_igd.cpp --- @@ -130,6 +145,90 @@ mlp_igd_transition::run(AnyType &args) { return state; } +/** + * @brief Perform the multilayer perceptron minibatch transition step + * + * Called for each tuple. + */ +AnyType +mlp_minibatch_transition::run(AnyType &args) { +// For the first tuple: args[0] is nothing more than a marker that +// indicates that we should do some initial operations. +// For other tuples: args[0] holds the computation state until last tuple +MLPMiniBatchState > state = args[0]; + +// initilize the state if first tuple +if (state.algo.numRows == 0) { +if (!args[3].isNull()) { +MLPMiniBatchState > previousState = args[3]; +state.allocate(*this, previousState.task.numberOfStages, + previousState.task.numbersOfUnits); +state = previousState; +} else { +// configuration parameters +ArrayHandle numbersOfUnits = args[4].getAs >(); --- End diff -- is it possible to reuse the code that gets the values from the args parameter ? I noticed that the igd transition function `mlp_igd_transition ` has the exact same code. ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175872591 --- Diff: src/modules/convex/task/mlp.hpp --- @@ -111,6 +117,57 @@ class MLP { template double MLP::lambda = 0; +template +double +MLP::getLossAndUpdateModel( +model_type &model, +const Matrix &x_batch, +const Matrix &y_true_batch, +const double &stepsize) { + +uint16_t N = model.u.size(); // assuming nu. of layers >= 1 +size_t n = x_batch.rows(); --- End diff -- is there a reason we chose N and n as variable names ? Can we use more descriptive names ? ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175923655 --- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in --- @@ -33,11 +34,12 @@ from convex.utils_regularization import __utils_normalize_data_grouping from utilities.in_mem_group_control import GroupIterationController from utilities.utilities import _array_to_string +from utilities.utilities import add_postfix +from utilities.utilities import py_list_to_sql_string as PY2SQL +from utilities.utilities import extract_keyvalue_params from utilities.utilities import _assert from utilities.utilities import _assert_equal from utilities.utilities import _string_to_array_with_quotes -from utilities.utilities import add_postfix -from utilities.utilities import extract_keyvalue_params from utilities.utilities import py_list_to_sql_string --- End diff -- we don't need this import anymore ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175894372 --- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in --- @@ -1457,3 +1660,85 @@ def mlp_predict_help(schema_madlib, message): return """ No such option. Use "SELECT {schema_madlib}.mlp_predict()" for help. """.format(**args) + + +def check_if_minibatch_enabled(source_table, independent_varname): +""" +Function to validate if the source_table is converted to a format that +can be used for mini-batching. It checks for the dimensionalities of +the independent variable to determine the same. +""" +query = """ +SELECT array_upper({0}, 1) AS n_x, + array_upper({0}, 2) AS n_y, + array_upper({0}, 3) AS n_z +FROM {1} +LIMIT 1 +""".format(independent_varname, source_table) +result = plpy.execute(query) + +if not result: +plpy.error("MLP: Input table could be empty.") + +has_x_dim, has_y_dim, has_z_dim = [bool(result[0][i]) + for i in ('n_x', 'n_y', 'n_z')] +if not has_x_dim: +plpy.error("MLP: {0} is empty.".format(independent_varname)) + +# error out if >2d matrix +if has_z_dim: +plpy.error("MLP: Input table is not in the right format.") +return has_y_dim + + +class MLPPreProcessor: +""" +This class consumes and validates the pre-processed source table used for +MLP mini-batch. This also populates values from the pre-processed summary +table which is used by MLP mini-batch + +""" +# summary table columns names +DEPENDENT_VARNAME = "dependent_varname" +INDEPENDENT_VARNAME = "independent_varname" +GROUPING_COL = "grouping_cols" +CLASS_VALUES = "class_values" +MODEL_TYPE_CLASSIFICATION = "classification" +MODEL_TYPE_REGRESSION = "regression" + +def __init__(self, source_table): +self.source_table = source_table +self.preprocessed_summary_dict = None +self.summary_table = add_postfix(self.source_table, "_summary") +self.std_table = add_postfix(self.source_table, "_standardization") + +self._validate_and_set_preprocessed_summary() + +def _validate_and_set_preprocessed_summary(self): +input_tbl_valid(self.source_table, 'MLP') + +if not table_exists(self.summary_table) or not table_exists(self.std_table): +plpy.error("Tables {0} and/or {1} do not exist. These tables are" + " needed for using minibatch during training.".format( + self.summary_table, + self.std_table)) + +query = "SELECT * FROM {0}".format(self.summary_table) +summary_table_columns = plpy.execute(query) +if not summary_table_columns or len(summary_table_columns) == 0: +plpy.error("No columns in table {0}.".format(self.summary_table)) +else: +summary_table_columns = summary_table_columns[0] + +required_columns = (self.DEPENDENT_VARNAME, self.INDEPENDENT_VARNAME, +self.CLASS_VALUES) +if set(required_columns) <= set(summary_table_columns): +self.preprocessed_summary_dict = summary_table_columns +else: +plpy.error("Expected columns ({0}, {1} and/or {2}) not present in" --- End diff -- We can use the `required_columns` to format the error message so that we don't have to repeat the column names. Something like ``` plpy.error("One or more of the expected columns {0} not present in {1}".format(required_columns, self.summary_table)) ``` ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175893520 --- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in --- @@ -1457,3 +1660,85 @@ def mlp_predict_help(schema_madlib, message): return """ No such option. Use "SELECT {schema_madlib}.mlp_predict()" for help. """.format(**args) + + +def check_if_minibatch_enabled(source_table, independent_varname): +""" +Function to validate if the source_table is converted to a format that +can be used for mini-batching. It checks for the dimensionalities of +the independent variable to determine the same. +""" +query = """ +SELECT array_upper({0}, 1) AS n_x, + array_upper({0}, 2) AS n_y, + array_upper({0}, 3) AS n_z +FROM {1} +LIMIT 1 +""".format(independent_varname, source_table) +result = plpy.execute(query) + +if not result: +plpy.error("MLP: Input table could be empty.") + +has_x_dim, has_y_dim, has_z_dim = [bool(result[0][i]) + for i in ('n_x', 'n_y', 'n_z')] +if not has_x_dim: +plpy.error("MLP: {0} is empty.".format(independent_varname)) + +# error out if >2d matrix +if has_z_dim: +plpy.error("MLP: Input table is not in the right format.") +return has_y_dim + + +class MLPPreProcessor: +""" +This class consumes and validates the pre-processed source table used for +MLP mini-batch. This also populates values from the pre-processed summary +table which is used by MLP mini-batch + +""" +# summary table columns names +DEPENDENT_VARNAME = "dependent_varname" +INDEPENDENT_VARNAME = "independent_varname" +GROUPING_COL = "grouping_cols" +CLASS_VALUES = "class_values" +MODEL_TYPE_CLASSIFICATION = "classification" +MODEL_TYPE_REGRESSION = "regression" + +def __init__(self, source_table): +self.source_table = source_table +self.preprocessed_summary_dict = None +self.summary_table = add_postfix(self.source_table, "_summary") +self.std_table = add_postfix(self.source_table, "_standardization") + +self._validate_and_set_preprocessed_summary() + +def _validate_and_set_preprocessed_summary(self): +input_tbl_valid(self.source_table, 'MLP') + +if not table_exists(self.summary_table) or not table_exists(self.std_table): +plpy.error("Tables {0} and/or {1} do not exist. These tables are" + " needed for using minibatch during training.".format( + self.summary_table, + self.std_table)) + +query = "SELECT * FROM {0}".format(self.summary_table) +summary_table_columns = plpy.execute(query) +if not summary_table_columns or len(summary_table_columns) == 0: +plpy.error("No columns in table {0}.".format(self.summary_table)) +else: +summary_table_columns = summary_table_columns[0] + +required_columns = (self.DEPENDENT_VARNAME, self.INDEPENDENT_VARNAME, --- End diff -- we also use `buffer_size` and `source_table` columns from the summary table. Do we need to validate those as well or are these three enough ? If we decide to assert all columns that we consume, we will have to keep this assert in sync with how we use the summary dict which is easy to forget. I don't have a better solution but just wanted to mention it. ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175891761 --- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in --- @@ -72,107 +73,127 @@ def mlp(schema_madlib, source_table, output_table, independent_varname, """ warm_start = bool(warm_start) optimizer_params = _get_optimizer_params(optimizer_param_str or "") + +tolerance = optimizer_params["tolerance"] +n_iterations = optimizer_params["n_iterations"] +step_size_init = optimizer_params["learning_rate_init"] +iterations_per_step = optimizer_params["iterations_per_step"] +power = optimizer_params["power"] +gamma = optimizer_params["gamma"] +step_size = step_size_init +n_tries = optimizer_params["n_tries"] +# lambda is a reserved word in python +lmbda = optimizer_params["lambda"] +batch_size = optimizer_params['batch_size'] +n_epochs = optimizer_params['n_epochs'] + summary_table = add_postfix(output_table, "_summary") standardization_table = add_postfix(output_table, "_standardization") -weights = '1' if not weights or not weights.strip() else weights.strip() hidden_layer_sizes = hidden_layer_sizes or [] -grouping_col = grouping_col or "" -activation = _get_activation_function_name(activation) -learning_rate_policy = _get_learning_rate_policy_name( -optimizer_params["learning_rate_policy"]) -activation_index = _get_activation_index(activation) - +# Note that we don't support weights with mini batching yet, so validate +# this based on is_minibatch_enabled. +weights = '1' if not weights or not weights.strip() else weights.strip() _validate_args(source_table, output_table, summary_table, standardization_table, independent_varname, dependent_varname, hidden_layer_sizes, optimizer_params, - is_classification, weights, warm_start, activation, - grouping_col) + warm_start, activation, grouping_col) +is_minibatch_enabled = check_if_minibatch_enabled(source_table, independent_varname) +_validate_params_based_on_minibatch(source_table, independent_varname, +dependent_varname, weights, +is_classification, +is_minibatch_enabled) +activation = _get_activation_function_name(activation) +learning_rate_policy = _get_learning_rate_policy_name( +optimizer_params["learning_rate_policy"]) +activation_index = _get_activation_index(activation) reserved_cols = ['coeff', 'loss', 'n_iterations'] +grouping_col = grouping_col or "" grouping_str, grouping_col = get_grouping_col_str(schema_madlib, 'MLP', reserved_cols, source_table, grouping_col) -current_iteration = 1 -prev_state = None -tolerance = optimizer_params["tolerance"] -n_iterations = optimizer_params["n_iterations"] -step_size_init = optimizer_params["learning_rate_init"] -iterations_per_step = optimizer_params["iterations_per_step"] -power = optimizer_params["power"] -gamma = optimizer_params["gamma"] -step_size = step_size_init -n_tries = optimizer_params["n_tries"] -# lambda is a reserved word in python -lmbda = optimizer_params["lambda"] -iterations_per_step = optimizer_params["iterations_per_step"] -num_input_nodes = array_col_dimension(source_table, - independent_varname) -num_output_nodes = 0 +dependent_varname_backup = dependent_varname classes = [] -dependent_type = get_expr_type(dependent_varname, source_table) -original_dependent_varname = dependent_varname - -x_mean_table = unique_string(desp='x_mean_table') -dimension, n_tuples = _tbl_dimension_rownum(schema_madlib, source_table, -independent_varname) - -tbl_data_scaled = unique_string(desp="tbl_data_scaled") -col_ind_var_norm_new = unique_string(desp="ind_var_norm") -col_dep_var_norm_new = unique_string(desp="dep_var_norm") -# Standardize the data, and create a standardized version of the -# source_table in tbl_data_scaled. Use this standardized table for IGD. -normalize_data(locals()) -if is_classification: -dependent_variable_sql = """ -SELECT
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175923217 --- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in --- @@ -292,26 +329,33 @@ def mlp(schema_madlib, source_table, output_table, independent_varname, # used, it will be an empty list if there was not grouping. groups = [t[col_grp_key] for t in res if t[col_grp_key]] losses = [t['loss'] for t in res] -loss = zip(groups, losses) if len(groups)==len(losses) \ - else losses -plpy.info("Iteration: " + str(it.iteration) + ", Loss: <" + \ - ', '.join([str(l) for l in loss]) + ">") +loss = zip(groups, losses) if groups else losses +plpy.info("Iteration: {0}, Loss: <{1}>". + format(it.iteration, ', '.join(map(str, loss it.final() _update_temp_model_table(it_args, it.iteration, temp_output_table, - first_try) + is_minibatch_enabled, first_try) first_try = False -layer_sizes_str = py_list_to_sql_string( -layer_sizes, array_type="integer") -classes_str = py_list_to_sql_string( -[strip_end_quotes(cl, "'") for cl in classes], -array_type=dependent_type) +layer_sizes_str = py_list_to_sql_string(layer_sizes, +array_type="integer") + _create_summary_table(locals()) -_create_standardization_table(standardization_table, x_mean_table, - warm_start) +if is_minibatch_enabled: +# We already have the mean and std in the input standardization table +input_std_table = add_postfix(source_table, '_standardization') +_create_standardization_table(standardization_table, input_std_table, + warm_start) +else: +_create_standardization_table(standardization_table, x_mean_table, + warm_start) +# The original input table is the tab_data_scaled for mini batch. --- End diff -- this comment should be moved to the if block ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175627929 --- Diff: src/modules/convex/mlp_igd.cpp --- @@ -130,6 +145,90 @@ mlp_igd_transition::run(AnyType &args) { return state; } +/** + * @brief Perform the multilayer perceptron minibatch transition step + * + * Called for each tuple. + */ +AnyType +mlp_minibatch_transition::run(AnyType &args) { +// For the first tuple: args[0] is nothing more than a marker that +// indicates that we should do some initial operations. +// For other tuples: args[0] holds the computation state until last tuple +MLPMiniBatchState > state = args[0]; + +// initilize the state if first tuple +if (state.algo.numRows == 0) { +if (!args[3].isNull()) { +MLPMiniBatchState > previousState = args[3]; +state.allocate(*this, previousState.task.numberOfStages, + previousState.task.numbersOfUnits); +state = previousState; +} else { +// configuration parameters +ArrayHandle numbersOfUnits = args[4].getAs >(); +int numberOfStages = numbersOfUnits.size() - 1; + +double stepsize = args[5].getAs(); + +state.allocate(*this, numberOfStages, + reinterpret_cast(numbersOfUnits.ptr())); +state.task.stepsize = stepsize; +const int activation = args[6].getAs(); +const int is_classification = args[7].getAs(); +// args[8] is for weighting the input row, which is populated later. +const bool warm_start = args[9].getAs(); +const double lambda = args[11].getAs(); +state.algo.batchSize = args[12].getAs(); +state.algo.nEpochs = args[13].getAs(); +state.task.lambda = lambda; +MLPTask::lambda = lambda; + +/* FIXME: The state is set back to zero for second row onwards if + initialized as in IGD. The following avoids that, but there is + some failure with debug build that must be fixed. +*/ +state.task.model.is_classification = +static_cast(is_classification); +state.task.model.activation = static_cast(activation); +MappedColumnVector initial_coeff = args[10].getAs(); +// copy initial_coeff into the model +Index fan_in, fan_out, layer_start = 0; +for (size_t k = 0; k < numberOfStages; ++k){ +fan_in = numbersOfUnits[k]; +fan_out = numbersOfUnits[k+1]; +state.task.model.u[k] << initial_coeff.segment(layer_start, (fan_in+1)*fan_out); +layer_start = (fan_in + 1) * fan_out; +} +} +// resetting in either case +state.reset(); +} + +// meta data +const uint16_t N = state.task.numberOfStages; --- End diff -- is there a reason we chose N and n as variable names ? Can we use more descriptive names ? ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175922864 --- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in --- @@ -292,26 +329,33 @@ def mlp(schema_madlib, source_table, output_table, independent_varname, # used, it will be an empty list if there was not grouping. groups = [t[col_grp_key] for t in res if t[col_grp_key]] losses = [t['loss'] for t in res] -loss = zip(groups, losses) if len(groups)==len(losses) \ - else losses -plpy.info("Iteration: " + str(it.iteration) + ", Loss: <" + \ - ', '.join([str(l) for l in loss]) + ">") +loss = zip(groups, losses) if groups else losses +plpy.info("Iteration: {0}, Loss: <{1}>". + format(it.iteration, ', '.join(map(str, loss it.final() _update_temp_model_table(it_args, it.iteration, temp_output_table, - first_try) + is_minibatch_enabled, first_try) first_try = False -layer_sizes_str = py_list_to_sql_string( -layer_sizes, array_type="integer") -classes_str = py_list_to_sql_string( -[strip_end_quotes(cl, "'") for cl in classes], -array_type=dependent_type) +layer_sizes_str = py_list_to_sql_string(layer_sizes, --- End diff -- should we use the `PY2SQL` alias here ? ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175889157 --- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in --- @@ -590,51 +664,103 @@ def _validate_warm_start(output_table, summary_table, standardization_table, output_table + ". Invalid number of coefficients in model.") return coeff +def _validate_dependent_var(source_table, dependent_varname, +is_classification, is_minibatch_enabled): +expr_type = get_expr_type(dependent_varname, source_table) +int_types = ['integer', 'smallint', 'bigint'] +text_types = ['text', 'varchar', 'character varying', 'char', 'character'] +boolean_types = ['boolean'] +float_types = ['double precision', 'real'] +classification_types = int_types + boolean_types + text_types +regression_types = int_types + float_types +validate_type = classification_types if is_classification else regression_types --- End diff -- I think it's slightly cleaner if we don't use the `validate_type ` variable but use the `classification_types` and `regression_types ` variables. ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175921215 --- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in --- @@ -222,67 +243,83 @@ def mlp(schema_madlib, source_table, output_table, independent_varname, it_args.update({ 'group_by_clause': group_by_clause, 'using_clause': using_clause, -'grouping_str_comma': grouping_str_comma +'grouping_str_comma': grouping_str_comma, }) first_try = True temp_output_table = unique_string(desp='temp_output_table') + +layer_sizes = [num_input_nodes] + hidden_layer_sizes + [num_output_nodes] + for _ in range(n_tries): +prev_state = None if not warm_start: coeff = [] -for i in range(len(layer_sizes) - 1): -fan_in = layer_sizes[i] -fan_out = layer_sizes[i + 1] +for fan_in, fan_out in zip(layer_sizes, layer_sizes[1:]): # Initalize according to Glorot and Bengio (2010) # See design doc for more info span = math.sqrt(6.0 / (fan_in + fan_out)) -dim = (layer_sizes[i] + 1) * layer_sizes[i + 1] -rand = plpy.execute("""SELECT array_agg({span}*2*(random()-0.5)) - AS random - FROM generate_series(0,{dim}) -""".format(span=span, dim=dim))[0]["random"] +dim = (fan_in + 1) * fan_out +rand = [span * (random() - 0.5) for _ in range(dim)] --- End diff -- why are we subtracting 0.5 from `random()` ? ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175931022 --- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in --- @@ -491,10 +571,28 @@ def _update_temp_model_table(args, iteration, temp_output_table, first_try): ) rel_state_subq {join_clause} """.format(insert_or_create_str=insert_or_create_str, - iteration=iteration, join_clause=join_clause, **args) + iteration=iteration, join_clause=join_clause, + internal_result_udf=internal_result_udf, **args) plpy.execute(model_table_query) +def _get_loss(schema_madlib, state, is_mini_batch): --- End diff -- This function is not used anywhere. ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175628144 --- Diff: src/modules/convex/mlp_igd.cpp --- @@ -130,6 +145,90 @@ mlp_igd_transition::run(AnyType &args) { return state; } +/** + * @brief Perform the multilayer perceptron minibatch transition step + * + * Called for each tuple. + */ +AnyType +mlp_minibatch_transition::run(AnyType &args) { +// For the first tuple: args[0] is nothing more than a marker that +// indicates that we should do some initial operations. +// For other tuples: args[0] holds the computation state until last tuple +MLPMiniBatchState > state = args[0]; + +// initilize the state if first tuple +if (state.algo.numRows == 0) { +if (!args[3].isNull()) { +MLPMiniBatchState > previousState = args[3]; +state.allocate(*this, previousState.task.numberOfStages, + previousState.task.numbersOfUnits); +state = previousState; +} else { +// configuration parameters +ArrayHandle numbersOfUnits = args[4].getAs >(); +int numberOfStages = numbersOfUnits.size() - 1; + +double stepsize = args[5].getAs(); + +state.allocate(*this, numberOfStages, + reinterpret_cast(numbersOfUnits.ptr())); +state.task.stepsize = stepsize; +const int activation = args[6].getAs(); +const int is_classification = args[7].getAs(); +// args[8] is for weighting the input row, which is populated later. +const bool warm_start = args[9].getAs(); +const double lambda = args[11].getAs(); +state.algo.batchSize = args[12].getAs(); +state.algo.nEpochs = args[13].getAs(); +state.task.lambda = lambda; +MLPTask::lambda = lambda; + +/* FIXME: The state is set back to zero for second row onwards if + initialized as in IGD. The following avoids that, but there is + some failure with debug build that must be fixed. +*/ +state.task.model.is_classification = +static_cast(is_classification); +state.task.model.activation = static_cast(activation); +MappedColumnVector initial_coeff = args[10].getAs(); +// copy initial_coeff into the model +Index fan_in, fan_out, layer_start = 0; +for (size_t k = 0; k < numberOfStages; ++k){ +fan_in = numbersOfUnits[k]; +fan_out = numbersOfUnits[k+1]; +state.task.model.u[k] << initial_coeff.segment(layer_start, (fan_in+1)*fan_out); +layer_start = (fan_in + 1) * fan_out; +} +} +// resetting in either case +state.reset(); +} + +// meta data +const uint16_t N = state.task.numberOfStages; +const double *n = state.task.numbersOfUnits; + +// tuple +Matrix indVar; +Matrix depVar; +try { --- End diff -- why do we expect the following 2 lines to fail ? may be add a comment explaining the reason. ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175877947 --- Diff: src/modules/convex/task/mlp.hpp --- @@ -111,6 +117,57 @@ class MLP { template double MLP::lambda = 0; +template +double +MLP::getLossAndUpdateModel( +model_type &model, +const Matrix &x_batch, +const Matrix &y_true_batch, +const double &stepsize) { + +uint16_t N = model.u.size(); // assuming nu. of layers >= 1 +size_t n = x_batch.rows(); +size_t i, k; +double total_loss = 0.; + +// gradient added over the batch +std::vector total_gradient_per_layer(N); +for (k=0; k < N; ++k) +total_gradient_per_layer[k] = Matrix::Zero(model.u[k].rows(), + model.u[k].cols()); + +for (i=0; i < n; i++){ +ColumnVector x = x_batch.row(i); +ColumnVector y_true = y_true_batch.row(i); + +std::vector net, o, delta; +feedForward(model, x, net, o); +backPropogate(y_true, o.back(), net, model, delta); + +for (k=0; k < N; k++){ +total_gradient_per_layer[k] += o[k] * delta[k].transpose(); +} + +// loss computation +ColumnVector y_estimated = o.back(); +if(model.is_classification){ +double clip = 1.e-10; +y_estimated = y_estimated.cwiseMax(clip).cwiseMin(1.-clip); --- End diff -- Just curious, why do we need to do re calculate `y_estimated` ? ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175929883 --- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in --- @@ -292,26 +329,33 @@ def mlp(schema_madlib, source_table, output_table, independent_varname, # used, it will be an empty list if there was not grouping. groups = [t[col_grp_key] for t in res if t[col_grp_key]] losses = [t['loss'] for t in res] -loss = zip(groups, losses) if len(groups)==len(losses) \ - else losses -plpy.info("Iteration: " + str(it.iteration) + ", Loss: <" + \ - ', '.join([str(l) for l in loss]) + ">") +loss = zip(groups, losses) if groups else losses +plpy.info("Iteration: {0}, Loss: <{1}>". + format(it.iteration, ', '.join(map(str, loss it.final() _update_temp_model_table(it_args, it.iteration, temp_output_table, - first_try) + is_minibatch_enabled, first_try) first_try = False -layer_sizes_str = py_list_to_sql_string( -layer_sizes, array_type="integer") -classes_str = py_list_to_sql_string( -[strip_end_quotes(cl, "'") for cl in classes], -array_type=dependent_type) +layer_sizes_str = py_list_to_sql_string(layer_sizes, +array_type="integer") + _create_summary_table(locals()) -_create_standardization_table(standardization_table, x_mean_table, - warm_start) +if is_minibatch_enabled: +# We already have the mean and std in the input standardization table +input_std_table = add_postfix(source_table, '_standardization') +_create_standardization_table(standardization_table, input_std_table, + warm_start) +else: +_create_standardization_table(standardization_table, x_mean_table, + warm_start) +# The original input table is the tab_data_scaled for mini batch. +# Do NOT drop this, it will end up dropping the original data table. +plpy.execute("DROP TABLE IF EXISTS {0}".format(tbl_data_scaled)) +plpy.execute("DROP TABLE IF EXISTS {0}".format(x_mean_table)) --- End diff -- is there a test for this in install check to assert that the input tables including summary and std tables aren't dropped for minibatch ? ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175620624 --- Diff: src/modules/convex/algo/igd.hpp --- @@ -90,20 +90,27 @@ IGD::transition(state_type &state, for (int curr_epoch=0; curr_epoch < n_epochs; curr_epoch++) { double loss = 0.0; -for (int curr_batch=0, curr_batch_row_index=0; curr_batch < n_batches; - curr_batch++, curr_batch_row_index += batch_size) { - Matrix X_batch; - ColumnVector y_batch; - if (curr_batch == n_batches-1) { - // last batch - X_batch = tuple.indVar.bottomRows(n_rows-curr_batch_row_index); - y_batch = tuple.depVar.tail(n_rows-curr_batch_row_index); - } else { - X_batch = tuple.indVar.block(curr_batch_row_index, 0, batch_size, n_ind_cols); - y_batch = tuple.depVar.segment(curr_batch_row_index, batch_size); - } - loss += Task::getLossAndUpdateModel( - state.task.model, X_batch, y_batch, state.task.stepsize); +int random_curr_batch[n_batches]; +for(int i=0; i
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175871655 --- Diff: src/modules/convex/mlp_igd.cpp --- @@ -170,6 +289,24 @@ mlp_igd_final::run(AnyType &args) { return state; } + +/** + * @brief Perform the multilayer perceptron final step + */ +AnyType +mlp_minibatch_final::run(AnyType &args) { +// We request a mutable object. Depending on the backend, this might perform +// a deep copy. +MLPMiniBatchState > state = args[0]; +// Aggregates that haven't seen any data just return Null. +if (state.algo.numRows == 0) { return Null(); } + +L2::lambda = state.task.lambda; +state.algo.loss = state.algo.loss/static_cast(state.algo.numRows); +state.algo.loss += L2::loss(state.task.model); +return state; --- End diff -- I noticed that minibatch `AlgoState` does not have an incr model unlike igd `AlgoState` . Do you think it makes sense to add a comment to explain this ? ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175917822 --- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in --- @@ -222,67 +243,83 @@ def mlp(schema_madlib, source_table, output_table, independent_varname, it_args.update({ 'group_by_clause': group_by_clause, 'using_clause': using_clause, -'grouping_str_comma': grouping_str_comma +'grouping_str_comma': grouping_str_comma, }) first_try = True temp_output_table = unique_string(desp='temp_output_table') + +layer_sizes = [num_input_nodes] + hidden_layer_sizes + [num_output_nodes] --- End diff -- we already did this at line 176. is this intentional? ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175890376 --- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in --- @@ -1457,3 +1660,85 @@ def mlp_predict_help(schema_madlib, message): return """ No such option. Use "SELECT {schema_madlib}.mlp_predict()" for help. """.format(**args) + + +def check_if_minibatch_enabled(source_table, independent_varname): +""" +Function to validate if the source_table is converted to a format that +can be used for mini-batching. It checks for the dimensionalities of +the independent variable to determine the same. +""" +query = """ +SELECT array_upper({0}, 1) AS n_x, + array_upper({0}, 2) AS n_y, + array_upper({0}, 3) AS n_z +FROM {1} +LIMIT 1 +""".format(independent_varname, source_table) +result = plpy.execute(query) + +if not result: +plpy.error("MLP: Input table could be empty.") + +has_x_dim, has_y_dim, has_z_dim = [bool(result[0][i]) + for i in ('n_x', 'n_y', 'n_z')] +if not has_x_dim: +plpy.error("MLP: {0} is empty.".format(independent_varname)) + +# error out if >2d matrix +if has_z_dim: +plpy.error("MLP: Input table is not in the right format.") +return has_y_dim + + +class MLPPreProcessor: +""" +This class consumes and validates the pre-processed source table used for +MLP mini-batch. This also populates values from the pre-processed summary +table which is used by MLP mini-batch + +""" +# summary table columns names +DEPENDENT_VARNAME = "dependent_varname" +INDEPENDENT_VARNAME = "independent_varname" +GROUPING_COL = "grouping_cols" +CLASS_VALUES = "class_values" +MODEL_TYPE_CLASSIFICATION = "classification" +MODEL_TYPE_REGRESSION = "regression" + +def __init__(self, source_table): +self.source_table = source_table +self.preprocessed_summary_dict = None +self.summary_table = add_postfix(self.source_table, "_summary") +self.std_table = add_postfix(self.source_table, "_standardization") + +self._validate_and_set_preprocessed_summary() + +def _validate_and_set_preprocessed_summary(self): +input_tbl_valid(self.source_table, 'MLP') --- End diff -- we don't really need to validate the source table here since it would already be validated by the `_validate_args` function. ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175895832 --- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in --- @@ -72,107 +73,127 @@ def mlp(schema_madlib, source_table, output_table, independent_varname, """ warm_start = bool(warm_start) optimizer_params = _get_optimizer_params(optimizer_param_str or "") + +tolerance = optimizer_params["tolerance"] +n_iterations = optimizer_params["n_iterations"] +step_size_init = optimizer_params["learning_rate_init"] +iterations_per_step = optimizer_params["iterations_per_step"] +power = optimizer_params["power"] +gamma = optimizer_params["gamma"] +step_size = step_size_init +n_tries = optimizer_params["n_tries"] +# lambda is a reserved word in python +lmbda = optimizer_params["lambda"] +batch_size = optimizer_params['batch_size'] +n_epochs = optimizer_params['n_epochs'] + summary_table = add_postfix(output_table, "_summary") standardization_table = add_postfix(output_table, "_standardization") -weights = '1' if not weights or not weights.strip() else weights.strip() hidden_layer_sizes = hidden_layer_sizes or [] -grouping_col = grouping_col or "" -activation = _get_activation_function_name(activation) -learning_rate_policy = _get_learning_rate_policy_name( -optimizer_params["learning_rate_policy"]) -activation_index = _get_activation_index(activation) - +# Note that we don't support weights with mini batching yet, so validate +# this based on is_minibatch_enabled. +weights = '1' if not weights or not weights.strip() else weights.strip() _validate_args(source_table, output_table, summary_table, standardization_table, independent_varname, dependent_varname, hidden_layer_sizes, optimizer_params, - is_classification, weights, warm_start, activation, - grouping_col) + warm_start, activation, grouping_col) +is_minibatch_enabled = check_if_minibatch_enabled(source_table, independent_varname) +_validate_params_based_on_minibatch(source_table, independent_varname, +dependent_varname, weights, +is_classification, +is_minibatch_enabled) +activation = _get_activation_function_name(activation) +learning_rate_policy = _get_learning_rate_policy_name( +optimizer_params["learning_rate_policy"]) +activation_index = _get_activation_index(activation) reserved_cols = ['coeff', 'loss', 'n_iterations'] +grouping_col = grouping_col or "" grouping_str, grouping_col = get_grouping_col_str(schema_madlib, 'MLP', reserved_cols, source_table, grouping_col) -current_iteration = 1 -prev_state = None -tolerance = optimizer_params["tolerance"] -n_iterations = optimizer_params["n_iterations"] -step_size_init = optimizer_params["learning_rate_init"] -iterations_per_step = optimizer_params["iterations_per_step"] -power = optimizer_params["power"] -gamma = optimizer_params["gamma"] -step_size = step_size_init -n_tries = optimizer_params["n_tries"] -# lambda is a reserved word in python -lmbda = optimizer_params["lambda"] -iterations_per_step = optimizer_params["iterations_per_step"] -num_input_nodes = array_col_dimension(source_table, - independent_varname) -num_output_nodes = 0 +dependent_varname_backup = dependent_varname --- End diff -- can we add a comment explaining why we need this backup variable ? ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175873168 --- Diff: src/modules/convex/task/mlp.hpp --- @@ -111,6 +117,57 @@ class MLP { template double MLP::lambda = 0; +template +double +MLP::getLossAndUpdateModel( +model_type &model, +const Matrix &x_batch, +const Matrix &y_true_batch, +const double &stepsize) { + +uint16_t N = model.u.size(); // assuming nu. of layers >= 1 +size_t n = x_batch.rows(); +size_t i, k; +double total_loss = 0.; + +// gradient added over the batch +std::vector total_gradient_per_layer(N); +for (k=0; k < N; ++k) +total_gradient_per_layer[k] = Matrix::Zero(model.u[k].rows(), + model.u[k].cols()); + +for (i=0; i < n; i++){ +ColumnVector x = x_batch.row(i); +ColumnVector y_true = y_true_batch.row(i); + +std::vector net, o, delta; +feedForward(model, x, net, o); --- End diff -- Can we use a more descriptive name for the variable `o` ? ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175625333 --- Diff: src/modules/convex/algo/igd.hpp --- @@ -90,20 +90,27 @@ IGD::transition(state_type &state, for (int curr_epoch=0; curr_epoch < n_epochs; curr_epoch++) { double loss = 0.0; -for (int curr_batch=0, curr_batch_row_index=0; curr_batch < n_batches; - curr_batch++, curr_batch_row_index += batch_size) { - Matrix X_batch; - ColumnVector y_batch; - if (curr_batch == n_batches-1) { - // last batch - X_batch = tuple.indVar.bottomRows(n_rows-curr_batch_row_index); - y_batch = tuple.depVar.tail(n_rows-curr_batch_row_index); - } else { - X_batch = tuple.indVar.block(curr_batch_row_index, 0, batch_size, n_ind_cols); - y_batch = tuple.depVar.segment(curr_batch_row_index, batch_size); - } - loss += Task::getLossAndUpdateModel( - state.task.model, X_batch, y_batch, state.task.stepsize); +int random_curr_batch[n_batches]; +for(int i=0; i
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175626817 --- Diff: src/modules/convex/mlp_igd.cpp --- @@ -130,6 +145,90 @@ mlp_igd_transition::run(AnyType &args) { return state; } +/** + * @brief Perform the multilayer perceptron minibatch transition step + * + * Called for each tuple. + */ +AnyType +mlp_minibatch_transition::run(AnyType &args) { +// For the first tuple: args[0] is nothing more than a marker that +// indicates that we should do some initial operations. +// For other tuples: args[0] holds the computation state until last tuple +MLPMiniBatchState > state = args[0]; + +// initilize the state if first tuple +if (state.algo.numRows == 0) { +if (!args[3].isNull()) { +MLPMiniBatchState > previousState = args[3]; --- End diff -- can we create this variable outside the if check and then use it if it's not null ? It looks cleaner and is easier to follow ---
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/243#discussion_r175621915 --- Diff: src/modules/convex/algo/igd.hpp --- @@ -90,20 +90,27 @@ IGD::transition(state_type &state, for (int curr_epoch=0; curr_epoch < n_epochs; curr_epoch++) { double loss = 0.0; -for (int curr_batch=0, curr_batch_row_index=0; curr_batch < n_batches; - curr_batch++, curr_batch_row_index += batch_size) { - Matrix X_batch; - ColumnVector y_batch; - if (curr_batch == n_batches-1) { - // last batch - X_batch = tuple.indVar.bottomRows(n_rows-curr_batch_row_index); - y_batch = tuple.depVar.tail(n_rows-curr_batch_row_index); - } else { - X_batch = tuple.indVar.block(curr_batch_row_index, 0, batch_size, n_ind_cols); - y_batch = tuple.depVar.segment(curr_batch_row_index, batch_size); - } - loss += Task::getLossAndUpdateModel( - state.task.model, X_batch, y_batch, state.task.stepsize); +int random_curr_batch[n_batches]; +for(int i=0; i
[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver
GitHub user njayaram2 opened a pull request: https://github.com/apache/madlib/pull/243 MLP: Add minibatch gradient descent solver JIRA: MADLIB-1206 This commit adds support for mini-batch based gradient descent for MLP. If the input table contains a 2D matrix for independent variable, minibatch is automatically used as the solver. Two minibatch specific optimizers are also introduced: batch_size and n_epochs. - batch_size is defaulted to min(200, buffer_size), where buffer_size is equal to the number of original input rows packed into a single row in the matrix. - n_epochs is the number of times all the batches in a buffer are iterated over (default 1). Other changes include: - dependent variable in the minibatch solver is also a matrix now. It was initially a vector. - Randomize the order of processing a batch within an epoch. - MLP minibatch currently doesn't support weights param, an error is thrown now. - Delete an unused type named mlp_step_result. - Add unit tests for newly added functions in python file. Co-authored-by: Rahul Iyer Co-authored-by: Nikhil Kak Closes #243 You can merge this pull request into a Git repository by running: $ git pull https://github.com/madlib/madlib mlp-minibatch-with-preprocessed-data-rebased Alternatively you can review and apply these changes as the patch at: https://github.com/apache/madlib/pull/243.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #243 commit d9306f7c6a44f64c53df13c34759da55468c4d26 Author: Nandish Jayaram Date: 2018-02-28T00:51:42Z MLP: Add minibatch gradient descent solver JIRA: MADLIB-1206 This commit adds support for mini-batch based gradient descent for MLP. If the input table contains a 2D matrix for independent variable, minibatch is automatically used as the solver. Two minibatch specific optimizers are also introduced: batch_size and n_epochs. - batch_size is defaulted to min(200, buffer_size), where buffer_size is equal to the number of original input rows packed into a single row in the matrix. - n_epochs is the number of times all the batches in a buffer are iterated over (default 1). Other changes include: - dependent variable in the minibatch solver is also a matrix now. It was initially a vector. - Randomize the order of processing a batch within an epoch. - MLP minibatch currently doesn't support weights param, an error is thrown now. - Delete an unused type named mlp_step_result. - Add unit tests for newly added functions in python file. Co-authored-by: Rahul Iyer Co-authored-by: Nikhil Kak Closes #243 ---