subject:"\[GitHub\] madlib pull request #243\: MLP\: Add minibatch gradient descent solver"

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-22 Thread asfgit

Github user asfgit closed the pull request at:

https://github.com/apache/madlib/pull/243


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-21 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r176262391
  
--- Diff: src/ports/postgres/modules/convex/test/mlp.sql_in ---
@@ -340,6 +181,51 @@ INSERT INTO iris_data VALUES
 (149,ARRAY[6.2,3.4,5.4,2.3],'Iris-virginica',3,2),
 (150,ARRAY[5.9,3.0,5.1,1.8],'Iris-virginica',3,2);
 
+-- NOTE that the batch specific tables were created using:
+-- madlib.minibatch_preprocessor(), with the regular source tables used in
+-- this file.
+
+-- Create preprocessed data that can be used with minibatch MLP:
+DROP TABLE IF EXISTS iris_data_batch, iris_data_batch_summary, 
iris_data_batch_standardization;
+CREATE TABLE iris_data_batch(
+__id__ integer,
+dependent_varname double precision[],
+independent_varname double precision[]
+);
+COPY iris_data_batch (__id__, dependent_varname, independent_varname) FROM 
STDIN NULL '?' DELIMITER '|';
+0 | 
{{0,1,0},{0,1,0},{0,0,1},{1,0,0},{0,1,0},{0,1,0},{0,0,1},{1,0,0},{1,0,0},{0,1,0},{1,0,0},{0,0,1},{0,0,1},{0,0,1},{1,0,0},{0,0,1},{0,0,1},{1,0,0},{1,0,0},{0,0,1},{0,1,0},{0,0,1},{0,0,1},{0,0,1},{0,0,1},{1,0,0},{0,1,0},{0,0,1},{0,0,1},{1,0,0}}
 | 
{{0.828881825720994,-0.314980522532101,0.363710790466334,0.159758615207397},{-1.08079689039279,-1.57669227467446,-0.229158821743702,-0.240110581430527},{-1.08079689039279,-1.32434992424599,0.482284712908341,0.692917544057962},{-1.46273263361555,0.442046528753317,-1.35561108494277,-1.30642843913166},{-0.0623015751321059,-0.567322872960574,0.245136868024327,0.159758615207397},{-0.189613489539692,-0.819665223389045,0.304423829245331,0.159758615207397},{0.701569911313408,-1.32434992424599,0.778719519013359,0.959497008483245},{-1.20810880480038,-0.0626381721036282,-1.35561108494277,-1.4397181713443},{-0.698861147170034,0.946731229610261,-1.35561108494277,-1.30642843913166},{-0.82617306157762,-1.32434992424599,-0.407019705406713,-0.106820849
 
217886},{-0.698861147170034,2.71312768260957,-1.29632412372177,-1.4397181713443},{1.33812948335134,0.442046528753317,1.31230217000239,1.49265593733381},{0.319634168090651,-0.0626381721036282,0.660145596571352,0.826207276270604},{0.701569911313408,-1.32434992424599,0.778719519013359,0.959497008483245},{-0.698861147170034,1.19907358003873,-1.29632412372177,-1.30642843913166},{1.46544139775892,0.189704178324845,0.838006480234363,1.49265593733381},{1.21081756894375,-0.0626381721036282,0.897293441455367,1.49265593733381},{-0.444237318354863,1.70375828089568,-1.29632412372177,-1.30642843913166},{-0.82617306157762,1.95610063132415,-1.05917627883775,-1.03984897470638},{0.828881825720994,-0.819665223389045,0.95658040267637,0.959497008483245},{0.956193740128579,-0.567322872960574,0.541571674129345,0.42633807963268},{1.33812948335134,0.442046528753317,1.31230217000239,1.49265593733381},{0.574257996905822,0.946731229610261,1.01586736389737,1.49265593733381},{0.0650103392754793,-0.81966522338904
 
5,0.838006480234363,0.959497008483245},{0.0650103392754793,-0.819665223389045,0.838006480234363,0.959497008483245},{-1.46273263361555,0.442046528753317,-1.35561108494277,-1.30642843913166},{0.574257996905822,-2.08137697553141,0.482284712908341,0.42633807963268},{1.21081756894375,0.189704178324845,1.13444128633938,1.62594566954645},{1.97468905538926,-0.314980522532101,1.54945001488641,0.826207276270604},{-1.08079689039279,0.189704178324845,-1.29632412372177,-1.4397181713443}}
+1 | 
{{0,1,0},{1,0,0},{0,1,0},{1,0,0},{1,0,0},{1,0,0},{1,0,0},{0,1,0},{0,0,1},{0,0,1},{1,0,0},{0,0,1},{1,0,0},{0,0,1},{0,1,0},{0,1,0},{0,1,0},{1,0,0},{1,0,0},{0,0,1},{0,1,0},{0,1,0},{0,0,1},{1,0,0},{1,0,0},{0,1,0},{1,0,0},{0,0,1},{0,1,0},{0,1,0}}
 | 
{{-0.0623015751321059,-0.0626381721036282,0.304423829245331,0.0264688829947554},{-0.316925403947277,2.96547003303804,-1.35561108494277,-1.30642843913166},{0.319634168090651,-0.819665223389045,0.838006480234363,0.559627811845321},{-0.953484975985206,1.19907358003873,-1.41489804616377,-1.17313870691902},{-0.953484975985206,0.442046528753317,-1.47418500738478,-1.30642843913166},{-1.33542071920796,0.442046528753317,-1.41489804616377,-1.30642843913166},{-1.71735646243072,-0.0626381721036282,-1.41489804616377,-1.30642843913166},{0.446946082498236,-0.0626381721036282,0.541571674129345,0.293048347420038},{1.21081756894375,-1.32434992424599,1.25301520878139,0.826207276270604},{0.701569911313408,0.694388879181789,1.3715891312234,1.75923540175909
 
},{-1.84466837683831,-0.0626381721036282,-1.53347196860578,-1.4397181713443},{1.84737714098168,1.45141593046721,1.430876092,1.75923540175909},{-0.82617306157762,1.19907358003873,-1.35561108494277,-1.30642843913166},{0.701569911313408,-0.314980522532101,1.13444128633938,0.826207276270604},{1.33812948335134,-0.567322872960574,0.660145596571352,0.293048347420038},{0.192322253683066,-0.0626381721036282,0.304423829245331,0.42633807963268},{-0.189613489539692,-0.819665223389045,0.304423829245331,0.159758615207397},{-1

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-21 Thread njayaram2

Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r176218740
  
--- Diff: src/modules/convex/mlp_igd.cpp ---
@@ -130,6 +145,90 @@ mlp_igd_transition::run(AnyType &args) {
 
 return state;
 }
+/**
+ * @brief Perform the multilayer perceptron minibatch transition step
+ *
+ * Called for each tuple.
+ */
+AnyType
+mlp_minibatch_transition::run(AnyType &args) {
+// For the first tuple: args[0] is nothing more than a marker that
+// indicates that we should do some initial operations.
+// For other tuples: args[0] holds the computation state until last 
tuple
+MLPMiniBatchState > state = args[0];
+
+// initilize the state if first tuple
+if (state.algo.numRows == 0) {
+if (!args[3].isNull()) {
+MLPMiniBatchState > previousState = 
args[3];
--- End diff --

Tried it, it was cleaner this way.


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread njayaram2

Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175949965
  
--- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in ---
@@ -222,67 +243,83 @@ def mlp(schema_madlib, source_table, output_table, 
independent_varname,
 it_args.update({
 'group_by_clause': group_by_clause,
 'using_clause': using_clause,
-'grouping_str_comma': grouping_str_comma
+'grouping_str_comma': grouping_str_comma,
 })
 
 first_try = True
 temp_output_table = unique_string(desp='temp_output_table')
+
+layer_sizes = [num_input_nodes] + hidden_layer_sizes + 
[num_output_nodes]
+
 for _ in range(n_tries):
+prev_state = None
 if not warm_start:
 coeff = []
-for i in range(len(layer_sizes) - 1):
-fan_in = layer_sizes[i]
-fan_out = layer_sizes[i + 1]
+for fan_in, fan_out in zip(layer_sizes, layer_sizes[1:]):
 # Initalize according to Glorot and Bengio (2010)
 # See design doc for more info
 span = math.sqrt(6.0 / (fan_in + fan_out))
-dim = (layer_sizes[i] + 1) * layer_sizes[i + 1]
-rand = plpy.execute("""SELECT 
array_agg({span}*2*(random()-0.5))
-   AS random
-   FROM generate_series(0,{dim})
-""".format(span=span, dim=dim))[0]["random"]
+dim = (fan_in + 1) * fan_out
+rand = [span * (random() - 0.5) for _ in range(dim)]
--- End diff --

Its supposed to be explained in the design doc as per the comment. I think 
these formulae are taken from a research paper.


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread njayaram2

Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175950079
  
--- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in ---
@@ -292,26 +329,33 @@ def mlp(schema_madlib, source_table, output_table, 
independent_varname,
 # used, it will be an empty list if there was not 
grouping.
 groups = [t[col_grp_key] for t in res if 
t[col_grp_key]]
 losses = [t['loss'] for t in res]
-loss = zip(groups, losses) if len(groups)==len(losses) 
\
-   else losses
-plpy.info("Iteration: " + str(it.iteration) + ", Loss: 
<" + \
-  ', '.join([str(l) for l in loss]) + ">")
+loss = zip(groups, losses) if groups else losses
+plpy.info("Iteration: {0}, Loss: <{1}>".
+  format(it.iteration, ', '.join(map(str, 
loss
 it.final()
 _update_temp_model_table(it_args, it.iteration, temp_output_table,
- first_try)
+ is_minibatch_enabled, first_try)
 first_try = False
-layer_sizes_str = py_list_to_sql_string(
-layer_sizes, array_type="integer")
-classes_str = py_list_to_sql_string(
-[strip_end_quotes(cl, "'") for cl in classes],
-array_type=dependent_type)
+layer_sizes_str = py_list_to_sql_string(layer_sizes,
+array_type="integer")
+
 _create_summary_table(locals())
-_create_standardization_table(standardization_table, x_mean_table,
-  warm_start)
+if is_minibatch_enabled:
+# We already have the mean and std in the input standardization 
table
+input_std_table = add_postfix(source_table, '_standardization')
+_create_standardization_table(standardization_table, 
input_std_table,
+  warm_start)
+else:
+_create_standardization_table(standardization_table, x_mean_table,
+  warm_start)
+# The original input table is the tab_data_scaled for mini batch.
+# Do NOT drop this, it will end up dropping the original data 
table.
+plpy.execute("DROP TABLE IF EXISTS {0}".format(tbl_data_scaled))
+plpy.execute("DROP TABLE IF EXISTS {0}".format(x_mean_table))
--- End diff --

Yes.


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread njayaram2

Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175947887
  
--- Diff: src/modules/convex/algo/igd.hpp ---
@@ -90,20 +90,27 @@ IGD::transition(state_type 
&state,
 
 for (int curr_epoch=0; curr_epoch < n_epochs; curr_epoch++) {
 double loss = 0.0;
-for (int curr_batch=0, curr_batch_row_index=0; curr_batch < 
n_batches;
- curr_batch++, curr_batch_row_index += batch_size) {
-   Matrix X_batch;
-   ColumnVector y_batch;
-   if (curr_batch == n_batches-1) {
-  // last batch
-  X_batch = 
tuple.indVar.bottomRows(n_rows-curr_batch_row_index);
-  y_batch = tuple.depVar.tail(n_rows-curr_batch_row_index);
-   } else {
-   X_batch = tuple.indVar.block(curr_batch_row_index, 0, 
batch_size, n_ind_cols);
-   y_batch = tuple.depVar.segment(curr_batch_row_index, 
batch_size);
-   }
-   loss += Task::getLossAndUpdateModel(
-   state.task.model, X_batch, y_batch, state.task.stepsize);
+int random_curr_batch[n_batches];
+for(int i=0; i

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread njayaram2

Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175947857
  
--- Diff: src/modules/convex/algo/igd.hpp ---
@@ -90,20 +90,27 @@ IGD::transition(state_type 
&state,
 
 for (int curr_epoch=0; curr_epoch < n_epochs; curr_epoch++) {
 double loss = 0.0;
-for (int curr_batch=0, curr_batch_row_index=0; curr_batch < 
n_batches;
- curr_batch++, curr_batch_row_index += batch_size) {
-   Matrix X_batch;
-   ColumnVector y_batch;
-   if (curr_batch == n_batches-1) {
-  // last batch
-  X_batch = 
tuple.indVar.bottomRows(n_rows-curr_batch_row_index);
-  y_batch = tuple.depVar.tail(n_rows-curr_batch_row_index);
-   } else {
-   X_batch = tuple.indVar.block(curr_batch_row_index, 0, 
batch_size, n_ind_cols);
-   y_batch = tuple.depVar.segment(curr_batch_row_index, 
batch_size);
-   }
-   loss += Task::getLossAndUpdateModel(
-   state.task.model, X_batch, y_batch, state.task.stepsize);
+int random_curr_batch[n_batches];
+for(int i=0; i

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread njayaram2

Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175950139
  
--- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in ---
@@ -491,10 +571,28 @@ def _update_temp_model_table(args, iteration, 
temp_output_table, first_try):
 ) rel_state_subq
 {join_clause}
 """.format(insert_or_create_str=insert_or_create_str,
-   iteration=iteration, join_clause=join_clause, **args)
+   iteration=iteration, join_clause=join_clause,
+   internal_result_udf=internal_result_udf, **args)
 plpy.execute(model_table_query)
 
 
+def _get_loss(schema_madlib, state, is_mini_batch):
--- End diff --

Yes, must remove it.


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread njayaram2

Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175948252
  
--- Diff: src/modules/convex/mlp_igd.cpp ---
@@ -130,6 +145,90 @@ mlp_igd_transition::run(AnyType &args) {
 
 return state;
 }
+/**
+ * @brief Perform the multilayer perceptron minibatch transition step
+ *
+ * Called for each tuple.
+ */
+AnyType
+mlp_minibatch_transition::run(AnyType &args) {
+// For the first tuple: args[0] is nothing more than a marker that
+// indicates that we should do some initial operations.
+// For other tuples: args[0] holds the computation state until last 
tuple
+MLPMiniBatchState > state = args[0];
+
+// initilize the state if first tuple
+if (state.algo.numRows == 0) {
+if (!args[3].isNull()) {
+MLPMiniBatchState > previousState = 
args[3];
+state.allocate(*this, previousState.task.numberOfStages,
+   previousState.task.numbersOfUnits);
+state = previousState;
+} else {
+// configuration parameters
+ArrayHandle numbersOfUnits = 
args[4].getAs >();
--- End diff --

We probably could, but there are a couple of extra arguments that only 
minibatch gets, and not IGD (batch_size and n_epochs).


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread njayaram2

Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175949682
  
--- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in ---
@@ -222,67 +243,83 @@ def mlp(schema_madlib, source_table, output_table, 
independent_varname,
 it_args.update({
 'group_by_clause': group_by_clause,
 'using_clause': using_clause,
-'grouping_str_comma': grouping_str_comma
+'grouping_str_comma': grouping_str_comma,
 })
 
 first_try = True
 temp_output_table = unique_string(desp='temp_output_table')
+
+layer_sizes = [num_input_nodes] + hidden_layer_sizes + 
[num_output_nodes]
--- End diff --

Yes, this looks like duplicated code.


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread njayaram2

Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175948750
  
--- Diff: src/modules/convex/task/mlp.hpp ---
@@ -111,6 +117,57 @@ class MLP {
 template 
 double MLP::lambda = 0;
 
+template 
+double
+MLP::getLossAndUpdateModel(
+model_type   &model,
+const Matrix &x_batch,
+const Matrix &y_true_batch,
+const double &stepsize) {
+
+uint16_t N = model.u.size(); // assuming nu. of layers >= 1
+size_t n = x_batch.rows();
+size_t i, k;
+double total_loss = 0.;
+
+// gradient added over the batch
+std::vector total_gradient_per_layer(N);
+for (k=0; k < N; ++k)
+total_gradient_per_layer[k] = Matrix::Zero(model.u[k].rows(),
+   model.u[k].cols());
+
+for (i=0; i < n; i++){
+ColumnVector x = x_batch.row(i);
+ColumnVector y_true = y_true_batch.row(i);
+
+std::vector net, o, delta;
+feedForward(model, x, net, o);
--- End diff --

We will have to change the design docs too for that. Apparently, the 
notation used here is supposed to be in sync with the design doc.


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175888098
  
--- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in ---
@@ -590,51 +664,103 @@ def _validate_warm_start(output_table, 
summary_table, standardization_table,
   output_table + ". Invalid number of coefficients in 
model.")
 return coeff
 
+def _validate_dependent_var(source_table, dependent_varname,
+is_classification, is_minibatch_enabled):
+expr_type = get_expr_type(dependent_varname, source_table)
+int_types = ['integer', 'smallint', 'bigint']
+text_types = ['text', 'varchar', 'character varying', 'char', 
'character']
+boolean_types = ['boolean']
+float_types = ['double precision', 'real']
+classification_types = int_types + boolean_types + text_types
+regression_types = int_types + float_types
+validate_type = classification_types if is_classification else 
regression_types
+
+if is_minibatch_enabled:
+# With pre-processed data, dep type is always an array
+_assert("[]" in expr_type,
+"Dependent variable column should refer to an array.")
+# The dependent variable is always a double precision array in
+# preprocessed data (so use regression_types)
+# strip out '[]' from expr_type
+_assert(expr_type[:-2] in regression_types,
--- End diff --

There are other numeric types like `decimal`, `numeric` etc. That makes me 
think if we really need this assert ? Same for the regression case for igd at 
line 696
And if we really want to assert this, consider using the function 
`is_psql_numeric_type` in `utilities_py.in`


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175627412
  
--- Diff: src/modules/convex/mlp_igd.cpp ---
@@ -130,6 +145,90 @@ mlp_igd_transition::run(AnyType &args) {
 
 return state;
 }
+/**
+ * @brief Perform the multilayer perceptron minibatch transition step
+ *
+ * Called for each tuple.
+ */
+AnyType
+mlp_minibatch_transition::run(AnyType &args) {
+// For the first tuple: args[0] is nothing more than a marker that
+// indicates that we should do some initial operations.
+// For other tuples: args[0] holds the computation state until last 
tuple
+MLPMiniBatchState > state = args[0];
+
+// initilize the state if first tuple
+if (state.algo.numRows == 0) {
+if (!args[3].isNull()) {
+MLPMiniBatchState > previousState = 
args[3];
+state.allocate(*this, previousState.task.numberOfStages,
+   previousState.task.numbersOfUnits);
+state = previousState;
+} else {
+// configuration parameters
+ArrayHandle numbersOfUnits = 
args[4].getAs >();
--- End diff --

is it possible to reuse the code that gets the values from the args 
parameter ? I noticed that the igd transition function `mlp_igd_transition ` 
has the exact same code.


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175872591
  
--- Diff: src/modules/convex/task/mlp.hpp ---
@@ -111,6 +117,57 @@ class MLP {
 template 
 double MLP::lambda = 0;
 
+template 
+double
+MLP::getLossAndUpdateModel(
+model_type   &model,
+const Matrix &x_batch,
+const Matrix &y_true_batch,
+const double &stepsize) {
+
+uint16_t N = model.u.size(); // assuming nu. of layers >= 1
+size_t n = x_batch.rows();
--- End diff --

is there a reason we chose N and n as variable names ? Can we use more 
descriptive names ?




---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175923655
  
--- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in ---
@@ -33,11 +34,12 @@ from convex.utils_regularization import 
__utils_normalize_data_grouping
 
 from utilities.in_mem_group_control import GroupIterationController
 from utilities.utilities import _array_to_string
+from utilities.utilities import add_postfix
+from utilities.utilities import py_list_to_sql_string as PY2SQL
+from utilities.utilities import extract_keyvalue_params
 from utilities.utilities import _assert
 from utilities.utilities import _assert_equal
 from utilities.utilities import _string_to_array_with_quotes
-from utilities.utilities import add_postfix
-from utilities.utilities import extract_keyvalue_params
 from utilities.utilities import py_list_to_sql_string
--- End diff --

we don't need this import anymore


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175894372
  
--- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in ---
@@ -1457,3 +1660,85 @@ def mlp_predict_help(schema_madlib, message):
 return """
 No such option. Use "SELECT {schema_madlib}.mlp_predict()" for 
help.
 """.format(**args)
+
+
+def check_if_minibatch_enabled(source_table, independent_varname):
+"""
+Function to validate if the source_table is converted to a format 
that
+can be used for mini-batching. It checks for the dimensionalities 
of
+the independent variable to determine the same.
+"""
+query = """
+SELECT array_upper({0}, 1) AS n_x,
+   array_upper({0}, 2) AS n_y,
+   array_upper({0}, 3) AS n_z
+FROM {1}
+LIMIT 1
+""".format(independent_varname, source_table)
+result = plpy.execute(query)
+
+if not result:
+plpy.error("MLP: Input table could be empty.")
+
+has_x_dim, has_y_dim, has_z_dim = [bool(result[0][i])
+   for i in ('n_x', 'n_y', 'n_z')]
+if not has_x_dim:
+plpy.error("MLP: {0} is empty.".format(independent_varname))
+
+# error out if >2d matrix
+if has_z_dim:
+plpy.error("MLP: Input table is not in the right format.")
+return has_y_dim
+
+
+class MLPPreProcessor:
+"""
+This class consumes and validates the pre-processed source table used 
for
+MLP mini-batch. This also populates values from the pre-processed 
summary
+table which is used by MLP mini-batch
+
+"""
+# summary table columns names
+DEPENDENT_VARNAME = "dependent_varname"
+INDEPENDENT_VARNAME = "independent_varname"
+GROUPING_COL = "grouping_cols"
+CLASS_VALUES = "class_values"
+MODEL_TYPE_CLASSIFICATION = "classification"
+MODEL_TYPE_REGRESSION = "regression"
+
+def __init__(self, source_table):
+self.source_table = source_table
+self.preprocessed_summary_dict = None
+self.summary_table = add_postfix(self.source_table, "_summary")
+self.std_table = add_postfix(self.source_table, "_standardization")
+
+self._validate_and_set_preprocessed_summary()
+
+def _validate_and_set_preprocessed_summary(self):
+input_tbl_valid(self.source_table, 'MLP')
+
+if not table_exists(self.summary_table) or not 
table_exists(self.std_table):
+plpy.error("Tables {0} and/or {1} do not exist. These tables 
are"
+   " needed for using minibatch during 
training.".format(
+ 
self.summary_table,
+ 
self.std_table))
+
+query = "SELECT * FROM {0}".format(self.summary_table)
+summary_table_columns = plpy.execute(query)
+if not summary_table_columns or len(summary_table_columns) == 0:
+plpy.error("No columns in table 
{0}.".format(self.summary_table))
+else:
+summary_table_columns = summary_table_columns[0]
+
+required_columns = (self.DEPENDENT_VARNAME, 
self.INDEPENDENT_VARNAME,
+self.CLASS_VALUES)
+if set(required_columns) <= set(summary_table_columns):
+self.preprocessed_summary_dict = summary_table_columns
+else:
+plpy.error("Expected columns ({0}, {1} and/or {2}) not present 
in"
--- End diff --

We can use the `required_columns` to format the error message so that we 
don't have to repeat the column names. Something like 
```
plpy.error("One or more of the expected columns {0} not present in 
{1}".format(required_columns, self.summary_table))
```


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175893520
  
--- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in ---
@@ -1457,3 +1660,85 @@ def mlp_predict_help(schema_madlib, message):
 return """
 No such option. Use "SELECT {schema_madlib}.mlp_predict()" for 
help.
 """.format(**args)
+
+
+def check_if_minibatch_enabled(source_table, independent_varname):
+"""
+Function to validate if the source_table is converted to a format 
that
+can be used for mini-batching. It checks for the dimensionalities 
of
+the independent variable to determine the same.
+"""
+query = """
+SELECT array_upper({0}, 1) AS n_x,
+   array_upper({0}, 2) AS n_y,
+   array_upper({0}, 3) AS n_z
+FROM {1}
+LIMIT 1
+""".format(independent_varname, source_table)
+result = plpy.execute(query)
+
+if not result:
+plpy.error("MLP: Input table could be empty.")
+
+has_x_dim, has_y_dim, has_z_dim = [bool(result[0][i])
+   for i in ('n_x', 'n_y', 'n_z')]
+if not has_x_dim:
+plpy.error("MLP: {0} is empty.".format(independent_varname))
+
+# error out if >2d matrix
+if has_z_dim:
+plpy.error("MLP: Input table is not in the right format.")
+return has_y_dim
+
+
+class MLPPreProcessor:
+"""
+This class consumes and validates the pre-processed source table used 
for
+MLP mini-batch. This also populates values from the pre-processed 
summary
+table which is used by MLP mini-batch
+
+"""
+# summary table columns names
+DEPENDENT_VARNAME = "dependent_varname"
+INDEPENDENT_VARNAME = "independent_varname"
+GROUPING_COL = "grouping_cols"
+CLASS_VALUES = "class_values"
+MODEL_TYPE_CLASSIFICATION = "classification"
+MODEL_TYPE_REGRESSION = "regression"
+
+def __init__(self, source_table):
+self.source_table = source_table
+self.preprocessed_summary_dict = None
+self.summary_table = add_postfix(self.source_table, "_summary")
+self.std_table = add_postfix(self.source_table, "_standardization")
+
+self._validate_and_set_preprocessed_summary()
+
+def _validate_and_set_preprocessed_summary(self):
+input_tbl_valid(self.source_table, 'MLP')
+
+if not table_exists(self.summary_table) or not 
table_exists(self.std_table):
+plpy.error("Tables {0} and/or {1} do not exist. These tables 
are"
+   " needed for using minibatch during 
training.".format(
+ 
self.summary_table,
+ 
self.std_table))
+
+query = "SELECT * FROM {0}".format(self.summary_table)
+summary_table_columns = plpy.execute(query)
+if not summary_table_columns or len(summary_table_columns) == 0:
+plpy.error("No columns in table 
{0}.".format(self.summary_table))
+else:
+summary_table_columns = summary_table_columns[0]
+
+required_columns = (self.DEPENDENT_VARNAME, 
self.INDEPENDENT_VARNAME,
--- End diff --

we also use `buffer_size` and `source_table` columns from the summary 
table. Do we need to validate those as well or are these three enough ?
If we decide to assert all columns that we consume, we will have to keep 
this assert in sync with how we use the summary dict which is easy to forget. I 
don't have a better solution but just wanted to mention it. 


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175891761
  
--- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in ---
@@ -72,107 +73,127 @@ def mlp(schema_madlib, source_table, output_table, 
independent_varname,
 """
 warm_start = bool(warm_start)
 optimizer_params = _get_optimizer_params(optimizer_param_str or "")
+
+tolerance = optimizer_params["tolerance"]
+n_iterations = optimizer_params["n_iterations"]
+step_size_init = optimizer_params["learning_rate_init"]
+iterations_per_step = optimizer_params["iterations_per_step"]
+power = optimizer_params["power"]
+gamma = optimizer_params["gamma"]
+step_size = step_size_init
+n_tries = optimizer_params["n_tries"]
+# lambda is a reserved word in python
+lmbda = optimizer_params["lambda"]
+batch_size = optimizer_params['batch_size']
+n_epochs = optimizer_params['n_epochs']
+
 summary_table = add_postfix(output_table, "_summary")
 standardization_table = add_postfix(output_table, "_standardization")
-weights = '1' if not weights or not weights.strip() else 
weights.strip()
 hidden_layer_sizes = hidden_layer_sizes or []
 
-grouping_col = grouping_col or ""
-activation = _get_activation_function_name(activation)
-learning_rate_policy = _get_learning_rate_policy_name(
-optimizer_params["learning_rate_policy"])
-activation_index = _get_activation_index(activation)
-
+# Note that we don't support weights with mini batching yet, so 
validate
+# this based on is_minibatch_enabled.
+weights = '1' if not weights or not weights.strip() else 
weights.strip()
 _validate_args(source_table, output_table, summary_table,
standardization_table, independent_varname,
dependent_varname, hidden_layer_sizes, optimizer_params,
-   is_classification, weights, warm_start, activation,
-   grouping_col)
+   warm_start, activation, grouping_col)
+is_minibatch_enabled = check_if_minibatch_enabled(source_table, 
independent_varname)
+_validate_params_based_on_minibatch(source_table, independent_varname,
+dependent_varname, weights,
+is_classification,
+is_minibatch_enabled)
+activation = _get_activation_function_name(activation)
+learning_rate_policy = _get_learning_rate_policy_name(
+optimizer_params["learning_rate_policy"])
+activation_index = _get_activation_index(activation)
 
 reserved_cols = ['coeff', 'loss', 'n_iterations']
+grouping_col = grouping_col or ""
 grouping_str, grouping_col = get_grouping_col_str(schema_madlib, 'MLP',
   reserved_cols,
   source_table,
   grouping_col)
-current_iteration = 1
-prev_state = None
-tolerance = optimizer_params["tolerance"]
-n_iterations = optimizer_params["n_iterations"]
-step_size_init = optimizer_params["learning_rate_init"]
-iterations_per_step = optimizer_params["iterations_per_step"]
-power = optimizer_params["power"]
-gamma = optimizer_params["gamma"]
-step_size = step_size_init
-n_tries = optimizer_params["n_tries"]
-# lambda is a reserved word in python
-lmbda = optimizer_params["lambda"]
-iterations_per_step = optimizer_params["iterations_per_step"]
-num_input_nodes = array_col_dimension(source_table,
-  independent_varname)
-num_output_nodes = 0
+dependent_varname_backup = dependent_varname
 classes = []
-dependent_type = get_expr_type(dependent_varname, source_table)
-original_dependent_varname = dependent_varname
-
-x_mean_table = unique_string(desp='x_mean_table')
-dimension, n_tuples = _tbl_dimension_rownum(schema_madlib, 
source_table,
-independent_varname)
-
-tbl_data_scaled = unique_string(desp="tbl_data_scaled")
-col_ind_var_norm_new = unique_string(desp="ind_var_norm")
-col_dep_var_norm_new = unique_string(desp="dep_var_norm")
-# Standardize the data, and create a standardized version of the
-# source_table in tbl_data_scaled. Use this standardized table for IGD.
-normalize_data(locals())
-if is_classification:
-dependent_variable_sql = """
-SELECT

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175923217
  
--- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in ---
@@ -292,26 +329,33 @@ def mlp(schema_madlib, source_table, output_table, 
independent_varname,
 # used, it will be an empty list if there was not 
grouping.
 groups = [t[col_grp_key] for t in res if 
t[col_grp_key]]
 losses = [t['loss'] for t in res]
-loss = zip(groups, losses) if len(groups)==len(losses) 
\
-   else losses
-plpy.info("Iteration: " + str(it.iteration) + ", Loss: 
<" + \
-  ', '.join([str(l) for l in loss]) + ">")
+loss = zip(groups, losses) if groups else losses
+plpy.info("Iteration: {0}, Loss: <{1}>".
+  format(it.iteration, ', '.join(map(str, 
loss
 it.final()
 _update_temp_model_table(it_args, it.iteration, temp_output_table,
- first_try)
+ is_minibatch_enabled, first_try)
 first_try = False
-layer_sizes_str = py_list_to_sql_string(
-layer_sizes, array_type="integer")
-classes_str = py_list_to_sql_string(
-[strip_end_quotes(cl, "'") for cl in classes],
-array_type=dependent_type)
+layer_sizes_str = py_list_to_sql_string(layer_sizes,
+array_type="integer")
+
 _create_summary_table(locals())
-_create_standardization_table(standardization_table, x_mean_table,
-  warm_start)
+if is_minibatch_enabled:
+# We already have the mean and std in the input standardization 
table
+input_std_table = add_postfix(source_table, '_standardization')
+_create_standardization_table(standardization_table, 
input_std_table,
+  warm_start)
+else:
+_create_standardization_table(standardization_table, x_mean_table,
+  warm_start)
+# The original input table is the tab_data_scaled for mini batch.
--- End diff --

this comment should be moved to the if block


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175627929
  
--- Diff: src/modules/convex/mlp_igd.cpp ---
@@ -130,6 +145,90 @@ mlp_igd_transition::run(AnyType &args) {
 
 return state;
 }
+/**
+ * @brief Perform the multilayer perceptron minibatch transition step
+ *
+ * Called for each tuple.
+ */
+AnyType
+mlp_minibatch_transition::run(AnyType &args) {
+// For the first tuple: args[0] is nothing more than a marker that
+// indicates that we should do some initial operations.
+// For other tuples: args[0] holds the computation state until last 
tuple
+MLPMiniBatchState > state = args[0];
+
+// initilize the state if first tuple
+if (state.algo.numRows == 0) {
+if (!args[3].isNull()) {
+MLPMiniBatchState > previousState = 
args[3];
+state.allocate(*this, previousState.task.numberOfStages,
+   previousState.task.numbersOfUnits);
+state = previousState;
+} else {
+// configuration parameters
+ArrayHandle numbersOfUnits = 
args[4].getAs >();
+int numberOfStages = numbersOfUnits.size() - 1;
+
+double stepsize = args[5].getAs();
+
+state.allocate(*this, numberOfStages,
+   reinterpret_cast(numbersOfUnits.ptr()));
+state.task.stepsize = stepsize;
+const int activation = args[6].getAs();
+const int is_classification = args[7].getAs();
+// args[8] is for weighting the input row, which is populated 
later.
+const bool warm_start = args[9].getAs();
+const double lambda = args[11].getAs();
+state.algo.batchSize = args[12].getAs();
+state.algo.nEpochs = args[13].getAs();
+state.task.lambda = lambda;
+MLPTask::lambda = lambda;
+
+/* FIXME: The state is set back to zero for second row onwards 
if
+   initialized as in IGD. The following avoids that, but there 
is
+   some failure with debug build that must be fixed.
+*/
+state.task.model.is_classification =
+static_cast(is_classification);
+state.task.model.activation = static_cast(activation);
+MappedColumnVector initial_coeff = 
args[10].getAs();
+// copy initial_coeff into the model
+Index fan_in, fan_out, layer_start = 0;
+for (size_t k = 0; k < numberOfStages; ++k){
+fan_in = numbersOfUnits[k];
+fan_out = numbersOfUnits[k+1];
+state.task.model.u[k] << 
initial_coeff.segment(layer_start, (fan_in+1)*fan_out);
+layer_start = (fan_in + 1) * fan_out;
+}
+}
+// resetting in either case
+state.reset();
+}
+
+// meta data
+const uint16_t N = state.task.numberOfStages;
--- End diff --

is there a reason we chose N and n as variable names ? Can we use more 
descriptive names ?


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175922864
  
--- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in ---
@@ -292,26 +329,33 @@ def mlp(schema_madlib, source_table, output_table, 
independent_varname,
 # used, it will be an empty list if there was not 
grouping.
 groups = [t[col_grp_key] for t in res if 
t[col_grp_key]]
 losses = [t['loss'] for t in res]
-loss = zip(groups, losses) if len(groups)==len(losses) 
\
-   else losses
-plpy.info("Iteration: " + str(it.iteration) + ", Loss: 
<" + \
-  ', '.join([str(l) for l in loss]) + ">")
+loss = zip(groups, losses) if groups else losses
+plpy.info("Iteration: {0}, Loss: <{1}>".
+  format(it.iteration, ', '.join(map(str, 
loss
 it.final()
 _update_temp_model_table(it_args, it.iteration, temp_output_table,
- first_try)
+ is_minibatch_enabled, first_try)
 first_try = False
-layer_sizes_str = py_list_to_sql_string(
-layer_sizes, array_type="integer")
-classes_str = py_list_to_sql_string(
-[strip_end_quotes(cl, "'") for cl in classes],
-array_type=dependent_type)
+layer_sizes_str = py_list_to_sql_string(layer_sizes,
--- End diff --

should we use the `PY2SQL` alias here ?


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175889157
  
--- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in ---
@@ -590,51 +664,103 @@ def _validate_warm_start(output_table, 
summary_table, standardization_table,
   output_table + ". Invalid number of coefficients in 
model.")
 return coeff
 
+def _validate_dependent_var(source_table, dependent_varname,
+is_classification, is_minibatch_enabled):
+expr_type = get_expr_type(dependent_varname, source_table)
+int_types = ['integer', 'smallint', 'bigint']
+text_types = ['text', 'varchar', 'character varying', 'char', 
'character']
+boolean_types = ['boolean']
+float_types = ['double precision', 'real']
+classification_types = int_types + boolean_types + text_types
+regression_types = int_types + float_types
+validate_type = classification_types if is_classification else 
regression_types
--- End diff --

I think it's slightly cleaner if we don't use the `validate_type ` variable 
but use the `classification_types` and `regression_types ` variables.


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175921215
  
--- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in ---
@@ -222,67 +243,83 @@ def mlp(schema_madlib, source_table, output_table, 
independent_varname,
 it_args.update({
 'group_by_clause': group_by_clause,
 'using_clause': using_clause,
-'grouping_str_comma': grouping_str_comma
+'grouping_str_comma': grouping_str_comma,
 })
 
 first_try = True
 temp_output_table = unique_string(desp='temp_output_table')
+
+layer_sizes = [num_input_nodes] + hidden_layer_sizes + 
[num_output_nodes]
+
 for _ in range(n_tries):
+prev_state = None
 if not warm_start:
 coeff = []
-for i in range(len(layer_sizes) - 1):
-fan_in = layer_sizes[i]
-fan_out = layer_sizes[i + 1]
+for fan_in, fan_out in zip(layer_sizes, layer_sizes[1:]):
 # Initalize according to Glorot and Bengio (2010)
 # See design doc for more info
 span = math.sqrt(6.0 / (fan_in + fan_out))
-dim = (layer_sizes[i] + 1) * layer_sizes[i + 1]
-rand = plpy.execute("""SELECT 
array_agg({span}*2*(random()-0.5))
-   AS random
-   FROM generate_series(0,{dim})
-""".format(span=span, dim=dim))[0]["random"]
+dim = (fan_in + 1) * fan_out
+rand = [span * (random() - 0.5) for _ in range(dim)]
--- End diff --

why are we subtracting 0.5 from `random()` ?


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175931022
  
--- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in ---
@@ -491,10 +571,28 @@ def _update_temp_model_table(args, iteration, 
temp_output_table, first_try):
 ) rel_state_subq
 {join_clause}
 """.format(insert_or_create_str=insert_or_create_str,
-   iteration=iteration, join_clause=join_clause, **args)
+   iteration=iteration, join_clause=join_clause,
+   internal_result_udf=internal_result_udf, **args)
 plpy.execute(model_table_query)
 
 
+def _get_loss(schema_madlib, state, is_mini_batch):
--- End diff --

This function is not used anywhere. 


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175628144
  
--- Diff: src/modules/convex/mlp_igd.cpp ---
@@ -130,6 +145,90 @@ mlp_igd_transition::run(AnyType &args) {
 
 return state;
 }
+/**
+ * @brief Perform the multilayer perceptron minibatch transition step
+ *
+ * Called for each tuple.
+ */
+AnyType
+mlp_minibatch_transition::run(AnyType &args) {
+// For the first tuple: args[0] is nothing more than a marker that
+// indicates that we should do some initial operations.
+// For other tuples: args[0] holds the computation state until last 
tuple
+MLPMiniBatchState > state = args[0];
+
+// initilize the state if first tuple
+if (state.algo.numRows == 0) {
+if (!args[3].isNull()) {
+MLPMiniBatchState > previousState = 
args[3];
+state.allocate(*this, previousState.task.numberOfStages,
+   previousState.task.numbersOfUnits);
+state = previousState;
+} else {
+// configuration parameters
+ArrayHandle numbersOfUnits = 
args[4].getAs >();
+int numberOfStages = numbersOfUnits.size() - 1;
+
+double stepsize = args[5].getAs();
+
+state.allocate(*this, numberOfStages,
+   reinterpret_cast(numbersOfUnits.ptr()));
+state.task.stepsize = stepsize;
+const int activation = args[6].getAs();
+const int is_classification = args[7].getAs();
+// args[8] is for weighting the input row, which is populated 
later.
+const bool warm_start = args[9].getAs();
+const double lambda = args[11].getAs();
+state.algo.batchSize = args[12].getAs();
+state.algo.nEpochs = args[13].getAs();
+state.task.lambda = lambda;
+MLPTask::lambda = lambda;
+
+/* FIXME: The state is set back to zero for second row onwards 
if
+   initialized as in IGD. The following avoids that, but there 
is
+   some failure with debug build that must be fixed.
+*/
+state.task.model.is_classification =
+static_cast(is_classification);
+state.task.model.activation = static_cast(activation);
+MappedColumnVector initial_coeff = 
args[10].getAs();
+// copy initial_coeff into the model
+Index fan_in, fan_out, layer_start = 0;
+for (size_t k = 0; k < numberOfStages; ++k){
+fan_in = numbersOfUnits[k];
+fan_out = numbersOfUnits[k+1];
+state.task.model.u[k] << 
initial_coeff.segment(layer_start, (fan_in+1)*fan_out);
+layer_start = (fan_in + 1) * fan_out;
+}
+}
+// resetting in either case
+state.reset();
+}
+
+// meta data
+const uint16_t N = state.task.numberOfStages;
+const double *n = state.task.numbersOfUnits;
+
+// tuple
+Matrix indVar;
+Matrix depVar;
+try {
--- End diff --

why do we expect the following 2 lines to fail ? may be add a comment 
explaining the reason.


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175877947
  
--- Diff: src/modules/convex/task/mlp.hpp ---
@@ -111,6 +117,57 @@ class MLP {
 template 
 double MLP::lambda = 0;
 
+template 
+double
+MLP::getLossAndUpdateModel(
+model_type   &model,
+const Matrix &x_batch,
+const Matrix &y_true_batch,
+const double &stepsize) {
+
+uint16_t N = model.u.size(); // assuming nu. of layers >= 1
+size_t n = x_batch.rows();
+size_t i, k;
+double total_loss = 0.;
+
+// gradient added over the batch
+std::vector total_gradient_per_layer(N);
+for (k=0; k < N; ++k)
+total_gradient_per_layer[k] = Matrix::Zero(model.u[k].rows(),
+   model.u[k].cols());
+
+for (i=0; i < n; i++){
+ColumnVector x = x_batch.row(i);
+ColumnVector y_true = y_true_batch.row(i);
+
+std::vector net, o, delta;
+feedForward(model, x, net, o);
+backPropogate(y_true, o.back(), net, model, delta);
+
+for (k=0; k < N; k++){
+total_gradient_per_layer[k] += o[k] * delta[k].transpose();
+}
+
+// loss computation
+ColumnVector y_estimated = o.back();
+if(model.is_classification){
+double clip = 1.e-10;
+y_estimated = y_estimated.cwiseMax(clip).cwiseMin(1.-clip);
--- End diff --

Just curious, why do we need to do re calculate `y_estimated` ?


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175929883
  
--- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in ---
@@ -292,26 +329,33 @@ def mlp(schema_madlib, source_table, output_table, 
independent_varname,
 # used, it will be an empty list if there was not 
grouping.
 groups = [t[col_grp_key] for t in res if 
t[col_grp_key]]
 losses = [t['loss'] for t in res]
-loss = zip(groups, losses) if len(groups)==len(losses) 
\
-   else losses
-plpy.info("Iteration: " + str(it.iteration) + ", Loss: 
<" + \
-  ', '.join([str(l) for l in loss]) + ">")
+loss = zip(groups, losses) if groups else losses
+plpy.info("Iteration: {0}, Loss: <{1}>".
+  format(it.iteration, ', '.join(map(str, 
loss
 it.final()
 _update_temp_model_table(it_args, it.iteration, temp_output_table,
- first_try)
+ is_minibatch_enabled, first_try)
 first_try = False
-layer_sizes_str = py_list_to_sql_string(
-layer_sizes, array_type="integer")
-classes_str = py_list_to_sql_string(
-[strip_end_quotes(cl, "'") for cl in classes],
-array_type=dependent_type)
+layer_sizes_str = py_list_to_sql_string(layer_sizes,
+array_type="integer")
+
 _create_summary_table(locals())
-_create_standardization_table(standardization_table, x_mean_table,
-  warm_start)
+if is_minibatch_enabled:
+# We already have the mean and std in the input standardization 
table
+input_std_table = add_postfix(source_table, '_standardization')
+_create_standardization_table(standardization_table, 
input_std_table,
+  warm_start)
+else:
+_create_standardization_table(standardization_table, x_mean_table,
+  warm_start)
+# The original input table is the tab_data_scaled for mini batch.
+# Do NOT drop this, it will end up dropping the original data 
table.
+plpy.execute("DROP TABLE IF EXISTS {0}".format(tbl_data_scaled))
+plpy.execute("DROP TABLE IF EXISTS {0}".format(x_mean_table))
--- End diff --

is there a test for this in install check to assert that the input tables 
including summary and std tables aren't dropped for minibatch ?


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175620624
  
--- Diff: src/modules/convex/algo/igd.hpp ---
@@ -90,20 +90,27 @@ IGD::transition(state_type 
&state,
 
 for (int curr_epoch=0; curr_epoch < n_epochs; curr_epoch++) {
 double loss = 0.0;
-for (int curr_batch=0, curr_batch_row_index=0; curr_batch < 
n_batches;
- curr_batch++, curr_batch_row_index += batch_size) {
-   Matrix X_batch;
-   ColumnVector y_batch;
-   if (curr_batch == n_batches-1) {
-  // last batch
-  X_batch = 
tuple.indVar.bottomRows(n_rows-curr_batch_row_index);
-  y_batch = tuple.depVar.tail(n_rows-curr_batch_row_index);
-   } else {
-   X_batch = tuple.indVar.block(curr_batch_row_index, 0, 
batch_size, n_ind_cols);
-   y_batch = tuple.depVar.segment(curr_batch_row_index, 
batch_size);
-   }
-   loss += Task::getLossAndUpdateModel(
-   state.task.model, X_batch, y_batch, state.task.stepsize);
+int random_curr_batch[n_batches];
+for(int i=0; i

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175871655
  
--- Diff: src/modules/convex/mlp_igd.cpp ---
@@ -170,6 +289,24 @@ mlp_igd_final::run(AnyType &args) {
 return state;
 }
 
+
+/**
+ * @brief Perform the multilayer perceptron final step
+ */
+AnyType
+mlp_minibatch_final::run(AnyType &args) {
+// We request a mutable object. Depending on the backend, this might 
perform
+// a deep copy.
+MLPMiniBatchState > state = args[0];
+// Aggregates that haven't seen any data just return Null.
+if (state.algo.numRows == 0) { return Null(); }
+
+L2::lambda = state.task.lambda;
+state.algo.loss = 
state.algo.loss/static_cast(state.algo.numRows);
+state.algo.loss += L2::loss(state.task.model);
+return state;
--- End diff --

I noticed that minibatch `AlgoState` does not have an incr model unlike igd 
`AlgoState` . Do you think it makes sense to add a comment to explain this ?


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175917822
  
--- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in ---
@@ -222,67 +243,83 @@ def mlp(schema_madlib, source_table, output_table, 
independent_varname,
 it_args.update({
 'group_by_clause': group_by_clause,
 'using_clause': using_clause,
-'grouping_str_comma': grouping_str_comma
+'grouping_str_comma': grouping_str_comma,
 })
 
 first_try = True
 temp_output_table = unique_string(desp='temp_output_table')
+
+layer_sizes = [num_input_nodes] + hidden_layer_sizes + 
[num_output_nodes]
--- End diff --

we already did this at line 176. is this intentional?


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175890376
  
--- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in ---
@@ -1457,3 +1660,85 @@ def mlp_predict_help(schema_madlib, message):
 return """
 No such option. Use "SELECT {schema_madlib}.mlp_predict()" for 
help.
 """.format(**args)
+
+
+def check_if_minibatch_enabled(source_table, independent_varname):
+"""
+Function to validate if the source_table is converted to a format 
that
+can be used for mini-batching. It checks for the dimensionalities 
of
+the independent variable to determine the same.
+"""
+query = """
+SELECT array_upper({0}, 1) AS n_x,
+   array_upper({0}, 2) AS n_y,
+   array_upper({0}, 3) AS n_z
+FROM {1}
+LIMIT 1
+""".format(independent_varname, source_table)
+result = plpy.execute(query)
+
+if not result:
+plpy.error("MLP: Input table could be empty.")
+
+has_x_dim, has_y_dim, has_z_dim = [bool(result[0][i])
+   for i in ('n_x', 'n_y', 'n_z')]
+if not has_x_dim:
+plpy.error("MLP: {0} is empty.".format(independent_varname))
+
+# error out if >2d matrix
+if has_z_dim:
+plpy.error("MLP: Input table is not in the right format.")
+return has_y_dim
+
+
+class MLPPreProcessor:
+"""
+This class consumes and validates the pre-processed source table used 
for
+MLP mini-batch. This also populates values from the pre-processed 
summary
+table which is used by MLP mini-batch
+
+"""
+# summary table columns names
+DEPENDENT_VARNAME = "dependent_varname"
+INDEPENDENT_VARNAME = "independent_varname"
+GROUPING_COL = "grouping_cols"
+CLASS_VALUES = "class_values"
+MODEL_TYPE_CLASSIFICATION = "classification"
+MODEL_TYPE_REGRESSION = "regression"
+
+def __init__(self, source_table):
+self.source_table = source_table
+self.preprocessed_summary_dict = None
+self.summary_table = add_postfix(self.source_table, "_summary")
+self.std_table = add_postfix(self.source_table, "_standardization")
+
+self._validate_and_set_preprocessed_summary()
+
+def _validate_and_set_preprocessed_summary(self):
+input_tbl_valid(self.source_table, 'MLP')
--- End diff --

we don't really need to validate the source table here since it would 
already be validated by the `_validate_args` function.  


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175895832
  
--- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in ---
@@ -72,107 +73,127 @@ def mlp(schema_madlib, source_table, output_table, 
independent_varname,
 """
 warm_start = bool(warm_start)
 optimizer_params = _get_optimizer_params(optimizer_param_str or "")
+
+tolerance = optimizer_params["tolerance"]
+n_iterations = optimizer_params["n_iterations"]
+step_size_init = optimizer_params["learning_rate_init"]
+iterations_per_step = optimizer_params["iterations_per_step"]
+power = optimizer_params["power"]
+gamma = optimizer_params["gamma"]
+step_size = step_size_init
+n_tries = optimizer_params["n_tries"]
+# lambda is a reserved word in python
+lmbda = optimizer_params["lambda"]
+batch_size = optimizer_params['batch_size']
+n_epochs = optimizer_params['n_epochs']
+
 summary_table = add_postfix(output_table, "_summary")
 standardization_table = add_postfix(output_table, "_standardization")
-weights = '1' if not weights or not weights.strip() else 
weights.strip()
 hidden_layer_sizes = hidden_layer_sizes or []
 
-grouping_col = grouping_col or ""
-activation = _get_activation_function_name(activation)
-learning_rate_policy = _get_learning_rate_policy_name(
-optimizer_params["learning_rate_policy"])
-activation_index = _get_activation_index(activation)
-
+# Note that we don't support weights with mini batching yet, so 
validate
+# this based on is_minibatch_enabled.
+weights = '1' if not weights or not weights.strip() else 
weights.strip()
 _validate_args(source_table, output_table, summary_table,
standardization_table, independent_varname,
dependent_varname, hidden_layer_sizes, optimizer_params,
-   is_classification, weights, warm_start, activation,
-   grouping_col)
+   warm_start, activation, grouping_col)
+is_minibatch_enabled = check_if_minibatch_enabled(source_table, 
independent_varname)
+_validate_params_based_on_minibatch(source_table, independent_varname,
+dependent_varname, weights,
+is_classification,
+is_minibatch_enabled)
+activation = _get_activation_function_name(activation)
+learning_rate_policy = _get_learning_rate_policy_name(
+optimizer_params["learning_rate_policy"])
+activation_index = _get_activation_index(activation)
 
 reserved_cols = ['coeff', 'loss', 'n_iterations']
+grouping_col = grouping_col or ""
 grouping_str, grouping_col = get_grouping_col_str(schema_madlib, 'MLP',
   reserved_cols,
   source_table,
   grouping_col)
-current_iteration = 1
-prev_state = None
-tolerance = optimizer_params["tolerance"]
-n_iterations = optimizer_params["n_iterations"]
-step_size_init = optimizer_params["learning_rate_init"]
-iterations_per_step = optimizer_params["iterations_per_step"]
-power = optimizer_params["power"]
-gamma = optimizer_params["gamma"]
-step_size = step_size_init
-n_tries = optimizer_params["n_tries"]
-# lambda is a reserved word in python
-lmbda = optimizer_params["lambda"]
-iterations_per_step = optimizer_params["iterations_per_step"]
-num_input_nodes = array_col_dimension(source_table,
-  independent_varname)
-num_output_nodes = 0
+dependent_varname_backup = dependent_varname
--- End diff --

can we add a comment explaining why we need this backup variable ?


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175873168
  
--- Diff: src/modules/convex/task/mlp.hpp ---
@@ -111,6 +117,57 @@ class MLP {
 template 
 double MLP::lambda = 0;
 
+template 
+double
+MLP::getLossAndUpdateModel(
+model_type   &model,
+const Matrix &x_batch,
+const Matrix &y_true_batch,
+const double &stepsize) {
+
+uint16_t N = model.u.size(); // assuming nu. of layers >= 1
+size_t n = x_batch.rows();
+size_t i, k;
+double total_loss = 0.;
+
+// gradient added over the batch
+std::vector total_gradient_per_layer(N);
+for (k=0; k < N; ++k)
+total_gradient_per_layer[k] = Matrix::Zero(model.u[k].rows(),
+   model.u[k].cols());
+
+for (i=0; i < n; i++){
+ColumnVector x = x_batch.row(i);
+ColumnVector y_true = y_true_batch.row(i);
+
+std::vector net, o, delta;
+feedForward(model, x, net, o);
--- End diff --

Can we use a more descriptive name for the variable `o` ?




---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175625333
  
--- Diff: src/modules/convex/algo/igd.hpp ---
@@ -90,20 +90,27 @@ IGD::transition(state_type 
&state,
 
 for (int curr_epoch=0; curr_epoch < n_epochs; curr_epoch++) {
 double loss = 0.0;
-for (int curr_batch=0, curr_batch_row_index=0; curr_batch < 
n_batches;
- curr_batch++, curr_batch_row_index += batch_size) {
-   Matrix X_batch;
-   ColumnVector y_batch;
-   if (curr_batch == n_batches-1) {
-  // last batch
-  X_batch = 
tuple.indVar.bottomRows(n_rows-curr_batch_row_index);
-  y_batch = tuple.depVar.tail(n_rows-curr_batch_row_index);
-   } else {
-   X_batch = tuple.indVar.block(curr_batch_row_index, 0, 
batch_size, n_ind_cols);
-   y_batch = tuple.depVar.segment(curr_batch_row_index, 
batch_size);
-   }
-   loss += Task::getLossAndUpdateModel(
-   state.task.model, X_batch, y_batch, state.task.stepsize);
+int random_curr_batch[n_batches];
+for(int i=0; i

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175626817
  
--- Diff: src/modules/convex/mlp_igd.cpp ---
@@ -130,6 +145,90 @@ mlp_igd_transition::run(AnyType &args) {
 
 return state;
 }
+/**
+ * @brief Perform the multilayer perceptron minibatch transition step
+ *
+ * Called for each tuple.
+ */
+AnyType
+mlp_minibatch_transition::run(AnyType &args) {
+// For the first tuple: args[0] is nothing more than a marker that
+// indicates that we should do some initial operations.
+// For other tuples: args[0] holds the computation state until last 
tuple
+MLPMiniBatchState > state = args[0];
+
+// initilize the state if first tuple
+if (state.algo.numRows == 0) {
+if (!args[3].isNull()) {
+MLPMiniBatchState > previousState = 
args[3];
--- End diff --

can we create this variable outside the if check and then use it if it's 
not null ? It looks cleaner and is easier to follow


---

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-20 Thread kaknikhil

Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/243#discussion_r175621915
  
--- Diff: src/modules/convex/algo/igd.hpp ---
@@ -90,20 +90,27 @@ IGD::transition(state_type 
&state,
 
 for (int curr_epoch=0; curr_epoch < n_epochs; curr_epoch++) {
 double loss = 0.0;
-for (int curr_batch=0, curr_batch_row_index=0; curr_batch < 
n_batches;
- curr_batch++, curr_batch_row_index += batch_size) {
-   Matrix X_batch;
-   ColumnVector y_batch;
-   if (curr_batch == n_batches-1) {
-  // last batch
-  X_batch = 
tuple.indVar.bottomRows(n_rows-curr_batch_row_index);
-  y_batch = tuple.depVar.tail(n_rows-curr_batch_row_index);
-   } else {
-   X_batch = tuple.indVar.block(curr_batch_row_index, 0, 
batch_size, n_ind_cols);
-   y_batch = tuple.depVar.segment(curr_batch_row_index, 
batch_size);
-   }
-   loss += Task::getLossAndUpdateModel(
-   state.task.model, X_batch, y_batch, state.task.stepsize);
+int random_curr_batch[n_batches];
+for(int i=0; i

[GitHub] madlib pull request #243: MLP: Add minibatch gradient descent solver

2018-03-16 Thread njayaram2

GitHub user njayaram2 opened a pull request:

https://github.com/apache/madlib/pull/243

MLP: Add minibatch gradient descent solver

JIRA: MADLIB-1206

This commit adds support for mini-batch based gradient descent for MLP.
If the input table contains a 2D matrix for independent variable,
minibatch is automatically used as the solver. Two minibatch specific
optimizers are also introduced: batch_size and n_epochs.
- batch_size is defaulted to min(200, buffer_size), where buffer_size is
  equal to the number of original input rows packed into a single row in
  the matrix.
- n_epochs is the number of times all the batches in a buffer are
  iterated over (default 1).

Other changes include:
- dependent variable in the minibatch solver is also a matrix now. It
  was initially a vector.
- Randomize the order of processing a batch within an epoch.
- MLP minibatch currently doesn't support weights param, an error is
  thrown now.
- Delete an unused type named mlp_step_result.
- Add unit tests for newly added functions in python file.

Co-authored-by: Rahul Iyer 
Co-authored-by: Nikhil Kak 

Closes #243

You can merge this pull request into a Git repository by running:

$ git pull https://github.com/madlib/madlib 
mlp-minibatch-with-preprocessed-data-rebased

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/madlib/pull/243.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #243


commit d9306f7c6a44f64c53df13c34759da55468c4d26
Author: Nandish Jayaram 
Date:   2018-02-28T00:51:42Z

MLP: Add minibatch gradient descent solver

JIRA: MADLIB-1206

This commit adds support for mini-batch based gradient descent for MLP.
If the input table contains a 2D matrix for independent variable,
minibatch is automatically used as the solver. Two minibatch specific
optimizers are also introduced: batch_size and n_epochs.
- batch_size is defaulted to min(200, buffer_size), where buffer_size is
  equal to the number of original input rows packed into a single row in
  the matrix.
- n_epochs is the number of times all the batches in a buffer are
  iterated over (default 1).

Other changes include:
- dependent variable in the minibatch solver is also a matrix now. It
  was initially a vector.
- Randomize the order of processing a batch within an epoch.
- MLP minibatch currently doesn't support weights param, an error is
  thrown now.
- Delete an unused type named mlp_step_result.
- Add unit tests for newly added functions in python file.

Co-authored-by: Rahul Iyer 
Co-authored-by: Nikhil Kak 

Closes #243




---

37 matches

Mail list logo