Github user kaknikhil commented on a diff in the pull request:
https://github.com/apache/madlib/pull/313#discussion_r211725199
--- Diff: src/modules/convex/task/mlp.hpp ---
@@ -192,18 +189,23 @@ MLP<Model, Tuple>::getLossAndUpdateModel(
// 1. normalize to per row update
// 2. discount by stepsize
// 3. add regularization
- // 4. make negative
+ // 4. make negative for descent
for (Index k=0; k < model.num_layers; k++){
Matrix regularization = MLP<Model, Tuple>::lambda * model.u[k];
regularization.row(0).setZero(); // Do not update bias
- total_gradient_per_layer[k] = -stepsize *
(total_gradient_per_layer[k] / static_cast<double>(num_rows_in_batch) +
- regularization);
- model.updateVelocity(total_gradient_per_layer[k], k);
- model.updatePosition(total_gradient_per_layer[k], k);
+ total_gradient_per_layer[k] = -stepsize *
+ (total_gradient_per_layer[k] /
static_cast<double>(num_rows_in_batch) +
+ regularization);
+
+ // total_gradient_per_layer is now the update vector
+ model.velocity[k] = model.momentum * model.velocity[k] +
total_gradient_per_layer[k];
--- End diff --
we should only update the velocity vector if momentum > 0
---