Github user kaknikhil commented on a diff in the pull request:

    https://github.com/apache/madlib/pull/313#discussion_r211725199
  
    --- Diff: src/modules/convex/task/mlp.hpp ---
    @@ -192,18 +189,23 @@ MLP<Model, Tuple>::getLossAndUpdateModel(
         //  1. normalize to per row update
         //  2. discount by stepsize
         //  3. add regularization
    -    //  4. make negative
    +    //  4. make negative for descent
         for (Index k=0; k < model.num_layers; k++){
             Matrix regularization = MLP<Model, Tuple>::lambda * model.u[k];
             regularization.row(0).setZero(); // Do not update bias
    -        total_gradient_per_layer[k] = -stepsize * 
(total_gradient_per_layer[k] / static_cast<double>(num_rows_in_batch) +
    -                                                  regularization);
    -        model.updateVelocity(total_gradient_per_layer[k], k);
    -        model.updatePosition(total_gradient_per_layer[k], k);
    +        total_gradient_per_layer[k] = -stepsize *
    +            (total_gradient_per_layer[k] / 
static_cast<double>(num_rows_in_batch) +
    +             regularization);
    +
    +        // total_gradient_per_layer is now the update vector
    +        model.velocity[k] = model.momentum * model.velocity[k] + 
total_gradient_per_layer[k];
    --- End diff --
    
    we should only update the velocity vector if momentum > 0


---

Reply via email to