[
https://issues.apache.org/jira/browse/MADLIB-1210?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16487845#comment-16487845
]
Nikhil edited comment on MADLIB-1210 at 5/23/18 6:53 PM:
---------------------------------------------------------
pg 10.1 mac minst dataset
{code:java}
############# debug master build ################################
DROP TABLE IF EXISTS mnist_result_summary, mnist_result,
mnist_result_standardization; SELECT madlib.mlp_classification(
'mnist_train', -- Source table
'mnist_result', -- Destination table
'x', -- Indepented
'y', -- Dependent
ARRAY[100], -- Hidden layer sizes
'learning_rate_init=0.001,
n_iterations=1,
learning_rate_policy=constant,
lambda=0.0001, -- Regularization
tolerance=0',
'tanh', -- Activation function
'', -- No weights
FALSE, -- No warmstart
FALSE); -- Verbose
Time: 533285.101 ms (08:53.285)
madlib_master=# select loss from mnist_result;
loss
------------------
3.34545760984669
################### debug momentum branch momentum 0.9 ######################
Timing is on.
DROP TABLE IF EXISTS mnist_result_summary, mnist_result,
mnist_result_standardization; SELECT madlib.mlp_classification(
'mnist_train', -- Source table
'mnist_result', -- Destination table
'x', -- Indepented
'y', -- Dependent
ARRAY[100], -- Hidden layer sizes
'learning_rate_init=0.001,
n_iterations=1,
learning_rate_policy=constant, momentum=0.9, nesterov_momentum=False,
lambda=0.0001, -- Regularization
tolerance=0',
'tanh', -- Activation function
'', -- No weights
FALSE, -- No warmstart
FALSE); -- Verbose
mlp_classification
--------------------
(1 row)
Time: 853966.848 ms (14:13.967)
madlib=# select loss from mnist_result;
loss
------------------
3.34545760984669
################### debug momentum branch nesterov momentum
0.9######################
Timing is on.
DROP TABLE IF EXISTS mnist_result_summary, mnist_result,
mnist_result_standardization; SELECT madlib.mlp_classification(
'mnist_train', -- Source table
'mnist_result', -- Destination table
'x', -- Indepented
'y', -- Dependent
ARRAY[100], -- Hidden layer sizes
'learning_rate_init=0.001,
n_iterations=1,
learning_rate_policy=constant, momentum=0.9, nesterov_momentum=True,
lambda=0.0001, -- Regularization
tolerance=0',
'tanh', -- Activation function
'', -- No weights
FALSE, -- No warmstart
FALSE); -- Verbose
mlp_classification
--------------------
(1 row)
Time: 956255.898 ms (15:56.256)
madlib=# select loss from mnist_result;
loss
------------------
3.33380958995029
################### debug momentum branch no momentum ######################
Timing is on.
DROP TABLE IF EXISTS mnist_result_summary, mnist_result,
mnist_result_standardization; SELECT madlib.mlp_classification(
'mnist_train', -- Source table
'mnist_result', -- Destination table
'x', -- Indepented
'y', -- Dependent
ARRAY[100], -- Hidden layer sizes
'learning_rate_init=0.001,
n_iterations=1,
learning_rate_policy=constant, momentum=0, nesterov_momentum=False,
lambda=0.0001, -- Regularization
tolerance=0',
'tanh', -- Activation function
'', -- No weights
FALSE, -- No warmstart
FALSE); -- Verbose
mlp_classification
--------------------
(1 row)
Time: 719405.115 ms (11:59.405)
madlib=# select loss from mnist_result;
------------------
3.38357626703038
(1 row)
{code}
was (Author: nikhilkak):
pg 10.1 mac
{code}
############# debug master build ################################
DROP TABLE IF EXISTS mnist_result_summary, mnist_result,
mnist_result_standardization; SELECT madlib.mlp_classification(
'mnist_train', -- Source table
'mnist_result', -- Destination table
'x', -- Indepented
'y', -- Dependent
ARRAY[100], -- Hidden layer sizes
'learning_rate_init=0.001,
n_iterations=1,
learning_rate_policy=constant,
lambda=0.0001, -- Regularization
tolerance=0',
'tanh', -- Activation function
'', -- No weights
FALSE, -- No warmstart
FALSE); -- Verbose
Time: 533285.101 ms (08:53.285)
madlib_master=# select loss from mnist_result;
loss
------------------
3.34545760984669
################### debug momentum branch momentum 0.9 ######################
Timing is on.
DROP TABLE IF EXISTS mnist_result_summary, mnist_result,
mnist_result_standardization; SELECT madlib.mlp_classification(
'mnist_train', -- Source table
'mnist_result', -- Destination table
'x', -- Indepented
'y', -- Dependent
ARRAY[100], -- Hidden layer sizes
'learning_rate_init=0.001,
n_iterations=1,
learning_rate_policy=constant, momentum=0.9, nesterov_momentum=False,
lambda=0.0001, -- Regularization
tolerance=0',
'tanh', -- Activation function
'', -- No weights
FALSE, -- No warmstart
FALSE); -- Verbose
mlp_classification
--------------------
(1 row)
Time: 853966.848 ms (14:13.967)
madlib=# select loss from mnist_result;
loss
------------------
3.34545760984669
################### debug momentum branch nesterov momentum
0.9######################
Timing is on.
DROP TABLE IF EXISTS mnist_result_summary, mnist_result,
mnist_result_standardization; SELECT madlib.mlp_classification(
'mnist_train', -- Source table
'mnist_result', -- Destination table
'x', -- Indepented
'y', -- Dependent
ARRAY[100], -- Hidden layer sizes
'learning_rate_init=0.001,
n_iterations=1,
learning_rate_policy=constant, momentum=0.9, nesterov_momentum=True,
lambda=0.0001, -- Regularization
tolerance=0',
'tanh', -- Activation function
'', -- No weights
FALSE, -- No warmstart
FALSE); -- Verbose
mlp_classification
--------------------
(1 row)
Time: 956255.898 ms (15:56.256)
madlib=# select loss from mnist_result;
loss
------------------
3.33380958995029
################### debug momentum branch no momentum ######################
Timing is on.
DROP TABLE IF EXISTS mnist_result_summary, mnist_result,
mnist_result_standardization; SELECT madlib.mlp_classification(
'mnist_train', -- Source table
'mnist_result', -- Destination table
'x', -- Indepented
'y', -- Dependent
ARRAY[100], -- Hidden layer sizes
'learning_rate_init=0.001,
n_iterations=1,
learning_rate_policy=constant, momentum=0, nesterov_momentum=False,
lambda=0.0001, -- Regularization
tolerance=0',
'tanh', -- Activation function
'', -- No weights
FALSE, -- No warmstart
FALSE); -- Verbose
mlp_classification
--------------------
(1 row)
Time: 719405.115 ms (11:59.405)
madlib=# select loss from mnist_result;
------------------
3.38357626703038
(1 row)
{code}
> Add momentum methods to MLP
> ---------------------------
>
> Key: MADLIB-1210
> URL: https://issues.apache.org/jira/browse/MADLIB-1210
> Project: Apache MADlib
> Issue Type: New Feature
> Components: Module: Neural Networks
> Reporter: Frank McQuillan
> Priority: Major
> Fix For: v1.15
>
> Attachments: Momentum methods comparison.xlsx
>
>
> Story
> As a data scientist,
> I want to use momentum methods in MLP,
> so that I get significantly better convergence behavior.
> Details
> Adding momentum will get the MADlib MLP algorithm closer to state of the art.
> 1) Implement momentum term, default value ~0.9
> Ref [1]:
> "Momentum update is another approach that almost always enjoys better
> converge rates on deep networks."
> 2) Implement Nesterov momentum, default TRUE
> Ref [1]:
> "Nesterov Momentum is a slightly different version of the momentum update
> that has recently been gaining popularity. It enjoys stronger theoretical
> converge guarantees for convex functions and in practice it also consistently
> works slightly better than standard momentum."
> Ref [2]
> "Nesterov’s accelerated gradient (abbrv. NAG; Nesterov, 1983) is a
> first-order optimization method which is proven to have a better convergence
> rate guarantee than gradient descent for general convex functions with
> Lipshitz-continuous derivatives (O(1/T2) versus O(1/T))"
> Interface
> There are 2 new optimization params for momentum, which apply for both
> classification and regression:
> {code}
> 'learning_rate_init = <value>,
> learning_rate_policy = <value>,
> gamma = <value>,
> power = <value>,
> iterations_per_step = <value>,
> n_iterations = <value>,
> n_tries = <value>,
> lambda = <value>,
> tolerance = <value>,
> batch_size = <value>,
> n_epochs = <value>,
> momentum = <value>,
> nesterov_momentum= <value>'
> momentum
> Default: 0.9. Momentum can help accelerate learning and
> avoid local minima when using gradient descent. Value must be in the
> range 0 to 1, where 0 means no momentum.
> nesterov_momentum
> Default: TRUE. Nesterov momentum can provide better results than using
> classical momentum alone, due to its look ahead characteristics.
> In classical momentum you first correct velocity and step with that
> velocity, whereas in Nesterov momentum you first step in the velocity
> direction then make a correction to the velocity vector based on
> new location.
> Nesterov momentum is only used when the 'momentum' parameter is > 0.
> {code}
> Open questions
> 1) Does momentum and Nesterov momentum work equally well with and without
> mini-batching?
> Is there any guidance we need to give to users on this?
> Acceptance
> [1] Compare the usefulness of momentum with and without Nesterov, mini-batch,
> and SGD. Use a 2D Rosenbrock function to compare in a similar way to test
> ref [100] in the comment further down, i.e., loss by iteration number.
> [2] Use another well behaved function (TBD) and run similar tests as in [1]
> above.
> [3] Test with MNIST.
> [4] Test with CIFAR-10 or CIFAR-100
> http://www.cs.toronto.edu/~kriz/cifar.html
> References
> [1] http://cs231n.github.io/neural-networks-3/#sgd
> [2] http://www.cs.utoronto.ca/~ilya/pubs/ilya_sutskever_phd_thesis.pdf, a
> link from previous source.
> [3]
> http://ruder.io/optimizing-gradient-descent/index.html#gradientdescentoptimizationalgorithms
> [4]
> http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
> [5] https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)