This is an automated email from the ASF dual-hosted git repository. haibin pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push: new f9a34f8 Revert "[MXNET-265] Update optimizer doc to clarify wd behaviors (#10388)" (#10471) f9a34f8 is described below commit f9a34f874cc80d6199fb55cd49c6bca1df1dd486 Author: Haibin Lin <linhaibin.e...@gmail.com> AuthorDate: Mon Apr 9 12:54:56 2018 -0700 Revert "[MXNET-265] Update optimizer doc to clarify wd behaviors (#10388)" (#10471) This reverts commit c24765f6088fd66d072dfc3402998680de659217. --- python/mxnet/optimizer.py | 40 ++------------- tests/python/unittest/test_optimizer.py | 89 ++------------------------------- 2 files changed, 7 insertions(+), 122 deletions(-) diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py index 18bd5c6..6589e77 100644 --- a/python/mxnet/optimizer.py +++ b/python/mxnet/optimizer.py @@ -538,7 +538,7 @@ class SGD(Optimizer): class Signum(Optimizer): """The Signum optimizer that takes the sign of gradient or momentum. - The optimizer updates the weight by:: + The optimizer updates the weight by: rescaled_grad = rescale_grad * clip(grad, clip_gradient) + wd * weight state = momentum * state + (1-momentum)*rescaled_grad @@ -604,14 +604,6 @@ class FTML(Optimizer): *FTML - Follow the Moving Leader in Deep Learning*, available at http://proceedings.mlr.press/v70/zheng17a/zheng17a.pdf. - Denote time step by t. The optimizer updates the weight by:: - - rescaled_grad = clip(grad * rescale_grad + wd * weight, clip_gradient) - v = beta2 * v + (1 - beta2) * square(rescaled_grad) - d_t = (1 - power(beta1, t)) / lr * square_root(v / (1 - power(beta2, t))) + epsilon) - z = beta1 * z + (1 - beta1) * rescaled_grad - (d_t - beta1 * d_(t-1)) * weight - weight = - z / d_t - This optimizer accepts the following parameters in addition to those accepted by :class:`.Optimizer`. @@ -1076,13 +1068,6 @@ class AdaGrad(Optimizer): Methods for Online Learning and Stochastic Optimization*, and available at http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf. - This optimizer updates each weight by:: - - grad = clip(grad * rescale_grad + weight * wd, clip_gradient) - history += square(grad) - div = grad / sqrt(history + float_stable_eps) - weight += div * -lr - This optimizer accepts the following parameters in addition to those accepted by :class:`.Optimizer`. @@ -1120,12 +1105,12 @@ class AdaGrad(Optimizer): kwargs['clip_gradient'] = self.clip_gradient sparse.adagrad_update(weight, grad, history, out=weight, lr=lr, wd=wd, **kwargs) else: - grad = grad * self.rescale_grad + weight * wd + grad = grad * self.rescale_grad if self.clip_gradient is not None: grad = clip(grad, -self.clip_gradient, self.clip_gradient) history[:] += square(grad) div = grad / sqrt(history + self.float_stable_eps) - weight[:] += div * -lr + weight[:] += (div + weight * wd) * -lr @register class RMSProp(Optimizer): @@ -1210,15 +1195,6 @@ class AdaDelta(Optimizer): This class implements AdaDelta, an optimizer described in *ADADELTA: An adaptive learning rate method*, available at https://arxiv.org/abs/1212.5701. - This optimizer updates each weight by:: - - grad = clip(grad * rescale_grad + wd * weight, clip_gradient) - acc_grad = rho * acc_grad + (1. - rho) * grad * grad - delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad - acc_delta = rho * acc_delta + (1. - rho) * delta * delta - weight -= delta - - This optimizer accepts the following parameters in addition to those accepted by :class:`.Optimizer`. @@ -1246,7 +1222,6 @@ class AdaDelta(Optimizer): # preprocess grad grad *= self.rescale_grad - grad += wd * weight if self.clip_gradient is not None: grad = clip(grad, -self.clip_gradient, self.clip_gradient) @@ -1259,7 +1234,7 @@ class AdaDelta(Optimizer): acc_delta[:] = self.rho * acc_delta + (1. - self.rho) * current_delta * current_delta # update weight - weight[:] -= current_delta + weight[:] -= current_delta + wd * weight #pylint: disable=invalid-name #pylint: disable=line-too-long @@ -1346,13 +1321,6 @@ class Adamax(Optimizer): It is a variant of Adam based on the infinity norm available at http://arxiv.org/abs/1412.6980 Section 7. - The optimizer updates the weight by:: - - grad = clip(grad * rescale_grad + wd * weight, clip_gradient) - m = beta1 * m_t + (1 - beta1) * grad - u = maximum(beta2 * u, abs(grad)) - weight -= lr / (1 - beta1**t) * m / u - This optimizer accepts the following parameters in addition to those accepted by :class:`.Optimizer`. diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py index 07b3067..bbd7845 100644 --- a/tests/python/unittest/test_optimizer.py +++ b/tests/python/unittest/test_optimizer.py @@ -999,12 +999,12 @@ class PyAdaGrad(mx.optimizer.Optimizer): wd = self._get_wd(index) history = state - grad = grad * self.rescale_grad + weight * wd + grad = grad * self.rescale_grad if self.clip_gradient is not None: grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient) history[:] += mx.nd.square(grad) div = grad / mx.nd.sqrt(history + self.float_stable_eps) - weight[:] += div * -lr + weight[:] += (div + weight * wd) * -lr def test_adagrad(): mx.random.seed(0) @@ -1014,7 +1014,7 @@ def test_adagrad(): eps_options = [{}, {'eps': 1e-8}] cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}] rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}] - wd_options = [{}, {'wd': 0.1}] + wd_options = [{}, {'wd': 0.0}] for dtype in [np.float32]: for eps_option in eps_options: for cg_option in cg_options: @@ -1031,89 +1031,6 @@ def test_adagrad(): w_stype='row_sparse', g_stype='row_sparse') -# AdaDelta -class PyAdaDelta(mx.optimizer.Optimizer): - """The python reference of AdaDelta optimizer. - - This class implements AdaDelta, an optimizer described in *ADADELTA: An adaptive - learning rate method*, available at https://arxiv.org/abs/1212.5701. - - This optimizer updates each weight by:: - - grad = clip(grad * rescale_grad + wd * weight, clip_gradient) - acc_grad = rho * acc_grad + (1. - rho) * grad * grad - delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad - acc_delta = rho * acc_delta + (1. - rho) * delta * delta - weight -= delta - - This optimizer accepts the following parameters in addition to those accepted - by :class:`.Optimizer`. - - Parameters - ---------- - eps: float, optional - Small value to avoid division by 0. - - Parameters - ---------- - rho: float - Decay rate for both squared gradients and delta. - epsilon : float - Small value to avoid division by 0. - """ - def __init__(self, rho=0.90, epsilon=1e-5, **kwargs): - super(PyAdaDelta, self).__init__(**kwargs) - self.rho = rho - self.epsilon = epsilon - - def create_state(self, index, weight): - return (mx.nd.zeros(weight.shape, weight.context), # accumulated g - mx.nd.zeros(weight.shape, weight.context)) # accumulated delta - - def update(self, index, weight, grad, state): - self._update_count(index) - lr = self._get_lr(index) - wd = self._get_wd(index) - - # preprocess grad - grad *= self.rescale_grad - grad += wd * weight - if self.clip_gradient is not None: - grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient) - - # accumulated g and delta initlization - acc_g, acc_delta = state - - # update g, delta - acc_g[:] = self.rho * acc_g + (1. - self.rho) * grad * grad - current_delta = mx.nd.sqrt(acc_delta + self.epsilon) / mx.nd.sqrt(acc_g + self.epsilon) * grad - acc_delta[:] = self.rho * acc_delta + (1. - self.rho) * current_delta * current_delta - - # update weight - weight[:] -= current_delta - -def test_adadelta(): - mx.random.seed(0) - opt1 = PyAdaDelta - opt2 = mx.optimizer.AdaDelta - shape = (3, 4, 5) - eps_options = [{}, {'epsilon': 1e-8}] - cg_options = [{}, {'clip_gradient': 0.4}] - rg_options = [{}, {'rescale_grad': 0.14}] - wd_options = [{}, {'wd': 0.1}] - for dtype in [np.float32]: - for eps_option in eps_options: - for cg_option in cg_options: - for rg_option in rg_options: - for wd_option in wd_options: - kwarg = {} - kwarg.update(eps_option) - kwarg.update(cg_option) - kwarg.update(rg_option) - kwarg.update(wd_option) - compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype) - - if __name__ == '__main__': import nose -- To stop receiving notification emails like this one, please contact hai...@apache.org.