This is an automated email from the ASF dual-hosted git repository.

haibin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new f9a34f8  Revert "[MXNET-265] Update optimizer doc to clarify wd 
behaviors (#10388)" (#10471)
f9a34f8 is described below

commit f9a34f874cc80d6199fb55cd49c6bca1df1dd486
Author: Haibin Lin <linhaibin.e...@gmail.com>
AuthorDate: Mon Apr 9 12:54:56 2018 -0700

    Revert "[MXNET-265] Update optimizer doc to clarify wd behaviors (#10388)" 
(#10471)
    
    This reverts commit c24765f6088fd66d072dfc3402998680de659217.
---
 python/mxnet/optimizer.py               | 40 ++-------------
 tests/python/unittest/test_optimizer.py | 89 ++-------------------------------
 2 files changed, 7 insertions(+), 122 deletions(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 18bd5c6..6589e77 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -538,7 +538,7 @@ class SGD(Optimizer):
 class Signum(Optimizer):
     """The Signum optimizer that takes the sign of gradient or momentum.
 
-    The optimizer updates the weight by::
+    The optimizer updates the weight by:
 
         rescaled_grad = rescale_grad * clip(grad, clip_gradient) + wd * weight
         state = momentum * state + (1-momentum)*rescaled_grad
@@ -604,14 +604,6 @@ class FTML(Optimizer):
     *FTML - Follow the Moving Leader in Deep Learning*,
     available at http://proceedings.mlr.press/v70/zheng17a/zheng17a.pdf.
 
-    Denote time step by t. The optimizer updates the weight by::
-
-        rescaled_grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        v = beta2 * v + (1 - beta2) * square(rescaled_grad)
-        d_t = (1 - power(beta1, t)) / lr * square_root(v / (1 - power(beta2, 
t))) + epsilon)
-        z = beta1 * z + (1 - beta1) * rescaled_grad - (d_t - beta1 * d_(t-1)) 
* weight
-        weight = - z / d_t
-
     This optimizer accepts the following parameters in addition to those 
accepted
     by :class:`.Optimizer`.
 
@@ -1076,13 +1068,6 @@ class AdaGrad(Optimizer):
     Methods for Online Learning and Stochastic Optimization*, and available at
     http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
 
-    This optimizer updates each weight by::
-
-            grad = clip(grad * rescale_grad + weight * wd, clip_gradient)
-            history += square(grad)
-            div = grad / sqrt(history + float_stable_eps)
-            weight += div * -lr
-
     This optimizer accepts the following parameters in addition to those 
accepted
     by :class:`.Optimizer`.
 
@@ -1120,12 +1105,12 @@ class AdaGrad(Optimizer):
                 kwargs['clip_gradient'] = self.clip_gradient
             sparse.adagrad_update(weight, grad, history, out=weight, lr=lr, 
wd=wd, **kwargs)
         else:
-            grad = grad * self.rescale_grad + weight * wd
+            grad = grad * self.rescale_grad
             if self.clip_gradient is not None:
                 grad = clip(grad, -self.clip_gradient, self.clip_gradient)
             history[:] += square(grad)
             div = grad / sqrt(history + self.float_stable_eps)
-            weight[:] += div * -lr
+            weight[:] += (div + weight * wd) * -lr
 
 @register
 class RMSProp(Optimizer):
@@ -1210,15 +1195,6 @@ class AdaDelta(Optimizer):
     This class implements AdaDelta, an optimizer described in  *ADADELTA: An 
adaptive
     learning rate method*, available at https://arxiv.org/abs/1212.5701.
 
-    This optimizer updates each weight by::
-
-        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        acc_grad = rho * acc_grad + (1. - rho) * grad * grad
-        delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
-        acc_delta = rho * acc_delta + (1. - rho) * delta * delta
-        weight -= delta
-
-
     This optimizer accepts the following parameters in addition to those 
accepted
     by :class:`.Optimizer`.
 
@@ -1246,7 +1222,6 @@ class AdaDelta(Optimizer):
 
         # preprocess grad
         grad *= self.rescale_grad
-        grad += wd * weight
         if self.clip_gradient is not None:
             grad = clip(grad, -self.clip_gradient, self.clip_gradient)
 
@@ -1259,7 +1234,7 @@ class AdaDelta(Optimizer):
         acc_delta[:] = self.rho * acc_delta + (1. - self.rho) * current_delta 
* current_delta
 
         # update weight
-        weight[:] -= current_delta
+        weight[:] -= current_delta + wd * weight
 
 #pylint: disable=invalid-name
 #pylint: disable=line-too-long
@@ -1346,13 +1321,6 @@ class Adamax(Optimizer):
     It is a variant of Adam based on the infinity norm
     available at http://arxiv.org/abs/1412.6980 Section 7.
 
-    The optimizer updates the weight by::
-
-        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        m = beta1 * m_t + (1 - beta1) * grad
-        u = maximum(beta2 * u, abs(grad))
-        weight -= lr / (1 - beta1**t) * m / u
-
     This optimizer accepts the following parameters in addition to those 
accepted
     by :class:`.Optimizer`.
 
diff --git a/tests/python/unittest/test_optimizer.py 
b/tests/python/unittest/test_optimizer.py
index 07b3067..bbd7845 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -999,12 +999,12 @@ class PyAdaGrad(mx.optimizer.Optimizer):
         wd = self._get_wd(index)
 
         history = state
-        grad = grad * self.rescale_grad + weight * wd
+        grad = grad * self.rescale_grad
         if self.clip_gradient is not None:
             grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
         history[:] += mx.nd.square(grad)
         div = grad / mx.nd.sqrt(history + self.float_stable_eps)
-        weight[:] += div * -lr
+        weight[:] += (div + weight * wd) * -lr
 
 def test_adagrad():
     mx.random.seed(0)
@@ -1014,7 +1014,7 @@ def test_adagrad():
     eps_options = [{}, {'eps': 1e-8}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.1}]
+    wd_options = [{}, {'wd': 0.0}]
     for dtype in [np.float32]:
         for eps_option in eps_options:
             for cg_option in cg_options:
@@ -1031,89 +1031,6 @@ def test_adagrad():
                                               w_stype='row_sparse', 
g_stype='row_sparse')
 
 
-# AdaDelta
-class PyAdaDelta(mx.optimizer.Optimizer):
-    """The python reference of AdaDelta optimizer.
-
-    This class implements AdaDelta, an optimizer described in  *ADADELTA: An 
adaptive
-    learning rate method*, available at https://arxiv.org/abs/1212.5701.
-
-    This optimizer updates each weight by::
-
-        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        acc_grad = rho * acc_grad + (1. - rho) * grad * grad
-        delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
-        acc_delta = rho * acc_delta + (1. - rho) * delta * delta
-        weight -= delta
-
-    This optimizer accepts the following parameters in addition to those 
accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    eps: float, optional
-        Small value to avoid division by 0.
-
-    Parameters
-    ----------
-    rho: float
-        Decay rate for both squared gradients and delta.
-    epsilon : float
-        Small value to avoid division by 0.
-    """
-    def __init__(self, rho=0.90, epsilon=1e-5, **kwargs):
-        super(PyAdaDelta, self).__init__(**kwargs)
-        self.rho = rho
-        self.epsilon = epsilon
-
-    def create_state(self, index, weight):
-        return (mx.nd.zeros(weight.shape, weight.context),  # accumulated g
-                mx.nd.zeros(weight.shape, weight.context))  # accumulated delta
-
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        # preprocess grad
-        grad *= self.rescale_grad
-        grad += wd * weight
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-
-        # accumulated g and delta initlization
-        acc_g, acc_delta = state
-
-        # update g, delta
-        acc_g[:] = self.rho * acc_g + (1. - self.rho) * grad * grad
-        current_delta = mx.nd.sqrt(acc_delta + self.epsilon) / 
mx.nd.sqrt(acc_g + self.epsilon) * grad
-        acc_delta[:] = self.rho * acc_delta + (1. - self.rho) * current_delta 
* current_delta
-
-        # update weight
-        weight[:] -= current_delta
-
-def test_adadelta():
-    mx.random.seed(0)
-    opt1 = PyAdaDelta
-    opt2 = mx.optimizer.AdaDelta
-    shape = (3, 4, 5)
-    eps_options = [{}, {'epsilon': 1e-8}]
-    cg_options = [{}, {'clip_gradient': 0.4}]
-    rg_options = [{}, {'rescale_grad': 0.14}]
-    wd_options = [{}, {'wd': 0.1}]
-    for dtype in [np.float32]:
-        for eps_option in eps_options:
-            for cg_option in cg_options:
-                for rg_option in rg_options:
-                    for wd_option in wd_options:
-                        kwarg = {}
-                        kwarg.update(eps_option)
-                        kwarg.update(cg_option)
-                        kwarg.update(rg_option)
-                        kwarg.update(wd_option)
-                        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, 
dtype)
-
-
 
 if __name__ == '__main__':
     import nose

-- 
To stop receiving notification emails like this one, please contact
hai...@apache.org.

Reply via email to