eric-haibin-lin closed pull request #10471: Revert "[MXNET-265] Update optimizer doc to clarify wd behaviors (#10388) URL: https://github.com/apache/incubator-mxnet/pull/10471
This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py index 18bd5c6e4fe..6589e77e453 100644 --- a/python/mxnet/optimizer.py +++ b/python/mxnet/optimizer.py @@ -538,7 +538,7 @@ def update_multi_precision(self, index, weight, grad, state): class Signum(Optimizer): """The Signum optimizer that takes the sign of gradient or momentum. - The optimizer updates the weight by:: + The optimizer updates the weight by: rescaled_grad = rescale_grad * clip(grad, clip_gradient) + wd * weight state = momentum * state + (1-momentum)*rescaled_grad @@ -604,14 +604,6 @@ class FTML(Optimizer): *FTML - Follow the Moving Leader in Deep Learning*, available at http://proceedings.mlr.press/v70/zheng17a/zheng17a.pdf. - Denote time step by t. The optimizer updates the weight by:: - - rescaled_grad = clip(grad * rescale_grad + wd * weight, clip_gradient) - v = beta2 * v + (1 - beta2) * square(rescaled_grad) - d_t = (1 - power(beta1, t)) / lr * square_root(v / (1 - power(beta2, t))) + epsilon) - z = beta1 * z + (1 - beta1) * rescaled_grad - (d_t - beta1 * d_(t-1)) * weight - weight = - z / d_t - This optimizer accepts the following parameters in addition to those accepted by :class:`.Optimizer`. @@ -1076,13 +1068,6 @@ class AdaGrad(Optimizer): Methods for Online Learning and Stochastic Optimization*, and available at http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf. - This optimizer updates each weight by:: - - grad = clip(grad * rescale_grad + weight * wd, clip_gradient) - history += square(grad) - div = grad / sqrt(history + float_stable_eps) - weight += div * -lr - This optimizer accepts the following parameters in addition to those accepted by :class:`.Optimizer`. @@ -1120,12 +1105,12 @@ def update(self, index, weight, grad, state): kwargs['clip_gradient'] = self.clip_gradient sparse.adagrad_update(weight, grad, history, out=weight, lr=lr, wd=wd, **kwargs) else: - grad = grad * self.rescale_grad + weight * wd + grad = grad * self.rescale_grad if self.clip_gradient is not None: grad = clip(grad, -self.clip_gradient, self.clip_gradient) history[:] += square(grad) div = grad / sqrt(history + self.float_stable_eps) - weight[:] += div * -lr + weight[:] += (div + weight * wd) * -lr @register class RMSProp(Optimizer): @@ -1210,15 +1195,6 @@ class AdaDelta(Optimizer): This class implements AdaDelta, an optimizer described in *ADADELTA: An adaptive learning rate method*, available at https://arxiv.org/abs/1212.5701. - This optimizer updates each weight by:: - - grad = clip(grad * rescale_grad + wd * weight, clip_gradient) - acc_grad = rho * acc_grad + (1. - rho) * grad * grad - delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad - acc_delta = rho * acc_delta + (1. - rho) * delta * delta - weight -= delta - - This optimizer accepts the following parameters in addition to those accepted by :class:`.Optimizer`. @@ -1246,7 +1222,6 @@ def update(self, index, weight, grad, state): # preprocess grad grad *= self.rescale_grad - grad += wd * weight if self.clip_gradient is not None: grad = clip(grad, -self.clip_gradient, self.clip_gradient) @@ -1259,7 +1234,7 @@ def update(self, index, weight, grad, state): acc_delta[:] = self.rho * acc_delta + (1. - self.rho) * current_delta * current_delta # update weight - weight[:] -= current_delta + weight[:] -= current_delta + wd * weight #pylint: disable=invalid-name #pylint: disable=line-too-long @@ -1346,13 +1321,6 @@ class Adamax(Optimizer): It is a variant of Adam based on the infinity norm available at http://arxiv.org/abs/1412.6980 Section 7. - The optimizer updates the weight by:: - - grad = clip(grad * rescale_grad + wd * weight, clip_gradient) - m = beta1 * m_t + (1 - beta1) * grad - u = maximum(beta2 * u, abs(grad)) - weight -= lr / (1 - beta1**t) * m / u - This optimizer accepts the following parameters in addition to those accepted by :class:`.Optimizer`. diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py index 07b306702f3..bbd7845f66f 100644 --- a/tests/python/unittest/test_optimizer.py +++ b/tests/python/unittest/test_optimizer.py @@ -999,12 +999,12 @@ def update(self, index, weight, grad, state): wd = self._get_wd(index) history = state - grad = grad * self.rescale_grad + weight * wd + grad = grad * self.rescale_grad if self.clip_gradient is not None: grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient) history[:] += mx.nd.square(grad) div = grad / mx.nd.sqrt(history + self.float_stable_eps) - weight[:] += div * -lr + weight[:] += (div + weight * wd) * -lr def test_adagrad(): mx.random.seed(0) @@ -1014,7 +1014,7 @@ def test_adagrad(): eps_options = [{}, {'eps': 1e-8}] cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}] rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}] - wd_options = [{}, {'wd': 0.1}] + wd_options = [{}, {'wd': 0.0}] for dtype in [np.float32]: for eps_option in eps_options: for cg_option in cg_options: @@ -1031,89 +1031,6 @@ def test_adagrad(): w_stype='row_sparse', g_stype='row_sparse') -# AdaDelta -class PyAdaDelta(mx.optimizer.Optimizer): - """The python reference of AdaDelta optimizer. - - This class implements AdaDelta, an optimizer described in *ADADELTA: An adaptive - learning rate method*, available at https://arxiv.org/abs/1212.5701. - - This optimizer updates each weight by:: - - grad = clip(grad * rescale_grad + wd * weight, clip_gradient) - acc_grad = rho * acc_grad + (1. - rho) * grad * grad - delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad - acc_delta = rho * acc_delta + (1. - rho) * delta * delta - weight -= delta - - This optimizer accepts the following parameters in addition to those accepted - by :class:`.Optimizer`. - - Parameters - ---------- - eps: float, optional - Small value to avoid division by 0. - - Parameters - ---------- - rho: float - Decay rate for both squared gradients and delta. - epsilon : float - Small value to avoid division by 0. - """ - def __init__(self, rho=0.90, epsilon=1e-5, **kwargs): - super(PyAdaDelta, self).__init__(**kwargs) - self.rho = rho - self.epsilon = epsilon - - def create_state(self, index, weight): - return (mx.nd.zeros(weight.shape, weight.context), # accumulated g - mx.nd.zeros(weight.shape, weight.context)) # accumulated delta - - def update(self, index, weight, grad, state): - self._update_count(index) - lr = self._get_lr(index) - wd = self._get_wd(index) - - # preprocess grad - grad *= self.rescale_grad - grad += wd * weight - if self.clip_gradient is not None: - grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient) - - # accumulated g and delta initlization - acc_g, acc_delta = state - - # update g, delta - acc_g[:] = self.rho * acc_g + (1. - self.rho) * grad * grad - current_delta = mx.nd.sqrt(acc_delta + self.epsilon) / mx.nd.sqrt(acc_g + self.epsilon) * grad - acc_delta[:] = self.rho * acc_delta + (1. - self.rho) * current_delta * current_delta - - # update weight - weight[:] -= current_delta - -def test_adadelta(): - mx.random.seed(0) - opt1 = PyAdaDelta - opt2 = mx.optimizer.AdaDelta - shape = (3, 4, 5) - eps_options = [{}, {'epsilon': 1e-8}] - cg_options = [{}, {'clip_gradient': 0.4}] - rg_options = [{}, {'rescale_grad': 0.14}] - wd_options = [{}, {'wd': 0.1}] - for dtype in [np.float32]: - for eps_option in eps_options: - for cg_option in cg_options: - for rg_option in rg_options: - for wd_option in wd_options: - kwarg = {} - kwarg.update(eps_option) - kwarg.update(cg_option) - kwarg.update(rg_option) - kwarg.update(wd_option) - compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype) - - if __name__ == '__main__': import nose ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services