[GitHub] [incubator-mxnet] access2rohit commented on a change in pull request #16715: Lamb optimizer update

2019-11-25 Thread GitBox
access2rohit commented on a change in pull request #16715: Lamb optimizer update
URL: https://github.com/apache/incubator-mxnet/pull/16715#discussion_r350381992
 
 

 ##
 File path: src/operator/optimizer_op-inl.h
 ##
 @@ -1563,6 +1563,192 @@ inline void AdamUpdateEx(const nnvm::NodeAttrs& attrs,
   }
 }
 
+struct LambUpdatePhaseOneParam : public 
dmlc::Parameter {
+float beta1;
+float beta2;
+float epsilon;
+float t;
 
 Review comment:
   we are using float here for integer data type. @sxjscience can you explain 
how we will loses precision for the operation `beta^t` ?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] [incubator-mxnet] access2rohit commented on a change in pull request #16715: Lamb optimizer update

2019-11-22 Thread GitBox
access2rohit commented on a change in pull request #16715: Lamb optimizer update
URL: https://github.com/apache/incubator-mxnet/pull/16715#discussion_r349745221
 
 

 ##
 File path: python/mxnet/optimizer/optimizer.py
 ##
 @@ -1244,6 +1244,54 @@ def update(self, index, weight, grad, state):
 kwargs = {}
 sgd_update(weight, grad, out=weight, lr=lr, wd=wd, **kwargs)
 
+
+@register
+class LAMB(Optimizer):
+"""LAMB Optimizer.
+"""
+def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, 
epsilon=1e-6,
+ lower_bound=None, upper_bound=None, bias_correction=False, 
**kwargs):
 
 Review comment:
   Earlier we discussed offline to keep it False. Any reason for changing it to 
True now ?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] [incubator-mxnet] access2rohit commented on a change in pull request #16715: Lamb optimizer update

2019-11-22 Thread GitBox
access2rohit commented on a change in pull request #16715: Lamb optimizer update
URL: https://github.com/apache/incubator-mxnet/pull/16715#discussion_r349744815
 
 

 ##
 File path: src/operator/optimizer_op-inl.h
 ##
 @@ -1563,6 +1563,192 @@ inline void AdamUpdateEx(const nnvm::NodeAttrs& attrs,
   }
 }
 
+struct LambUpdatePhaseOneParam : public 
dmlc::Parameter {
+float beta1;
+float beta2;
+float epsilon;
+float t;
+bool bias_correction;
+float wd;
+float rescale_grad;
+float clip_gradient;
+DMLC_DECLARE_PARAMETER(LambUpdatePhaseOneParam) {
+  DMLC_DECLARE_FIELD(beta1)
+  .set_default(0.9f)
+  .describe("The decay rate for the 1st moment estimates.");
+  DMLC_DECLARE_FIELD(beta2)
+  .set_default(0.999f)
+  .describe("The decay rate for the 2nd moment estimates.");
+  DMLC_DECLARE_FIELD(epsilon)
+  .set_default(1e-6f)
+  .describe("A small constant for numerical stability.");
+  DMLC_DECLARE_FIELD(t)
+  .describe("Index update count.");
+  DMLC_DECLARE_FIELD(bias_correction)
+  .set_default(false)
+  .describe("Whether to use bias correction.");
+  DMLC_DECLARE_FIELD(wd)
+  .describe("Weight decay augments the objective function with a "
+"regularization term that penalizes large weights. "
+"The penalty scales with the square of the magnitude of each 
weight.");
+  DMLC_DECLARE_FIELD(rescale_grad)
+  .set_default(1.0f)
+  .describe("Rescale gradient to grad = rescale_grad*grad.");
+  DMLC_DECLARE_FIELD(clip_gradient)
+  .set_default(-1.0f)
+  .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] 
"
+"If clip_gradient <= 0, gradient clipping is turned off. "
+"grad = max(min(grad, clip_gradient), -clip_gradient).");
+}
+};
+
+struct LambUpdatePhaseTwoParam : public 
dmlc::Parameter {
+float lr;
+float lower_bound;
+float upper_bound;
+DMLC_DECLARE_PARAMETER(LambUpdatePhaseTwoParam) {
+  DMLC_DECLARE_FIELD(lr)
+  .describe("Learning rate");
+  DMLC_DECLARE_FIELD(lower_bound)
+  .set_default(-1.0f)
+  .describe("Lower limit of norm of weight. If lower_bound <= 0, Lower 
limit is not set");
+  DMLC_DECLARE_FIELD(upper_bound)
+  .set_default(-1.0f)
+  .describe("Upper limit of norm of weight. If upper_bound <= 0, Upper 
limit is not set");
+}
+};
+
+struct LambUpdatePhaseOneKernel {
+  template
+  MSHADOW_XINLINE static void Map(int i, DType* out_data,
+DType* mean_data, DType* var_data, const DType* weight_data, const DType* 
grad_data,
+const DType clip_gradient, const DType rescale_grad,
+const DType beta1, const DType beta2, const DType wd,
+const DType epsilon, const DType t,
+bool bias_correction, const OpReqType req) {
+using namespace mshadow_op;
+
+DType grad_rescaled = grad_data[i] * rescale_grad;
+if (clip_gradient >= 0.f) {
+  grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
+}
+
+mean_data[i] = beta1 * mean_data[i] + (1.f - beta1) * grad_rescaled;
+var_data[i] = beta2 * var_data[i] + (1.f - beta2) * grad_rescaled * 
grad_rescaled;
+
+DType g = mean_data[i] / (square_root::Map(var_data[i]) + epsilon) + wd * 
weight_data[i];
+
+if (bias_correction) {
+  DType mean_hat = mean_data[i] / (1. - power::Map(beta1, t));
+  DType var_hat = var_data[i] / (1 - power::Map(beta2, t));
+  g = mean_hat / (square_root::Map(var_hat) + epsilon) + wd * 
weight_data[i];
+}
+KERNEL_ASSIGN(out_data[i], req, g);
+  }
+};
+
+template
+inline void LambUpdatePhaseOne(const nnvm::NodeAttrs& attrs,
+   const OpContext ,
+   const std::vector ,
+   const std::vector ,
+   const std::vector ) {
+  using namespace mxnet_op;
+  const LambUpdatePhaseOneParam& param = 
nnvm::get(attrs.parsed);
+  Stream* s = ctx.get_stream();
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+Tensor weight = inputs[0].FlatTo2D(s);
+Tensor grad = inputs[1].FlatTo2D(s);
+Tensor mean = inputs[2].FlatTo2D(s);
+Tensor var = inputs[3].FlatTo2D(s);
+Tensor out = outputs[0].FlatTo2D(s);
+
+  Kernel::Launch(s, weight.shape_.Size(),
+out.dptr_, mean.dptr_, var.dptr_, weight.dptr_, grad.dptr_,
+static_cast(param.clip_gradient), 
static_cast(param.rescale_grad),
+static_cast(param.beta1), static_cast(param.beta2),
+static_cast(param.wd), static_cast(param.epsilon),
+static_cast(param.t), static_cast(param.bias_correction), 
req[0]);
+  });
+}
+
+inline bool LambUpdatePhaseTwoShape(const nnvm::NodeAttrs& attrs,
+mxnet::ShapeVector* in_attrs,
+mxnet::ShapeVector* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 4U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  mxnet::TShape expected_out(in_attrs->at(0).ndim(), -1);
+
+  

[GitHub] [incubator-mxnet] access2rohit commented on a change in pull request #16715: Lamb optimizer update

2019-11-22 Thread GitBox
access2rohit commented on a change in pull request #16715: Lamb optimizer update
URL: https://github.com/apache/incubator-mxnet/pull/16715#discussion_r349744737
 
 

 ##
 File path: src/operator/optimizer_op-inl.h
 ##
 @@ -1563,6 +1563,192 @@ inline void AdamUpdateEx(const nnvm::NodeAttrs& attrs,
   }
 }
 
+struct LambUpdatePhaseOneParam : public 
dmlc::Parameter {
+float beta1;
+float beta2;
+float epsilon;
+float t;
+bool bias_correction;
+float wd;
+float rescale_grad;
+float clip_gradient;
+DMLC_DECLARE_PARAMETER(LambUpdatePhaseOneParam) {
+  DMLC_DECLARE_FIELD(beta1)
+  .set_default(0.9f)
+  .describe("The decay rate for the 1st moment estimates.");
+  DMLC_DECLARE_FIELD(beta2)
+  .set_default(0.999f)
+  .describe("The decay rate for the 2nd moment estimates.");
+  DMLC_DECLARE_FIELD(epsilon)
+  .set_default(1e-6f)
+  .describe("A small constant for numerical stability.");
+  DMLC_DECLARE_FIELD(t)
+  .describe("Index update count.");
+  DMLC_DECLARE_FIELD(bias_correction)
+  .set_default(false)
+  .describe("Whether to use bias correction.");
+  DMLC_DECLARE_FIELD(wd)
+  .describe("Weight decay augments the objective function with a "
+"regularization term that penalizes large weights. "
+"The penalty scales with the square of the magnitude of each 
weight.");
+  DMLC_DECLARE_FIELD(rescale_grad)
+  .set_default(1.0f)
+  .describe("Rescale gradient to grad = rescale_grad*grad.");
+  DMLC_DECLARE_FIELD(clip_gradient)
+  .set_default(-1.0f)
+  .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] 
"
+"If clip_gradient <= 0, gradient clipping is turned off. "
+"grad = max(min(grad, clip_gradient), -clip_gradient).");
+}
+};
+
+struct LambUpdatePhaseTwoParam : public 
dmlc::Parameter {
+float lr;
+float lower_bound;
+float upper_bound;
+DMLC_DECLARE_PARAMETER(LambUpdatePhaseTwoParam) {
+  DMLC_DECLARE_FIELD(lr)
+  .describe("Learning rate");
+  DMLC_DECLARE_FIELD(lower_bound)
+  .set_default(-1.0f)
+  .describe("Lower limit of norm of weight. If lower_bound <= 0, Lower 
limit is not set");
+  DMLC_DECLARE_FIELD(upper_bound)
+  .set_default(-1.0f)
+  .describe("Upper limit of norm of weight. If upper_bound <= 0, Upper 
limit is not set");
+}
+};
+
+struct LambUpdatePhaseOneKernel {
+  template
+  MSHADOW_XINLINE static void Map(int i, DType* out_data,
+DType* mean_data, DType* var_data, const DType* weight_data, const DType* 
grad_data,
+const DType clip_gradient, const DType rescale_grad,
+const DType beta1, const DType beta2, const DType wd,
+const DType epsilon, const DType t,
+bool bias_correction, const OpReqType req) {
+using namespace mshadow_op;
+
+DType grad_rescaled = grad_data[i] * rescale_grad;
+if (clip_gradient >= 0.f) {
+  grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
+}
+
+mean_data[i] = beta1 * mean_data[i] + (1.f - beta1) * grad_rescaled;
+var_data[i] = beta2 * var_data[i] + (1.f - beta2) * grad_rescaled * 
grad_rescaled;
+
+DType g = mean_data[i] / (square_root::Map(var_data[i]) + epsilon) + wd * 
weight_data[i];
+
+if (bias_correction) {
+  DType mean_hat = mean_data[i] / (1. - power::Map(beta1, t));
+  DType var_hat = var_data[i] / (1 - power::Map(beta2, t));
+  g = mean_hat / (square_root::Map(var_hat) + epsilon) + wd * 
weight_data[i];
+}
+KERNEL_ASSIGN(out_data[i], req, g);
+  }
+};
+
+template
+inline void LambUpdatePhaseOne(const nnvm::NodeAttrs& attrs,
+   const OpContext ,
+   const std::vector ,
+   const std::vector ,
+   const std::vector ) {
+  using namespace mxnet_op;
+  const LambUpdatePhaseOneParam& param = 
nnvm::get(attrs.parsed);
+  Stream* s = ctx.get_stream();
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+Tensor weight = inputs[0].FlatTo2D(s);
+Tensor grad = inputs[1].FlatTo2D(s);
+Tensor mean = inputs[2].FlatTo2D(s);
+Tensor var = inputs[3].FlatTo2D(s);
+Tensor out = outputs[0].FlatTo2D(s);
 
 Review comment:
   @samskalicky These aren't data copies.  FlatTo2D converts the input data to 
2D and returns a 
pointer(https://github.com/apache/incubator-mxnet/blob/master/include/mxnet/tensor_blob.h#L210).
 The gradient, weights, mean and variance generally have higher dimensions than 
2. Refer to this file for examples regarding how other optimizers are 
tested(https://github.com/apache/incubator-mxnet/blob/master/tests/python/unittest/test_optimizer.py)


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about 

[GitHub] [incubator-mxnet] access2rohit commented on a change in pull request #16715: Lamb optimizer update

2019-11-13 Thread GitBox
access2rohit commented on a change in pull request #16715: Lamb optimizer update
URL: https://github.com/apache/incubator-mxnet/pull/16715#discussion_r346107803
 
 

 ##
 File path: src/operator/optimizer_op.cc
 ##
 @@ -921,5 +923,39 @@ Note that non-zero values for the weight decay option are 
not supported.
 .add_argument("history", "NDArray-or-Symbol", "History")
 .add_arguments(AdagradParam::__FIELDS__());
 
+NNVM_REGISTER_OP(lamb_update)
+.describe(R"code(Update function for lamb optimizer.
+)code" ADD_FILELINE)
+.set_num_inputs(4)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser)
+.set_attr("FInferShape", ElemwiseShape<4, 1>)
+.set_attr("FInferType", ElemwiseType<4, 1>)
+.set_attr("FCompute", LAMBUpdate)
+.set_attr("FMutateInputs",
+  [](const nnvm::NodeAttrs& attrs) {
+return std::vector{2, 3};
+  })
+.add_argument("weight", "NDArray-or-Symbol", "Weight")
+.add_argument("grad", "NDArray-or-Symbol", "Gradient")
+.add_argument("mean", "NDArray-or-Symbol", "Moving mean")
+.add_argument("var", "NDArray-or-Symbol", "Moving variance")
+.add_arguments(LAMBParam::__FIELDS__());
+
+NNVM_REGISTER_OP(lamb_weight_update)
+.describe(R"code(Update function for lamb optimizer.
+)code" ADD_FILELINE)
+.set_num_inputs(4)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser)
+.set_attr("FInferShape", LambWeightShape)
+.set_attr("FInferType", ElemwiseType<4, 1>)
+.set_attr("FCompute", LAMBWeightUpdate)
+.add_argument("weight", "NDArray-or-Symbol", "Weight")
+.add_argument("g", "NDArray-or-Symbol", "g")
 
 Review comment:
   Can you suggest a link or what should I write here. I have followed this: 
https://towardsdatascience.com/an-intuitive-understanding-of-the-lamb-optimizer-46f8c0ae4866
   Not sure how should I describe `g`


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] [incubator-mxnet] access2rohit commented on a change in pull request #16715: Lamb optimizer update

2019-11-13 Thread GitBox
access2rohit commented on a change in pull request #16715: Lamb optimizer update
URL: https://github.com/apache/incubator-mxnet/pull/16715#discussion_r346107263
 
 

 ##
 File path: src/operator/optimizer_op-inl.h
 ##
 @@ -1563,6 +1563,192 @@ inline void AdamUpdateEx(const nnvm::NodeAttrs& attrs,
   }
 }
 
+struct LAMBParam : public dmlc::Parameter {
+float beta1;
+float beta2;
+float epsilon;
+float t;
+bool bias_correction;
+float wd;
+float rescale_grad;
+float clip_gradient;
+DMLC_DECLARE_PARAMETER(LAMBParam) {
+  DMLC_DECLARE_FIELD(beta1)
+  .set_default(0.9f)
+  .describe("The decay rate for the 1st moment estimates.");
+  DMLC_DECLARE_FIELD(beta2)
+  .set_default(0.999f)
+  .describe("The decay rate for the 2nd moment estimates.");
+  DMLC_DECLARE_FIELD(epsilon)
+  .set_default(1e-6f)
+  .describe("A small constant for numerical stability.");
+  DMLC_DECLARE_FIELD(t)
+  .describe("Index update count.");
+  DMLC_DECLARE_FIELD(bias_correction)
+  .set_default(false)
+  .describe("Whether to use bias correction.");
+  DMLC_DECLARE_FIELD(wd)
+  .describe("Weight decay augments the objective function with a "
+"regularization term that penalizes large weights. "
+"The penalty scales with the square of the magnitude of each 
weight.");
+  DMLC_DECLARE_FIELD(rescale_grad)
+  .set_default(1.0f)
+  .describe("Rescale gradient to grad = rescale_grad*grad.");
+  DMLC_DECLARE_FIELD(clip_gradient)
+  .set_default(-1.0f)
+  .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] 
"
+"If clip_gradient <= 0, gradient clipping is turned off. "
+"grad = max(min(grad, clip_gradient), -clip_gradient).");
+}
+};
+
+struct LAMBWeightParam : public dmlc::Parameter {
+float lr;
+float lower_bound;
+float upper_bound;
+DMLC_DECLARE_PARAMETER(LAMBWeightParam) {
+  DMLC_DECLARE_FIELD(lr)
+  .describe("Learning rate");
+  DMLC_DECLARE_FIELD(lower_bound)
 
 Review comment:
   done


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] [incubator-mxnet] access2rohit commented on a change in pull request #16715: Lamb optimizer update

2019-11-13 Thread GitBox
access2rohit commented on a change in pull request #16715: Lamb optimizer update
URL: https://github.com/apache/incubator-mxnet/pull/16715#discussion_r346107327
 
 

 ##
 File path: tests/python/unittest/test_optimizer.py
 ##
 @@ -425,6 +425,77 @@ def test_nag():
 continue
 compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, 
rtol=1e-3, atol=1e-4)
 
+
+# LAMB optimizer
+class PyLAMB(mx.optimizer.Optimizer):
+"""
+   Python reference implementation of LAMB optimizer.
+"""
+def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, 
epsilon=1e-6,
+ lower_bound=1e-3, upper_bound=10.0, bias_correction=False, 
**kwargs):
+super(PyLAMB, self).__init__(learning_rate=learning_rate, **kwargs)
+self.beta1 = beta1
+self.beta2 = beta2
+self.epsilon = epsilon
+self.lower_bound = lower_bound
+self.upper_bound = upper_bound
+self.bias_correction = bias_correction
+
+def create_state(self, index, weight):
+stype = weight.stype
+return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype, 
stype=stype),
+mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype, 
stype=stype))
+
+def update(self, index, weight, grad, state):
+self._update_count(index)
+lr = self._get_lr(index)
+wd = self._get_wd(index)
+t = self._index_update_count[index]
+
+grad *= self.rescale_grad
+if self.clip_gradient is not None:
+grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+
+mean, var = state
+mean[:] = self.beta1 * mean + (1. - self.beta1) * grad
+var[:] = self.beta2 * var + (1. - self.beta2) * mx.nd.square(grad)
+
+
+r1 = weight.norm()
+if not self.bias_correction:
+r1 = mx.nd.minimum(mx.nd.maximum(r1, self.lower_bound), 
self.upper_bound)
+g = mean / (mx.nd.sqrt(var) + self.epsilon) + wd * weight
+
+else:
+mean_hat = mean / (1. - mx.nd.power(self.beta1, t))
+var_hat = var / (1. - mx.nd.power(self.beta2, t))
+g = mean_hat / mx.nd.sqrt(var_hat + self.epsilon) + wd * weight
+
+r2 = g.norm()
+
+# calculate lamb_trust_ratio
+r = 1. if r1 == 0. or r2 == 0. else r1 / r2
+lr *= r
+# update weight
+weight[:] -= lr * g
+
+def update_multi_precision(self, index, weight, grad, state):
+self.update(index, weight, grad, state)
+
+@with_seed()
+def test_lamb():
+opt1 = PyLAMB
+opt2 = mx.optimizer.LAMB
+shape = (3, 4, 5)
+cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+bc_options = [{}, {'bias_correction': False}, {'bias_correction': True}]
 
 Review comment:
   done


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] [incubator-mxnet] access2rohit commented on a change in pull request #16715: Lamb optimizer update

2019-11-13 Thread GitBox
access2rohit commented on a change in pull request #16715: Lamb optimizer update
URL: https://github.com/apache/incubator-mxnet/pull/16715#discussion_r346088876
 
 

 ##
 File path: src/operator/optimizer_op.cc
 ##
 @@ -921,5 +923,33 @@ Note that non-zero values for the weight decay option are 
not supported.
 .add_argument("history", "NDArray-or-Symbol", "History")
 .add_arguments(AdagradParam::__FIELDS__());
 
+NNVM_REGISTER_OP(lamb_update)
+.describe(R"code(Update function for lamb optimizer.
+)code" ADD_FILELINE)
+.set_num_inputs(4)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser)
+.set_attr("FInferShape", ElemwiseShape<4, 1>)
+.set_attr("FInferType", ElemwiseType<4, 1>)
+.set_attr("FCompute", LAMBUpdate)
+.add_argument("weight", "NDArray-or-Symbol", "Weight")
+.add_argument("grad", "NDArray-or-Symbol", "Gradient")
+.add_argument("mean", "NDArray-or-Symbol", "Moving mean")
+.add_argument("var", "NDArray-or-Symbol", "Moving variance")
+.add_arguments(LAMBParam::__FIELDS__());
+
+NNVM_REGISTER_OP(lamb_weight_update)
+.describe(R"code(Update function for lamb optimizer.
+)code" ADD_FILELINE)
 
 Review comment:
   done here: 
https://github.com/apache/incubator-mxnet/pull/16715/files#diff-abf63a5538d1f9e37ec5d95d7528d2d0R935


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] [incubator-mxnet] access2rohit commented on a change in pull request #16715: Lamb optimizer update

2019-11-13 Thread GitBox
access2rohit commented on a change in pull request #16715: Lamb optimizer update
URL: https://github.com/apache/incubator-mxnet/pull/16715#discussion_r346088727
 
 

 ##
 File path: python/mxnet/optimizer/optimizer.py
 ##
 @@ -1244,6 +1244,51 @@ def update(self, index, weight, grad, state):
 kwargs = {}
 sgd_update(weight, grad, out=weight, lr=lr, wd=wd, **kwargs)
 
+
+@register
+class LAMB(Optimizer):
+"""LAMB Optimizer.
 
 Review comment:
   working on it now


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] [incubator-mxnet] access2rohit commented on a change in pull request #16715: Lamb optimizer update

2019-11-13 Thread GitBox
access2rohit commented on a change in pull request #16715: Lamb optimizer update
URL: https://github.com/apache/incubator-mxnet/pull/16715#discussion_r346088445
 
 

 ##
 File path: tests/python/unittest/test_optimizer.py
 ##
 @@ -425,6 +425,77 @@ def test_nag():
 continue
 compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, 
rtol=1e-3, atol=1e-4)
 
+
+# LAMB optimizer
+class PyLAMB(mx.optimizer.Optimizer):
+"""
+   Python reference implementation of LAMB optimizer.
+"""
+def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, 
epsilon=1e-6,
+ lower_bound=1e-3, upper_bound=10.0, bias_correction=False, 
**kwargs):
+super(PyLAMB, self).__init__(learning_rate=learning_rate, **kwargs)
+self.beta1 = beta1
+self.beta2 = beta2
+self.epsilon = epsilon
+self.lower_bound = lower_bound
+self.upper_bound = upper_bound
+self.bias_correction = bias_correction
+
+def create_state(self, index, weight):
+stype = weight.stype
+return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype, 
stype=stype),
+mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype, 
stype=stype))
+
+def update(self, index, weight, grad, state):
+self._update_count(index)
+lr = self._get_lr(index)
+wd = self._get_wd(index)
+t = self._index_update_count[index]
+
+grad *= self.rescale_grad
+if self.clip_gradient is not None:
+grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+
+mean, var = state
+mean[:] = self.beta1 * mean + (1. - self.beta1) * grad
+var[:] = self.beta2 * var + (1. - self.beta2) * mx.nd.square(grad)
+
+
+r1 = weight.norm()
+if not self.bias_correction:
+r1 = mx.nd.minimum(mx.nd.maximum(r1, self.lower_bound), 
self.upper_bound)
+g = mean / (mx.nd.sqrt(var) + self.epsilon) + wd * weight
+
+else:
+mean_hat = mean / (1. - mx.nd.power(self.beta1, t))
+var_hat = var / (1. - mx.nd.power(self.beta2, t))
+g = mean_hat / mx.nd.sqrt(var_hat + self.epsilon) + wd * weight
+
+r2 = g.norm()
+
+# calculate lamb_trust_ratio
+r = 1. if r1 == 0. or r2 == 0. else r1 / r2
+lr *= r
+# update weight
+weight[:] -= lr * g
+
+def update_multi_precision(self, index, weight, grad, state):
+self.update(index, weight, grad, state)
+
+@with_seed()
+def test_lamb():
+opt1 = PyLAMB
+opt2 = mx.optimizer.LAMB
+shape = (3, 4, 5)
+cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+bc_options = [{}, {'bias_correction': False}, {'bias_correction': True}]
 
 Review comment:
   done


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] [incubator-mxnet] access2rohit commented on a change in pull request #16715: Lamb optimizer update

2019-11-13 Thread GitBox
access2rohit commented on a change in pull request #16715: Lamb optimizer update
URL: https://github.com/apache/incubator-mxnet/pull/16715#discussion_r346088325
 
 

 ##
 File path: src/operator/optimizer_op.cc
 ##
 @@ -921,5 +923,33 @@ Note that non-zero values for the weight decay option are 
not supported.
 .add_argument("history", "NDArray-or-Symbol", "History")
 .add_arguments(AdagradParam::__FIELDS__());
 
+NNVM_REGISTER_OP(lamb_update)
+.describe(R"code(Update function for lamb optimizer.
+)code" ADD_FILELINE)
+.set_num_inputs(4)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser)
+.set_attr("FInferShape", ElemwiseShape<4, 1>)
+.set_attr("FInferType", ElemwiseType<4, 1>)
+.set_attr("FCompute", LAMBUpdate)
+.add_argument("weight", "NDArray-or-Symbol", "Weight")
+.add_argument("grad", "NDArray-or-Symbol", "Gradient")
+.add_argument("mean", "NDArray-or-Symbol", "Moving mean")
+.add_argument("var", "NDArray-or-Symbol", "Moving variance")
+.add_arguments(LAMBParam::__FIELDS__());
+
+NNVM_REGISTER_OP(lamb_weight_update)
+.describe(R"code(Update function for lamb optimizer.
+)code" ADD_FILELINE)
 
 Review comment:
   Done 


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services


[GitHub] [incubator-mxnet] access2rohit commented on a change in pull request #16715: Lamb optimizer update

2019-11-13 Thread GitBox
access2rohit commented on a change in pull request #16715: Lamb optimizer update
URL: https://github.com/apache/incubator-mxnet/pull/16715#discussion_r346088464
 
 

 ##
 File path: tests/python/unittest/test_optimizer.py
 ##
 @@ -425,6 +425,77 @@ def test_nag():
 continue
 compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, 
rtol=1e-3, atol=1e-4)
 
+
+# LAMB optimizer
+class PyLAMB(mx.optimizer.Optimizer):
+"""
+   Python reference implementation of LAMB optimizer.
+"""
+def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, 
epsilon=1e-6,
+ lower_bound=1e-3, upper_bound=10.0, bias_correction=False, 
**kwargs):
+super(PyLAMB, self).__init__(learning_rate=learning_rate, **kwargs)
+self.beta1 = beta1
+self.beta2 = beta2
+self.epsilon = epsilon
+self.lower_bound = lower_bound
+self.upper_bound = upper_bound
+self.bias_correction = bias_correction
+
+def create_state(self, index, weight):
+stype = weight.stype
+return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype, 
stype=stype),
+mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype, 
stype=stype))
+
+def update(self, index, weight, grad, state):
+self._update_count(index)
+lr = self._get_lr(index)
+wd = self._get_wd(index)
+t = self._index_update_count[index]
+
+grad *= self.rescale_grad
+if self.clip_gradient is not None:
+grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+
+mean, var = state
+mean[:] = self.beta1 * mean + (1. - self.beta1) * grad
+var[:] = self.beta2 * var + (1. - self.beta2) * mx.nd.square(grad)
+
+
+r1 = weight.norm()
+if not self.bias_correction:
+r1 = mx.nd.minimum(mx.nd.maximum(r1, self.lower_bound), 
self.upper_bound)
+g = mean / (mx.nd.sqrt(var) + self.epsilon) + wd * weight
+
+else:
+mean_hat = mean / (1. - mx.nd.power(self.beta1, t))
+var_hat = var / (1. - mx.nd.power(self.beta2, t))
+g = mean_hat / mx.nd.sqrt(var_hat + self.epsilon) + wd * weight
+
 
 Review comment:
   done


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services