Expose amsgrad argument for Adam, but error out if it is set to True.

PiperOrigin-RevId: 222424267
This commit is contained in:
Zhenyu Tan 2018-11-21 10:03:25 -08:00 committed by TensorFlower Gardener
parent 1799b11347
commit c59719cf1f
3 changed files with 56 additions and 13 deletions

View File

@ -35,9 +35,13 @@ class Adam(optimizer_v2.OptimizerV2):
requirement, invariant to diagonal rescaling of gradients, and is well suited
for problems that are large in terms of data/parameters'.
Note, amsgrad is currently not supported and the argument can only be False.
# References
See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
For AMSGrad see [Reddi et al., 2-18]
(https://openreview.net/pdf?id=ryQu7f-RZ)
"""
def __init__(self,
@ -45,26 +49,47 @@ class Adam(optimizer_v2.OptimizerV2):
beta_1=0.9,
beta_2=0.999,
epsilon=1e-7,
amsgrad=False,
name='Adam'):
r"""Construct a new Adam optimizer.
Initialization:
If amsgrad = False:
Initialization:
$$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
$$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
$$t := 0 \text{(Initialize timestep)}$$
$$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
$$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
$$t := 0 \text{(Initialize timestep)}$$
The update rule for `variable` with gradient `g` uses an optimization
described at the end of section2 of the paper:
The update rule for `variable` with gradient `g` uses an optimization
described at the end of section2 of the paper:
$$t := t + 1$$
$$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
$$t := t + 1$$
$$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
$$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
$$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
The default value of 1e-8 for epsilon might not be a good default in
If amsgrad = True:
Initialization:
$$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
$$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
$$v_hat_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
$$t := 0 \text{(Initialize timestep)}$$
The update rule for `variable` with gradient `g` uses an optimization
described at the end of section2 of the paper:
$$t := t + 1$$
$$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
$$v_hat_t := max(v_hat_{t-1}, v_t)
$$variable := variable - lr_t * m_t / (\sqrt{v_hat_t} + \epsilon)$$
The default value of 1e-7 for epsilon might not be a good default in
general. For example, when training an Inception network on ImageNet a
current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
formulation just before Section 2.1 of the Kingma and Ba paper rather than
@ -89,6 +114,8 @@ class Adam(optimizer_v2.OptimizerV2):
epsilon: A small constant for numerical stability. This epsilon is
"epsilon hat" in the Kingma and Ba paper (in the formula just before
Section 2.1), not the epsilon in Algorithm 1 of the paper.
amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
the paper "On the Convergence of Adam and beyond".
name: Optional name for the operations created when applying gradients.
Defaults to "Adam". @compatibility(eager) When eager execution is
enabled, `learning_rate`, `beta_1`, `beta_2`, and `epsilon` can each be
@ -102,6 +129,10 @@ class Adam(optimizer_v2.OptimizerV2):
self._set_hyper('beta_1', beta_1)
self._set_hyper('beta_2', beta_2)
self._set_hyper('epsilon', epsilon)
# TODO(tanzheny): create op for resource_apply_adam_with_amsgrad
if amsgrad:
raise ValueError('Amsgrad is currently not supported.')
self._amsgrad = amsgrad
def _create_slots(self, var_list):
# Create slots for the first and second moments.
@ -173,5 +204,6 @@ class Adam(optimizer_v2.OptimizerV2):
'beta_1': self._serialize_hyperparameter('beta_1'),
'beta_2': self._serialize_hyperparameter('beta_2'),
'epsilon': self._serialize_hyperparameter('epsilon'),
'amsgrad': self._amsgrad,
})
return config

View File

@ -298,6 +298,11 @@ class AdamOptimizerTest(test.TestCase):
# variables for v1 and v2 respectively.
self.assertEqual(9, len(set(opt.variables())))
def testAmsgradWithError(self):
with self.assertRaisesRegexp(ValueError,
"Amsgrad is currently not supported"):
adam.Adam(learning_rate=1., beta_1=0.9, beta_2=0.99, amsgrad=True)
if __name__ == "__main__":
test.main()

View File

@ -89,7 +89,13 @@ class Adamax(adam.Adam):
Defaults to "Adamax".
"""
# pylint: disable=useless-super-delegation
super(Adamax, self).__init__(learning_rate, beta_1, beta_2, epsilon, name)
super(Adamax, self).__init__(
learning_rate=learning_rate,
beta_1=beta_1,
beta_2=beta_2,
epsilon=epsilon,
amsgrad=False,
name=name)
# pylint: enable=useless-super-delegation
def _resource_apply_dense(self, grad, var):