Expose amsgrad argument for Adam, but error out if it is set to True.

PiperOrigin-RevId: 222424267
This commit is contained in:
Zhenyu Tan 2018-11-21 10:03:25 -08:00 committed by TensorFlower Gardener
parent 1799b11347
commit c59719cf1f
3 changed files with 56 additions and 13 deletions

View File

@ -35,9 +35,13 @@ class Adam(optimizer_v2.OptimizerV2):
requirement, invariant to diagonal rescaling of gradients, and is well suited requirement, invariant to diagonal rescaling of gradients, and is well suited
for problems that are large in terms of data/parameters'. for problems that are large in terms of data/parameters'.
Note, amsgrad is currently not supported and the argument can only be False.
# References # References
See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980) See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
([pdf](http://arxiv.org/pdf/1412.6980.pdf)). ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
For AMSGrad see [Reddi et al., 2-18]
(https://openreview.net/pdf?id=ryQu7f-RZ)
""" """
def __init__(self, def __init__(self,
@ -45,26 +49,47 @@ class Adam(optimizer_v2.OptimizerV2):
beta_1=0.9, beta_1=0.9,
beta_2=0.999, beta_2=0.999,
epsilon=1e-7, epsilon=1e-7,
amsgrad=False,
name='Adam'): name='Adam'):
r"""Construct a new Adam optimizer. r"""Construct a new Adam optimizer.
Initialization: If amsgrad = False:
Initialization:
$$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$ $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
$$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$ $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
$$t := 0 \text{(Initialize timestep)}$$ $$t := 0 \text{(Initialize timestep)}$$
The update rule for `variable` with gradient `g` uses an optimization The update rule for `variable` with gradient `g` uses an optimization
described at the end of section2 of the paper: described at the end of section2 of the paper:
$$t := t + 1$$ $$t := t + 1$$
$$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$ $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$ $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
$$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
The default value of 1e-8 for epsilon might not be a good default in If amsgrad = True:
Initialization:
$$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
$$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
$$v_hat_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
$$t := 0 \text{(Initialize timestep)}$$
The update rule for `variable` with gradient `g` uses an optimization
described at the end of section2 of the paper:
$$t := t + 1$$
$$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
$$v_hat_t := max(v_hat_{t-1}, v_t)
$$variable := variable - lr_t * m_t / (\sqrt{v_hat_t} + \epsilon)$$
The default value of 1e-7 for epsilon might not be a good default in
general. For example, when training an Inception network on ImageNet a general. For example, when training an Inception network on ImageNet a
current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
formulation just before Section 2.1 of the Kingma and Ba paper rather than formulation just before Section 2.1 of the Kingma and Ba paper rather than
@ -89,6 +114,8 @@ class Adam(optimizer_v2.OptimizerV2):
epsilon: A small constant for numerical stability. This epsilon is epsilon: A small constant for numerical stability. This epsilon is
"epsilon hat" in the Kingma and Ba paper (in the formula just before "epsilon hat" in the Kingma and Ba paper (in the formula just before
Section 2.1), not the epsilon in Algorithm 1 of the paper. Section 2.1), not the epsilon in Algorithm 1 of the paper.
amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
the paper "On the Convergence of Adam and beyond".
name: Optional name for the operations created when applying gradients. name: Optional name for the operations created when applying gradients.
Defaults to "Adam". @compatibility(eager) When eager execution is Defaults to "Adam". @compatibility(eager) When eager execution is
enabled, `learning_rate`, `beta_1`, `beta_2`, and `epsilon` can each be enabled, `learning_rate`, `beta_1`, `beta_2`, and `epsilon` can each be
@ -102,6 +129,10 @@ class Adam(optimizer_v2.OptimizerV2):
self._set_hyper('beta_1', beta_1) self._set_hyper('beta_1', beta_1)
self._set_hyper('beta_2', beta_2) self._set_hyper('beta_2', beta_2)
self._set_hyper('epsilon', epsilon) self._set_hyper('epsilon', epsilon)
# TODO(tanzheny): create op for resource_apply_adam_with_amsgrad
if amsgrad:
raise ValueError('Amsgrad is currently not supported.')
self._amsgrad = amsgrad
def _create_slots(self, var_list): def _create_slots(self, var_list):
# Create slots for the first and second moments. # Create slots for the first and second moments.
@ -173,5 +204,6 @@ class Adam(optimizer_v2.OptimizerV2):
'beta_1': self._serialize_hyperparameter('beta_1'), 'beta_1': self._serialize_hyperparameter('beta_1'),
'beta_2': self._serialize_hyperparameter('beta_2'), 'beta_2': self._serialize_hyperparameter('beta_2'),
'epsilon': self._serialize_hyperparameter('epsilon'), 'epsilon': self._serialize_hyperparameter('epsilon'),
'amsgrad': self._amsgrad,
}) })
return config return config

View File

@ -298,6 +298,11 @@ class AdamOptimizerTest(test.TestCase):
# variables for v1 and v2 respectively. # variables for v1 and v2 respectively.
self.assertEqual(9, len(set(opt.variables()))) self.assertEqual(9, len(set(opt.variables())))
def testAmsgradWithError(self):
with self.assertRaisesRegexp(ValueError,
"Amsgrad is currently not supported"):
adam.Adam(learning_rate=1., beta_1=0.9, beta_2=0.99, amsgrad=True)
if __name__ == "__main__": if __name__ == "__main__":
test.main() test.main()

View File

@ -89,7 +89,13 @@ class Adamax(adam.Adam):
Defaults to "Adamax". Defaults to "Adamax".
""" """
# pylint: disable=useless-super-delegation # pylint: disable=useless-super-delegation
super(Adamax, self).__init__(learning_rate, beta_1, beta_2, epsilon, name) super(Adamax, self).__init__(
learning_rate=learning_rate,
beta_1=beta_1,
beta_2=beta_2,
epsilon=epsilon,
amsgrad=False,
name=name)
# pylint: enable=useless-super-delegation # pylint: enable=useless-super-delegation
def _resource_apply_dense(self, grad, var): def _resource_apply_dense(self, grad, var):