Expose amsgrad argument for Adam, but error out if it is set to True.
PiperOrigin-RevId: 222424267
This commit is contained in:
parent
1799b11347
commit
c59719cf1f
@ -35,9 +35,13 @@ class Adam(optimizer_v2.OptimizerV2):
|
|||||||
requirement, invariant to diagonal rescaling of gradients, and is well suited
|
requirement, invariant to diagonal rescaling of gradients, and is well suited
|
||||||
for problems that are large in terms of data/parameters'.
|
for problems that are large in terms of data/parameters'.
|
||||||
|
|
||||||
|
Note, amsgrad is currently not supported and the argument can only be False.
|
||||||
|
|
||||||
# References
|
# References
|
||||||
See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
|
See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
|
||||||
([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
|
([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
|
||||||
|
For AMSGrad see [Reddi et al., 2-18]
|
||||||
|
(https://openreview.net/pdf?id=ryQu7f-RZ)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
@ -45,26 +49,47 @@ class Adam(optimizer_v2.OptimizerV2):
|
|||||||
beta_1=0.9,
|
beta_1=0.9,
|
||||||
beta_2=0.999,
|
beta_2=0.999,
|
||||||
epsilon=1e-7,
|
epsilon=1e-7,
|
||||||
|
amsgrad=False,
|
||||||
name='Adam'):
|
name='Adam'):
|
||||||
r"""Construct a new Adam optimizer.
|
r"""Construct a new Adam optimizer.
|
||||||
|
|
||||||
Initialization:
|
If amsgrad = False:
|
||||||
|
Initialization:
|
||||||
|
|
||||||
$$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
|
$$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
|
||||||
$$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
|
$$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
|
||||||
$$t := 0 \text{(Initialize timestep)}$$
|
$$t := 0 \text{(Initialize timestep)}$$
|
||||||
|
|
||||||
The update rule for `variable` with gradient `g` uses an optimization
|
The update rule for `variable` with gradient `g` uses an optimization
|
||||||
described at the end of section2 of the paper:
|
described at the end of section2 of the paper:
|
||||||
|
|
||||||
$$t := t + 1$$
|
$$t := t + 1$$
|
||||||
$$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
|
$$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
|
||||||
|
|
||||||
$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
|
$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
|
||||||
$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
|
$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
|
||||||
$$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
|
$$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
|
||||||
|
|
||||||
The default value of 1e-8 for epsilon might not be a good default in
|
If amsgrad = True:
|
||||||
|
Initialization:
|
||||||
|
|
||||||
|
$$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
|
||||||
|
$$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
|
||||||
|
$$v_hat_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
|
||||||
|
$$t := 0 \text{(Initialize timestep)}$$
|
||||||
|
|
||||||
|
The update rule for `variable` with gradient `g` uses an optimization
|
||||||
|
described at the end of section2 of the paper:
|
||||||
|
|
||||||
|
$$t := t + 1$$
|
||||||
|
$$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
|
||||||
|
|
||||||
|
$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
|
||||||
|
$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
|
||||||
|
$$v_hat_t := max(v_hat_{t-1}, v_t)
|
||||||
|
$$variable := variable - lr_t * m_t / (\sqrt{v_hat_t} + \epsilon)$$
|
||||||
|
|
||||||
|
The default value of 1e-7 for epsilon might not be a good default in
|
||||||
general. For example, when training an Inception network on ImageNet a
|
general. For example, when training an Inception network on ImageNet a
|
||||||
current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
|
current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
|
||||||
formulation just before Section 2.1 of the Kingma and Ba paper rather than
|
formulation just before Section 2.1 of the Kingma and Ba paper rather than
|
||||||
@ -89,6 +114,8 @@ class Adam(optimizer_v2.OptimizerV2):
|
|||||||
epsilon: A small constant for numerical stability. This epsilon is
|
epsilon: A small constant for numerical stability. This epsilon is
|
||||||
"epsilon hat" in the Kingma and Ba paper (in the formula just before
|
"epsilon hat" in the Kingma and Ba paper (in the formula just before
|
||||||
Section 2.1), not the epsilon in Algorithm 1 of the paper.
|
Section 2.1), not the epsilon in Algorithm 1 of the paper.
|
||||||
|
amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
|
||||||
|
the paper "On the Convergence of Adam and beyond".
|
||||||
name: Optional name for the operations created when applying gradients.
|
name: Optional name for the operations created when applying gradients.
|
||||||
Defaults to "Adam". @compatibility(eager) When eager execution is
|
Defaults to "Adam". @compatibility(eager) When eager execution is
|
||||||
enabled, `learning_rate`, `beta_1`, `beta_2`, and `epsilon` can each be
|
enabled, `learning_rate`, `beta_1`, `beta_2`, and `epsilon` can each be
|
||||||
@ -102,6 +129,10 @@ class Adam(optimizer_v2.OptimizerV2):
|
|||||||
self._set_hyper('beta_1', beta_1)
|
self._set_hyper('beta_1', beta_1)
|
||||||
self._set_hyper('beta_2', beta_2)
|
self._set_hyper('beta_2', beta_2)
|
||||||
self._set_hyper('epsilon', epsilon)
|
self._set_hyper('epsilon', epsilon)
|
||||||
|
# TODO(tanzheny): create op for resource_apply_adam_with_amsgrad
|
||||||
|
if amsgrad:
|
||||||
|
raise ValueError('Amsgrad is currently not supported.')
|
||||||
|
self._amsgrad = amsgrad
|
||||||
|
|
||||||
def _create_slots(self, var_list):
|
def _create_slots(self, var_list):
|
||||||
# Create slots for the first and second moments.
|
# Create slots for the first and second moments.
|
||||||
@ -173,5 +204,6 @@ class Adam(optimizer_v2.OptimizerV2):
|
|||||||
'beta_1': self._serialize_hyperparameter('beta_1'),
|
'beta_1': self._serialize_hyperparameter('beta_1'),
|
||||||
'beta_2': self._serialize_hyperparameter('beta_2'),
|
'beta_2': self._serialize_hyperparameter('beta_2'),
|
||||||
'epsilon': self._serialize_hyperparameter('epsilon'),
|
'epsilon': self._serialize_hyperparameter('epsilon'),
|
||||||
|
'amsgrad': self._amsgrad,
|
||||||
})
|
})
|
||||||
return config
|
return config
|
||||||
|
@ -298,6 +298,11 @@ class AdamOptimizerTest(test.TestCase):
|
|||||||
# variables for v1 and v2 respectively.
|
# variables for v1 and v2 respectively.
|
||||||
self.assertEqual(9, len(set(opt.variables())))
|
self.assertEqual(9, len(set(opt.variables())))
|
||||||
|
|
||||||
|
def testAmsgradWithError(self):
|
||||||
|
with self.assertRaisesRegexp(ValueError,
|
||||||
|
"Amsgrad is currently not supported"):
|
||||||
|
adam.Adam(learning_rate=1., beta_1=0.9, beta_2=0.99, amsgrad=True)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
test.main()
|
test.main()
|
||||||
|
@ -89,7 +89,13 @@ class Adamax(adam.Adam):
|
|||||||
Defaults to "Adamax".
|
Defaults to "Adamax".
|
||||||
"""
|
"""
|
||||||
# pylint: disable=useless-super-delegation
|
# pylint: disable=useless-super-delegation
|
||||||
super(Adamax, self).__init__(learning_rate, beta_1, beta_2, epsilon, name)
|
super(Adamax, self).__init__(
|
||||||
|
learning_rate=learning_rate,
|
||||||
|
beta_1=beta_1,
|
||||||
|
beta_2=beta_2,
|
||||||
|
epsilon=epsilon,
|
||||||
|
amsgrad=False,
|
||||||
|
name=name)
|
||||||
# pylint: enable=useless-super-delegation
|
# pylint: enable=useless-super-delegation
|
||||||
|
|
||||||
def _resource_apply_dense(self, grad, var):
|
def _resource_apply_dense(self, grad, var):
|
||||||
|
Loading…
Reference in New Issue
Block a user