diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py index 962680fad68..fd5918dbfad 100644 --- a/tensorflow/python/keras/optimizer_v2/adam.py +++ b/tensorflow/python/keras/optimizer_v2/adam.py @@ -35,9 +35,13 @@ class Adam(optimizer_v2.OptimizerV2): requirement, invariant to diagonal rescaling of gradients, and is well suited for problems that are large in terms of data/parameters'. + Note, amsgrad is currently not supported and the argument can only be False. + # References See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980) ([pdf](http://arxiv.org/pdf/1412.6980.pdf)). + For AMSGrad see [Reddi et al., 2-18] + (https://openreview.net/pdf?id=ryQu7f-RZ) """ def __init__(self, @@ -45,26 +49,47 @@ class Adam(optimizer_v2.OptimizerV2): beta_1=0.9, beta_2=0.999, epsilon=1e-7, + amsgrad=False, name='Adam'): r"""Construct a new Adam optimizer. - Initialization: + If amsgrad = False: + Initialization: - $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$ - $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$ - $$t := 0 \text{(Initialize timestep)}$$ + $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$ + $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$ + $$t := 0 \text{(Initialize timestep)}$$ - The update rule for `variable` with gradient `g` uses an optimization - described at the end of section2 of the paper: + The update rule for `variable` with gradient `g` uses an optimization + described at the end of section2 of the paper: - $$t := t + 1$$ - $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$ + $$t := t + 1$$ + $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$ - $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$ - $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ - $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ + $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$ + $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ + $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ - The default value of 1e-8 for epsilon might not be a good default in + If amsgrad = True: + Initialization: + + $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$ + $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$ + $$v_hat_0 := 0 \text{(Initialize initial 2nd moment vector)}$$ + $$t := 0 \text{(Initialize timestep)}$$ + + The update rule for `variable` with gradient `g` uses an optimization + described at the end of section2 of the paper: + + $$t := t + 1$$ + $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$ + + $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$ + $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ + $$v_hat_t := max(v_hat_{t-1}, v_t) + $$variable := variable - lr_t * m_t / (\sqrt{v_hat_t} + \epsilon)$$ + + The default value of 1e-7 for epsilon might not be a good default in general. For example, when training an Inception network on ImageNet a current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the formulation just before Section 2.1 of the Kingma and Ba paper rather than @@ -89,6 +114,8 @@ class Adam(optimizer_v2.OptimizerV2): epsilon: A small constant for numerical stability. This epsilon is "epsilon hat" in the Kingma and Ba paper (in the formula just before Section 2.1), not the epsilon in Algorithm 1 of the paper. + amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from + the paper "On the Convergence of Adam and beyond". name: Optional name for the operations created when applying gradients. Defaults to "Adam". @compatibility(eager) When eager execution is enabled, `learning_rate`, `beta_1`, `beta_2`, and `epsilon` can each be @@ -102,6 +129,10 @@ class Adam(optimizer_v2.OptimizerV2): self._set_hyper('beta_1', beta_1) self._set_hyper('beta_2', beta_2) self._set_hyper('epsilon', epsilon) + # TODO(tanzheny): create op for resource_apply_adam_with_amsgrad + if amsgrad: + raise ValueError('Amsgrad is currently not supported.') + self._amsgrad = amsgrad def _create_slots(self, var_list): # Create slots for the first and second moments. @@ -173,5 +204,6 @@ class Adam(optimizer_v2.OptimizerV2): 'beta_1': self._serialize_hyperparameter('beta_1'), 'beta_2': self._serialize_hyperparameter('beta_2'), 'epsilon': self._serialize_hyperparameter('epsilon'), + 'amsgrad': self._amsgrad, }) return config diff --git a/tensorflow/python/keras/optimizer_v2/adam_test.py b/tensorflow/python/keras/optimizer_v2/adam_test.py index 46a45af224b..20780ead9cd 100644 --- a/tensorflow/python/keras/optimizer_v2/adam_test.py +++ b/tensorflow/python/keras/optimizer_v2/adam_test.py @@ -298,6 +298,11 @@ class AdamOptimizerTest(test.TestCase): # variables for v1 and v2 respectively. self.assertEqual(9, len(set(opt.variables()))) + def testAmsgradWithError(self): + with self.assertRaisesRegexp(ValueError, + "Amsgrad is currently not supported"): + adam.Adam(learning_rate=1., beta_1=0.9, beta_2=0.99, amsgrad=True) + if __name__ == "__main__": test.main() diff --git a/tensorflow/python/keras/optimizer_v2/adamax.py b/tensorflow/python/keras/optimizer_v2/adamax.py index 7530e629d16..67b678f8628 100644 --- a/tensorflow/python/keras/optimizer_v2/adamax.py +++ b/tensorflow/python/keras/optimizer_v2/adamax.py @@ -89,7 +89,13 @@ class Adamax(adam.Adam): Defaults to "Adamax". """ # pylint: disable=useless-super-delegation - super(Adamax, self).__init__(learning_rate, beta_1, beta_2, epsilon, name) + super(Adamax, self).__init__( + learning_rate=learning_rate, + beta_1=beta_1, + beta_2=beta_2, + epsilon=epsilon, + amsgrad=False, + name=name) # pylint: enable=useless-super-delegation def _resource_apply_dense(self, grad, var):