Expose amsgrad argument for Adam, but error out if it is set to True.

PiperOrigin-RevId: 222424267
2018-11-21 10:03:25 -08:00 · 2018-11-21 10:03:25 -08:00 · c59719cf1f
commit c59719cf1f
parent 1799b11347
3 changed files with 56 additions and 13 deletions
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@ -35,9 +35,13 @@ class Adam(optimizer_v2.OptimizerV2):
  requirement, invariant to diagonal rescaling of gradients, and is well suited
  for problems that are large in terms of data/parameters'.

+  Note, amsgrad is currently not supported and the argument can only be False.
+
  # References
      See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
        ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+      For AMSGrad see [Reddi et al., 2-18]
+        (https://openreview.net/pdf?id=ryQu7f-RZ)
  """

  def __init__(self,
@ -45,26 +49,47 @@ class Adam(optimizer_v2.OptimizerV2):
               beta_1=0.9,
               beta_2=0.999,
               epsilon=1e-7,
+               amsgrad=False,
               name='Adam'):
    r"""Construct a new Adam optimizer.

-    Initialization:
+    If amsgrad = False:
+      Initialization:

-    $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
-    $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
-    $$t := 0 \text{(Initialize timestep)}$$
+      $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
+      $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+      $$t := 0 \text{(Initialize timestep)}$$

-    The update rule for `variable` with gradient `g` uses an optimization
-    described at the end of section2 of the paper:
+      The update rule for `variable` with gradient `g` uses an optimization
+      described at the end of section2 of the paper:

-    $$t := t + 1$$
-    $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+      $$t := t + 1$$
+      $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$

-    $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-    $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-    $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+      $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+      $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+      $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$

-    The default value of 1e-8 for epsilon might not be a good default in
+    If amsgrad = True:
+      Initialization:
+
+      $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
+      $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+      $$v_hat_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+      $$t := 0 \text{(Initialize timestep)}$$
+
+      The update rule for `variable` with gradient `g` uses an optimization
+      described at the end of section2 of the paper:
+
+      $$t := t + 1$$
+      $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+
+      $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+      $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+      $$v_hat_t := max(v_hat_{t-1}, v_t)
+      $$variable := variable - lr_t * m_t / (\sqrt{v_hat_t} + \epsilon)$$
+
+    The default value of 1e-7 for epsilon might not be a good default in
    general. For example, when training an Inception network on ImageNet a
    current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
    formulation just before Section 2.1 of the Kingma and Ba paper rather than
@ -89,6 +114,8 @@ class Adam(optimizer_v2.OptimizerV2):
      epsilon: A small constant for numerical stability. This epsilon is
        "epsilon hat" in the Kingma and Ba paper (in the formula just before
        Section 2.1), not the epsilon in Algorithm 1 of the paper.
+      amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
+        the paper "On the Convergence of Adam and beyond".
      name: Optional name for the operations created when applying gradients.
        Defaults to "Adam".  @compatibility(eager) When eager execution is
        enabled, `learning_rate`, `beta_1`, `beta_2`, and `epsilon` can each be
@ -102,6 +129,10 @@ class Adam(optimizer_v2.OptimizerV2):
    self._set_hyper('beta_1', beta_1)
    self._set_hyper('beta_2', beta_2)
    self._set_hyper('epsilon', epsilon)
+    # TODO(tanzheny): create op for resource_apply_adam_with_amsgrad
+    if amsgrad:
+      raise ValueError('Amsgrad is currently not supported.')
+    self._amsgrad = amsgrad

  def _create_slots(self, var_list):
    # Create slots for the first and second moments.
@ -173,5 +204,6 @@ class Adam(optimizer_v2.OptimizerV2):
        'beta_1': self._serialize_hyperparameter('beta_1'),
        'beta_2': self._serialize_hyperparameter('beta_2'),
        'epsilon': self._serialize_hyperparameter('epsilon'),
+        'amsgrad': self._amsgrad,
    })
    return config
--- a/tensorflow/python/keras/optimizer_v2/adam_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adam_test.py
@ -298,6 +298,11 @@ class AdamOptimizerTest(test.TestCase):
      # variables for v1 and v2 respectively.
      self.assertEqual(9, len(set(opt.variables())))

+  def testAmsgradWithError(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 "Amsgrad is currently not supported"):
+      adam.Adam(learning_rate=1., beta_1=0.9, beta_2=0.99, amsgrad=True)
+

 if __name__ == "__main__":
  test.main()
--- a/tensorflow/python/keras/optimizer_v2/adamax.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax.py
@ -89,7 +89,13 @@ class Adamax(adam.Adam):
        Defaults to "Adamax".
    """
    # pylint: disable=useless-super-delegation
-    super(Adamax, self).__init__(learning_rate, beta_1, beta_2, epsilon, name)
+    super(Adamax, self).__init__(
+        learning_rate=learning_rate,
+        beta_1=beta_1,
+        beta_2=beta_2,
+        epsilon=epsilon,
+        amsgrad=False,
+        name=name)
    # pylint: enable=useless-super-delegation

  def _resource_apply_dense(self, grad, var):