Clean up optimizer docstrings.

PiperOrigin-RevId: 304544799 Change-Id: Ic83206c54cbf8437a4d6ff693f139412b5bdcee8
2020-04-02 22:15:23 -07:00 · 2020-04-02 22:15:23 -07:00 · f6302e4ec7
commit f6302e4ec7
parent 8a370a0077
9 changed files with 251 additions and 377 deletions
--- a/tensorflow/python/keras/optimizer_v2/adadelta.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta.py
@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
+"""Adadelta optimizer implementation."""
-"""Adadelta for TensorFlow."""
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -34,23 +34,9 @@ class Adadelta(optimizer_v2.OptimizerV2):
  Adadelta optimization is a stochastic gradient descent method that is based on
  adaptive learning rate per dimension to address two drawbacks:
    1) the continual decay of learning rates throughout training
    2) the need for a manually selected global learning rate
-  Two accumulation steps are required:
+  - The continual decay of learning rates throughout training
-    1) the accumulation of gradients squared,
+  - The need for a manually selected global learning rate
    2) the accumulation of updates squared.
  Initialization:
  $$E[g^2]_0 := 0 \text{(Initialize gradient 2nd order moment vector)}$$
  $$E[\Delta x^2]_0 := 0 \text{(Initialize 2nd order variable update)}$$
  $$t := t + 1$$
  $$E[g^2]_t := \rho * E[g^2]_{t-1} + (1 - \rho) * g^2$$
  $$\Delta x_t = -RMS[\Delta x]_{t-1} * g_t / RMS[g]_t$$
  $$E[\Delta x^2]_t := \rho * E[\Delta x^2]_{t-1} + (1 - \rho) * \Delta x_t^2$$
  $$x_t := x_{t-1} + \Delta x_{t}$$
  Adadelta is a more robust extension of Adagrad that adapts learning rates
  based on a moving window of gradient updates, instead of accumulating all
@ -59,16 +45,22 @@ class Adadelta(optimizer_v2.OptimizerV2):
  don't have to set an initial learning rate. In this version, initial
  learning rate can be set, as in most other Keras optimizers.
-  @compatibility(eager)
+  Args:
-  When eager execution is enabled, `learning_rate`, `rho`, and `epsilon` can
+    learning_rate: A `Tensor`, floating point value, or a schedule that is a
-  each be a callable that takes no arguments and returns the actual value to
+      `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
-  use. This can be useful for changing these values across different
+      To match the exact form in the original paper use 1.0.
-  invocations of optimizer functions.
+    rho: A `Tensor` or a floating point value. The decay rate.
-  @end_compatibility
+    epsilon: A `Tensor` or a floating point value.  A constant epsilon used
             to better conditioning the grad update.
    name: Optional name prefix for the operations created when applying
      gradients.  Defaults to `"Adadelta"`.
    **kwargs: Keyword arguments. Allowed to be one of
      `"clipnorm"` or `"clipvalue"`.
      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
      gradients by value.
-  References
+  Reference:
-    See [M. D. Zeiler](http://arxiv.org/abs/1212.5701)
+    - [Zeiler, 2012](http://arxiv.org/abs/1212.5701)
      ([pdf](http://arxiv.org/pdf/1212.5701v1.pdf))
  """
  _HAS_AGGREGATE_GRAD = True
@ -79,23 +71,6 @@ class Adadelta(optimizer_v2.OptimizerV2):
               epsilon=1e-7,
               name='Adadelta',
               **kwargs):
    """Construct a new Adadelta optimizer.
    Args:
      learning_rate: A `Tensor`, floating point value, or a schedule that is a
        `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
        To match the exact form in the original paper use 1.0.
      rho: A `Tensor` or a floating point value. The decay rate.
      epsilon: A `Tensor` or a floating point value.  A constant epsilon used
               to better conditioning the grad update.
      name: Optional name prefix for the operations created when applying
        gradients.  Defaults to "Adadelta".
      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
        gradients by value, `decay` is included for backward compatibility to
        allow time inverse decay of learning rate. `lr` is included for backward
        compatibility, recommended to use `learning_rate` instead.
    """
    super(Adadelta, self).__init__(name, **kwargs)
    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
    self._set_hyper('decay', self._initial_decay)
--- a/tensorflow/python/keras/optimizer_v2/adagrad.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
+"""Adagrad optimizer implementation."""
-"""Adagrad for TensorFlow."""
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -39,26 +39,22 @@ class Adagrad(optimizer_v2.OptimizerV2):
  updated during training. The more updates a parameter receives,
  the smaller the updates.
-  Initialization:
+  Args:
-  $$accum_{g_0} := \text{initial_accumulator_value}$$
+    learning_rate: A `Tensor`, floating point value, or a schedule that is a
      `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
    initial_accumulator_value: A floating point value.
      Starting value for the accumulators, must be non-negative.
    epsilon: A small floating point value to avoid zero denominator.
    name: Optional name prefix for the operations created when applying
      gradients.  Defaults to `"Adagrad"`.
    **kwargs: Keyword arguments. Allowed to be one of
      `"clipnorm"` or `"clipvalue"`.
      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
      gradients by value.
-  Update step:
+  Reference:
-  $$t := t + 1$$
+    - [Duchi et al., 2011](
-  $$accum_{g_t} := accum_{g_{t-1}} + g^2$$
+      http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
  $$\theta_t := \theta_{t-1} - lr * g / (\sqrt{accum_{g_t}} + \epsilon)$$
  @compatibility(eager)
  When eager execution is enabled, `learning_rate` can be a callable that
  takes no arguments and returns the actual value to use. This can be useful
  for changing these values across different invocations of optimizer
  functions.
  @end_compatibility
  References:
  * [Paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
  * [Introduction]
    (https://ppasupat.github.io/a9online/uploads/proximal_notes.pdf).
  """
  _HAS_AGGREGATE_GRAD = True
@ -69,25 +65,6 @@ class Adagrad(optimizer_v2.OptimizerV2):
               epsilon=1e-7,
               name='Adagrad',
               **kwargs):
    """Construct a new Adagrad optimizer.
    Args:
      learning_rate: A `Tensor`, floating point value, or a schedule that is a
        `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
      initial_accumulator_value: A floating point value.
        Starting value for the accumulators, must be non-negative.
      epsilon: A small floating point value to avoid zero denominator.
      name: Optional name prefix for the operations created when applying
        gradients.  Defaults to "Adagrad".
      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
        gradients by value, `decay` is included for backward compatibility to
        allow time inverse decay of learning rate. `lr` is included for backward
        compatibility, recommended to use `learning_rate` instead.
    Raises:
      ValueError: If the `initial_accumulator_value` or `epsilon` is invalid.
    """
    if initial_accumulator_value < 0.0:
      raise ValueError('initial_accumulator_value must be non-negative: %s' %
                       initial_accumulator_value)
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Adam for TensorFlow."""
+"""Adam optimizer implementation."""
 # pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -35,50 +36,58 @@ class Adam(optimizer_v2.OptimizerV2):
  Adam optimization is a stochastic gradient descent method that is based on
  adaptive estimation of first-order and second-order moments.
-  According to the paper
+
-  [Adam: A Method for Stochastic Optimization. Kingma et al.,
+  According to
-  2014](http://arxiv.org/abs/1412.6980), the method is "*computationally
+  [Kingma et al., 2014](http://arxiv.org/abs/1412.6980),
  the method is "*computationally
  efficient, has little memory requirement, invariant to diagonal rescaling of
  gradients, and is well suited for problems that are large in terms of
  data/parameters*".
-  For AMSGrad see [On The Convergence Of Adam And Beyond.
+  Args:
-  Reddi et al., 5-8](https://openreview.net/pdf?id=ryQu7f-RZ).
+    learning_rate: A `Tensor`, floating point value, or a schedule that is a
      `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
      that takes no arguments and returns the actual value to use, The
      learning rate. Defaults to 0.001.
    beta_1: A float value or a constant float tensor, or a callable
      that takes no arguments and returns the actual value to use. The
      exponential decay rate for the 1st moment estimates. Defaults to 0.9.
    beta_2: A float value or a constant float tensor, or a callable
      that takes no arguments and returns the actual value to use, The
      exponential decay rate for the 2nd moment estimates. Defaults to 0.999.
    epsilon: A small constant for numerical stability. This epsilon is
      "epsilon hat" in the Kingma and Ba paper (in the formula just before
      Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
      1e-7.
    amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
      the paper "On the Convergence of Adam and beyond". Defaults to `False`.
    name: Optional name for the operations created when applying gradients.
      Defaults to `"Adam"`.
    **kwargs: Keyword arguments. Allowed to be one of
      `"clipnorm"` or `"clipvalue"`.
      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
      gradients by value.
-  **If amsgrad = False**:
+  Usage:
-  initialize $m_0$ as 1st moment vector
+  >>> opt = tf.keras.optimizers.Adam(learning_rate=0.1)
-  initialize $v_0$ as 2nd moment vector
+  >>> var1 = tf.Variable(10.0)
  >>> loss = lambda: (var1 ** 2)/2.0       # d(loss)/d(var1) == var1
  >>> step_count = opt.minimize(loss, [var1]).numpy()
  >>> # The first step is `-learning_rate*sign(grad)`
  >>> var1.numpy()
  9.9
-  The update rule for $\theta$ with gradient $g$ uses an optimization
+  Reference:
-  described at the end of section 2 of the paper:
+    - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
    - [Reddi et al., 2018](
        https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.
-  $$lr_t = \mathrm{learning\_rate} *
+  Notes:
    \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
  $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
  $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
  $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
  **If amsgrad = True**:
  initialize $m_0$ as 1st moment vector
  initialize $v_0$ as 2nd moment vector
  initialize $\hat{v}_0$ as 2nd moment vector
  The update rule for $\theta$ with gradient $g$ uses an optimization
  described at the end of section 2 of the paper:
  $$lr_t = \mathrm{learning\_rate} *
    \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
  $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
  $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
  $$\hat{v}_t = \max(\hat{v}_{t-1}, v_t)$$
  $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{\hat{v}_t} + \epsilon)$$
  The default value of 1e-7 for epsilon might not be a good default in
  general. For example, when training an Inception network on ImageNet a
-  current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
+  current good choice is 1.0 or 0.1. Note that since Adam uses the
  formulation just before Section 2.1 of the Kingma and Ba paper rather than
  the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
  hat" in the paper.
@ -91,16 +100,6 @@ class Adam(optimizer_v2.OptimizerV2):
  accumulator. This means that the sparse behavior is equivalent to the dense
  behavior (in contrast to some momentum implementations which ignore momentum
  unless a variable slice was actually used).
  Usage:
  >>> opt = tf.keras.optimizers.Adam(learning_rate=0.1)
  >>> var1 = tf.Variable(10.0)
  >>> loss = lambda: (var1 ** 2)/2.0       # d(loss)/d(var1) == var1
  >>> step_count = opt.minimize(loss, [var1]).numpy()
  >>> # The first step is `-learning_rate*sign(grad)`
  >>> var1.numpy()
  9.9
  """
  _HAS_AGGREGATE_GRAD = True
@ -113,34 +112,6 @@ class Adam(optimizer_v2.OptimizerV2):
               amsgrad=False,
               name='Adam',
               **kwargs):
    """Construct a new Adam optimizer.
    Args:
      learning_rate: A `Tensor`, floating point value, or a schedule that is a
        `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
        that takes no arguments and returns the actual value to use, The
        learning rate. Defaults to 0.001.
      beta_1: A float value or a constant float tensor, or a callable
        that takes no arguments and returns the actual value to use. The
        exponential decay rate for the 1st moment estimates. Defaults to 0.9.
      beta_2: A float value or a constant float tensor, or a callable
        that takes no arguments and returns the actual value to use, The
        exponential decay rate for the 2nd moment estimates. Defaults to 0.999.
      epsilon: A small constant for numerical stability. This epsilon is
        "epsilon hat" in the Kingma and Ba paper (in the formula just before
        Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
        1e-7.
      amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
        the paper "On the Convergence of Adam and beyond". Defaults to `False`.
      name: Optional name for the operations created when applying gradients.
        Defaults to "Adam".
      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
        gradients by value, `decay` is included for backward compatibility to
        allow time inverse decay of learning rate. `lr` is included for backward
        compatibility, recommended to use `learning_rate` instead.
    """
    super(Adam, self).__init__(name, **kwargs)
    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
    self._set_hyper('decay', self._initial_decay)
@ -329,7 +300,7 @@ class NonFusedAdam(optimizer_v2.OptimizerV2):
  The default value of 1e-7 for epsilon might not be a good default in
  general. For example, when training an Inception network on ImageNet a
-  current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
+  current good choice is 1.0 or 0.1. Note that since Adam uses the
  formulation just before Section 2.1 of the Kingma and Ba paper rather than
  the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
  hat" in the paper.
--- a/tensorflow/python/keras/optimizer_v2/adamax.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax.py
@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
+"""Adamax optimizer implementation."""
-"""Adamax for TensorFlow."""
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -39,27 +39,27 @@ class Adamax(optimizer_v2.OptimizerV2):
  Initialization:
-  ```
+  ```python
-  m_0 <- 0 (Initialize initial 1st moment vector)
+  m = 0  # Initialize initial 1st moment vector
-  v_0 <- 0 (Initialize the exponentially weighted infinity norm)
+  v = 0  # Initialize the exponentially weighted infinity norm
-  t <- 0 (Initialize timestep)
+  t = 0  # Initialize timestep
  ```
-  The update rule for `variable` with gradient `g` uses an optimization
+  The update rule for parameter `w` with gradient `g` is
  described at the end of section 7.1 of the paper:
-  ```
+  ```python
-  t <- t + 1
+  t += 1
-
+  m = beta1 * m + (1 - beta) * g
-  m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+  v = max(beta2 * v, abs(g))
-  v_t <- max(beta2 * v_{t-1}, abs(g))
+  current_lr = learning_rate / (1 - beta1 ** t)
-  variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+  w = w - current_lr * m / (v + epsilon)
  ```
-  Similar to AdamOptimizer, the epsilon is added for numerical stability
+  Similarly to `Adam`, the epsilon is added for numerical stability
-  (especially to get rid of division by zero when v_t = 0).
+  (especially to get rid of division by zero when `v_t == 0`).
-  Contrast to AdamOptimizer, the sparse implementation of this algorithm
+  In contrast to `Adam`, the sparse implementation of this algorithm
  (used when the gradient is an IndexedSlices object, typically because of
  `tf.gather` or an embedding lookup in the forward pass) only updates
  variable slices and corresponding `m_t`, `v_t` terms when that part of
@ -68,9 +68,23 @@ class Adamax(optimizer_v2.OptimizerV2):
  implementations which ignore momentum unless a variable slice was actually
  used).
-  References
+  Args:
-    see Section 7 of [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+    learning_rate: A `Tensor`, floating point value, or a schedule that is a
-      ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+      `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
    beta_1: A float value or a constant float tensor. The exponential decay
      rate for the 1st moment estimates.
    beta_2: A float value or a constant float tensor. The exponential decay
      rate for the exponentially weighted infinity norm.
    epsilon: A small constant for numerical stability.
    name: Optional name for the operations created when applying gradients.
      Defaults to `"Adamax"`.
    **kwargs: Keyword arguments. Allowed to be one of
      `"clipnorm"` or `"clipvalue"`.
      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
      gradients by value.
  Reference:
    - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
  """
  _HAS_AGGREGATE_GRAD = True
@ -82,24 +96,6 @@ class Adamax(optimizer_v2.OptimizerV2):
               epsilon=1e-7,
               name='Adamax',
               **kwargs):
    """Construct a new Adamax optimizer.
    Args:
      learning_rate: A `Tensor`, floating point value, or a schedule that is a
        `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
      beta_1: A float value or a constant float tensor. The exponential decay
        rate for the 1st moment estimates.
      beta_2: A float value or a constant float tensor. The exponential decay
        rate for the exponentially weighted infinity norm.
      epsilon: A small constant for numerical stability.
      name: Optional name for the operations created when applying gradients.
        Defaults to "Adamax".
      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
        gradients by value, `decay` is included for backward compatibility to
        allow time inverse decay of learning rate. `lr` is included for backward
        compatibility, recommended to use `learning_rate` instead.
    """
    super(Adamax, self).__init__(name, **kwargs)
    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
    self._set_hyper('decay', self._initial_decay)
--- a/tensorflow/python/keras/optimizer_v2/ftrl.py
+++ b/tensorflow/python/keras/optimizer_v2/ftrl.py
@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Ftrl-proximal for TensorFlow."""
+"""Ftrl-proximal optimizer implementation."""
 # pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -35,26 +36,32 @@ class Ftrl(optimizer_v2.OptimizerV2):
  above) and shrinkage-type L2 (which is the addition of an L2 penalty to the
  loss function).
-  Initialization:
+  Args:
-  $$t = 0$$
+    learning_rate: A `Tensor`, floating point value, or a schedule that is a
-  $$n_{0} = 0$$
+      `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
-  $$\sigma_{0} = 0$$
+    learning_rate_power: A float value, must be less or equal to zero.
-  $$z_{0} = 0$$
+      Controls how the learning rate decreases during training. Use zero for
      a fixed learning rate.
    initial_accumulator_value: The starting value for accumulators.
      Only zero or positive values are allowed.
    l1_regularization_strength: A float value, must be greater than or
      equal to zero.
    l2_regularization_strength: A float value, must be greater than or
      equal to zero.
    name: Optional name prefix for the operations created when applying
      gradients.  Defaults to `"Ftrl"`.
    l2_shrinkage_regularization_strength: A float value, must be greater than
      or equal to zero. This differs from L2 above in that the L2 above is a
      stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
      When input is sparse shrinkage will only happen on the active weights.
    **kwargs: Keyword arguments. Allowed to be one of
      `"clipnorm"` or `"clipvalue"`.
      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
      gradients by value.
-  Update ($$i$$ is variable index):
+  Reference:
-  $$t = t + 1$$
+    - [paper](
-  $$n_{t,i} = n_{t-1,i} + g_{t,i}^{2}$$
+      https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
  $$\sigma_{t,i} = (\sqrt{n_{t,i}} - \sqrt{n_{t-1,i}}) / \alpha$$
  $$z_{t,i} = z_{t-1,i} + g_{t,i} - \sigma_{t,i} * w_{t,i}$$
  $$w_{t,i} = - ((\beta+\sqrt{n+{t}}) / \alpha + \lambda_{2})^{-1} * (z_{i} -
               sgn(z_{i}) * \lambda_{1}) if \abs{z_{i}} > \lambda_{i} else 0$$
  Check the documentation for the l2_shrinkage_regularization_strength
  parameter for more details when shrinkage is enabled, where gradient is
  replaced with gradient_with_shrinkage.
  References: See
  [paper](https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
  """
  def __init__(self,
@ -66,44 +73,6 @@ class Ftrl(optimizer_v2.OptimizerV2):
               name='Ftrl',
               l2_shrinkage_regularization_strength=0.0,
               **kwargs):
    r"""Construct a new FTRL optimizer.
    Args:
      learning_rate: A `Tensor`, floating point value, or a schedule that is a
        `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
      learning_rate_power: A float value, must be less or equal to zero.
        Controls how the learning rate decreases during training. Use zero for
        a fixed learning rate.
      initial_accumulator_value: The starting value for accumulators.
        Only zero or positive values are allowed.
      l1_regularization_strength: A float value, must be greater than or
        equal to zero.
      l2_regularization_strength: A float value, must be greater than or
        equal to zero.
      name: Optional name prefix for the operations created when applying
        gradients.  Defaults to "Ftrl".
      l2_shrinkage_regularization_strength: A float value, must be greater than
        or equal to zero. This differs from L2 above in that the L2 above is a
        stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
        The FTRL formulation can be written as:
        w_{t+1} = argmin_w(\hat{g}_{1:t}w + L1*||w||_1 + L2*||w||_2^2), where
        \hat{g} = g + (2*L2_shrinkage*w), and g is the gradient of the loss
        function w.r.t. the weights w.
        Specifically, in the absence of L1 regularization, it is equivalent to
        the following update rule:
        w_{t+1} = w_t - lr_t / (1 + 2*L2*lr_t) * g_t -
                  2*L2_shrinkage*lr_t / (1 + 2*L2*lr_t) * w_t
        where lr_t is the learning rate at t.
        When input is sparse shrinkage will only happen on the active weights.\
      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
        gradients by value, `decay` is included for backward compatibility to
        allow time inverse decay of learning rate. `lr` is included for backward
        compatibility, recommended to use `learning_rate` instead.
    Raises:
      ValueError: If one of the arguments is invalid.
    """
    super(Ftrl, self).__init__(name, **kwargs)
    if initial_accumulator_value < 0.0:
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Momentum for TensorFlow."""
+"""SGD optimizer implementation."""
 # pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -27,17 +28,45 @@ from tensorflow.python.util.tf_export import keras_export
@keras_export("keras.optimizers.SGD")
 class SGD(optimizer_v2.OptimizerV2):
-  r"""Stochastic gradient descent and momentum optimizer.
+  r"""Gradient descent (with momentum) optimizer.
-  The update rule for $\theta$ with gradient $g$ when `momentum` is 0.0:
+  Update rule for parameter `w` with gradient `g` when `momentum` is 0:
  $$\theta_t = \theta_{t-1} - \mathrm{learning\_rate} * g_t$$
-  The update rule when `momentum` is larger than 0.0:
+  ```python
-  $$v_t = \mathrm{momentum} * v_{t-1} - \mathrm{learning\_rate} * g_t$$
+  w = w - learning_rate * g
-  $$\theta_t = \theta_{t-1} + v_t$$
+  ```
-  if `nesterov` is False, gradient is evaluated at $\theta_t$.
+
-  if `nesterov` is True, gradient is evaluated at $\theta_t + momentum * v_t$,
+  Update rule when `momentum` is larger than 0:
-    and the variables always store $\theta + m v$ instead of $theta$
+
  ```python
  velocity = momentum * velocity - learning_rate * g
  w = w * velocity
  ```
  When `nesterov=False`, this rule becomes:
  ```python
  velocity = momentum * velocity - learning_rate * g
  w = w + momentum * velocity - learning_rate * g
  ```
  Args:
    learning_rate: A `Tensor`, floating point value, or a schedule that is a
      `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
      that takes no arguments and returns the actual value to use. The
      learning rate. Defaults to 0.01.
    momentum: float hyperparameter >= 0 that accelerates gradient descent
      in the relevant
      direction and dampens oscillations. Defaults to 0, i.e., vanilla gradient
      descent.
    nesterov: boolean. Whether to apply Nesterov momentum.
      Defaults to `False`.
    name: Optional name prefix for the operations created when applying
      gradients.  Defaults to `"SGD"`.
    **kwargs: Keyword arguments. Allowed to be one of
      `"clipnorm"` or `"clipvalue"`.
      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
      gradients by value.
  Usage:
@ -45,7 +74,7 @@ class SGD(optimizer_v2.OptimizerV2):
  >>> var = tf.Variable(1.0)
  >>> loss = lambda: (var ** 2)/2.0         # d(loss)/d(var1) = var1
  >>> step_count = opt.minimize(loss, [var]).numpy()
-  >>> # Step is `-learning_rate*grad`
+  >>> # Step is `- learning_rate * grad`
  >>> var.numpy()
  0.9
@ -53,7 +82,7 @@ class SGD(optimizer_v2.OptimizerV2):
  >>> var = tf.Variable(1.0)
  >>> val0 = var.value()
  >>> loss = lambda: (var ** 2)/2.0         # d(loss)/d(var1) = var1
-  >>> # First step is `-learning_rate*grad`
+  >>> # First step is `- learning_rate * grad`
  >>> step_count = opt.minimize(loss, [var]).numpy()
  >>> val1 = var.value()
  >>> (val0 - val1).numpy()
@ -64,13 +93,8 @@ class SGD(optimizer_v2.OptimizerV2):
  >>> (val1 - val2).numpy()
  0.18
-  Some of the args below are hyperparameters, where a hyperparameter is
+  Reference:
-  defined as a scalar Tensor, a regular Python value, or a callable (which
+      - For `nesterov=True`, See [Sutskever et al., 2013](
  will be evaluated when `apply_gradients` is called) returning a scalar
  Tensor or a Python value.
  # References
      nesterov = True, See [Sutskever et al., 2013](
        http://jmlr.org/proceedings/papers/v28/sutskever13.pdf).
  """
@ -82,25 +106,6 @@ class SGD(optimizer_v2.OptimizerV2):
               nesterov=False,
               name="SGD",
               **kwargs):
    """Construct a new Stochastic Gradient Descent or Momentum optimizer.
    Arguments:
      learning_rate: A `Tensor`, floating point value, or a schedule that is a
        `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
        that takes no arguments and returns the actual value to use. The
        learning rate. Defaults to 0.01.
      momentum: float hyperparameter >= 0 that accelerates SGD in the relevant
        direction and dampens oscillations. Defaults to 0.0, i.e., SGD.
      nesterov: boolean. Whether to apply Nesterov momentum.
        Defaults to `False`.
      name: Optional name prefix for the operations created when applying
        gradients.  Defaults to 'SGD'.
      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
        gradients by value, `decay` is included for backward compatibility to
        allow time inverse decay of learning rate. `lr` is included for backward
        compatibility, recommended to use `learning_rate` instead.
    """
    super(SGD, self).__init__(name, **kwargs)
    self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
    self._set_hyper("decay", self._initial_decay)
--- a/tensorflow/python/keras/optimizer_v2/nadam.py
+++ b/tensorflow/python/keras/optimizer_v2/nadam.py
@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Nadam for TensorFlow."""
+"""Nadam optimizer implementation."""
 # pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -36,29 +37,22 @@ class Nadam(optimizer_v2.OptimizerV2):
  Much like Adam is essentially RMSprop with momentum, Nadam is Adam with
  Nesterov momentum.
-  Initialization:
+  Args:
    learning_rate: A Tensor or a floating point value.  The learning rate.
    beta_1: A float value or a constant float tensor. The exponential decay
      rate for the 1st moment estimates.
    beta_2: A float value or a constant float tensor. The exponential decay
      rate for the exponentially weighted infinity norm.
    epsilon: A small constant for numerical stability.
    name: Optional name for the operations created when applying gradients.
      Defaults to `"Nadam"`.
    **kwargs: Keyword arguments. Allowed to be one of
      `"clipnorm"` or `"clipvalue"`.
      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
      gradients by value.
-  $$m_0 := 0 \text{(Initialize 1st moment vector)}$$
+  Reference:
-  $$v_0 := 0 \text{(Initialize 2nd moment vector)}$$
+    - [Dozat, 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
  $$mu_0 := 1$$
  $$t := 0 \text{(Initialize timestep)}$$
  Computes:
  $$t := t + 1$$
  $$\mu_t := \beta_1 * (1 - 0.5 * 0.96^{0.004 * t})$$
  $$g' := g / (1 - \prod_{i=1}^{t}{\mu_i})$$
  $$m_t := \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
  $$m' := m_t / (1 - \prod_{i=1}^{t+1}{\mu_i})$$
  $$v_t := \beta_2 * v_{t-1} + (1 - \beta_2) * g * g$$
  $$v' := v_t / (1 - \beta_2^t)$$
  $$\bar{m} := (1 - \mu_t) * g' + \mu_{t+1} * m'$$
  $$\theta_t := \theta_{t-1} - lr * \bar{m} / (\sqrt{v'} + \epsilon)$$
  gradient is evaluated at theta(t) + momentum * v(t), and the variables always
  store theta + beta_1 * m / sqrt(v) instead of theta.
  References
    See [Dozat, T., 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
  """
  _HAS_AGGREGATE_GRAD = True
@ -70,24 +64,6 @@ class Nadam(optimizer_v2.OptimizerV2):
               epsilon=1e-7,
               name='Nadam',
               **kwargs):
    """Construct a new Nadam optimizer.
    Args:
      learning_rate: A Tensor or a floating point value.  The learning rate.
      beta_1: A float value or a constant float tensor. The exponential decay
        rate for the 1st moment estimates.
      beta_2: A float value or a constant float tensor. The exponential decay
        rate for the exponentially weighted infinity norm.
      epsilon: A small constant for numerical stability.
      name: Optional name for the operations created when applying gradients.
        Defaults to "Nadam".
      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
        gradients by value, `decay` is included for backward compatibility to
        allow time inverse decay of learning rate. `lr` is included for backward
        compatibility, recommended to use `learning_rate` instead.
    """
    # Backwards compatibility with keras NAdam optimizer.
    kwargs['decay'] = kwargs.pop('schedule_decay', 0.004)
    learning_rate = kwargs.get('lr', learning_rate)
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Version 2 of class Optimizer."""
 # pylint: disable=g-bad-name
@ -79,11 +78,10 @@ def _deduplicate_indexed_slices(values, indices):
@six.add_metaclass(abc.ABCMeta)
@keras_export("keras.optimizers.Optimizer")
 class OptimizerV2(trackable.Trackable):
-  """Updated base class for optimizers.
+  """Base class for Keras optimizers.
-  This class defines the API to add Ops to train a model.  You never use this
+  You should not use this class directly, but instead instantiate one of its
-  class directly, but instead instantiate one of its subclasses such as
+  subclasses such as `tf.keras.optimizers.SGD`, `tf.keras.optimizers.Adam`, etc.
  `tf.keras.optimizers.SGD`, `tf.keras.optimizers.Adam`.
  ### Usage
@ -101,7 +99,7 @@ class OptimizerV2(trackable.Trackable):
  opt.minimize(loss, var_list=[var1, var2])
  ```
-  ### Custom training loop with Keras models
+  ### Usage in custom training loops
  In Keras models, sometimes variables are created when the model is first
  called, instead of construction time. Examples include 1) sequential models
@ -109,6 +107,7 @@ class OptimizerV2(trackable.Trackable):
  callable in these cases.
  Example:
  ```python
  opt = tf.keras.optimizers.SGD(learning_rate=0.1)
  model = tf.keras.Sequential()
@ -120,7 +119,7 @@ class OptimizerV2(trackable.Trackable):
    opt.minimize(loss_fn, var_list_fn)
  ```
-  ### Processing gradients before applying them.
+  ### Processing gradients before applying them
  Calling `minimize()` takes care of both computing the gradients and
  applying them to the variables.  If you want to process the gradients
@ -150,7 +149,7 @@ class OptimizerV2(trackable.Trackable):
  opt.apply_gradients(zip(processed_grads, var_list))
  ```
-  ### Use with `tf.distribute.Strategy`.
+  ### Use with `tf.distribute.Strategy`
  This optimizer class is `tf.distribute.Strategy` aware, which means it
  automatically sums gradients across all replicas. To average gradients,
@ -172,7 +171,7 @@ class OptimizerV2(trackable.Trackable):
  step. As a result, using `tf.math.reduce_mean` will give the wrong answer,
  resulting in gradients that can be many times too big.
-  ### Variable Constraint
+  ### Variable Constraints
  All Keras optimizers respect variable constraints. If constraint function is
  passed to any variable, the constraint will be applied to the variable after
@ -195,7 +194,7 @@ class OptimizerV2(trackable.Trackable):
  This can be useful if you want to log debug a training algorithm, report stats
  about the slots, etc.
-  ### Hyper parameters
+  ### Hyperparameters
  These are arguments passed to the optimizer subclass constructor
  (the `__init__` method), and then passed to `self._set_hyper()`.
@ -203,7 +202,7 @@ class OptimizerV2(trackable.Trackable):
  callables. If they are callable, the callable will be called during
  `apply_gradients()` to get the value for the hyper parameter.
-  Hyper parameters can be overwritten through user code:
+  Hyperparameters can be overwritten through user code:
  Example:
@ -220,7 +219,8 @@ class OptimizerV2(trackable.Trackable):
  opt.minimize(loss, var_list=[var1, var2])
  ```
-  ### Callable learning rate.
+  ### Callable learning rate
  Optimizer accepts a callable learning rate in two ways. The first way is
  through built-in or customized
  `tf.keras.optimizers.schedules.LearningRateSchedule`. The schedule will be
@ -250,14 +250,17 @@ class OptimizerV2(trackable.Trackable):
  >>> opt.minimize(loss, var_list=[var])
  <tf.Variable...
-  ### Write a customized optimizer.
+  ### Creating a custom optimizer
  If you intend to create your own optimization algorithm, simply inherit from
  this class and override the following methods:
-    - _resource_apply_dense (update variable given gradient tensor is dense)
+    - `_resource_apply_dense` (update variable given gradient tensor is dense)
-    - _resource_apply_sparse (update variable given gradient tensor is sparse)
+    - `_resource_apply_sparse` (update variable given gradient tensor is sparse)
-    - _create_slots (if your optimizer algorithm requires additional variables)
+    - `_create_slots`
-    - get_config (serialization of the optimizer, include all hyper parameters)
+      (if your optimizer algorithm requires additional variables)
    - `get_config`
      (serialization of the optimizer, include all hyper parameters)
  """
  # Subclasses should set this to True unless they override `apply_gradients`
--- a/tensorflow/python/keras/optimizer_v2/rmsprop.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""RMSprop for TensorFlow."""
+"""RMSprop optimizer implementation."""
 # pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -34,36 +35,37 @@ from tensorflow.python.util.tf_export import keras_export
 class RMSprop(optimizer_v2.OptimizerV2):
  r"""Optimizer that implements the RMSprop algorithm.
-  A detailed description of rmsprop.
+  The gist of RMSprop is to:
    - maintain a moving (discounted) average of the square of gradients
    - divide gradient by the root of this average
-  The default settings does not use momentum:
+  - Maintain a moving (discounted) average of the square of gradients
-
+  - Divide the gradient by the root of this average
  $$rms_t = \rho * rms_{t-1} + (1-\rho) * g_t^2$$
  $$\theta_t = \theta_{t-1} - \mathrm{learning\_rate} *
                              g_t / \sqrt{rms_t + \epsilon}$$
  Since  $x/x^2 = sign(x)$, this  is an smoothed approximation of:
  $$ \theta_t = \theta_{t-1} - \mathrm{learning\_rate} * sign(g_t) $$
  With momentum the update is:
  $$rms_t = \rho * rms_{t-1} + (1-\rho) * g_t^2$$
  $$mom_t = \mathrm{momentum} * mom_{t-1} + g_t / \sqrt{rms_t + \epsilon}$$
  $$\theta_t = \theta_{t-1} - \mathrm{learning\_rate} * mom_t$$
  This implementation of RMSprop uses plain momentum, not Nesterov momentum.
  The centered version additionally maintains a moving average of the
-  gradients, and uses that average to estimate the variance:
+  gradients, and uses that average to estimate the variance.
-  $$mg_t = \rho * mg_{t-1} + (1-\rho) * g_t$$
+  Args:
-  $$rms_t = \rho * rms_{t-1} + (1-\rho) * g_t^2$$
+    learning_rate: A `Tensor`, floating point value, or a schedule that is a
-  $$mom_t = \mathrm{momentum} * mom_{t-1} +
+      `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
-      \mathrm{learning\_rate} * g_t / sqrt(rms_t - mg_t^2 + \epsilon)$$
+      that takes no arguments and returns the actual value to use. The
-  $$\theta_t = \theta_{t-1} - mom_t$$
+      learning rate. Defeaults to 0.001.
    rho: Discounting factor for the history/coming gradient. Defaults to 0.9.
    momentum: A scalar or a scalar `Tensor`. Defaults to 0.0.
    epsilon: A small constant for numerical stability. This epsilon is
      "epsilon hat" in the Kingma and Ba paper (in the formula just before
      Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
      1e-7.
    centered: Boolean. If `True`, gradients are normalized by the estimated
      variance of the gradient; if False, by the uncentered second moment.
      Setting this to `True` may help with training, but is slightly more
      expensive in terms of computation and memory. Defaults to `False`.
    name: Optional name prefix for the operations created when applying
      gradients. Defaults to `"RMSprop"`.
    **kwargs: Keyword arguments. Allowed to be one of
      `"clipnorm"` or `"clipvalue"`.
      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
      gradients by value.
  Note that in the dense implementation of this algorithm, variables and their
  corresponding accumulators (momentum, gradient moving average, square
@ -81,14 +83,14 @@ class RMSprop(optimizer_v2.OptimizerV2):
  >>> opt = tf.keras.optimizers.RMSprop(learning_rate=0.1)
  >>> var1 = tf.Variable(10.0)
-  >>> loss = lambda: (var1 ** 2)/2.0                # d(loss)/d(var1) = var1
+  >>> loss = lambda: (var1 ** 2) / 2.0    # d(loss) / d(var1) = var1
  >>> step_count = opt.minimize(loss, [var1]).numpy()
  >>> var1.numpy()
  9.683772
-  References
+  Reference:
-    See ([pdf]
+    - [Hinton, 2012](
-      http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+      http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
  """
  _HAS_AGGREGATE_GRAD = True