Clean up optimizer docstrings.

PiperOrigin-RevId: 304544799 Change-Id: Ic83206c54cbf8437a4d6ff693f139412b5bdcee8
2020-04-02 22:15:23 -07:00 · 2020-04-02 22:15:23 -07:00 · f6302e4ec7
commit f6302e4ec7
parent 8a370a0077
9 changed files with 251 additions and 377 deletions
--- a/tensorflow/python/keras/optimizer_v2/adadelta.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta.py
@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-"""Adadelta for TensorFlow."""
+"""Adadelta optimizer implementation."""
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -34,23 +34,9 @@ class Adadelta(optimizer_v2.OptimizerV2):

  Adadelta optimization is a stochastic gradient descent method that is based on
  adaptive learning rate per dimension to address two drawbacks:
-    1) the continual decay of learning rates throughout training
-    2) the need for a manually selected global learning rate

-  Two accumulation steps are required:
-    1) the accumulation of gradients squared,
-    2) the accumulation of updates squared.
-
-  Initialization:
-
-  $$E[g^2]_0 := 0 \text{(Initialize gradient 2nd order moment vector)}$$
-  $$E[\Delta x^2]_0 := 0 \text{(Initialize 2nd order variable update)}$$
-
-  $$t := t + 1$$
-  $$E[g^2]_t := \rho * E[g^2]_{t-1} + (1 - \rho) * g^2$$
-  $$\Delta x_t = -RMS[\Delta x]_{t-1} * g_t / RMS[g]_t$$
-  $$E[\Delta x^2]_t := \rho * E[\Delta x^2]_{t-1} + (1 - \rho) * \Delta x_t^2$$
-  $$x_t := x_{t-1} + \Delta x_{t}$$
+  - The continual decay of learning rates throughout training
+  - The need for a manually selected global learning rate

  Adadelta is a more robust extension of Adagrad that adapts learning rates
  based on a moving window of gradient updates, instead of accumulating all
@ -59,16 +45,22 @@ class Adadelta(optimizer_v2.OptimizerV2):
  don't have to set an initial learning rate. In this version, initial
  learning rate can be set, as in most other Keras optimizers.

-  @compatibility(eager)
-  When eager execution is enabled, `learning_rate`, `rho`, and `epsilon` can
-  each be a callable that takes no arguments and returns the actual value to
-  use. This can be useful for changing these values across different
-  invocations of optimizer functions.
-  @end_compatibility
+  Args:
+    learning_rate: A `Tensor`, floating point value, or a schedule that is a
+      `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
+      To match the exact form in the original paper use 1.0.
+    rho: A `Tensor` or a floating point value. The decay rate.
+    epsilon: A `Tensor` or a floating point value.  A constant epsilon used
+             to better conditioning the grad update.
+    name: Optional name prefix for the operations created when applying
+      gradients.  Defaults to `"Adadelta"`.
+    **kwargs: Keyword arguments. Allowed to be one of
+      `"clipnorm"` or `"clipvalue"`.
+      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
+      gradients by value.

-  References
-    See [M. D. Zeiler](http://arxiv.org/abs/1212.5701)
-      ([pdf](http://arxiv.org/pdf/1212.5701v1.pdf))
+  Reference:
+    - [Zeiler, 2012](http://arxiv.org/abs/1212.5701)
  """

  _HAS_AGGREGATE_GRAD = True
@ -79,23 +71,6 @@ class Adadelta(optimizer_v2.OptimizerV2):
               epsilon=1e-7,
               name='Adadelta',
               **kwargs):
-    """Construct a new Adadelta optimizer.
-
-    Args:
-      learning_rate: A `Tensor`, floating point value, or a schedule that is a
-        `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
-        To match the exact form in the original paper use 1.0.
-      rho: A `Tensor` or a floating point value. The decay rate.
-      epsilon: A `Tensor` or a floating point value.  A constant epsilon used
-               to better conditioning the grad update.
-      name: Optional name prefix for the operations created when applying
-        gradients.  Defaults to "Adadelta".
-      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
-        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
-        gradients by value, `decay` is included for backward compatibility to
-        allow time inverse decay of learning rate. `lr` is included for backward
-        compatibility, recommended to use `learning_rate` instead.
-    """
    super(Adadelta, self).__init__(name, **kwargs)
    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
    self._set_hyper('decay', self._initial_decay)
--- a/tensorflow/python/keras/optimizer_v2/adagrad.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-"""Adagrad for TensorFlow."""
+"""Adagrad optimizer implementation."""
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -39,26 +39,22 @@ class Adagrad(optimizer_v2.OptimizerV2):
  updated during training. The more updates a parameter receives,
  the smaller the updates.

-  Initialization:
-  $$accum_{g_0} := \text{initial_accumulator_value}$$
+  Args:
+    learning_rate: A `Tensor`, floating point value, or a schedule that is a
+      `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
+    initial_accumulator_value: A floating point value.
+      Starting value for the accumulators, must be non-negative.
+    epsilon: A small floating point value to avoid zero denominator.
+    name: Optional name prefix for the operations created when applying
+      gradients.  Defaults to `"Adagrad"`.
+    **kwargs: Keyword arguments. Allowed to be one of
+      `"clipnorm"` or `"clipvalue"`.
+      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
+      gradients by value.

-  Update step:
-  $$t := t + 1$$
-  $$accum_{g_t} := accum_{g_{t-1}} + g^2$$
-  $$\theta_t := \theta_{t-1} - lr * g / (\sqrt{accum_{g_t}} + \epsilon)$$
-
-  @compatibility(eager)
-  When eager execution is enabled, `learning_rate` can be a callable that
-  takes no arguments and returns the actual value to use. This can be useful
-  for changing these values across different invocations of optimizer
-  functions.
-  @end_compatibility
-
-  References:
-
-  * [Paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
-  * [Introduction]
-    (https://ppasupat.github.io/a9online/uploads/proximal_notes.pdf).
+  Reference:
+    - [Duchi et al., 2011](
+      http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
  """

  _HAS_AGGREGATE_GRAD = True
@ -69,25 +65,6 @@ class Adagrad(optimizer_v2.OptimizerV2):
               epsilon=1e-7,
               name='Adagrad',
               **kwargs):
-    """Construct a new Adagrad optimizer.
-
-    Args:
-      learning_rate: A `Tensor`, floating point value, or a schedule that is a
-        `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
-      initial_accumulator_value: A floating point value.
-        Starting value for the accumulators, must be non-negative.
-      epsilon: A small floating point value to avoid zero denominator.
-      name: Optional name prefix for the operations created when applying
-        gradients.  Defaults to "Adagrad".
-      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
-        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
-        gradients by value, `decay` is included for backward compatibility to
-        allow time inverse decay of learning rate. `lr` is included for backward
-        compatibility, recommended to use `learning_rate` instead.
-
-    Raises:
-      ValueError: If the `initial_accumulator_value` or `epsilon` is invalid.
-    """
    if initial_accumulator_value < 0.0:
      raise ValueError('initial_accumulator_value must be non-negative: %s' %
                       initial_accumulator_value)
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Adam for TensorFlow."""
+"""Adam optimizer implementation."""
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -35,86 +36,14 @@ class Adam(optimizer_v2.OptimizerV2):

  Adam optimization is a stochastic gradient descent method that is based on
  adaptive estimation of first-order and second-order moments.
-  According to the paper
-  [Adam: A Method for Stochastic Optimization. Kingma et al.,
-  2014](http://arxiv.org/abs/1412.6980), the method is "*computationally
+
+  According to
+  [Kingma et al., 2014](http://arxiv.org/abs/1412.6980),
+  the method is "*computationally
  efficient, has little memory requirement, invariant to diagonal rescaling of
  gradients, and is well suited for problems that are large in terms of
  data/parameters*".

-  For AMSGrad see [On The Convergence Of Adam And Beyond.
-  Reddi et al., 5-8](https://openreview.net/pdf?id=ryQu7f-RZ).
-
-  **If amsgrad = False**:
-
-  initialize $m_0$ as 1st moment vector
-  initialize $v_0$ as 2nd moment vector
-
-  The update rule for $\theta$ with gradient $g$ uses an optimization
-  described at the end of section 2 of the paper:
-
-  $$lr_t = \mathrm{learning\_rate} *
-    \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
-  $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
-  $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
-  $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
-
-  **If amsgrad = True**:
-
-  initialize $m_0$ as 1st moment vector
-  initialize $v_0$ as 2nd moment vector
-  initialize $\hat{v}_0$ as 2nd moment vector
-
-  The update rule for $\theta$ with gradient $g$ uses an optimization
-  described at the end of section 2 of the paper:
-
-  $$lr_t = \mathrm{learning\_rate} *
-    \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
-
-  $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
-  $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
-  $$\hat{v}_t = \max(\hat{v}_{t-1}, v_t)$$
-  $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{\hat{v}_t} + \epsilon)$$
-
-  The default value of 1e-7 for epsilon might not be a good default in
-  general. For example, when training an Inception network on ImageNet a
-  current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
-  formulation just before Section 2.1 of the Kingma and Ba paper rather than
-  the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
-  hat" in the paper.
-
-  The sparse implementation of this algorithm (used when the gradient is an
-  IndexedSlices object, typically because of `tf.gather` or an embedding
-  lookup in the forward pass) does apply momentum to variable slices even if
-  they were not used in the forward pass (meaning they have a gradient equal
-  to zero). Momentum decay (beta1) is also applied to the entire momentum
-  accumulator. This means that the sparse behavior is equivalent to the dense
-  behavior (in contrast to some momentum implementations which ignore momentum
-  unless a variable slice was actually used).
-
-  Usage:
-
-  >>> opt = tf.keras.optimizers.Adam(learning_rate=0.1)
-  >>> var1 = tf.Variable(10.0)
-  >>> loss = lambda: (var1 ** 2)/2.0       # d(loss)/d(var1) == var1
-  >>> step_count = opt.minimize(loss, [var1]).numpy()
-  >>> # The first step is `-learning_rate*sign(grad)`
-  >>> var1.numpy()
-  9.9
-  """
-
-  _HAS_AGGREGATE_GRAD = True
-
-  def __init__(self,
-               learning_rate=0.001,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=1e-7,
-               amsgrad=False,
-               name='Adam',
-               **kwargs):
-    """Construct a new Adam optimizer.
-
  Args:
    learning_rate: A `Tensor`, floating point value, or a schedule that is a
      `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
@ -133,14 +62,56 @@ class Adam(optimizer_v2.OptimizerV2):
    amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
      the paper "On the Convergence of Adam and beyond". Defaults to `False`.
    name: Optional name for the operations created when applying gradients.
-        Defaults to "Adam".
-      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
-        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
-        gradients by value, `decay` is included for backward compatibility to
-        allow time inverse decay of learning rate. `lr` is included for backward
-        compatibility, recommended to use `learning_rate` instead.
+      Defaults to `"Adam"`.
+    **kwargs: Keyword arguments. Allowed to be one of
+      `"clipnorm"` or `"clipvalue"`.
+      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
+      gradients by value.
+
+  Usage:
+
+  >>> opt = tf.keras.optimizers.Adam(learning_rate=0.1)
+  >>> var1 = tf.Variable(10.0)
+  >>> loss = lambda: (var1 ** 2)/2.0       # d(loss)/d(var1) == var1
+  >>> step_count = opt.minimize(loss, [var1]).numpy()
+  >>> # The first step is `-learning_rate*sign(grad)`
+  >>> var1.numpy()
+  9.9
+
+  Reference:
+    - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+    - [Reddi et al., 2018](
+        https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.
+
+  Notes:
+
+  The default value of 1e-7 for epsilon might not be a good default in
+  general. For example, when training an Inception network on ImageNet a
+  current good choice is 1.0 or 0.1. Note that since Adam uses the
+  formulation just before Section 2.1 of the Kingma and Ba paper rather than
+  the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
+  hat" in the paper.
+
+  The sparse implementation of this algorithm (used when the gradient is an
+  IndexedSlices object, typically because of `tf.gather` or an embedding
+  lookup in the forward pass) does apply momentum to variable slices even if
+  they were not used in the forward pass (meaning they have a gradient equal
+  to zero). Momentum decay (beta1) is also applied to the entire momentum
+  accumulator. This means that the sparse behavior is equivalent to the dense
+  behavior (in contrast to some momentum implementations which ignore momentum
+  unless a variable slice was actually used).
  """

+  _HAS_AGGREGATE_GRAD = True
+
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-7,
+               amsgrad=False,
+               name='Adam',
+               **kwargs):
    super(Adam, self).__init__(name, **kwargs)
    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
    self._set_hyper('decay', self._initial_decay)
@ -329,7 +300,7 @@ class NonFusedAdam(optimizer_v2.OptimizerV2):

  The default value of 1e-7 for epsilon might not be a good default in
  general. For example, when training an Inception network on ImageNet a
-  current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
+  current good choice is 1.0 or 0.1. Note that since Adam uses the
  formulation just before Section 2.1 of the Kingma and Ba paper rather than
  the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
  hat" in the paper.
--- a/tensorflow/python/keras/optimizer_v2/adamax.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax.py
@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-"""Adamax for TensorFlow."""
+"""Adamax optimizer implementation."""
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -39,27 +39,27 @@ class Adamax(optimizer_v2.OptimizerV2):

  Initialization:

-  ```
-  m_0 <- 0 (Initialize initial 1st moment vector)
-  v_0 <- 0 (Initialize the exponentially weighted infinity norm)
-  t <- 0 (Initialize timestep)
+  ```python
+  m = 0  # Initialize initial 1st moment vector
+  v = 0  # Initialize the exponentially weighted infinity norm
+  t = 0  # Initialize timestep
  ```

-  The update rule for `variable` with gradient `g` uses an optimization
+  The update rule for parameter `w` with gradient `g` is
  described at the end of section 7.1 of the paper:

-  ```
-  t <- t + 1
-
-  m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-  v_t <- max(beta2 * v_{t-1}, abs(g))
-  variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+  ```python
+  t += 1
+  m = beta1 * m + (1 - beta) * g
+  v = max(beta2 * v, abs(g))
+  current_lr = learning_rate / (1 - beta1 ** t)
+  w = w - current_lr * m / (v + epsilon)
  ```

-  Similar to AdamOptimizer, the epsilon is added for numerical stability
-  (especially to get rid of division by zero when v_t = 0).
+  Similarly to `Adam`, the epsilon is added for numerical stability
+  (especially to get rid of division by zero when `v_t == 0`).

-  Contrast to AdamOptimizer, the sparse implementation of this algorithm
+  In contrast to `Adam`, the sparse implementation of this algorithm
  (used when the gradient is an IndexedSlices object, typically because of
  `tf.gather` or an embedding lookup in the forward pass) only updates
  variable slices and corresponding `m_t`, `v_t` terms when that part of
@ -68,9 +68,23 @@ class Adamax(optimizer_v2.OptimizerV2):
  implementations which ignore momentum unless a variable slice was actually
  used).

-  References
-    see Section 7 of [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
-      ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+  Args:
+    learning_rate: A `Tensor`, floating point value, or a schedule that is a
+      `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
+    beta_1: A float value or a constant float tensor. The exponential decay
+      rate for the 1st moment estimates.
+    beta_2: A float value or a constant float tensor. The exponential decay
+      rate for the exponentially weighted infinity norm.
+    epsilon: A small constant for numerical stability.
+    name: Optional name for the operations created when applying gradients.
+      Defaults to `"Adamax"`.
+    **kwargs: Keyword arguments. Allowed to be one of
+      `"clipnorm"` or `"clipvalue"`.
+      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
+      gradients by value.
+
+  Reference:
+    - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
  """

  _HAS_AGGREGATE_GRAD = True
@ -82,24 +96,6 @@ class Adamax(optimizer_v2.OptimizerV2):
               epsilon=1e-7,
               name='Adamax',
               **kwargs):
-    """Construct a new Adamax optimizer.
-
-    Args:
-      learning_rate: A `Tensor`, floating point value, or a schedule that is a
-        `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
-      beta_1: A float value or a constant float tensor. The exponential decay
-        rate for the 1st moment estimates.
-      beta_2: A float value or a constant float tensor. The exponential decay
-        rate for the exponentially weighted infinity norm.
-      epsilon: A small constant for numerical stability.
-      name: Optional name for the operations created when applying gradients.
-        Defaults to "Adamax".
-      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
-        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
-        gradients by value, `decay` is included for backward compatibility to
-        allow time inverse decay of learning rate. `lr` is included for backward
-        compatibility, recommended to use `learning_rate` instead.
-    """
    super(Adamax, self).__init__(name, **kwargs)
    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
    self._set_hyper('decay', self._initial_decay)
--- a/tensorflow/python/keras/optimizer_v2/ftrl.py
+++ b/tensorflow/python/keras/optimizer_v2/ftrl.py
@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Ftrl-proximal for TensorFlow."""
+"""Ftrl-proximal optimizer implementation."""
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -35,39 +36,6 @@ class Ftrl(optimizer_v2.OptimizerV2):
  above) and shrinkage-type L2 (which is the addition of an L2 penalty to the
  loss function).

-  Initialization:
-  $$t = 0$$
-  $$n_{0} = 0$$
-  $$\sigma_{0} = 0$$
-  $$z_{0} = 0$$
-
-  Update ($$i$$ is variable index):
-  $$t = t + 1$$
-  $$n_{t,i} = n_{t-1,i} + g_{t,i}^{2}$$
-  $$\sigma_{t,i} = (\sqrt{n_{t,i}} - \sqrt{n_{t-1,i}}) / \alpha$$
-  $$z_{t,i} = z_{t-1,i} + g_{t,i} - \sigma_{t,i} * w_{t,i}$$
-  $$w_{t,i} = - ((\beta+\sqrt{n+{t}}) / \alpha + \lambda_{2})^{-1} * (z_{i} -
-               sgn(z_{i}) * \lambda_{1}) if \abs{z_{i}} > \lambda_{i} else 0$$
-
-  Check the documentation for the l2_shrinkage_regularization_strength
-  parameter for more details when shrinkage is enabled, where gradient is
-  replaced with gradient_with_shrinkage.
-
-  References: See
-  [paper](https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
-  """
-
-  def __init__(self,
-               learning_rate=0.001,
-               learning_rate_power=-0.5,
-               initial_accumulator_value=0.1,
-               l1_regularization_strength=0.0,
-               l2_regularization_strength=0.0,
-               name='Ftrl',
-               l2_shrinkage_regularization_strength=0.0,
-               **kwargs):
-    r"""Construct a new FTRL optimizer.
-
  Args:
    learning_rate: A `Tensor`, floating point value, or a schedule that is a
      `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
@ -81,29 +49,30 @@ class Ftrl(optimizer_v2.OptimizerV2):
    l2_regularization_strength: A float value, must be greater than or
      equal to zero.
    name: Optional name prefix for the operations created when applying
-        gradients.  Defaults to "Ftrl".
+      gradients.  Defaults to `"Ftrl"`.
    l2_shrinkage_regularization_strength: A float value, must be greater than
      or equal to zero. This differs from L2 above in that the L2 above is a
      stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
-        The FTRL formulation can be written as:
-        w_{t+1} = argmin_w(\hat{g}_{1:t}w + L1*||w||_1 + L2*||w||_2^2), where
-        \hat{g} = g + (2*L2_shrinkage*w), and g is the gradient of the loss
-        function w.r.t. the weights w.
-        Specifically, in the absence of L1 regularization, it is equivalent to
-        the following update rule:
-        w_{t+1} = w_t - lr_t / (1 + 2*L2*lr_t) * g_t -
-                  2*L2_shrinkage*lr_t / (1 + 2*L2*lr_t) * w_t
-        where lr_t is the learning rate at t.
-        When input is sparse shrinkage will only happen on the active weights.\
-      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
-        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
-        gradients by value, `decay` is included for backward compatibility to
-        allow time inverse decay of learning rate. `lr` is included for backward
-        compatibility, recommended to use `learning_rate` instead.
+      When input is sparse shrinkage will only happen on the active weights.
+    **kwargs: Keyword arguments. Allowed to be one of
+      `"clipnorm"` or `"clipvalue"`.
+      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
+      gradients by value.

-    Raises:
-      ValueError: If one of the arguments is invalid.
+  Reference:
+    - [paper](
+      https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               learning_rate_power=-0.5,
+               initial_accumulator_value=0.1,
+               l1_regularization_strength=0.0,
+               l2_regularization_strength=0.0,
+               name='Ftrl',
+               l2_shrinkage_regularization_strength=0.0,
+               **kwargs):
    super(Ftrl, self).__init__(name, **kwargs)

    if initial_accumulator_value < 0.0:
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Momentum for TensorFlow."""
+"""SGD optimizer implementation."""
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -27,17 +28,45 @@ from tensorflow.python.util.tf_export import keras_export

@keras_export("keras.optimizers.SGD")
 class SGD(optimizer_v2.OptimizerV2):
-  r"""Stochastic gradient descent and momentum optimizer.
+  r"""Gradient descent (with momentum) optimizer.

-  The update rule for $\theta$ with gradient $g$ when `momentum` is 0.0:
-  $$\theta_t = \theta_{t-1} - \mathrm{learning\_rate} * g_t$$
+  Update rule for parameter `w` with gradient `g` when `momentum` is 0:

-  The update rule when `momentum` is larger than 0.0:
-  $$v_t = \mathrm{momentum} * v_{t-1} - \mathrm{learning\_rate} * g_t$$
-  $$\theta_t = \theta_{t-1} + v_t$$
-  if `nesterov` is False, gradient is evaluated at $\theta_t$.
-  if `nesterov` is True, gradient is evaluated at $\theta_t + momentum * v_t$,
-    and the variables always store $\theta + m v$ instead of $theta$
+  ```python
+  w = w - learning_rate * g
+  ```
+
+  Update rule when `momentum` is larger than 0:
+
+  ```python
+  velocity = momentum * velocity - learning_rate * g
+  w = w * velocity
+  ```
+
+  When `nesterov=False`, this rule becomes:
+
+  ```python
+  velocity = momentum * velocity - learning_rate * g
+  w = w + momentum * velocity - learning_rate * g
+  ```
+
+  Args:
+    learning_rate: A `Tensor`, floating point value, or a schedule that is a
+      `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
+      that takes no arguments and returns the actual value to use. The
+      learning rate. Defaults to 0.01.
+    momentum: float hyperparameter >= 0 that accelerates gradient descent
+      in the relevant
+      direction and dampens oscillations. Defaults to 0, i.e., vanilla gradient
+      descent.
+    nesterov: boolean. Whether to apply Nesterov momentum.
+      Defaults to `False`.
+    name: Optional name prefix for the operations created when applying
+      gradients.  Defaults to `"SGD"`.
+    **kwargs: Keyword arguments. Allowed to be one of
+      `"clipnorm"` or `"clipvalue"`.
+      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
+      gradients by value.

  Usage:

@ -64,13 +93,8 @@ class SGD(optimizer_v2.OptimizerV2):
  >>> (val1 - val2).numpy()
  0.18

-  Some of the args below are hyperparameters, where a hyperparameter is
-  defined as a scalar Tensor, a regular Python value, or a callable (which
-  will be evaluated when `apply_gradients` is called) returning a scalar
-  Tensor or a Python value.
-
-  # References
-      nesterov = True, See [Sutskever et al., 2013](
+  Reference:
+      - For `nesterov=True`, See [Sutskever et al., 2013](
        http://jmlr.org/proceedings/papers/v28/sutskever13.pdf).
  """

@ -82,25 +106,6 @@ class SGD(optimizer_v2.OptimizerV2):
               nesterov=False,
               name="SGD",
               **kwargs):
-    """Construct a new Stochastic Gradient Descent or Momentum optimizer.
-
-    Arguments:
-      learning_rate: A `Tensor`, floating point value, or a schedule that is a
-        `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
-        that takes no arguments and returns the actual value to use. The
-        learning rate. Defaults to 0.01.
-      momentum: float hyperparameter >= 0 that accelerates SGD in the relevant
-        direction and dampens oscillations. Defaults to 0.0, i.e., SGD.
-      nesterov: boolean. Whether to apply Nesterov momentum.
-        Defaults to `False`.
-      name: Optional name prefix for the operations created when applying
-        gradients.  Defaults to 'SGD'.
-      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
-        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
-        gradients by value, `decay` is included for backward compatibility to
-        allow time inverse decay of learning rate. `lr` is included for backward
-        compatibility, recommended to use `learning_rate` instead.
-    """
    super(SGD, self).__init__(name, **kwargs)
    self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
    self._set_hyper("decay", self._initial_decay)
--- a/tensorflow/python/keras/optimizer_v2/nadam.py
+++ b/tensorflow/python/keras/optimizer_v2/nadam.py
@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Nadam for TensorFlow."""
+"""Nadam optimizer implementation."""
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -36,29 +37,22 @@ class Nadam(optimizer_v2.OptimizerV2):
  Much like Adam is essentially RMSprop with momentum, Nadam is Adam with
  Nesterov momentum.

-  Initialization:
+  Args:
+    learning_rate: A Tensor or a floating point value.  The learning rate.
+    beta_1: A float value or a constant float tensor. The exponential decay
+      rate for the 1st moment estimates.
+    beta_2: A float value or a constant float tensor. The exponential decay
+      rate for the exponentially weighted infinity norm.
+    epsilon: A small constant for numerical stability.
+    name: Optional name for the operations created when applying gradients.
+      Defaults to `"Nadam"`.
+    **kwargs: Keyword arguments. Allowed to be one of
+      `"clipnorm"` or `"clipvalue"`.
+      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
+      gradients by value.

-  $$m_0 := 0 \text{(Initialize 1st moment vector)}$$
-  $$v_0 := 0 \text{(Initialize 2nd moment vector)}$$
-  $$mu_0 := 1$$
-  $$t := 0 \text{(Initialize timestep)}$$
-
-  Computes:
-  $$t := t + 1$$
-  $$\mu_t := \beta_1 * (1 - 0.5 * 0.96^{0.004 * t})$$
-  $$g' := g / (1 - \prod_{i=1}^{t}{\mu_i})$$
-  $$m_t := \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
-  $$m' := m_t / (1 - \prod_{i=1}^{t+1}{\mu_i})$$
-  $$v_t := \beta_2 * v_{t-1} + (1 - \beta_2) * g * g$$
-  $$v' := v_t / (1 - \beta_2^t)$$
-  $$\bar{m} := (1 - \mu_t) * g' + \mu_{t+1} * m'$$
-  $$\theta_t := \theta_{t-1} - lr * \bar{m} / (\sqrt{v'} + \epsilon)$$
-
-  gradient is evaluated at theta(t) + momentum * v(t), and the variables always
-  store theta + beta_1 * m / sqrt(v) instead of theta.
-
-  References
-    See [Dozat, T., 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
+  Reference:
+    - [Dozat, 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
  """

  _HAS_AGGREGATE_GRAD = True
@ -70,24 +64,6 @@ class Nadam(optimizer_v2.OptimizerV2):
               epsilon=1e-7,
               name='Nadam',
               **kwargs):
-    """Construct a new Nadam optimizer.
-
-    Args:
-      learning_rate: A Tensor or a floating point value.  The learning rate.
-      beta_1: A float value or a constant float tensor. The exponential decay
-        rate for the 1st moment estimates.
-      beta_2: A float value or a constant float tensor. The exponential decay
-        rate for the exponentially weighted infinity norm.
-      epsilon: A small constant for numerical stability.
-      name: Optional name for the operations created when applying gradients.
-        Defaults to "Nadam".
-      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
-        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
-        gradients by value, `decay` is included for backward compatibility to
-        allow time inverse decay of learning rate. `lr` is included for backward
-        compatibility, recommended to use `learning_rate` instead.
-    """
-
    # Backwards compatibility with keras NAdam optimizer.
    kwargs['decay'] = kwargs.pop('schedule_decay', 0.004)
    learning_rate = kwargs.get('lr', learning_rate)
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Version 2 of class Optimizer."""
 # pylint: disable=g-bad-name

@ -79,11 +78,10 @@ def _deduplicate_indexed_slices(values, indices):
@six.add_metaclass(abc.ABCMeta)
@keras_export("keras.optimizers.Optimizer")
 class OptimizerV2(trackable.Trackable):
-  """Updated base class for optimizers.
+  """Base class for Keras optimizers.

-  This class defines the API to add Ops to train a model.  You never use this
-  class directly, but instead instantiate one of its subclasses such as
-  `tf.keras.optimizers.SGD`, `tf.keras.optimizers.Adam`.
+  You should not use this class directly, but instead instantiate one of its
+  subclasses such as `tf.keras.optimizers.SGD`, `tf.keras.optimizers.Adam`, etc.

  ### Usage

@ -101,7 +99,7 @@ class OptimizerV2(trackable.Trackable):
  opt.minimize(loss, var_list=[var1, var2])
  ```

-  ### Custom training loop with Keras models
+  ### Usage in custom training loops

  In Keras models, sometimes variables are created when the model is first
  called, instead of construction time. Examples include 1) sequential models
@ -109,6 +107,7 @@ class OptimizerV2(trackable.Trackable):
  callable in these cases.

  Example:
+
  ```python
  opt = tf.keras.optimizers.SGD(learning_rate=0.1)
  model = tf.keras.Sequential()
@ -120,7 +119,7 @@ class OptimizerV2(trackable.Trackable):
    opt.minimize(loss_fn, var_list_fn)
  ```

-  ### Processing gradients before applying them.
+  ### Processing gradients before applying them

  Calling `minimize()` takes care of both computing the gradients and
  applying them to the variables.  If you want to process the gradients
@ -150,7 +149,7 @@ class OptimizerV2(trackable.Trackable):
  opt.apply_gradients(zip(processed_grads, var_list))
  ```

-  ### Use with `tf.distribute.Strategy`.
+  ### Use with `tf.distribute.Strategy`

  This optimizer class is `tf.distribute.Strategy` aware, which means it
  automatically sums gradients across all replicas. To average gradients,
@ -172,7 +171,7 @@ class OptimizerV2(trackable.Trackable):
  step. As a result, using `tf.math.reduce_mean` will give the wrong answer,
  resulting in gradients that can be many times too big.

-  ### Variable Constraint
+  ### Variable Constraints

  All Keras optimizers respect variable constraints. If constraint function is
  passed to any variable, the constraint will be applied to the variable after
@ -220,7 +219,8 @@ class OptimizerV2(trackable.Trackable):
  opt.minimize(loss, var_list=[var1, var2])
  ```

-  ### Callable learning rate.
+  ### Callable learning rate
+
  Optimizer accepts a callable learning rate in two ways. The first way is
  through built-in or customized
  `tf.keras.optimizers.schedules.LearningRateSchedule`. The schedule will be
@ -250,14 +250,17 @@ class OptimizerV2(trackable.Trackable):
  >>> opt.minimize(loss, var_list=[var])
  <tf.Variable...

-  ### Write a customized optimizer.
+  ### Creating a custom optimizer
+
  If you intend to create your own optimization algorithm, simply inherit from
  this class and override the following methods:

-    - _resource_apply_dense (update variable given gradient tensor is dense)
-    - _resource_apply_sparse (update variable given gradient tensor is sparse)
-    - _create_slots (if your optimizer algorithm requires additional variables)
-    - get_config (serialization of the optimizer, include all hyper parameters)
+    - `_resource_apply_dense` (update variable given gradient tensor is dense)
+    - `_resource_apply_sparse` (update variable given gradient tensor is sparse)
+    - `_create_slots`
+      (if your optimizer algorithm requires additional variables)
+    - `get_config`
+      (serialization of the optimizer, include all hyper parameters)
  """

  # Subclasses should set this to True unless they override `apply_gradients`
--- a/tensorflow/python/keras/optimizer_v2/rmsprop.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""RMSprop for TensorFlow."""
+"""RMSprop optimizer implementation."""
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -34,36 +35,37 @@ from tensorflow.python.util.tf_export import keras_export
 class RMSprop(optimizer_v2.OptimizerV2):
  r"""Optimizer that implements the RMSprop algorithm.

-  A detailed description of rmsprop.
-    - maintain a moving (discounted) average of the square of gradients
-    - divide gradient by the root of this average
+  The gist of RMSprop is to:

-  The default settings does not use momentum:
-
-  $$rms_t = \rho * rms_{t-1} + (1-\rho) * g_t^2$$
-  $$\theta_t = \theta_{t-1} - \mathrm{learning\_rate} *
-                              g_t / \sqrt{rms_t + \epsilon}$$
-
-  Since  $x/x^2 = sign(x)$, this  is an smoothed approximation of:
-
-  $$ \theta_t = \theta_{t-1} - \mathrm{learning\_rate} * sign(g_t) $$
-
-  With momentum the update is:
-
-  $$rms_t = \rho * rms_{t-1} + (1-\rho) * g_t^2$$
-  $$mom_t = \mathrm{momentum} * mom_{t-1} + g_t / \sqrt{rms_t + \epsilon}$$
-  $$\theta_t = \theta_{t-1} - \mathrm{learning\_rate} * mom_t$$
+  - Maintain a moving (discounted) average of the square of gradients
+  - Divide the gradient by the root of this average

  This implementation of RMSprop uses plain momentum, not Nesterov momentum.

  The centered version additionally maintains a moving average of the
-  gradients, and uses that average to estimate the variance:
+  gradients, and uses that average to estimate the variance.

-  $$mg_t = \rho * mg_{t-1} + (1-\rho) * g_t$$
-  $$rms_t = \rho * rms_{t-1} + (1-\rho) * g_t^2$$
-  $$mom_t = \mathrm{momentum} * mom_{t-1} +
-      \mathrm{learning\_rate} * g_t / sqrt(rms_t - mg_t^2 + \epsilon)$$
-  $$\theta_t = \theta_{t-1} - mom_t$$
+  Args:
+    learning_rate: A `Tensor`, floating point value, or a schedule that is a
+      `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
+      that takes no arguments and returns the actual value to use. The
+      learning rate. Defeaults to 0.001.
+    rho: Discounting factor for the history/coming gradient. Defaults to 0.9.
+    momentum: A scalar or a scalar `Tensor`. Defaults to 0.0.
+    epsilon: A small constant for numerical stability. This epsilon is
+      "epsilon hat" in the Kingma and Ba paper (in the formula just before
+      Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
+      1e-7.
+    centered: Boolean. If `True`, gradients are normalized by the estimated
+      variance of the gradient; if False, by the uncentered second moment.
+      Setting this to `True` may help with training, but is slightly more
+      expensive in terms of computation and memory. Defaults to `False`.
+    name: Optional name prefix for the operations created when applying
+      gradients. Defaults to `"RMSprop"`.
+    **kwargs: Keyword arguments. Allowed to be one of
+      `"clipnorm"` or `"clipvalue"`.
+      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
+      gradients by value.

  Note that in the dense implementation of this algorithm, variables and their
  corresponding accumulators (momentum, gradient moving average, square
@ -86,9 +88,9 @@ class RMSprop(optimizer_v2.OptimizerV2):
  >>> var1.numpy()
  9.683772

-  References
-    See ([pdf]
-      http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+  Reference:
+    - [Hinton, 2012](
+      http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
  """

  _HAS_AGGREGATE_GRAD = True