diff --git a/tensorflow/python/keras/optimizer_v2/adadelta.py b/tensorflow/python/keras/optimizer_v2/adadelta.py
index 815d17c29d6..dfed74d8ab8 100644
--- a/tensorflow/python/keras/optimizer_v2/adadelta.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta.py
@@ -52,10 +52,23 @@ class Adadelta(optimizer_v2.OptimizerV2):
   $$E[\Delta x^2]_t := \rho * E[\Delta x^2]_{t-1} + (1 - \rho) * \Delta x_t^2$$
   $$x_t := x_{t-1} + \Delta x_{t}$$
 
+  Adadelta is a more robust extension of Adagrad that adapts learning rates
+  based on a moving window of gradient updates, instead of accumulating all
+  past gradients. This way, Adadelta continues learning even when many updates
+  have been done. Compared to Adagrad, in the original version of Adadelta you
+  don't have to set an initial learning rate. In this version, initial
+  learning rate can be set, as in most other Keras optimizers.
+
+  @compatibility(eager)
+  When eager execution is enabled, `learning_rate`, `rho`, and `epsilon` can
+  each be a callable that takes no arguments and returns the actual value to
+  use. This can be useful for changing these values across different
+  invocations of optimizer functions.
+  @end_compatibility
+
   References
     See [M. D. Zeiler](http://arxiv.org/abs/1212.5701)
       ([pdf](http://arxiv.org/pdf/1212.5701v1.pdf))
-
   """
 
   _HAS_ALL_REDUCE_SUM_GRAD = True
@@ -68,13 +81,6 @@ class Adadelta(optimizer_v2.OptimizerV2):
                **kwargs):
     """Construct a new Adadelta optimizer.
 
-    Adadelta is a more robust extension of Adagrad that adapts learning rates
-    based on a moving window of gradient updates, instead of accumulating all
-    past gradients. This way, Adadelta continues learning even when many updates
-    have been done. Compared to Adagrad, in the original version of Adadelta you
-    don't have to set an initial learning rate. In this version, initial
-    learning rate can be set, as in most other Keras optimizers.
-
     Args:
       learning_rate: A `Tensor`, floating point value, or a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
@@ -89,13 +95,6 @@ class Adadelta(optimizer_v2.OptimizerV2):
         gradients by value, `decay` is included for backward compatibility to
         allow time inverse decay of learning rate. `lr` is included for backward
         compatibility, recommended to use `learning_rate` instead.
-
-    @compatibility(eager)
-    When eager execution is enabled, `learning_rate`, `rho`, and `epsilon` can
-    each be a callable that takes no arguments and returns the actual value to
-    use. This can be useful for changing these values across different
-    invocations of optimizer functions.
-    @end_compatibility
     """
     super(Adadelta, self).__init__(name, **kwargs)
     self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad.py b/tensorflow/python/keras/optimizer_v2/adagrad.py
index f251ef027b4..ca1cbd6d3fb 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@@ -47,6 +47,13 @@ class Adagrad(optimizer_v2.OptimizerV2):
   $$accum_{g_t} := accum_{g_{t-1}} + g^2$$
   $$\theta_t := \theta_{t-1} - lr * g / (\sqrt{accum_{g_t}} + \epsilon)$$
 
+  @compatibility(eager)
+  When eager execution is enabled, `learning_rate` can be a callable that
+  takes no arguments and returns the actual value to use. This can be useful
+  for changing these values across different invocations of optimizer
+  functions.
+  @end_compatibility
+
   References:
 
   * [Paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
@@ -80,13 +87,6 @@ class Adagrad(optimizer_v2.OptimizerV2):
 
     Raises:
       ValueError: If the `initial_accumulator_value` or `epsilon` is invalid.
-
-    @compatibility(eager)
-    When eager execution is enabled, `learning_rate` can be a callable that
-    takes no arguments and returns the actual value to use. This can be useful
-    for changing these values across different invocations of optimizer
-    functions.
-    @end_compatibility
     """
     if initial_accumulator_value < 0.0:
       raise ValueError('initial_accumulator_value must be non-negative: %s' %
diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py
index dcf11b6d0c3..6783d9324f6 100644
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@@ -30,7 +30,7 @@ from tensorflow.python.util.tf_export import keras_export
 
 @keras_export('keras.optimizers.Adam')
 class Adam(optimizer_v2.OptimizerV2):
-  """Optimizer that implements the Adam algorithm.
+  r"""Optimizer that implements the Adam algorithm.
 
   Adam optimization is a stochastic gradient descent method that is based on
   adaptive estimation of first-order and second-order moments.
@@ -43,6 +43,63 @@ class Adam(optimizer_v2.OptimizerV2):
 
   For AMSGrad see [On The Convergence Of Adam And Beyond.
   Reddi et al., 5-8](https://openreview.net/pdf?id=ryQu7f-RZ).
+
+  **If amsgrad = False**:
+
+  initialize $m_0$ as 1st moment vector
+  initialize $v_0$ as 2nd moment vector
+
+  The update rule for $\theta$ with gradient $g$ uses an optimization
+  described at the end of section 2 of the paper:
+
+  $$lr_t = \mathrm{learning\_rate} *
+    \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
+  $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
+  $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
+  $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+
+  **If amsgrad = True**:
+
+  initialize $m_0$ as 1st moment vector
+  initialize $v_0$ as 2nd moment vector
+  initialize $\hat{v}_0$ as 2nd moment vector
+
+  The update rule for $\theta$ with gradient $g$ uses an optimization
+  described at the end of section 2 of the paper:
+
+  $$lr_t = \mathrm{learning\_rate} *
+    \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
+
+  $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
+  $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
+  $$\hat{v}_t = \max(\hat{v}_{t-1}, v_t)$$
+  $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{\hat{v}_t} + \epsilon)$$
+
+  The default value of 1e-7 for epsilon might not be a good default in
+  general. For example, when training an Inception network on ImageNet a
+  current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
+  formulation just before Section 2.1 of the Kingma and Ba paper rather than
+  the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
+  hat" in the paper.
+
+  The sparse implementation of this algorithm (used when the gradient is an
+  IndexedSlices object, typically because of `tf.gather` or an embedding
+  lookup in the forward pass) does apply momentum to variable slices even if
+  they were not used in the forward pass (meaning they have a gradient equal
+  to zero). Momentum decay (beta1) is also applied to the entire momentum
+  accumulator. This means that the sparse behavior is equivalent to the dense
+  behavior (in contrast to some momentum implementations which ignore momentum
+  unless a variable slice was actually used).
+
+  Usage:
+
+  >>> opt = tf.keras.optimizers.Adam(learning_rate=0.1)
+  >>> var1 = tf.Variable(10.0)
+  >>> loss = lambda: (var1 ** 2)/2.0       # d(loss)/d(var1) == var1
+  >>> step_count = opt.minimize(loss, [var1]).numpy()
+  >>> # The first step is `-learning_rate*sign(grad)`
+  >>> var1.numpy()
+  9.9
   """
 
   _HAS_ALL_REDUCE_SUM_GRAD = True
@@ -55,64 +112,7 @@ class Adam(optimizer_v2.OptimizerV2):
                amsgrad=False,
                name='Adam',
                **kwargs):
-    r"""Construct a new Adam optimizer.
-
-    If amsgrad = False:
-
-      initialize $m_0$ as 1st moment vector
-      initialize $v_0$ as 2nd moment vector
-
-      The update rule for $\theta$ with gradient $g$ uses an optimization
-      described at the end of section 2 of the paper:
-
-      $$lr_t = \mathrm{learning\_rate} *
-        \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
-      $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
-      $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
-      $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
-
-    If amsgrad = True:
-
-      initialize $m_0$ as 1st moment vector
-      initialize $v_0$ as 2nd moment vector
-      initialize $\hat{v}_0$ as 2nd moment vector
-
-      The update rule for $\theta$ with gradient $g$ uses an optimization
-      described at the end of section 2 of the paper:
-
-      $$lr_t = \mathrm{learning\_rate} *
-        \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
-
-      $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
-      $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
-      $$\hat{v}_t = \max(\hat{v}_{t-1}, v_t)$$
-      $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{\hat{v}_t} + \epsilon)$$
-
-    The default value of 1e-7 for epsilon might not be a good default in
-    general. For example, when training an Inception network on ImageNet a
-    current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
-    formulation just before Section 2.1 of the Kingma and Ba paper rather than
-    the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
-    hat" in the paper.
-
-    The sparse implementation of this algorithm (used when the gradient is an
-    IndexedSlices object, typically because of `tf.gather` or an embedding
-    lookup in the forward pass) does apply momentum to variable slices even if
-    they were not used in the forward pass (meaning they have a gradient equal
-    to zero). Momentum decay (beta1) is also applied to the entire momentum
-    accumulator. This means that the sparse behavior is equivalent to the dense
-    behavior (in contrast to some momentum implementations which ignore momentum
-    unless a variable slice was actually used).
-
-    Usage:
-
-    >>> opt = tf.keras.optimizers.Adam(learning_rate=0.1)
-    >>> var1 = tf.Variable(10.0)
-    >>> loss = lambda: (var1 ** 2)/2.0       # d(loss)/d(var1) == var1
-    >>> step_count = opt.minimize(loss, [var1]).numpy()
-    >>> # The first step is `-learning_rate*sign(grad)`
-    >>> var1.numpy()
-    9.9
+    """Construct a new Adam optimizer.
 
     Args:
       learning_rate: A `Tensor`, floating point value, or a schedule that is a
@@ -138,7 +138,6 @@ class Adam(optimizer_v2.OptimizerV2):
         gradients by value, `decay` is included for backward compatibility to
         allow time inverse decay of learning rate. `lr` is included for backward
         compatibility, recommended to use `learning_rate` instead.
-
     """
 
     super(Adam, self).__init__(name, **kwargs)
diff --git a/tensorflow/python/keras/optimizer_v2/adamax.py b/tensorflow/python/keras/optimizer_v2/adamax.py
index da2d9bb48d1..9166f637c1e 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax.py
@@ -37,6 +37,37 @@ class Adamax(optimizer_v2.OptimizerV2):
   Default parameters follow those provided in the paper.
   Adamax is sometimes superior to adam, specially in models with embeddings.
 
+  Initialization:
+
+  ```
+  m_0 <- 0 (Initialize initial 1st moment vector)
+  v_0 <- 0 (Initialize the exponentially weighted infinity norm)
+  t <- 0 (Initialize timestep)
+  ```
+
+  The update rule for `variable` with gradient `g` uses an optimization
+  described at the end of section 7.1 of the paper:
+
+  ```
+  t <- t + 1
+
+  m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+  v_t <- max(beta2 * v_{t-1}, abs(g))
+  variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+  ```
+
+  Similar to AdamOptimizer, the epsilon is added for numerical stability
+  (especially to get rid of division by zero when v_t = 0).
+
+  Contrast to AdamOptimizer, the sparse implementation of this algorithm
+  (used when the gradient is an IndexedSlices object, typically because of
+  `tf.gather` or an embedding lookup in the forward pass) only updates
+  variable slices and corresponding `m_t`, `v_t` terms when that part of
+  the variable was used in the forward pass. This means that the sparse
+  behavior is contrast to the dense behavior (similar to some momentum
+  implementations which ignore momentum unless a variable slice was actually
+  used).
+
   References
     see Section 7 of [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
       ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
@@ -53,37 +84,6 @@ class Adamax(optimizer_v2.OptimizerV2):
                **kwargs):
     """Construct a new Adamax optimizer.
 
-    Initialization:
-
-    ```
-    m_0 <- 0 (Initialize initial 1st moment vector)
-    v_0 <- 0 (Initialize the exponentially weighted infinity norm)
-    t <- 0 (Initialize timestep)
-    ```
-
-    The update rule for `variable` with gradient `g` uses an optimization
-    described at the end of section 7.1 of the paper:
-
-    ```
-    t <- t + 1
-
-    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-    v_t <- max(beta2 * v_{t-1}, abs(g))
-    variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
-    ```
-
-    Similar to AdamOptimizer, the epsilon is added for numerical stability
-    (especially to get rid of division by zero when v_t = 0).
-
-    Contrast to AdamOptimizer, the sparse implementation of this algorithm
-    (used when the gradient is an IndexedSlices object, typically because of
-    `tf.gather` or an embedding lookup in the forward pass) only updates
-    variable slices and corresponding `m_t`, `v_t` terms when that part of
-    the variable was used in the forward pass. This means that the sparse
-    behavior is contrast to the dense behavior (similar to some momentum
-    implementations which ignore momentum unless a variable slice was actually
-    used).
-
     Args:
       learning_rate: A `Tensor`, floating point value, or a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl.py b/tensorflow/python/keras/optimizer_v2/ftrl.py
index b893271f805..17484395044 100644
--- a/tensorflow/python/keras/optimizer_v2/ftrl.py
+++ b/tensorflow/python/keras/optimizer_v2/ftrl.py
@@ -52,6 +52,9 @@ class Ftrl(optimizer_v2.OptimizerV2):
   Check the documentation for the l2_shrinkage_regularization_strength
   parameter for more details when shrinkage is enabled, where gradient is
   replaced with gradient_with_shrinkage.
+
+  References: See
+  [paper](https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
   """
 
   def __init__(self,
@@ -100,10 +103,6 @@ class Ftrl(optimizer_v2.OptimizerV2):
 
     Raises:
       ValueError: If one of the arguments is invalid.
-
-    References
-      See [paper]
-        (https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
     """
     super(Ftrl, self).__init__(name, **kwargs)
 
diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
index a735d8287d6..9efda8faa5d 100644
--- a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
+++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
@@ -62,7 +62,55 @@ class LearningRateSchedule(object):
 
 @keras_export("keras.optimizers.schedules.ExponentialDecay")
 class ExponentialDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses an exponential decay schedule."""
+  """A LearningRateSchedule that uses an exponential decay schedule.
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses. This schedule applies an exponential decay function
+  to an optimizer step, given a provided initial learning rate.
+
+  The schedule a 1-arg callable that produces a decayed learning
+  rate when passed the current optimizer step. This can be useful for changing
+  the learning rate value across different invocations of optimizer functions.
+  It is computed as:
+
+  ```python
+  def decayed_learning_rate(step):
+    return initial_learning_rate * decay_rate ^ (step / decay_steps)
+  ```
+
+  If the argument `staircase` is `True`, then `step / decay_steps` is
+  an integer division and the decayed learning rate follows a
+  staircase function.
+
+  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+  as the learning rate.
+  Example: When fitting a Keras model, decay every 100000 steps with a base
+  of 0.96:
+
+  ```python
+  initial_learning_rate = 0.1
+  lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
+      initial_learning_rate,
+      decay_steps=100000,
+      decay_rate=0.96,
+      staircase=True)
+
+  model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=lr_schedule),
+                loss='sparse_categorical_crossentropy',
+                metrics=['accuracy'])
+
+  model.fit(data, labels, epochs=5)
+  ```
+
+  The learning rate schedule is also serializable and deserializable using
+  `tf.keras.optimizers.schedules.serialize` and
+  `tf.keras.optimizers.schedules.deserialize`.
+
+  Returns:
+    A 1-arg callable learning rate schedule that takes the current optimizer
+    step and outputs the decayed learning rate, a scalar `Tensor` of the same
+    type as `initial_learning_rate`.
+  """
 
   def __init__(
       self,
@@ -73,48 +121,6 @@ class ExponentialDecay(LearningRateSchedule):
       name=None):
     """Applies exponential decay to the learning rate.
 
-    When training a model, it is often recommended to lower the learning rate as
-    the training progresses. This schedule applies an exponential decay function
-    to an optimizer step, given a provided initial learning rate.
-
-    The schedule a 1-arg callable that produces a decayed learning
-    rate when passed the current optimizer step. This can be useful for changing
-    the learning rate value across different invocations of optimizer functions.
-    It is computed as:
-
-    ```python
-    def decayed_learning_rate(step):
-      return initial_learning_rate * decay_rate ^ (step / decay_steps)
-    ```
-
-    If the argument `staircase` is `True`, then `step / decay_steps` is
-    an integer division and the decayed learning rate follows a
-    staircase function.
-
-    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-    as the learning rate.
-    Example: When fitting a Keras model, decay every 100000 steps with a base
-    of 0.96:
-
-    ```python
-    initial_learning_rate = 0.1
-    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
-        initial_learning_rate,
-        decay_steps=100000,
-        decay_rate=0.96,
-        staircase=True)
-
-    model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=lr_schedule),
-                  loss='sparse_categorical_crossentropy',
-                  metrics=['accuracy'])
-
-    model.fit(data, labels, epochs=5)
-    ```
-
-    The learning rate schedule is also serializable and deserializable using
-    `tf.keras.optimizers.schedules.serialize` and
-    `tf.keras.optimizers.schedules.deserialize`.
-
     Args:
       initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
         Python number.  The initial learning rate.
@@ -126,11 +132,6 @@ class ExponentialDecay(LearningRateSchedule):
         intervals
       name: String.  Optional name of the operation.  Defaults to
         'ExponentialDecay'.
-
-    Returns:
-      A 1-arg callable learning rate schedule that takes the current optimizer
-      step and outputs the decayed learning rate, a scalar `Tensor` of the same
-      type as `initial_learning_rate`.
     """
     super(ExponentialDecay, self).__init__()
     self.initial_learning_rate = initial_learning_rate
@@ -166,7 +167,41 @@ class ExponentialDecay(LearningRateSchedule):
 
 @keras_export("keras.optimizers.schedules.PiecewiseConstantDecay")
 class PiecewiseConstantDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses a piecewise constant decay schedule."""
+  """A LearningRateSchedule that uses a piecewise constant decay schedule.
+
+  The function returns a 1-arg callable to compute the piecewise constant
+  when passed the current optimizer step. This can be useful for changing the
+  learning rate value across different invocations of optimizer functions.
+
+  Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
+    for the next 10000 steps, and 0.1 for any additional steps.
+
+  ```python
+  step = tf.Variable(0, trainable=False)
+  boundaries = [100000, 110000]
+  values = [1.0, 0.5, 0.1]
+  learning_rate_fn = keras.optimizers.schedules.PiecewiseConstantDecay(
+      boundaries, values)
+
+  # Later, whenever we perform an optimization step, we pass in the step.
+  learning_rate = learning_rate_fn(step)
+  ```
+
+  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+  as the learning rate. The learning rate schedule is also serializable and
+  deserializable using `tf.keras.optimizers.schedules.serialize` and
+  `tf.keras.optimizers.schedules.deserialize`.
+
+  Returns:
+    A 1-arg callable learning rate schedule that takes the current optimizer
+    step and outputs the decayed learning rate, a scalar `Tensor` of the same
+    type as the boundary tensors.
+
+    The output of the 1-arg function that takes the `step`
+    is `values[0]` when `step <= boundaries[0]`,
+    `values[1]` when `step > boundaries[0]` and `step <= boundaries[1]`, ...,
+    and values[-1] when `step > boundaries[-1]`.
+  """
 
   def __init__(
       self,
@@ -175,29 +210,6 @@ class PiecewiseConstantDecay(LearningRateSchedule):
       name=None):
     """Piecewise constant from boundaries and interval values.
 
-    The function returns a 1-arg callable to compute the piecewise constant
-    when passed the current optimizer step. This can be useful for changing the
-    learning rate value across different invocations of optimizer functions.
-
-    Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
-      for the next 10000 steps, and 0.1 for any additional steps.
-
-    ```python
-    step = tf.Variable(0, trainable=False)
-    boundaries = [100000, 110000]
-    values = [1.0, 0.5, 0.1]
-    learning_rate_fn = keras.optimizers.schedules.PiecewiseConstantDecay(
-        boundaries, values)
-
-    # Later, whenever we perform an optimization step, we pass in the step.
-    learning_rate = learning_rate_fn(step)
-    ```
-
-    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-    as the learning rate. The learning rate schedule is also serializable and
-    deserializable using `tf.keras.optimizers.schedules.serialize` and
-    `tf.keras.optimizers.schedules.deserialize`.
-
     Args:
       boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
         increasing entries, and with all elements having the same type as the
@@ -209,16 +221,6 @@ class PiecewiseConstantDecay(LearningRateSchedule):
       name: A string. Optional name of the operation. Defaults to
         'PiecewiseConstant'.
 
-    Returns:
-      A 1-arg callable learning rate schedule that takes the current optimizer
-      step and outputs the decayed learning rate, a scalar `Tensor` of the same
-      type as the boundary tensors.
-
-      The output of the 1-arg function that takes the `step`
-      is `values[0]` when `step <= boundaries[0]`,
-      `values[1]` when `step > boundaries[0]` and `step <= boundaries[1]`, ...,
-      and values[-1] when `step > boundaries[-1]`.
-
     Raises:
       ValueError: if the number of elements in the lists do not match.
     """
@@ -265,7 +267,75 @@ class PiecewiseConstantDecay(LearningRateSchedule):
 
 @keras_export("keras.optimizers.schedules.PolynomialDecay")
 class PolynomialDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses a polynomial decay schedule."""
+  """A LearningRateSchedule that uses a polynomial decay schedule.
+
+  It is commonly observed that a monotonically decreasing learning rate, whose
+  degree of change is carefully chosen, results in a better performing model.
+  This schedule applies a polynomial decay function to an optimizer step,
+  given a provided `initial_learning_rate`, to reach an `end_learning_rate`
+  in the given `decay_steps`.
+
+  It requires a `step` value to compute the decayed learning rate. You
+  can just pass a TensorFlow variable that you increment at each training
+  step.
+
+  The schedule is a 1-arg callable that produces a decayed learning rate
+  when passed the current optimizer step. This can be useful for changing the
+  learning rate value across different invocations of optimizer functions.
+  It is computed as:
+
+  ```python
+  def decayed_learning_rate(step):
+    step = min(step, decay_steps)
+    return ((initial_learning_rate - end_learning_rate) *
+            (1 - step / decay_steps) ^ (power)
+           ) + end_learning_rate
+  ```
+
+  If `cycle` is True then a multiple of `decay_steps` is used, the first one
+  that is bigger than `step`.
+
+  ```python
+  def decayed_learning_rate(step):
+    decay_steps = decay_steps * ceil(step / decay_steps)
+    return ((initial_learning_rate - end_learning_rate) *
+            (1 - step / decay_steps) ^ (power)
+           ) + end_learning_rate
+  ```
+
+  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+  as the learning rate.
+  Example: Fit a model while decaying from 0.1 to 0.01 in 10000 steps using
+  sqrt (i.e. power=0.5):
+
+  ```python
+  ...
+  starter_learning_rate = 0.1
+  end_learning_rate = 0.01
+  decay_steps = 10000
+  learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
+      starter_learning_rate,
+      decay_steps,
+      end_learning_rate,
+      power=0.5)
+
+  model.compile(optimizer=tf.keras.optimizers.SGD(
+                    learning_rate=learning_rate_fn),
+                loss='sparse_categorical_crossentropy',
+                metrics=['accuracy'])
+
+  model.fit(data, labels, epochs=5)
+  ```
+
+  The learning rate schedule is also serializable and deserializable using
+  `tf.keras.optimizers.schedules.serialize` and
+  `tf.keras.optimizers.schedules.deserialize`.
+
+  Returns:
+    A 1-arg callable learning rate schedule that takes the current optimizer
+    step and outputs the decayed learning rate, a scalar `Tensor` of the same
+    type as `initial_learning_rate`.
+  """
 
   def __init__(
       self,
@@ -277,68 +347,6 @@ class PolynomialDecay(LearningRateSchedule):
       name=None):
     """Applies a polynomial decay to the learning rate.
 
-    It is commonly observed that a monotonically decreasing learning rate, whose
-    degree of change is carefully chosen, results in a better performing model.
-    This schedule applies a polynomial decay function to an optimizer step,
-    given a provided `initial_learning_rate`, to reach an `end_learning_rate`
-    in the given `decay_steps`.
-
-    It requires a `step` value to compute the decayed learning rate. You
-    can just pass a TensorFlow variable that you increment at each training
-    step.
-
-    The schedule is a 1-arg callable that produces a decayed learning rate
-    when passed the current optimizer step. This can be useful for changing the
-    learning rate value across different invocations of optimizer functions.
-    It is computed as:
-
-    ```python
-    def decayed_learning_rate(step):
-      step = min(step, decay_steps)
-      return ((initial_learning_rate - end_learning_rate) *
-              (1 - step / decay_steps) ^ (power)
-             ) + end_learning_rate
-    ```
-
-    If `cycle` is True then a multiple of `decay_steps` is used, the first one
-    that is bigger than `step`.
-
-    ```python
-    def decayed_learning_rate(step):
-      decay_steps = decay_steps * ceil(step / decay_steps)
-      return ((initial_learning_rate - end_learning_rate) *
-              (1 - step / decay_steps) ^ (power)
-             ) + end_learning_rate
-    ```
-
-    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-    as the learning rate.
-    Example: Fit a model while decaying from 0.1 to 0.01 in 10000 steps using
-    sqrt (i.e. power=0.5):
-
-    ```python
-    ...
-    starter_learning_rate = 0.1
-    end_learning_rate = 0.01
-    decay_steps = 10000
-    learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
-        starter_learning_rate,
-        decay_steps,
-        end_learning_rate,
-        power=0.5)
-
-    model.compile(optimizer=tf.keras.optimizers.SGD(
-                      learning_rate=learning_rate_fn),
-                  loss='sparse_categorical_crossentropy',
-                  metrics=['accuracy'])
-
-    model.fit(data, labels, epochs=5)
-    ```
-
-    The learning rate schedule is also serializable and deserializable using
-    `tf.keras.optimizers.schedules.serialize` and
-    `tf.keras.optimizers.schedules.deserialize`.
-
     Args:
       initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
         Python number.  The initial learning rate.
@@ -351,11 +359,6 @@ class PolynomialDecay(LearningRateSchedule):
       cycle: A boolean, whether or not it should cycle beyond decay_steps.
       name: String.  Optional name of the operation. Defaults to
         'PolynomialDecay'.
-
-    Returns:
-      A 1-arg callable learning rate schedule that takes the current optimizer
-      step and outputs the decayed learning rate, a scalar `Tensor` of the same
-      type as `initial_learning_rate`.
     """
     super(PolynomialDecay, self).__init__()
 
@@ -408,7 +411,56 @@ class PolynomialDecay(LearningRateSchedule):
 
 @keras_export("keras.optimizers.schedules.InverseTimeDecay")
 class InverseTimeDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses an inverse time decay schedule."""
+  """A LearningRateSchedule that uses an inverse time decay schedule.
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses. This schedule applies the inverse decay function
+  to an optimizer step, given a provided initial learning rate.
+  It requires a `step` value to compute the decayed learning rate. You can
+  just pass a TensorFlow variable that you increment at each training step.
+
+  The schedule a 1-arg callable that produces a decayed learning
+  rate when passed the current optimizer step. This can be useful for changing
+  the learning rate value across different invocations of optimizer functions.
+  It is computed as:
+
+  ```python
+  def decayed_learning_rate(step):
+    return initial_learning_rate / (1 + decay_rate * step / decay_step)
+  ```
+
+  or, if `staircase` is `True`, as:
+
+  ```python
+  def decayed_learning_rate(step):
+    return initial_learning_rate / (1 + decay_rate * floor(step / decay_step))
+  ```
+
+  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+  as the learning rate.
+  Example: Fit a Keras model when decaying 1/t with a rate of 0.5:
+
+  ```python
+  ...
+  initial_learning_rate = 0.1
+  decay_steps = 1.0
+  decay_rate = 0.5
+  learning_rate_fn = keras.optimizers.schedules.InverseTimeDecay(
+    initial_learning_rate, decay_steps, decay_rate)
+
+  model.compile(optimizer=tf.keras.optimizers.SGD(
+                    learning_rate=learning_rate_fn),
+                loss='sparse_categorical_crossentropy',
+                metrics=['accuracy'])
+
+  model.fit(data, labels, epochs=5)
+  ```
+
+  Returns:
+    A 1-arg callable learning rate schedule that takes the current optimizer
+    step and outputs the decayed learning rate, a scalar `Tensor` of the same
+    type as `initial_learning_rate`.
+  """
 
   def __init__(
       self,
@@ -419,49 +471,6 @@ class InverseTimeDecay(LearningRateSchedule):
       name=None):
     """Applies inverse time decay to the initial learning rate.
 
-    When training a model, it is often recommended to lower the learning rate as
-    the training progresses. This schedule applies the inverse decay function
-    to an optimizer step, given a provided initial learning rate.
-    It requires a `step` value to compute the decayed learning rate. You can
-    just pass a TensorFlow variable that you increment at each training step.
-
-    The schedule a 1-arg callable that produces a decayed learning
-    rate when passed the current optimizer step. This can be useful for changing
-    the learning rate value across different invocations of optimizer functions.
-    It is computed as:
-
-    ```python
-    def decayed_learning_rate(step):
-      return initial_learning_rate / (1 + decay_rate * step / decay_step)
-    ```
-
-    or, if `staircase` is `True`, as:
-
-    ```python
-    def decayed_learning_rate(step):
-      return initial_learning_rate / (1 + decay_rate * floor(step / decay_step))
-    ```
-
-    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-    as the learning rate.
-    Example: Fit a Keras model when decaying 1/t with a rate of 0.5:
-
-    ```python
-    ...
-    initial_learning_rate = 0.1
-    decay_steps = 1.0
-    decay_rate = 0.5
-    learning_rate_fn = keras.optimizers.schedules.InverseTimeDecay(
-      initial_learning_rate, decay_steps, decay_rate)
-
-    model.compile(optimizer=tf.keras.optimizers.SGD(
-                      learning_rate=learning_rate_fn),
-                  loss='sparse_categorical_crossentropy',
-                  metrics=['accuracy'])
-
-    model.fit(data, labels, epochs=5)
-    ```
-
     Args:
       initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
         Python number.  The initial learning rate.
@@ -471,11 +480,6 @@ class InverseTimeDecay(LearningRateSchedule):
         continuous, fashion.
       name: String.  Optional name of the operation.  Defaults to
         'InverseTimeDecay'.
-
-    Returns:
-      A 1-arg callable learning rate schedule that takes the current optimizer
-      step and outputs the decayed learning rate, a scalar `Tensor` of the same
-      type as `initial_learning_rate`.
     """
     super(InverseTimeDecay, self).__init__()
 
@@ -513,7 +517,47 @@ class InverseTimeDecay(LearningRateSchedule):
 
 @keras_export("keras.experimental.CosineDecay")
 class CosineDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses a cosine decay schedule."""
+  """A LearningRateSchedule that uses a cosine decay schedule.
+
+  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses. This schedule applies a cosine decay function
+  to an optimizer step, given a provided initial learning rate.
+  It requires a `step` value to compute the decayed learning rate. You can
+  just pass a TensorFlow variable that you increment at each training step.
+
+  The schedule a 1-arg callable that produces a decayed learning
+  rate when passed the current optimizer step. This can be useful for changing
+  the learning rate value across different invocations of optimizer functions.
+  It is computed as:
+
+  ```python
+  def decayed_learning_rate(step):
+    step = min(step, decay_steps)
+    cosine_decay = 0.5 * (1 + cos(pi * step / decay_steps))
+    decayed = (1 - alpha) * cosine_decay + alpha
+    return initial_learning_rate * decayed
+  ```
+
+  Example usage:
+  ```python
+  decay_steps = 1000
+  lr_decayed_fn = tf.keras.experimental.CosineDecay(
+      initial_learning_rate, decay_steps)
+  ```
+
+  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+  as the learning rate. The learning rate schedule is also serializable and
+  deserializable using `tf.keras.optimizers.schedules.serialize` and
+  `tf.keras.optimizers.schedules.deserialize`.
+
+  Returns:
+    A 1-arg callable learning rate schedule that takes the current optimizer
+    step and outputs the decayed learning rate, a scalar `Tensor` of the same
+    type as `initial_learning_rate`.
+  """
 
   def __init__(
       self,
@@ -523,40 +567,6 @@ class CosineDecay(LearningRateSchedule):
       name=None):
     """Applies cosine decay to the learning rate.
 
-    See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
-    with Warm Restarts. https://arxiv.org/abs/1608.03983
-
-    When training a model, it is often recommended to lower the learning rate as
-    the training progresses. This schedule applies a cosine decay function
-    to an optimizer step, given a provided initial learning rate.
-    It requires a `step` value to compute the decayed learning rate. You can
-    just pass a TensorFlow variable that you increment at each training step.
-
-    The schedule a 1-arg callable that produces a decayed learning
-    rate when passed the current optimizer step. This can be useful for changing
-    the learning rate value across different invocations of optimizer functions.
-    It is computed as:
-
-    ```python
-    def decayed_learning_rate(step):
-      step = min(step, decay_steps)
-      cosine_decay = 0.5 * (1 + cos(pi * step / decay_steps))
-      decayed = (1 - alpha) * cosine_decay + alpha
-      return initial_learning_rate * decayed
-    ```
-
-    Example usage:
-    ```python
-    decay_steps = 1000
-    lr_decayed_fn = tf.keras.experimental.CosineDecay(
-        initial_learning_rate, decay_steps)
-    ```
-
-    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-    as the learning rate. The learning rate schedule is also serializable and
-    deserializable using `tf.keras.optimizers.schedules.serialize` and
-    `tf.keras.optimizers.schedules.deserialize`.
-
     Args:
       initial_learning_rate: A scalar `float32` or `float64` Tensor or a
         Python number. The initial learning rate.
@@ -565,10 +575,6 @@ class CosineDecay(LearningRateSchedule):
       alpha: A scalar `float32` or `float64` Tensor or a Python number.
         Minimum learning rate value as a fraction of initial_learning_rate.
       name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
-    Returns:
-      A 1-arg callable learning rate schedule that takes the current optimizer
-      step and outputs the decayed learning rate, a scalar `Tensor` of the same
-      type as `initial_learning_rate`.
     """
     super(CosineDecay, self).__init__()
 
@@ -604,7 +610,45 @@ class CosineDecay(LearningRateSchedule):
 
 @keras_export("keras.experimental.CosineDecayRestarts")
 class CosineDecayRestarts(LearningRateSchedule):
-  """A LearningRateSchedule that uses a cosine decay schedule with restarts."""
+  """A LearningRateSchedule that uses a cosine decay schedule with restarts.
+
+  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses. This schedule applies a cosine decay function with
+  restarts to an optimizer step, given a provided initial learning rate.
+  It requires a `step` value to compute the decayed learning rate. You can
+  just pass a TensorFlow variable that you increment at each training step.
+
+  The schedule a 1-arg callable that produces a decayed learning
+  rate when passed the current optimizer step. This can be useful for changing
+  the learning rate value across different invocations of optimizer functions.
+
+  The learning rate multiplier first decays
+  from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
+  restart is performed. Each new warm restart runs for `t_mul` times more
+  steps and with `m_mul` times smaller initial learning rate.
+
+  Example usage:
+  ```python
+  first_decay_steps = 1000
+  lr_decayed_fn = (
+    tf.keras.experimental.CosineDecayRestarts(
+        initial_learning_rate,
+        first_decay_steps))
+  ```
+
+  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+  as the learning rate. The learning rate schedule is also serializable and
+  deserializable using `tf.keras.optimizers.schedules.serialize` and
+  `tf.keras.optimizers.schedules.deserialize`.
+
+  Returns:
+    A 1-arg callable learning rate schedule that takes the current optimizer
+    step and outputs the decayed learning rate, a scalar `Tensor` of the same
+    type as `initial_learning_rate`.
+  """
 
   def __init__(
       self,
@@ -616,38 +660,6 @@ class CosineDecayRestarts(LearningRateSchedule):
       name=None):
     """Applies cosine decay with restarts to the learning rate.
 
-    See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
-    with Warm Restarts. https://arxiv.org/abs/1608.03983
-
-    When training a model, it is often recommended to lower the learning rate as
-    the training progresses. This schedule applies a cosine decay function with
-    restarts to an optimizer step, given a provided initial learning rate.
-    It requires a `step` value to compute the decayed learning rate. You can
-    just pass a TensorFlow variable that you increment at each training step.
-
-    The schedule a 1-arg callable that produces a decayed learning
-    rate when passed the current optimizer step. This can be useful for changing
-    the learning rate value across different invocations of optimizer functions.
-
-    The learning rate multiplier first decays
-    from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
-    restart is performed. Each new warm restart runs for `t_mul` times more
-    steps and with `m_mul` times smaller initial learning rate.
-
-    Example usage:
-    ```python
-    first_decay_steps = 1000
-    lr_decayed_fn = (
-      tf.keras.experimental.CosineDecayRestarts(
-          initial_learning_rate,
-          first_decay_steps))
-    ```
-
-    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-    as the learning rate. The learning rate schedule is also serializable and
-    deserializable using `tf.keras.optimizers.schedules.serialize` and
-    `tf.keras.optimizers.schedules.deserialize`.
-
     Args:
       initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
         number. The initial learning rate.
@@ -660,10 +672,6 @@ class CosineDecayRestarts(LearningRateSchedule):
       alpha: A scalar `float32` or `float64` Tensor or a Python number.
         Minimum learning rate value as a fraction of the initial_learning_rate.
       name: String. Optional name of the operation.  Defaults to 'SGDRDecay'.
-    Returns:
-      A 1-arg callable learning rate schedule that takes the current optimizer
-      step and outputs the decayed learning rate, a scalar `Tensor` of the same
-      type as `initial_learning_rate`.
     """
     super(CosineDecayRestarts, self).__init__()
 
@@ -728,7 +736,57 @@ class CosineDecayRestarts(LearningRateSchedule):
 
 @keras_export("keras.experimental.LinearCosineDecay")
 class LinearCosineDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses a linear cosine decay schedule."""
+  """A LearningRateSchedule that uses a linear cosine decay schedule.
+
+  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
+  https://arxiv.org/abs/1709.07417
+
+  For the idea of warm starts here controlled by `num_periods`,
+  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+  Note that linear cosine decay is more aggressive than cosine decay and
+  larger initial learning rates can typically be used.
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses. This schedule applies a linear cosine decay
+  function to an optimizer step, given a provided initial learning rate.
+  It requires a `step` value to compute the decayed learning rate. You can
+  just pass a TensorFlow variable that you increment at each training step.
+
+  The schedule a 1-arg callable that produces a decayed learning
+  rate when passed the current optimizer step. This can be useful for changing
+  the learning rate value across different invocations of optimizer functions.
+  It is computed as:
+
+  ```python
+  def decayed_learning_rate(step):
+    step = min(step, decay_steps)
+    linear_decay = (decay_steps - step) / decay_steps
+    cosine_decay = 0.5 * (
+        1 + cos(pi * 2 * num_periods * step / decay_steps))
+    decayed = (alpha + linear_decay) * cosine_decay + beta
+    return initial_learning_rate * decayed
+  ```
+
+  Example usage:
+  ```python
+  decay_steps = 1000
+  lr_decayed_fn = (
+    tf.keras.experimental.LinearCosineDecay(
+      initial_learning_rate, decay_steps))
+  ```
+
+  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+  as the learning rate. The learning rate schedule is also serializable and
+  deserializable using `tf.keras.optimizers.schedules.serialize` and
+  `tf.keras.optimizers.schedules.deserialize`.
+
+  Returns:
+    A 1-arg callable learning rate schedule that takes the current optimizer
+    step and outputs the decayed learning rate, a scalar `Tensor` of the same
+    type as `initial_learning_rate`.
+  """
 
   def __init__(
       self,
@@ -740,50 +798,6 @@ class LinearCosineDecay(LearningRateSchedule):
       name=None):
     """Applies linear cosine decay to the learning rate.
 
-    See [Bello et al., ICML2017] Neural Optimizer Search with RL.
-    https://arxiv.org/abs/1709.07417
-
-    For the idea of warm starts here controlled by `num_periods`,
-    see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
-    with Warm Restarts. https://arxiv.org/abs/1608.03983
-
-    Note that linear cosine decay is more aggressive than cosine decay and
-    larger initial learning rates can typically be used.
-
-    When training a model, it is often recommended to lower the learning rate as
-    the training progresses. This schedule applies a linear cosine decay
-    function to an optimizer step, given a provided initial learning rate.
-    It requires a `step` value to compute the decayed learning rate. You can
-    just pass a TensorFlow variable that you increment at each training step.
-
-    The schedule a 1-arg callable that produces a decayed learning
-    rate when passed the current optimizer step. This can be useful for changing
-    the learning rate value across different invocations of optimizer functions.
-    It is computed as:
-
-    ```python
-    def decayed_learning_rate(step):
-      step = min(step, decay_steps)
-      linear_decay = (decay_steps - step) / decay_steps
-      cosine_decay = 0.5 * (
-          1 + cos(pi * 2 * num_periods * step / decay_steps))
-      decayed = (alpha + linear_decay) * cosine_decay + beta
-      return initial_learning_rate * decayed
-    ```
-
-    Example usage:
-    ```python
-    decay_steps = 1000
-    lr_decayed_fn = (
-      tf.keras.experimental.LinearCosineDecay(
-        initial_learning_rate, decay_steps))
-    ```
-
-    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-    as the learning rate. The learning rate schedule is also serializable and
-    deserializable using `tf.keras.optimizers.schedules.serialize` and
-    `tf.keras.optimizers.schedules.deserialize`.
-
     Args:
       initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
         number. The initial learning rate.
@@ -795,10 +809,6 @@ class LinearCosineDecay(LearningRateSchedule):
       beta: See computation above.
       name: String.  Optional name of the operation.  Defaults to
         'LinearCosineDecay'.
-    Returns:
-      A 1-arg callable learning rate schedule that takes the current optimizer
-      step and outputs the decayed learning rate, a scalar `Tensor` of the same
-      type as `initial_learning_rate`.
     """
     super(LinearCosineDecay, self).__init__()
 
@@ -844,7 +854,59 @@ class LinearCosineDecay(LearningRateSchedule):
 
 @keras_export("keras.experimental.NoisyLinearCosineDecay")
 class NoisyLinearCosineDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses a noisy linear cosine decay schedule."""
+  """A LearningRateSchedule that uses a noisy linear cosine decay schedule.
+
+  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
+  https://arxiv.org/abs/1709.07417
+
+  For the idea of warm starts here controlled by `num_periods`,
+  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+  Note that linear cosine decay is more aggressive than cosine decay and
+  larger initial learning rates can typically be used.
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses. This schedule applies a noisy linear cosine decay
+  function to an optimizer step, given a provided initial learning rate.
+  It requires a `step` value to compute the decayed learning rate. You can
+  just pass a TensorFlow variable that you increment at each training step.
+
+  The schedule a 1-arg callable that produces a decayed learning
+  rate when passed the current optimizer step. This can be useful for changing
+  the learning rate value across different invocations of optimizer functions.
+  It is computed as:
+
+  ```python
+  def decayed_learning_rate(step):
+    step = min(step, decay_steps)
+    linear_decay = (decay_steps - step) / decay_steps)
+    cosine_decay = 0.5 * (
+        1 + cos(pi * 2 * num_periods * step / decay_steps))
+    decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
+    return initial_learning_rate * decayed
+  ```
+  where eps_t is 0-centered gaussian noise with variance
+  initial_variance / (1 + global_step) ** variance_decay
+
+  Example usage:
+  ```python
+  decay_steps = 1000
+  lr_decayed_fn = (
+    tf.keras.experimental.NoisyLinearCosineDecay(
+      initial_learning_rate, decay_steps))
+  ```
+
+  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+  as the learning rate. The learning rate schedule is also serializable and
+  deserializable using `tf.keras.optimizers.schedules.serialize` and
+  `tf.keras.optimizers.schedules.deserialize`.
+
+  Returns:
+    A 1-arg callable learning rate schedule that takes the current optimizer
+    step and outputs the decayed learning rate, a scalar `Tensor` of the same
+    type as `initial_learning_rate`.
+  """
 
   def __init__(
       self,
@@ -858,52 +920,6 @@ class NoisyLinearCosineDecay(LearningRateSchedule):
       name=None):
     """Applies noisy linear cosine decay to the learning rate.
 
-    See [Bello et al., ICML2017] Neural Optimizer Search with RL.
-    https://arxiv.org/abs/1709.07417
-
-    For the idea of warm starts here controlled by `num_periods`,
-    see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
-    with Warm Restarts. https://arxiv.org/abs/1608.03983
-
-    Note that linear cosine decay is more aggressive than cosine decay and
-    larger initial learning rates can typically be used.
-
-    When training a model, it is often recommended to lower the learning rate as
-    the training progresses. This schedule applies a noisy linear cosine decay
-    function to an optimizer step, given a provided initial learning rate.
-    It requires a `step` value to compute the decayed learning rate. You can
-    just pass a TensorFlow variable that you increment at each training step.
-
-    The schedule a 1-arg callable that produces a decayed learning
-    rate when passed the current optimizer step. This can be useful for changing
-    the learning rate value across different invocations of optimizer functions.
-    It is computed as:
-
-    ```python
-    def decayed_learning_rate(step):
-      step = min(step, decay_steps)
-      linear_decay = (decay_steps - step) / decay_steps)
-      cosine_decay = 0.5 * (
-          1 + cos(pi * 2 * num_periods * step / decay_steps))
-      decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
-      return initial_learning_rate * decayed
-    ```
-    where eps_t is 0-centered gaussian noise with variance
-    initial_variance / (1 + global_step) ** variance_decay
-
-    Example usage:
-    ```python
-    decay_steps = 1000
-    lr_decayed_fn = (
-      tf.keras.experimental.NoisyLinearCosineDecay(
-        initial_learning_rate, decay_steps))
-    ```
-
-    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-    as the learning rate. The learning rate schedule is also serializable and
-    deserializable using `tf.keras.optimizers.schedules.serialize` and
-    `tf.keras.optimizers.schedules.deserialize`.
-
     Args:
       initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
         number. The initial learning rate.
@@ -917,10 +933,6 @@ class NoisyLinearCosineDecay(LearningRateSchedule):
       beta: See computation above.
       name: String.  Optional name of the operation.  Defaults to
         'NoisyLinearCosineDecay'.
-    Returns:
-      A 1-arg callable learning rate schedule that takes the current optimizer
-      step and outputs the decayed learning rate, a scalar `Tensor` of the same
-      type as `initial_learning_rate`.
     """
     super(NoisyLinearCosineDecay, self).__init__()
 
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop.py b/tensorflow/python/keras/optimizer_v2/rmsprop.py
index 64942f911ec..bf08870bee5 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@@ -65,6 +65,18 @@ class RMSprop(optimizer_v2.OptimizerV2):
       \mathrm{learning\_rate} * g_t / sqrt(rms_t - mg_t^2 + \epsilon)$$
   $$\theta_t = \theta_{t-1} - mom_t$$
 
+  Note that in the dense implementation of this algorithm, variables and their
+  corresponding accumulators (momentum, gradient moving average, square
+  gradient moving average) will be updated even if the gradient is zero
+  (i.e. accumulators will decay, momentum will be applied). The sparse
+  implementation (used when the gradient is an `IndexedSlices` object,
+  typically because of `tf.gather` or an embedding lookup in the forward pass)
+  will not update variable slices or their accumulators unless those slices
+  were used in the forward pass (nor is there an "eventual" correction to
+  account for these omitted updates). This leads to more efficient updates for
+  large embedding lookup tables (where most of the slices are not accessed in
+  a particular graph execution), but differs from the published algorithm.
+
   Usage:
 
   >>> opt = tf.keras.optimizers.RMSprop(learning_rate=0.1)
@@ -91,18 +103,6 @@ class RMSprop(optimizer_v2.OptimizerV2):
                **kwargs):
     """Construct a new RMSprop optimizer.
 
-    Note that in the dense implementation of this algorithm, variables and their
-    corresponding accumulators (momentum, gradient moving average, square
-    gradient moving average) will be updated even if the gradient is zero
-    (i.e. accumulators will decay, momentum will be applied). The sparse
-    implementation (used when the gradient is an `IndexedSlices` object,
-    typically because of `tf.gather` or an embedding lookup in the forward pass)
-    will not update variable slices or their accumulators unless those slices
-    were used in the forward pass (nor is there an "eventual" correction to
-    account for these omitted updates). This leads to more efficient updates for
-    large embedding lookup tables (where most of the slices are not accessed in
-    a particular graph execution), but differs from the published algorithm.
-
     Args:
       learning_rate: A `Tensor`, floating point value, or a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable