From 8d8ea6b0bdf1806b33445e3d7d42cdb5e4ec1ea9 Mon Sep 17 00:00:00 2001
From: mrTsjolder <mrtsjolder@gmail.com>
Date: Wed, 28 Nov 2018 23:43:36 +0100
Subject: [PATCH] first steps in cleaning doc citations

---
 .../python/training/learning_rate_decay.py    | 46 +++++++++++--------
 tensorflow/python/training/momentum.py        | 16 ++++---
 tensorflow/python/training/moving_averages.py | 17 +++++--
 .../python/training/proximal_adagrad.py       |  8 +++-
 .../training/proximal_gradient_descent.py     |  5 +-
 tensorflow/python/training/rmsprop.py         |  8 ++--
 6 files changed, 65 insertions(+), 35 deletions(-)

diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index ab9d923bedc..b092286176e 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -429,9 +429,6 @@ def inverse_time_decay(learning_rate,
 def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
   """Applies cosine decay to the learning rate.
 
-  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
-  with Warm Restarts. https://arxiv.org/abs/1608.03983
-
   When training a model, it is often recommended to lower the learning rate as
   the training progresses.  This function applies a cosine decay function
   to a provided initial learning rate.  It requires a `global_step` value to
@@ -468,6 +465,11 @@ def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
   Raises:
     ValueError: if `global_step` is not supplied.
 
+  References:
+    Stochastic Gradient Descent with Warm Restarts:
+      [Loshchilov et al., 2017](https://openreview.net/forum?id=Skq89Scxx&noteId=Skq89Scxx)
+      ([pdf](https://openreview.net/pdf?id=Skq89Scxx))
+
   @compatibility(eager)
   When eager execution is enabled, this function returns a function which in
   turn returns the decayed learning rate Tensor. This can be useful for changing
@@ -494,9 +496,6 @@ def cosine_decay_restarts(learning_rate,
                           name=None):
   """Applies cosine decay with restarts to the learning rate.
 
-  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
-  with Warm Restarts. https://arxiv.org/abs/1608.03983
-
   When training a model, it is often recommended to lower the learning rate as
   the training progresses.  This function applies a cosine decay function with
   restarts to a provided initial learning rate.  It requires a `global_step`
@@ -536,6 +535,11 @@ def cosine_decay_restarts(learning_rate,
   Raises:
     ValueError: if `global_step` is not supplied.
 
+  References:
+    Stochastic Gradient Descent with Warm Restarts:
+      [Loshchilov et al., 2017](https://openreview.net/forum?id=Skq89Scxx&noteId=Skq89Scxx)
+      ([pdf](https://openreview.net/pdf?id=Skq89Scxx))
+
   @compatibility(eager)
   When eager execution is enabled, this function returns a function which in
   turn returns the decayed learning rate Tensor. This can be useful for changing
@@ -567,13 +571,6 @@ def linear_cosine_decay(learning_rate,
                         name=None):
   """Applies linear cosine decay to the learning rate.
 
-  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
-  https://arxiv.org/abs/1709.07417
-
-  For the idea of warm starts here controlled by `num_periods`,
-  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
-  with Warm Restarts. https://arxiv.org/abs/1608.03983
-
   Note that linear cosine decay is more aggressive than cosine decay and
   larger initial learning rates can typically be used.
 
@@ -618,6 +615,14 @@ def linear_cosine_decay(learning_rate,
   Raises:
     ValueError: if `global_step` is not supplied.
 
+  References:
+    Neural Optimizer Search with Reinforcement Learning:
+      [Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html)
+      ([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf))
+    Stochastic Gradient Descent with Warm Restarts:
+      [Loshchilov et al., 2017](https://openreview.net/forum?id=Skq89Scxx&noteId=Skq89Scxx)
+      ([pdf](https://openreview.net/pdf?id=Skq89Scxx))
+
   @compatibility(eager)
   When eager execution is enabled, this function returns a function which in
   turn returns the decayed learning rate Tensor. This can be useful for changing
@@ -651,13 +656,6 @@ def noisy_linear_cosine_decay(learning_rate,
                               name=None):
   """Applies noisy linear cosine decay to the learning rate.
 
-  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
-  https://arxiv.org/abs/1709.07417
-
-  For the idea of warm starts here controlled by `num_periods`,
-  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
-  with Warm Restarts. https://arxiv.org/abs/1608.03983
-
   Note that linear cosine decay is more aggressive than cosine decay and
   larger initial learning rates can typically be used.
 
@@ -708,6 +706,14 @@ def noisy_linear_cosine_decay(learning_rate,
   Raises:
     ValueError: if `global_step` is not supplied.
 
+  References:
+    Neural Optimizer Search with Reinforcement Learning:
+      [Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html)
+      ([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf))
+    Stochastic Gradient Descent with Warm Restarts:
+      [Loshchilov et al., 2017](https://openreview.net/forum?id=Skq89Scxx&noteId=Skq89Scxx)
+      ([pdf](https://openreview.net/pdf?id=Skq89Scxx))
+
   @compatibility(eager)
   When eager execution is enabled, this function returns a function which in
   turn returns the decayed learning rate Tensor. This can be useful for changing
diff --git a/tensorflow/python/training/momentum.py b/tensorflow/python/training/momentum.py
index f3bc83bbfa1..ca1104496f2 100644
--- a/tensorflow/python/training/momentum.py
+++ b/tensorflow/python/training/momentum.py
@@ -54,17 +54,21 @@ class MomentumOptimizer(optimizer.Optimizer):
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to "Momentum".
       use_nesterov: If `True` use Nesterov Momentum.
-        See [Sutskever et al., 2013](
-        http://jmlr.org/proceedings/papers/v28/sutskever13.pdf).
+        See (Sutskever et al., 2013).
         This implementation always computes gradients at the value of the
         variable(s) passed to the optimizer. Using Nesterov Momentum makes the
         variable(s) track the values called `theta_t + mu*v_t` in the paper.
-        This implementation is an approximation of the original formula, valid 
-        for high values of momentum. It will compute the "adjusted gradient" 
-        in NAG by assuming that the new gradient will be estimated by the 
-        current average gradient plus the product of momentum and the change 
+        This implementation is an approximation of the original formula, valid
+        for high values of momentum. It will compute the "adjusted gradient"
+        in NAG by assuming that the new gradient will be estimated by the
+        current average gradient plus the product of momentum and the change
         in the average gradient.
 
+    References:
+      On the importance of initialization and momentum in deep learning:
+        [Sutskever et al., 2013](http://proceedings.mlr.press/v28/sutskever13.html)
+        ([pdf](http://proceedings.mlr.press/v28/sutskever13.pdf))
+
     @compatibility(eager)
     When eager execution is enabled, `learning_rate` and `momentum` can each be
     a callable that takes no arguments and returns the actual value to use. This
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index cc58a952689..e1fa7ee2994 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -46,8 +46,7 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
   `zero_debias` optionally enables scaling by the mathematically correct
   debiasing factor of
     1 - decay ** num_updates
-  See `ADAM: A Method for Stochastic Optimization` Section 3 for more details
-  (https://arxiv.org/abs/1412.6980).
+  See Section 3 of (Kingma et al., 2015) for more details.
 
   The names of the debias shadow variables, by default, include both the scope
   they were created in and the scope of the variables they debias. They are also
@@ -72,12 +71,17 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
     value: A tensor with the same shape as 'variable'.
     decay: A float Tensor or float value.  The moving average decay.
     zero_debias: A python bool. If true, assume the variable is 0-initialized
-      and unbias it, as in https://arxiv.org/abs/1412.6980. See docstring in
+      and unbias it, as in (Kingma et al., 2015). See docstring in
       `_zero_debias` for more details.
     name: Optional name of the returned operation.
 
   Returns:
     A tensor which if evaluated will compute and return the new moving average.
+
+  References:
+    A Method for Stochastic Optimization:
+      [Kingma et al., 2015](https://arxiv.org/abs/1412.6980)
+      ([pdf](https://arxiv.org/pdf/1412.6980.pdf))
   """
   def update_fn(v, value, decay=decay):
     decay = ops.convert_to_tensor(1.0 - decay, name="decay")
@@ -176,7 +180,7 @@ def _zero_debias(unbiased_var, value, decay):
   All exponential moving averages initialized with Tensors are initialized to 0,
   and therefore are biased to 0. Variables initialized to 0 and used as EMAs are
   similarly biased. This function creates the debias updated amount according to
-  a scale factor, as in https://arxiv.org/abs/1412.6980.
+  a scale factor, as in (Kingma et al., 2015).
 
   To demonstrate the bias the results from 0-initialization, take an EMA that
   was initialized to `0` with decay `b`. After `t` timesteps of seeing the
@@ -201,6 +205,11 @@ def _zero_debias(unbiased_var, value, decay):
   Returns:
     The amount that the unbiased variable should be updated. Computing this
     tensor will also update the shadow variables appropriately.
+
+  References:
+    A Method for Stochastic Optimization:
+      [Kingma et al., 2015](https://arxiv.org/abs/1412.6980)
+      ([pdf](https://arxiv.org/pdf/1412.6980.pdf))
   """
   with variable_scope.variable_scope(
       unbiased_var.name[:-len(":0")], values=[unbiased_var,
diff --git a/tensorflow/python/training/proximal_adagrad.py b/tensorflow/python/training/proximal_adagrad.py
index 2ea628a56b4..857ba227d0f 100644
--- a/tensorflow/python/training/proximal_adagrad.py
+++ b/tensorflow/python/training/proximal_adagrad.py
@@ -31,7 +31,13 @@ class ProximalAdagradOptimizer(optimizer.Optimizer):
   # pylint: disable=line-too-long
   """Optimizer that implements the Proximal Adagrad algorithm.
 
-  See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
+  References:
+    Adaptive Subgradient Methods for Online Learning and Stochastic Optimization:
+      [Duchi et al., 2011](http://jmlr.org/papers/v12/duchi11a.html)
+      ([pdf](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf))
+    Efficient Learning using Forward-Backward Splitting:
+      [Duchi et al., 2009](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting)
+      ([pdf](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf))
   """
 
   def __init__(self, learning_rate, initial_accumulator_value=0.1,
diff --git a/tensorflow/python/training/proximal_gradient_descent.py b/tensorflow/python/training/proximal_gradient_descent.py
index 6eca0e6cb5f..194e28f73b3 100644
--- a/tensorflow/python/training/proximal_gradient_descent.py
+++ b/tensorflow/python/training/proximal_gradient_descent.py
@@ -32,7 +32,10 @@ class ProximalGradientDescentOptimizer(optimizer.Optimizer):
   # pylint: disable=line-too-long
   """Optimizer that implements the proximal gradient descent algorithm.
 
-  See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
+  References:
+    Efficient Learning using Forward-Backward Splitting:
+      [Duchi et al., 2009](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting)
+      ([pdf](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf))
   """
 
   def __init__(self, learning_rate, l1_regularization_strength=0.0,
diff --git a/tensorflow/python/training/rmsprop.py b/tensorflow/python/training/rmsprop.py
index fb53b5883f5..a0e79656c39 100644
--- a/tensorflow/python/training/rmsprop.py
+++ b/tensorflow/python/training/rmsprop.py
@@ -52,10 +52,12 @@ from tensorflow.python.util.tf_export import tf_export
 
 @tf_export(v1=["train.RMSPropOptimizer"])
 class RMSPropOptimizer(optimizer.Optimizer):
-  """Optimizer that implements the RMSProp algorithm.
+  """Optimizer that implements the RMSProp algorithm (Tielemans et al. 2012).
 
-  See the
-  [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+  References:
+    Coursera slide 29:
+    Hinton, 2012
+    ([pdf](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf))
   """
 
   def __init__(self,