first steps in cleaning doc citations

This commit is contained in:
mrTsjolder 2018-11-28 23:43:36 +01:00
parent a7575051f1
commit 8d8ea6b0bd
6 changed files with 65 additions and 35 deletions

View File

@ -429,9 +429,6 @@ def inverse_time_decay(learning_rate,
def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
"""Applies cosine decay to the learning rate.
See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
with Warm Restarts. https://arxiv.org/abs/1608.03983
When training a model, it is often recommended to lower the learning rate as
the training progresses. This function applies a cosine decay function
to a provided initial learning rate. It requires a `global_step` value to
@ -468,6 +465,11 @@ def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
Raises:
ValueError: if `global_step` is not supplied.
References:
Stochastic Gradient Descent with Warm Restarts:
[Loshchilov et al., 2017](https://openreview.net/forum?id=Skq89Scxx&noteId=Skq89Scxx)
([pdf](https://openreview.net/pdf?id=Skq89Scxx))
@compatibility(eager)
When eager execution is enabled, this function returns a function which in
turn returns the decayed learning rate Tensor. This can be useful for changing
@ -494,9 +496,6 @@ def cosine_decay_restarts(learning_rate,
name=None):
"""Applies cosine decay with restarts to the learning rate.
See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
with Warm Restarts. https://arxiv.org/abs/1608.03983
When training a model, it is often recommended to lower the learning rate as
the training progresses. This function applies a cosine decay function with
restarts to a provided initial learning rate. It requires a `global_step`
@ -536,6 +535,11 @@ def cosine_decay_restarts(learning_rate,
Raises:
ValueError: if `global_step` is not supplied.
References:
Stochastic Gradient Descent with Warm Restarts:
[Loshchilov et al., 2017](https://openreview.net/forum?id=Skq89Scxx&noteId=Skq89Scxx)
([pdf](https://openreview.net/pdf?id=Skq89Scxx))
@compatibility(eager)
When eager execution is enabled, this function returns a function which in
turn returns the decayed learning rate Tensor. This can be useful for changing
@ -567,13 +571,6 @@ def linear_cosine_decay(learning_rate,
name=None):
"""Applies linear cosine decay to the learning rate.
See [Bello et al., ICML2017] Neural Optimizer Search with RL.
https://arxiv.org/abs/1709.07417
For the idea of warm starts here controlled by `num_periods`,
see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
with Warm Restarts. https://arxiv.org/abs/1608.03983
Note that linear cosine decay is more aggressive than cosine decay and
larger initial learning rates can typically be used.
@ -618,6 +615,14 @@ def linear_cosine_decay(learning_rate,
Raises:
ValueError: if `global_step` is not supplied.
References:
Neural Optimizer Search with Reinforcement Learning:
[Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html)
([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf))
Stochastic Gradient Descent with Warm Restarts:
[Loshchilov et al., 2017](https://openreview.net/forum?id=Skq89Scxx&noteId=Skq89Scxx)
([pdf](https://openreview.net/pdf?id=Skq89Scxx))
@compatibility(eager)
When eager execution is enabled, this function returns a function which in
turn returns the decayed learning rate Tensor. This can be useful for changing
@ -651,13 +656,6 @@ def noisy_linear_cosine_decay(learning_rate,
name=None):
"""Applies noisy linear cosine decay to the learning rate.
See [Bello et al., ICML2017] Neural Optimizer Search with RL.
https://arxiv.org/abs/1709.07417
For the idea of warm starts here controlled by `num_periods`,
see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
with Warm Restarts. https://arxiv.org/abs/1608.03983
Note that linear cosine decay is more aggressive than cosine decay and
larger initial learning rates can typically be used.
@ -708,6 +706,14 @@ def noisy_linear_cosine_decay(learning_rate,
Raises:
ValueError: if `global_step` is not supplied.
References:
Neural Optimizer Search with Reinforcement Learning:
[Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html)
([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf))
Stochastic Gradient Descent with Warm Restarts:
[Loshchilov et al., 2017](https://openreview.net/forum?id=Skq89Scxx&noteId=Skq89Scxx)
([pdf](https://openreview.net/pdf?id=Skq89Scxx))
@compatibility(eager)
When eager execution is enabled, this function returns a function which in
turn returns the decayed learning rate Tensor. This can be useful for changing

View File

@ -54,17 +54,21 @@ class MomentumOptimizer(optimizer.Optimizer):
name: Optional name prefix for the operations created when applying
gradients. Defaults to "Momentum".
use_nesterov: If `True` use Nesterov Momentum.
See [Sutskever et al., 2013](
http://jmlr.org/proceedings/papers/v28/sutskever13.pdf).
See (Sutskever et al., 2013).
This implementation always computes gradients at the value of the
variable(s) passed to the optimizer. Using Nesterov Momentum makes the
variable(s) track the values called `theta_t + mu*v_t` in the paper.
This implementation is an approximation of the original formula, valid
for high values of momentum. It will compute the "adjusted gradient"
in NAG by assuming that the new gradient will be estimated by the
current average gradient plus the product of momentum and the change
This implementation is an approximation of the original formula, valid
for high values of momentum. It will compute the "adjusted gradient"
in NAG by assuming that the new gradient will be estimated by the
current average gradient plus the product of momentum and the change
in the average gradient.
References:
On the importance of initialization and momentum in deep learning:
[Sutskever et al., 2013](http://proceedings.mlr.press/v28/sutskever13.html)
([pdf](http://proceedings.mlr.press/v28/sutskever13.pdf))
@compatibility(eager)
When eager execution is enabled, `learning_rate` and `momentum` can each be
a callable that takes no arguments and returns the actual value to use. This

View File

@ -46,8 +46,7 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
`zero_debias` optionally enables scaling by the mathematically correct
debiasing factor of
1 - decay ** num_updates
See `ADAM: A Method for Stochastic Optimization` Section 3 for more details
(https://arxiv.org/abs/1412.6980).
See Section 3 of (Kingma et al., 2015) for more details.
The names of the debias shadow variables, by default, include both the scope
they were created in and the scope of the variables they debias. They are also
@ -72,12 +71,17 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
value: A tensor with the same shape as 'variable'.
decay: A float Tensor or float value. The moving average decay.
zero_debias: A python bool. If true, assume the variable is 0-initialized
and unbias it, as in https://arxiv.org/abs/1412.6980. See docstring in
and unbias it, as in (Kingma et al., 2015). See docstring in
`_zero_debias` for more details.
name: Optional name of the returned operation.
Returns:
A tensor which if evaluated will compute and return the new moving average.
References:
A Method for Stochastic Optimization:
[Kingma et al., 2015](https://arxiv.org/abs/1412.6980)
([pdf](https://arxiv.org/pdf/1412.6980.pdf))
"""
def update_fn(v, value, decay=decay):
decay = ops.convert_to_tensor(1.0 - decay, name="decay")
@ -176,7 +180,7 @@ def _zero_debias(unbiased_var, value, decay):
All exponential moving averages initialized with Tensors are initialized to 0,
and therefore are biased to 0. Variables initialized to 0 and used as EMAs are
similarly biased. This function creates the debias updated amount according to
a scale factor, as in https://arxiv.org/abs/1412.6980.
a scale factor, as in (Kingma et al., 2015).
To demonstrate the bias the results from 0-initialization, take an EMA that
was initialized to `0` with decay `b`. After `t` timesteps of seeing the
@ -201,6 +205,11 @@ def _zero_debias(unbiased_var, value, decay):
Returns:
The amount that the unbiased variable should be updated. Computing this
tensor will also update the shadow variables appropriately.
References:
A Method for Stochastic Optimization:
[Kingma et al., 2015](https://arxiv.org/abs/1412.6980)
([pdf](https://arxiv.org/pdf/1412.6980.pdf))
"""
with variable_scope.variable_scope(
unbiased_var.name[:-len(":0")], values=[unbiased_var,

View File

@ -31,7 +31,13 @@ class ProximalAdagradOptimizer(optimizer.Optimizer):
# pylint: disable=line-too-long
"""Optimizer that implements the Proximal Adagrad algorithm.
See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
References:
Adaptive Subgradient Methods for Online Learning and Stochastic Optimization:
[Duchi et al., 2011](http://jmlr.org/papers/v12/duchi11a.html)
([pdf](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf))
Efficient Learning using Forward-Backward Splitting:
[Duchi et al., 2009](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting)
([pdf](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf))
"""
def __init__(self, learning_rate, initial_accumulator_value=0.1,

View File

@ -32,7 +32,10 @@ class ProximalGradientDescentOptimizer(optimizer.Optimizer):
# pylint: disable=line-too-long
"""Optimizer that implements the proximal gradient descent algorithm.
See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
References:
Efficient Learning using Forward-Backward Splitting:
[Duchi et al., 2009](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting)
([pdf](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf))
"""
def __init__(self, learning_rate, l1_regularization_strength=0.0,

View File

@ -52,10 +52,12 @@ from tensorflow.python.util.tf_export import tf_export
@tf_export(v1=["train.RMSPropOptimizer"])
class RMSPropOptimizer(optimizer.Optimizer):
"""Optimizer that implements the RMSProp algorithm.
"""Optimizer that implements the RMSProp algorithm (Tielemans et al. 2012).
See the
[paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
References:
Coursera slide 29:
Hinton, 2012
([pdf](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf))
"""
def __init__(self,