first steps in cleaning doc citations
This commit is contained in:
parent
a7575051f1
commit
8d8ea6b0bd
@ -429,9 +429,6 @@ def inverse_time_decay(learning_rate,
|
||||
def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
|
||||
"""Applies cosine decay to the learning rate.
|
||||
|
||||
See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
|
||||
with Warm Restarts. https://arxiv.org/abs/1608.03983
|
||||
|
||||
When training a model, it is often recommended to lower the learning rate as
|
||||
the training progresses. This function applies a cosine decay function
|
||||
to a provided initial learning rate. It requires a `global_step` value to
|
||||
@ -468,6 +465,11 @@ def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
|
||||
Raises:
|
||||
ValueError: if `global_step` is not supplied.
|
||||
|
||||
References:
|
||||
Stochastic Gradient Descent with Warm Restarts:
|
||||
[Loshchilov et al., 2017](https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx)
|
||||
([pdf](https://openreview.net/pdf?id=Skq89Scxx))
|
||||
|
||||
@compatibility(eager)
|
||||
When eager execution is enabled, this function returns a function which in
|
||||
turn returns the decayed learning rate Tensor. This can be useful for changing
|
||||
@ -494,9 +496,6 @@ def cosine_decay_restarts(learning_rate,
|
||||
name=None):
|
||||
"""Applies cosine decay with restarts to the learning rate.
|
||||
|
||||
See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
|
||||
with Warm Restarts. https://arxiv.org/abs/1608.03983
|
||||
|
||||
When training a model, it is often recommended to lower the learning rate as
|
||||
the training progresses. This function applies a cosine decay function with
|
||||
restarts to a provided initial learning rate. It requires a `global_step`
|
||||
@ -536,6 +535,11 @@ def cosine_decay_restarts(learning_rate,
|
||||
Raises:
|
||||
ValueError: if `global_step` is not supplied.
|
||||
|
||||
References:
|
||||
Stochastic Gradient Descent with Warm Restarts:
|
||||
[Loshchilov et al., 2017](https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx)
|
||||
([pdf](https://openreview.net/pdf?id=Skq89Scxx))
|
||||
|
||||
@compatibility(eager)
|
||||
When eager execution is enabled, this function returns a function which in
|
||||
turn returns the decayed learning rate Tensor. This can be useful for changing
|
||||
@ -567,13 +571,6 @@ def linear_cosine_decay(learning_rate,
|
||||
name=None):
|
||||
"""Applies linear cosine decay to the learning rate.
|
||||
|
||||
See [Bello et al., ICML2017] Neural Optimizer Search with RL.
|
||||
https://arxiv.org/abs/1709.07417
|
||||
|
||||
For the idea of warm starts here controlled by `num_periods`,
|
||||
see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
|
||||
with Warm Restarts. https://arxiv.org/abs/1608.03983
|
||||
|
||||
Note that linear cosine decay is more aggressive than cosine decay and
|
||||
larger initial learning rates can typically be used.
|
||||
|
||||
@ -618,6 +615,14 @@ def linear_cosine_decay(learning_rate,
|
||||
Raises:
|
||||
ValueError: if `global_step` is not supplied.
|
||||
|
||||
References:
|
||||
Neural Optimizer Search with Reinforcement Learning:
|
||||
[Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html)
|
||||
([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf))
|
||||
Stochastic Gradient Descent with Warm Restarts:
|
||||
[Loshchilov et al., 2017](https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx)
|
||||
([pdf](https://openreview.net/pdf?id=Skq89Scxx))
|
||||
|
||||
@compatibility(eager)
|
||||
When eager execution is enabled, this function returns a function which in
|
||||
turn returns the decayed learning rate Tensor. This can be useful for changing
|
||||
@ -651,13 +656,6 @@ def noisy_linear_cosine_decay(learning_rate,
|
||||
name=None):
|
||||
"""Applies noisy linear cosine decay to the learning rate.
|
||||
|
||||
See [Bello et al., ICML2017] Neural Optimizer Search with RL.
|
||||
https://arxiv.org/abs/1709.07417
|
||||
|
||||
For the idea of warm starts here controlled by `num_periods`,
|
||||
see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
|
||||
with Warm Restarts. https://arxiv.org/abs/1608.03983
|
||||
|
||||
Note that linear cosine decay is more aggressive than cosine decay and
|
||||
larger initial learning rates can typically be used.
|
||||
|
||||
@ -708,6 +706,14 @@ def noisy_linear_cosine_decay(learning_rate,
|
||||
Raises:
|
||||
ValueError: if `global_step` is not supplied.
|
||||
|
||||
References:
|
||||
Neural Optimizer Search with Reinforcement Learning:
|
||||
[Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html)
|
||||
([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf))
|
||||
Stochastic Gradient Descent with Warm Restarts:
|
||||
[Loshchilov et al., 2017](https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx)
|
||||
([pdf](https://openreview.net/pdf?id=Skq89Scxx))
|
||||
|
||||
@compatibility(eager)
|
||||
When eager execution is enabled, this function returns a function which in
|
||||
turn returns the decayed learning rate Tensor. This can be useful for changing
|
||||
|
@ -54,17 +54,21 @@ class MomentumOptimizer(optimizer.Optimizer):
|
||||
name: Optional name prefix for the operations created when applying
|
||||
gradients. Defaults to "Momentum".
|
||||
use_nesterov: If `True` use Nesterov Momentum.
|
||||
See [Sutskever et al., 2013](
|
||||
http://jmlr.org/proceedings/papers/v28/sutskever13.pdf).
|
||||
See (Sutskever et al., 2013).
|
||||
This implementation always computes gradients at the value of the
|
||||
variable(s) passed to the optimizer. Using Nesterov Momentum makes the
|
||||
variable(s) track the values called `theta_t + mu*v_t` in the paper.
|
||||
This implementation is an approximation of the original formula, valid
|
||||
for high values of momentum. It will compute the "adjusted gradient"
|
||||
in NAG by assuming that the new gradient will be estimated by the
|
||||
current average gradient plus the product of momentum and the change
|
||||
This implementation is an approximation of the original formula, valid
|
||||
for high values of momentum. It will compute the "adjusted gradient"
|
||||
in NAG by assuming that the new gradient will be estimated by the
|
||||
current average gradient plus the product of momentum and the change
|
||||
in the average gradient.
|
||||
|
||||
References:
|
||||
On the importance of initialization and momentum in deep learning:
|
||||
[Sutskever et al., 2013](http://proceedings.mlr.press/v28/sutskever13.html)
|
||||
([pdf](http://proceedings.mlr.press/v28/sutskever13.pdf))
|
||||
|
||||
@compatibility(eager)
|
||||
When eager execution is enabled, `learning_rate` and `momentum` can each be
|
||||
a callable that takes no arguments and returns the actual value to use. This
|
||||
|
@ -46,8 +46,7 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
|
||||
`zero_debias` optionally enables scaling by the mathematically correct
|
||||
debiasing factor of
|
||||
1 - decay ** num_updates
|
||||
See `ADAM: A Method for Stochastic Optimization` Section 3 for more details
|
||||
(https://arxiv.org/abs/1412.6980).
|
||||
See Section 3 of (Kingma et al., 2015) for more details.
|
||||
|
||||
The names of the debias shadow variables, by default, include both the scope
|
||||
they were created in and the scope of the variables they debias. They are also
|
||||
@ -72,12 +71,17 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
|
||||
value: A tensor with the same shape as 'variable'.
|
||||
decay: A float Tensor or float value. The moving average decay.
|
||||
zero_debias: A python bool. If true, assume the variable is 0-initialized
|
||||
and unbias it, as in https://arxiv.org/abs/1412.6980. See docstring in
|
||||
and unbias it, as in (Kingma et al., 2015). See docstring in
|
||||
`_zero_debias` for more details.
|
||||
name: Optional name of the returned operation.
|
||||
|
||||
Returns:
|
||||
A tensor which if evaluated will compute and return the new moving average.
|
||||
|
||||
References:
|
||||
A Method for Stochastic Optimization:
|
||||
[Kingma et al., 2015](https://arxiv.org/abs/1412.6980)
|
||||
([pdf](https://arxiv.org/pdf/1412.6980.pdf))
|
||||
"""
|
||||
def update_fn(v, value, decay=decay):
|
||||
decay = ops.convert_to_tensor(1.0 - decay, name="decay")
|
||||
@ -176,7 +180,7 @@ def _zero_debias(unbiased_var, value, decay):
|
||||
All exponential moving averages initialized with Tensors are initialized to 0,
|
||||
and therefore are biased to 0. Variables initialized to 0 and used as EMAs are
|
||||
similarly biased. This function creates the debias updated amount according to
|
||||
a scale factor, as in https://arxiv.org/abs/1412.6980.
|
||||
a scale factor, as in (Kingma et al., 2015).
|
||||
|
||||
To demonstrate the bias the results from 0-initialization, take an EMA that
|
||||
was initialized to `0` with decay `b`. After `t` timesteps of seeing the
|
||||
@ -201,6 +205,11 @@ def _zero_debias(unbiased_var, value, decay):
|
||||
Returns:
|
||||
The amount that the unbiased variable should be updated. Computing this
|
||||
tensor will also update the shadow variables appropriately.
|
||||
|
||||
References:
|
||||
A Method for Stochastic Optimization:
|
||||
[Kingma et al., 2015](https://arxiv.org/abs/1412.6980)
|
||||
([pdf](https://arxiv.org/pdf/1412.6980.pdf))
|
||||
"""
|
||||
with variable_scope.variable_scope(
|
||||
unbiased_var.name[:-len(":0")], values=[unbiased_var,
|
||||
|
@ -31,7 +31,13 @@ class ProximalAdagradOptimizer(optimizer.Optimizer):
|
||||
# pylint: disable=line-too-long
|
||||
"""Optimizer that implements the Proximal Adagrad algorithm.
|
||||
|
||||
See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
|
||||
References:
|
||||
Adaptive Subgradient Methods for Online Learning and Stochastic Optimization:
|
||||
[Duchi et al., 2011](http://jmlr.org/papers/v12/duchi11a.html)
|
||||
([pdf](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf))
|
||||
Efficient Learning using Forward-Backward Splitting:
|
||||
[Duchi et al., 2009](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting)
|
||||
([pdf](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf))
|
||||
"""
|
||||
|
||||
def __init__(self, learning_rate, initial_accumulator_value=0.1,
|
||||
|
@ -32,7 +32,10 @@ class ProximalGradientDescentOptimizer(optimizer.Optimizer):
|
||||
# pylint: disable=line-too-long
|
||||
"""Optimizer that implements the proximal gradient descent algorithm.
|
||||
|
||||
See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
|
||||
References:
|
||||
Efficient Learning using Forward-Backward Splitting:
|
||||
[Duchi et al., 2009](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting)
|
||||
([pdf](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf))
|
||||
"""
|
||||
|
||||
def __init__(self, learning_rate, l1_regularization_strength=0.0,
|
||||
|
@ -52,10 +52,12 @@ from tensorflow.python.util.tf_export import tf_export
|
||||
|
||||
@tf_export(v1=["train.RMSPropOptimizer"])
|
||||
class RMSPropOptimizer(optimizer.Optimizer):
|
||||
"""Optimizer that implements the RMSProp algorithm.
|
||||
"""Optimizer that implements the RMSProp algorithm (Tielemans et al. 2012).
|
||||
|
||||
See the
|
||||
[paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
|
||||
References:
|
||||
Coursera slide 29:
|
||||
Hinton, 2012
|
||||
([pdf](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf))
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
Loading…
Reference in New Issue
Block a user