Clean up optimizer docstrings.
PiperOrigin-RevId: 304544799 Change-Id: Ic83206c54cbf8437a4d6ff693f139412b5bdcee8
This commit is contained in:
parent
8a370a0077
commit
f6302e4ec7
@ -12,8 +12,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
"""Adadelta optimizer implementation."""
|
||||||
"""Adadelta for TensorFlow."""
|
# pylint: disable=g-classes-have-attributes
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
@ -34,23 +34,9 @@ class Adadelta(optimizer_v2.OptimizerV2):
|
|||||||
|
|
||||||
Adadelta optimization is a stochastic gradient descent method that is based on
|
Adadelta optimization is a stochastic gradient descent method that is based on
|
||||||
adaptive learning rate per dimension to address two drawbacks:
|
adaptive learning rate per dimension to address two drawbacks:
|
||||||
1) the continual decay of learning rates throughout training
|
|
||||||
2) the need for a manually selected global learning rate
|
|
||||||
|
|
||||||
Two accumulation steps are required:
|
- The continual decay of learning rates throughout training
|
||||||
1) the accumulation of gradients squared,
|
- The need for a manually selected global learning rate
|
||||||
2) the accumulation of updates squared.
|
|
||||||
|
|
||||||
Initialization:
|
|
||||||
|
|
||||||
$$E[g^2]_0 := 0 \text{(Initialize gradient 2nd order moment vector)}$$
|
|
||||||
$$E[\Delta x^2]_0 := 0 \text{(Initialize 2nd order variable update)}$$
|
|
||||||
|
|
||||||
$$t := t + 1$$
|
|
||||||
$$E[g^2]_t := \rho * E[g^2]_{t-1} + (1 - \rho) * g^2$$
|
|
||||||
$$\Delta x_t = -RMS[\Delta x]_{t-1} * g_t / RMS[g]_t$$
|
|
||||||
$$E[\Delta x^2]_t := \rho * E[\Delta x^2]_{t-1} + (1 - \rho) * \Delta x_t^2$$
|
|
||||||
$$x_t := x_{t-1} + \Delta x_{t}$$
|
|
||||||
|
|
||||||
Adadelta is a more robust extension of Adagrad that adapts learning rates
|
Adadelta is a more robust extension of Adagrad that adapts learning rates
|
||||||
based on a moving window of gradient updates, instead of accumulating all
|
based on a moving window of gradient updates, instead of accumulating all
|
||||||
@ -59,16 +45,22 @@ class Adadelta(optimizer_v2.OptimizerV2):
|
|||||||
don't have to set an initial learning rate. In this version, initial
|
don't have to set an initial learning rate. In this version, initial
|
||||||
learning rate can be set, as in most other Keras optimizers.
|
learning rate can be set, as in most other Keras optimizers.
|
||||||
|
|
||||||
@compatibility(eager)
|
Args:
|
||||||
When eager execution is enabled, `learning_rate`, `rho`, and `epsilon` can
|
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
||||||
each be a callable that takes no arguments and returns the actual value to
|
`tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
|
||||||
use. This can be useful for changing these values across different
|
To match the exact form in the original paper use 1.0.
|
||||||
invocations of optimizer functions.
|
rho: A `Tensor` or a floating point value. The decay rate.
|
||||||
@end_compatibility
|
epsilon: A `Tensor` or a floating point value. A constant epsilon used
|
||||||
|
to better conditioning the grad update.
|
||||||
|
name: Optional name prefix for the operations created when applying
|
||||||
|
gradients. Defaults to `"Adadelta"`.
|
||||||
|
**kwargs: Keyword arguments. Allowed to be one of
|
||||||
|
`"clipnorm"` or `"clipvalue"`.
|
||||||
|
`"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
|
||||||
|
gradients by value.
|
||||||
|
|
||||||
References
|
Reference:
|
||||||
See [M. D. Zeiler](http://arxiv.org/abs/1212.5701)
|
- [Zeiler, 2012](http://arxiv.org/abs/1212.5701)
|
||||||
([pdf](http://arxiv.org/pdf/1212.5701v1.pdf))
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_HAS_AGGREGATE_GRAD = True
|
_HAS_AGGREGATE_GRAD = True
|
||||||
@ -79,23 +71,6 @@ class Adadelta(optimizer_v2.OptimizerV2):
|
|||||||
epsilon=1e-7,
|
epsilon=1e-7,
|
||||||
name='Adadelta',
|
name='Adadelta',
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""Construct a new Adadelta optimizer.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
|
||||||
`tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
|
|
||||||
To match the exact form in the original paper use 1.0.
|
|
||||||
rho: A `Tensor` or a floating point value. The decay rate.
|
|
||||||
epsilon: A `Tensor` or a floating point value. A constant epsilon used
|
|
||||||
to better conditioning the grad update.
|
|
||||||
name: Optional name prefix for the operations created when applying
|
|
||||||
gradients. Defaults to "Adadelta".
|
|
||||||
**kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
|
|
||||||
`decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
|
|
||||||
gradients by value, `decay` is included for backward compatibility to
|
|
||||||
allow time inverse decay of learning rate. `lr` is included for backward
|
|
||||||
compatibility, recommended to use `learning_rate` instead.
|
|
||||||
"""
|
|
||||||
super(Adadelta, self).__init__(name, **kwargs)
|
super(Adadelta, self).__init__(name, **kwargs)
|
||||||
self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
|
self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
|
||||||
self._set_hyper('decay', self._initial_decay)
|
self._set_hyper('decay', self._initial_decay)
|
||||||
|
@ -12,8 +12,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
"""Adagrad optimizer implementation."""
|
||||||
"""Adagrad for TensorFlow."""
|
# pylint: disable=g-classes-have-attributes
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
@ -39,26 +39,22 @@ class Adagrad(optimizer_v2.OptimizerV2):
|
|||||||
updated during training. The more updates a parameter receives,
|
updated during training. The more updates a parameter receives,
|
||||||
the smaller the updates.
|
the smaller the updates.
|
||||||
|
|
||||||
Initialization:
|
Args:
|
||||||
$$accum_{g_0} := \text{initial_accumulator_value}$$
|
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
||||||
|
`tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
|
||||||
|
initial_accumulator_value: A floating point value.
|
||||||
|
Starting value for the accumulators, must be non-negative.
|
||||||
|
epsilon: A small floating point value to avoid zero denominator.
|
||||||
|
name: Optional name prefix for the operations created when applying
|
||||||
|
gradients. Defaults to `"Adagrad"`.
|
||||||
|
**kwargs: Keyword arguments. Allowed to be one of
|
||||||
|
`"clipnorm"` or `"clipvalue"`.
|
||||||
|
`"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
|
||||||
|
gradients by value.
|
||||||
|
|
||||||
Update step:
|
Reference:
|
||||||
$$t := t + 1$$
|
- [Duchi et al., 2011](
|
||||||
$$accum_{g_t} := accum_{g_{t-1}} + g^2$$
|
http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
|
||||||
$$\theta_t := \theta_{t-1} - lr * g / (\sqrt{accum_{g_t}} + \epsilon)$$
|
|
||||||
|
|
||||||
@compatibility(eager)
|
|
||||||
When eager execution is enabled, `learning_rate` can be a callable that
|
|
||||||
takes no arguments and returns the actual value to use. This can be useful
|
|
||||||
for changing these values across different invocations of optimizer
|
|
||||||
functions.
|
|
||||||
@end_compatibility
|
|
||||||
|
|
||||||
References:
|
|
||||||
|
|
||||||
* [Paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
|
|
||||||
* [Introduction]
|
|
||||||
(https://ppasupat.github.io/a9online/uploads/proximal_notes.pdf).
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_HAS_AGGREGATE_GRAD = True
|
_HAS_AGGREGATE_GRAD = True
|
||||||
@ -69,25 +65,6 @@ class Adagrad(optimizer_v2.OptimizerV2):
|
|||||||
epsilon=1e-7,
|
epsilon=1e-7,
|
||||||
name='Adagrad',
|
name='Adagrad',
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""Construct a new Adagrad optimizer.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
|
||||||
`tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
|
|
||||||
initial_accumulator_value: A floating point value.
|
|
||||||
Starting value for the accumulators, must be non-negative.
|
|
||||||
epsilon: A small floating point value to avoid zero denominator.
|
|
||||||
name: Optional name prefix for the operations created when applying
|
|
||||||
gradients. Defaults to "Adagrad".
|
|
||||||
**kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
|
|
||||||
`decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
|
|
||||||
gradients by value, `decay` is included for backward compatibility to
|
|
||||||
allow time inverse decay of learning rate. `lr` is included for backward
|
|
||||||
compatibility, recommended to use `learning_rate` instead.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ValueError: If the `initial_accumulator_value` or `epsilon` is invalid.
|
|
||||||
"""
|
|
||||||
if initial_accumulator_value < 0.0:
|
if initial_accumulator_value < 0.0:
|
||||||
raise ValueError('initial_accumulator_value must be non-negative: %s' %
|
raise ValueError('initial_accumulator_value must be non-negative: %s' %
|
||||||
initial_accumulator_value)
|
initial_accumulator_value)
|
||||||
|
@ -12,7 +12,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
"""Adam for TensorFlow."""
|
"""Adam optimizer implementation."""
|
||||||
|
# pylint: disable=g-classes-have-attributes
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
@ -35,50 +36,58 @@ class Adam(optimizer_v2.OptimizerV2):
|
|||||||
|
|
||||||
Adam optimization is a stochastic gradient descent method that is based on
|
Adam optimization is a stochastic gradient descent method that is based on
|
||||||
adaptive estimation of first-order and second-order moments.
|
adaptive estimation of first-order and second-order moments.
|
||||||
According to the paper
|
|
||||||
[Adam: A Method for Stochastic Optimization. Kingma et al.,
|
According to
|
||||||
2014](http://arxiv.org/abs/1412.6980), the method is "*computationally
|
[Kingma et al., 2014](http://arxiv.org/abs/1412.6980),
|
||||||
|
the method is "*computationally
|
||||||
efficient, has little memory requirement, invariant to diagonal rescaling of
|
efficient, has little memory requirement, invariant to diagonal rescaling of
|
||||||
gradients, and is well suited for problems that are large in terms of
|
gradients, and is well suited for problems that are large in terms of
|
||||||
data/parameters*".
|
data/parameters*".
|
||||||
|
|
||||||
For AMSGrad see [On The Convergence Of Adam And Beyond.
|
Args:
|
||||||
Reddi et al., 5-8](https://openreview.net/pdf?id=ryQu7f-RZ).
|
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
||||||
|
`tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
|
||||||
|
that takes no arguments and returns the actual value to use, The
|
||||||
|
learning rate. Defaults to 0.001.
|
||||||
|
beta_1: A float value or a constant float tensor, or a callable
|
||||||
|
that takes no arguments and returns the actual value to use. The
|
||||||
|
exponential decay rate for the 1st moment estimates. Defaults to 0.9.
|
||||||
|
beta_2: A float value or a constant float tensor, or a callable
|
||||||
|
that takes no arguments and returns the actual value to use, The
|
||||||
|
exponential decay rate for the 2nd moment estimates. Defaults to 0.999.
|
||||||
|
epsilon: A small constant for numerical stability. This epsilon is
|
||||||
|
"epsilon hat" in the Kingma and Ba paper (in the formula just before
|
||||||
|
Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
|
||||||
|
1e-7.
|
||||||
|
amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
|
||||||
|
the paper "On the Convergence of Adam and beyond". Defaults to `False`.
|
||||||
|
name: Optional name for the operations created when applying gradients.
|
||||||
|
Defaults to `"Adam"`.
|
||||||
|
**kwargs: Keyword arguments. Allowed to be one of
|
||||||
|
`"clipnorm"` or `"clipvalue"`.
|
||||||
|
`"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
|
||||||
|
gradients by value.
|
||||||
|
|
||||||
**If amsgrad = False**:
|
Usage:
|
||||||
|
|
||||||
initialize $m_0$ as 1st moment vector
|
>>> opt = tf.keras.optimizers.Adam(learning_rate=0.1)
|
||||||
initialize $v_0$ as 2nd moment vector
|
>>> var1 = tf.Variable(10.0)
|
||||||
|
>>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) == var1
|
||||||
|
>>> step_count = opt.minimize(loss, [var1]).numpy()
|
||||||
|
>>> # The first step is `-learning_rate*sign(grad)`
|
||||||
|
>>> var1.numpy()
|
||||||
|
9.9
|
||||||
|
|
||||||
The update rule for $\theta$ with gradient $g$ uses an optimization
|
Reference:
|
||||||
described at the end of section 2 of the paper:
|
- [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
|
||||||
|
- [Reddi et al., 2018](
|
||||||
|
https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.
|
||||||
|
|
||||||
$$lr_t = \mathrm{learning\_rate} *
|
Notes:
|
||||||
\sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
|
|
||||||
$$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
|
|
||||||
$$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
|
|
||||||
$$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
|
|
||||||
|
|
||||||
**If amsgrad = True**:
|
|
||||||
|
|
||||||
initialize $m_0$ as 1st moment vector
|
|
||||||
initialize $v_0$ as 2nd moment vector
|
|
||||||
initialize $\hat{v}_0$ as 2nd moment vector
|
|
||||||
|
|
||||||
The update rule for $\theta$ with gradient $g$ uses an optimization
|
|
||||||
described at the end of section 2 of the paper:
|
|
||||||
|
|
||||||
$$lr_t = \mathrm{learning\_rate} *
|
|
||||||
\sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
|
|
||||||
|
|
||||||
$$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
|
|
||||||
$$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
|
|
||||||
$$\hat{v}_t = \max(\hat{v}_{t-1}, v_t)$$
|
|
||||||
$$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{\hat{v}_t} + \epsilon)$$
|
|
||||||
|
|
||||||
The default value of 1e-7 for epsilon might not be a good default in
|
The default value of 1e-7 for epsilon might not be a good default in
|
||||||
general. For example, when training an Inception network on ImageNet a
|
general. For example, when training an Inception network on ImageNet a
|
||||||
current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
|
current good choice is 1.0 or 0.1. Note that since Adam uses the
|
||||||
formulation just before Section 2.1 of the Kingma and Ba paper rather than
|
formulation just before Section 2.1 of the Kingma and Ba paper rather than
|
||||||
the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
|
the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
|
||||||
hat" in the paper.
|
hat" in the paper.
|
||||||
@ -91,16 +100,6 @@ class Adam(optimizer_v2.OptimizerV2):
|
|||||||
accumulator. This means that the sparse behavior is equivalent to the dense
|
accumulator. This means that the sparse behavior is equivalent to the dense
|
||||||
behavior (in contrast to some momentum implementations which ignore momentum
|
behavior (in contrast to some momentum implementations which ignore momentum
|
||||||
unless a variable slice was actually used).
|
unless a variable slice was actually used).
|
||||||
|
|
||||||
Usage:
|
|
||||||
|
|
||||||
>>> opt = tf.keras.optimizers.Adam(learning_rate=0.1)
|
|
||||||
>>> var1 = tf.Variable(10.0)
|
|
||||||
>>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) == var1
|
|
||||||
>>> step_count = opt.minimize(loss, [var1]).numpy()
|
|
||||||
>>> # The first step is `-learning_rate*sign(grad)`
|
|
||||||
>>> var1.numpy()
|
|
||||||
9.9
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_HAS_AGGREGATE_GRAD = True
|
_HAS_AGGREGATE_GRAD = True
|
||||||
@ -113,34 +112,6 @@ class Adam(optimizer_v2.OptimizerV2):
|
|||||||
amsgrad=False,
|
amsgrad=False,
|
||||||
name='Adam',
|
name='Adam',
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""Construct a new Adam optimizer.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
|
||||||
`tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
|
|
||||||
that takes no arguments and returns the actual value to use, The
|
|
||||||
learning rate. Defaults to 0.001.
|
|
||||||
beta_1: A float value or a constant float tensor, or a callable
|
|
||||||
that takes no arguments and returns the actual value to use. The
|
|
||||||
exponential decay rate for the 1st moment estimates. Defaults to 0.9.
|
|
||||||
beta_2: A float value or a constant float tensor, or a callable
|
|
||||||
that takes no arguments and returns the actual value to use, The
|
|
||||||
exponential decay rate for the 2nd moment estimates. Defaults to 0.999.
|
|
||||||
epsilon: A small constant for numerical stability. This epsilon is
|
|
||||||
"epsilon hat" in the Kingma and Ba paper (in the formula just before
|
|
||||||
Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
|
|
||||||
1e-7.
|
|
||||||
amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
|
|
||||||
the paper "On the Convergence of Adam and beyond". Defaults to `False`.
|
|
||||||
name: Optional name for the operations created when applying gradients.
|
|
||||||
Defaults to "Adam".
|
|
||||||
**kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
|
|
||||||
`decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
|
|
||||||
gradients by value, `decay` is included for backward compatibility to
|
|
||||||
allow time inverse decay of learning rate. `lr` is included for backward
|
|
||||||
compatibility, recommended to use `learning_rate` instead.
|
|
||||||
"""
|
|
||||||
|
|
||||||
super(Adam, self).__init__(name, **kwargs)
|
super(Adam, self).__init__(name, **kwargs)
|
||||||
self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
|
self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
|
||||||
self._set_hyper('decay', self._initial_decay)
|
self._set_hyper('decay', self._initial_decay)
|
||||||
@ -329,7 +300,7 @@ class NonFusedAdam(optimizer_v2.OptimizerV2):
|
|||||||
|
|
||||||
The default value of 1e-7 for epsilon might not be a good default in
|
The default value of 1e-7 for epsilon might not be a good default in
|
||||||
general. For example, when training an Inception network on ImageNet a
|
general. For example, when training an Inception network on ImageNet a
|
||||||
current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
|
current good choice is 1.0 or 0.1. Note that since Adam uses the
|
||||||
formulation just before Section 2.1 of the Kingma and Ba paper rather than
|
formulation just before Section 2.1 of the Kingma and Ba paper rather than
|
||||||
the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
|
the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
|
||||||
hat" in the paper.
|
hat" in the paper.
|
||||||
|
@ -12,8 +12,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
"""Adamax optimizer implementation."""
|
||||||
"""Adamax for TensorFlow."""
|
# pylint: disable=g-classes-have-attributes
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
@ -39,27 +39,27 @@ class Adamax(optimizer_v2.OptimizerV2):
|
|||||||
|
|
||||||
Initialization:
|
Initialization:
|
||||||
|
|
||||||
```
|
```python
|
||||||
m_0 <- 0 (Initialize initial 1st moment vector)
|
m = 0 # Initialize initial 1st moment vector
|
||||||
v_0 <- 0 (Initialize the exponentially weighted infinity norm)
|
v = 0 # Initialize the exponentially weighted infinity norm
|
||||||
t <- 0 (Initialize timestep)
|
t = 0 # Initialize timestep
|
||||||
```
|
```
|
||||||
|
|
||||||
The update rule for `variable` with gradient `g` uses an optimization
|
The update rule for parameter `w` with gradient `g` is
|
||||||
described at the end of section 7.1 of the paper:
|
described at the end of section 7.1 of the paper:
|
||||||
|
|
||||||
```
|
```python
|
||||||
t <- t + 1
|
t += 1
|
||||||
|
m = beta1 * m + (1 - beta) * g
|
||||||
m_t <- beta1 * m_{t-1} + (1 - beta1) * g
|
v = max(beta2 * v, abs(g))
|
||||||
v_t <- max(beta2 * v_{t-1}, abs(g))
|
current_lr = learning_rate / (1 - beta1 ** t)
|
||||||
variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
|
w = w - current_lr * m / (v + epsilon)
|
||||||
```
|
```
|
||||||
|
|
||||||
Similar to AdamOptimizer, the epsilon is added for numerical stability
|
Similarly to `Adam`, the epsilon is added for numerical stability
|
||||||
(especially to get rid of division by zero when v_t = 0).
|
(especially to get rid of division by zero when `v_t == 0`).
|
||||||
|
|
||||||
Contrast to AdamOptimizer, the sparse implementation of this algorithm
|
In contrast to `Adam`, the sparse implementation of this algorithm
|
||||||
(used when the gradient is an IndexedSlices object, typically because of
|
(used when the gradient is an IndexedSlices object, typically because of
|
||||||
`tf.gather` or an embedding lookup in the forward pass) only updates
|
`tf.gather` or an embedding lookup in the forward pass) only updates
|
||||||
variable slices and corresponding `m_t`, `v_t` terms when that part of
|
variable slices and corresponding `m_t`, `v_t` terms when that part of
|
||||||
@ -68,9 +68,23 @@ class Adamax(optimizer_v2.OptimizerV2):
|
|||||||
implementations which ignore momentum unless a variable slice was actually
|
implementations which ignore momentum unless a variable slice was actually
|
||||||
used).
|
used).
|
||||||
|
|
||||||
References
|
Args:
|
||||||
see Section 7 of [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
|
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
||||||
([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
|
`tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
|
||||||
|
beta_1: A float value or a constant float tensor. The exponential decay
|
||||||
|
rate for the 1st moment estimates.
|
||||||
|
beta_2: A float value or a constant float tensor. The exponential decay
|
||||||
|
rate for the exponentially weighted infinity norm.
|
||||||
|
epsilon: A small constant for numerical stability.
|
||||||
|
name: Optional name for the operations created when applying gradients.
|
||||||
|
Defaults to `"Adamax"`.
|
||||||
|
**kwargs: Keyword arguments. Allowed to be one of
|
||||||
|
`"clipnorm"` or `"clipvalue"`.
|
||||||
|
`"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
|
||||||
|
gradients by value.
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
- [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_HAS_AGGREGATE_GRAD = True
|
_HAS_AGGREGATE_GRAD = True
|
||||||
@ -82,24 +96,6 @@ class Adamax(optimizer_v2.OptimizerV2):
|
|||||||
epsilon=1e-7,
|
epsilon=1e-7,
|
||||||
name='Adamax',
|
name='Adamax',
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""Construct a new Adamax optimizer.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
|
||||||
`tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
|
|
||||||
beta_1: A float value or a constant float tensor. The exponential decay
|
|
||||||
rate for the 1st moment estimates.
|
|
||||||
beta_2: A float value or a constant float tensor. The exponential decay
|
|
||||||
rate for the exponentially weighted infinity norm.
|
|
||||||
epsilon: A small constant for numerical stability.
|
|
||||||
name: Optional name for the operations created when applying gradients.
|
|
||||||
Defaults to "Adamax".
|
|
||||||
**kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
|
|
||||||
`decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
|
|
||||||
gradients by value, `decay` is included for backward compatibility to
|
|
||||||
allow time inverse decay of learning rate. `lr` is included for backward
|
|
||||||
compatibility, recommended to use `learning_rate` instead.
|
|
||||||
"""
|
|
||||||
super(Adamax, self).__init__(name, **kwargs)
|
super(Adamax, self).__init__(name, **kwargs)
|
||||||
self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
|
self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
|
||||||
self._set_hyper('decay', self._initial_decay)
|
self._set_hyper('decay', self._initial_decay)
|
||||||
|
@ -12,7 +12,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
"""Ftrl-proximal for TensorFlow."""
|
"""Ftrl-proximal optimizer implementation."""
|
||||||
|
# pylint: disable=g-classes-have-attributes
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
@ -35,26 +36,32 @@ class Ftrl(optimizer_v2.OptimizerV2):
|
|||||||
above) and shrinkage-type L2 (which is the addition of an L2 penalty to the
|
above) and shrinkage-type L2 (which is the addition of an L2 penalty to the
|
||||||
loss function).
|
loss function).
|
||||||
|
|
||||||
Initialization:
|
Args:
|
||||||
$$t = 0$$
|
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
||||||
$$n_{0} = 0$$
|
`tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
|
||||||
$$\sigma_{0} = 0$$
|
learning_rate_power: A float value, must be less or equal to zero.
|
||||||
$$z_{0} = 0$$
|
Controls how the learning rate decreases during training. Use zero for
|
||||||
|
a fixed learning rate.
|
||||||
|
initial_accumulator_value: The starting value for accumulators.
|
||||||
|
Only zero or positive values are allowed.
|
||||||
|
l1_regularization_strength: A float value, must be greater than or
|
||||||
|
equal to zero.
|
||||||
|
l2_regularization_strength: A float value, must be greater than or
|
||||||
|
equal to zero.
|
||||||
|
name: Optional name prefix for the operations created when applying
|
||||||
|
gradients. Defaults to `"Ftrl"`.
|
||||||
|
l2_shrinkage_regularization_strength: A float value, must be greater than
|
||||||
|
or equal to zero. This differs from L2 above in that the L2 above is a
|
||||||
|
stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
|
||||||
|
When input is sparse shrinkage will only happen on the active weights.
|
||||||
|
**kwargs: Keyword arguments. Allowed to be one of
|
||||||
|
`"clipnorm"` or `"clipvalue"`.
|
||||||
|
`"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
|
||||||
|
gradients by value.
|
||||||
|
|
||||||
Update ($$i$$ is variable index):
|
Reference:
|
||||||
$$t = t + 1$$
|
- [paper](
|
||||||
$$n_{t,i} = n_{t-1,i} + g_{t,i}^{2}$$
|
https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
|
||||||
$$\sigma_{t,i} = (\sqrt{n_{t,i}} - \sqrt{n_{t-1,i}}) / \alpha$$
|
|
||||||
$$z_{t,i} = z_{t-1,i} + g_{t,i} - \sigma_{t,i} * w_{t,i}$$
|
|
||||||
$$w_{t,i} = - ((\beta+\sqrt{n+{t}}) / \alpha + \lambda_{2})^{-1} * (z_{i} -
|
|
||||||
sgn(z_{i}) * \lambda_{1}) if \abs{z_{i}} > \lambda_{i} else 0$$
|
|
||||||
|
|
||||||
Check the documentation for the l2_shrinkage_regularization_strength
|
|
||||||
parameter for more details when shrinkage is enabled, where gradient is
|
|
||||||
replaced with gradient_with_shrinkage.
|
|
||||||
|
|
||||||
References: See
|
|
||||||
[paper](https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
@ -66,44 +73,6 @@ class Ftrl(optimizer_v2.OptimizerV2):
|
|||||||
name='Ftrl',
|
name='Ftrl',
|
||||||
l2_shrinkage_regularization_strength=0.0,
|
l2_shrinkage_regularization_strength=0.0,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
r"""Construct a new FTRL optimizer.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
|
||||||
`tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
|
|
||||||
learning_rate_power: A float value, must be less or equal to zero.
|
|
||||||
Controls how the learning rate decreases during training. Use zero for
|
|
||||||
a fixed learning rate.
|
|
||||||
initial_accumulator_value: The starting value for accumulators.
|
|
||||||
Only zero or positive values are allowed.
|
|
||||||
l1_regularization_strength: A float value, must be greater than or
|
|
||||||
equal to zero.
|
|
||||||
l2_regularization_strength: A float value, must be greater than or
|
|
||||||
equal to zero.
|
|
||||||
name: Optional name prefix for the operations created when applying
|
|
||||||
gradients. Defaults to "Ftrl".
|
|
||||||
l2_shrinkage_regularization_strength: A float value, must be greater than
|
|
||||||
or equal to zero. This differs from L2 above in that the L2 above is a
|
|
||||||
stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
|
|
||||||
The FTRL formulation can be written as:
|
|
||||||
w_{t+1} = argmin_w(\hat{g}_{1:t}w + L1*||w||_1 + L2*||w||_2^2), where
|
|
||||||
\hat{g} = g + (2*L2_shrinkage*w), and g is the gradient of the loss
|
|
||||||
function w.r.t. the weights w.
|
|
||||||
Specifically, in the absence of L1 regularization, it is equivalent to
|
|
||||||
the following update rule:
|
|
||||||
w_{t+1} = w_t - lr_t / (1 + 2*L2*lr_t) * g_t -
|
|
||||||
2*L2_shrinkage*lr_t / (1 + 2*L2*lr_t) * w_t
|
|
||||||
where lr_t is the learning rate at t.
|
|
||||||
When input is sparse shrinkage will only happen on the active weights.\
|
|
||||||
**kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
|
|
||||||
`decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
|
|
||||||
gradients by value, `decay` is included for backward compatibility to
|
|
||||||
allow time inverse decay of learning rate. `lr` is included for backward
|
|
||||||
compatibility, recommended to use `learning_rate` instead.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ValueError: If one of the arguments is invalid.
|
|
||||||
"""
|
|
||||||
super(Ftrl, self).__init__(name, **kwargs)
|
super(Ftrl, self).__init__(name, **kwargs)
|
||||||
|
|
||||||
if initial_accumulator_value < 0.0:
|
if initial_accumulator_value < 0.0:
|
||||||
|
@ -12,7 +12,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
"""Momentum for TensorFlow."""
|
"""SGD optimizer implementation."""
|
||||||
|
# pylint: disable=g-classes-have-attributes
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
@ -27,17 +28,45 @@ from tensorflow.python.util.tf_export import keras_export
|
|||||||
|
|
||||||
@keras_export("keras.optimizers.SGD")
|
@keras_export("keras.optimizers.SGD")
|
||||||
class SGD(optimizer_v2.OptimizerV2):
|
class SGD(optimizer_v2.OptimizerV2):
|
||||||
r"""Stochastic gradient descent and momentum optimizer.
|
r"""Gradient descent (with momentum) optimizer.
|
||||||
|
|
||||||
The update rule for $\theta$ with gradient $g$ when `momentum` is 0.0:
|
Update rule for parameter `w` with gradient `g` when `momentum` is 0:
|
||||||
$$\theta_t = \theta_{t-1} - \mathrm{learning\_rate} * g_t$$
|
|
||||||
|
|
||||||
The update rule when `momentum` is larger than 0.0:
|
```python
|
||||||
$$v_t = \mathrm{momentum} * v_{t-1} - \mathrm{learning\_rate} * g_t$$
|
w = w - learning_rate * g
|
||||||
$$\theta_t = \theta_{t-1} + v_t$$
|
```
|
||||||
if `nesterov` is False, gradient is evaluated at $\theta_t$.
|
|
||||||
if `nesterov` is True, gradient is evaluated at $\theta_t + momentum * v_t$,
|
Update rule when `momentum` is larger than 0:
|
||||||
and the variables always store $\theta + m v$ instead of $theta$
|
|
||||||
|
```python
|
||||||
|
velocity = momentum * velocity - learning_rate * g
|
||||||
|
w = w * velocity
|
||||||
|
```
|
||||||
|
|
||||||
|
When `nesterov=False`, this rule becomes:
|
||||||
|
|
||||||
|
```python
|
||||||
|
velocity = momentum * velocity - learning_rate * g
|
||||||
|
w = w + momentum * velocity - learning_rate * g
|
||||||
|
```
|
||||||
|
|
||||||
|
Args:
|
||||||
|
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
||||||
|
`tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
|
||||||
|
that takes no arguments and returns the actual value to use. The
|
||||||
|
learning rate. Defaults to 0.01.
|
||||||
|
momentum: float hyperparameter >= 0 that accelerates gradient descent
|
||||||
|
in the relevant
|
||||||
|
direction and dampens oscillations. Defaults to 0, i.e., vanilla gradient
|
||||||
|
descent.
|
||||||
|
nesterov: boolean. Whether to apply Nesterov momentum.
|
||||||
|
Defaults to `False`.
|
||||||
|
name: Optional name prefix for the operations created when applying
|
||||||
|
gradients. Defaults to `"SGD"`.
|
||||||
|
**kwargs: Keyword arguments. Allowed to be one of
|
||||||
|
`"clipnorm"` or `"clipvalue"`.
|
||||||
|
`"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
|
||||||
|
gradients by value.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
|
|
||||||
@ -45,7 +74,7 @@ class SGD(optimizer_v2.OptimizerV2):
|
|||||||
>>> var = tf.Variable(1.0)
|
>>> var = tf.Variable(1.0)
|
||||||
>>> loss = lambda: (var ** 2)/2.0 # d(loss)/d(var1) = var1
|
>>> loss = lambda: (var ** 2)/2.0 # d(loss)/d(var1) = var1
|
||||||
>>> step_count = opt.minimize(loss, [var]).numpy()
|
>>> step_count = opt.minimize(loss, [var]).numpy()
|
||||||
>>> # Step is `-learning_rate*grad`
|
>>> # Step is `- learning_rate * grad`
|
||||||
>>> var.numpy()
|
>>> var.numpy()
|
||||||
0.9
|
0.9
|
||||||
|
|
||||||
@ -53,7 +82,7 @@ class SGD(optimizer_v2.OptimizerV2):
|
|||||||
>>> var = tf.Variable(1.0)
|
>>> var = tf.Variable(1.0)
|
||||||
>>> val0 = var.value()
|
>>> val0 = var.value()
|
||||||
>>> loss = lambda: (var ** 2)/2.0 # d(loss)/d(var1) = var1
|
>>> loss = lambda: (var ** 2)/2.0 # d(loss)/d(var1) = var1
|
||||||
>>> # First step is `-learning_rate*grad`
|
>>> # First step is `- learning_rate * grad`
|
||||||
>>> step_count = opt.minimize(loss, [var]).numpy()
|
>>> step_count = opt.minimize(loss, [var]).numpy()
|
||||||
>>> val1 = var.value()
|
>>> val1 = var.value()
|
||||||
>>> (val0 - val1).numpy()
|
>>> (val0 - val1).numpy()
|
||||||
@ -64,13 +93,8 @@ class SGD(optimizer_v2.OptimizerV2):
|
|||||||
>>> (val1 - val2).numpy()
|
>>> (val1 - val2).numpy()
|
||||||
0.18
|
0.18
|
||||||
|
|
||||||
Some of the args below are hyperparameters, where a hyperparameter is
|
Reference:
|
||||||
defined as a scalar Tensor, a regular Python value, or a callable (which
|
- For `nesterov=True`, See [Sutskever et al., 2013](
|
||||||
will be evaluated when `apply_gradients` is called) returning a scalar
|
|
||||||
Tensor or a Python value.
|
|
||||||
|
|
||||||
# References
|
|
||||||
nesterov = True, See [Sutskever et al., 2013](
|
|
||||||
http://jmlr.org/proceedings/papers/v28/sutskever13.pdf).
|
http://jmlr.org/proceedings/papers/v28/sutskever13.pdf).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -82,25 +106,6 @@ class SGD(optimizer_v2.OptimizerV2):
|
|||||||
nesterov=False,
|
nesterov=False,
|
||||||
name="SGD",
|
name="SGD",
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""Construct a new Stochastic Gradient Descent or Momentum optimizer.
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
|
||||||
`tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
|
|
||||||
that takes no arguments and returns the actual value to use. The
|
|
||||||
learning rate. Defaults to 0.01.
|
|
||||||
momentum: float hyperparameter >= 0 that accelerates SGD in the relevant
|
|
||||||
direction and dampens oscillations. Defaults to 0.0, i.e., SGD.
|
|
||||||
nesterov: boolean. Whether to apply Nesterov momentum.
|
|
||||||
Defaults to `False`.
|
|
||||||
name: Optional name prefix for the operations created when applying
|
|
||||||
gradients. Defaults to 'SGD'.
|
|
||||||
**kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
|
|
||||||
`decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
|
|
||||||
gradients by value, `decay` is included for backward compatibility to
|
|
||||||
allow time inverse decay of learning rate. `lr` is included for backward
|
|
||||||
compatibility, recommended to use `learning_rate` instead.
|
|
||||||
"""
|
|
||||||
super(SGD, self).__init__(name, **kwargs)
|
super(SGD, self).__init__(name, **kwargs)
|
||||||
self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
|
self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
|
||||||
self._set_hyper("decay", self._initial_decay)
|
self._set_hyper("decay", self._initial_decay)
|
||||||
|
@ -12,7 +12,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
"""Nadam for TensorFlow."""
|
"""Nadam optimizer implementation."""
|
||||||
|
# pylint: disable=g-classes-have-attributes
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
@ -36,29 +37,22 @@ class Nadam(optimizer_v2.OptimizerV2):
|
|||||||
Much like Adam is essentially RMSprop with momentum, Nadam is Adam with
|
Much like Adam is essentially RMSprop with momentum, Nadam is Adam with
|
||||||
Nesterov momentum.
|
Nesterov momentum.
|
||||||
|
|
||||||
Initialization:
|
Args:
|
||||||
|
learning_rate: A Tensor or a floating point value. The learning rate.
|
||||||
|
beta_1: A float value or a constant float tensor. The exponential decay
|
||||||
|
rate for the 1st moment estimates.
|
||||||
|
beta_2: A float value or a constant float tensor. The exponential decay
|
||||||
|
rate for the exponentially weighted infinity norm.
|
||||||
|
epsilon: A small constant for numerical stability.
|
||||||
|
name: Optional name for the operations created when applying gradients.
|
||||||
|
Defaults to `"Nadam"`.
|
||||||
|
**kwargs: Keyword arguments. Allowed to be one of
|
||||||
|
`"clipnorm"` or `"clipvalue"`.
|
||||||
|
`"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
|
||||||
|
gradients by value.
|
||||||
|
|
||||||
$$m_0 := 0 \text{(Initialize 1st moment vector)}$$
|
Reference:
|
||||||
$$v_0 := 0 \text{(Initialize 2nd moment vector)}$$
|
- [Dozat, 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
|
||||||
$$mu_0 := 1$$
|
|
||||||
$$t := 0 \text{(Initialize timestep)}$$
|
|
||||||
|
|
||||||
Computes:
|
|
||||||
$$t := t + 1$$
|
|
||||||
$$\mu_t := \beta_1 * (1 - 0.5 * 0.96^{0.004 * t})$$
|
|
||||||
$$g' := g / (1 - \prod_{i=1}^{t}{\mu_i})$$
|
|
||||||
$$m_t := \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
|
|
||||||
$$m' := m_t / (1 - \prod_{i=1}^{t+1}{\mu_i})$$
|
|
||||||
$$v_t := \beta_2 * v_{t-1} + (1 - \beta_2) * g * g$$
|
|
||||||
$$v' := v_t / (1 - \beta_2^t)$$
|
|
||||||
$$\bar{m} := (1 - \mu_t) * g' + \mu_{t+1} * m'$$
|
|
||||||
$$\theta_t := \theta_{t-1} - lr * \bar{m} / (\sqrt{v'} + \epsilon)$$
|
|
||||||
|
|
||||||
gradient is evaluated at theta(t) + momentum * v(t), and the variables always
|
|
||||||
store theta + beta_1 * m / sqrt(v) instead of theta.
|
|
||||||
|
|
||||||
References
|
|
||||||
See [Dozat, T., 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_HAS_AGGREGATE_GRAD = True
|
_HAS_AGGREGATE_GRAD = True
|
||||||
@ -70,24 +64,6 @@ class Nadam(optimizer_v2.OptimizerV2):
|
|||||||
epsilon=1e-7,
|
epsilon=1e-7,
|
||||||
name='Nadam',
|
name='Nadam',
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""Construct a new Nadam optimizer.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
learning_rate: A Tensor or a floating point value. The learning rate.
|
|
||||||
beta_1: A float value or a constant float tensor. The exponential decay
|
|
||||||
rate for the 1st moment estimates.
|
|
||||||
beta_2: A float value or a constant float tensor. The exponential decay
|
|
||||||
rate for the exponentially weighted infinity norm.
|
|
||||||
epsilon: A small constant for numerical stability.
|
|
||||||
name: Optional name for the operations created when applying gradients.
|
|
||||||
Defaults to "Nadam".
|
|
||||||
**kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
|
|
||||||
`decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
|
|
||||||
gradients by value, `decay` is included for backward compatibility to
|
|
||||||
allow time inverse decay of learning rate. `lr` is included for backward
|
|
||||||
compatibility, recommended to use `learning_rate` instead.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Backwards compatibility with keras NAdam optimizer.
|
# Backwards compatibility with keras NAdam optimizer.
|
||||||
kwargs['decay'] = kwargs.pop('schedule_decay', 0.004)
|
kwargs['decay'] = kwargs.pop('schedule_decay', 0.004)
|
||||||
learning_rate = kwargs.get('lr', learning_rate)
|
learning_rate = kwargs.get('lr', learning_rate)
|
||||||
|
@ -12,7 +12,6 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
"""Version 2 of class Optimizer."""
|
"""Version 2 of class Optimizer."""
|
||||||
# pylint: disable=g-bad-name
|
# pylint: disable=g-bad-name
|
||||||
|
|
||||||
@ -79,11 +78,10 @@ def _deduplicate_indexed_slices(values, indices):
|
|||||||
@six.add_metaclass(abc.ABCMeta)
|
@six.add_metaclass(abc.ABCMeta)
|
||||||
@keras_export("keras.optimizers.Optimizer")
|
@keras_export("keras.optimizers.Optimizer")
|
||||||
class OptimizerV2(trackable.Trackable):
|
class OptimizerV2(trackable.Trackable):
|
||||||
"""Updated base class for optimizers.
|
"""Base class for Keras optimizers.
|
||||||
|
|
||||||
This class defines the API to add Ops to train a model. You never use this
|
You should not use this class directly, but instead instantiate one of its
|
||||||
class directly, but instead instantiate one of its subclasses such as
|
subclasses such as `tf.keras.optimizers.SGD`, `tf.keras.optimizers.Adam`, etc.
|
||||||
`tf.keras.optimizers.SGD`, `tf.keras.optimizers.Adam`.
|
|
||||||
|
|
||||||
### Usage
|
### Usage
|
||||||
|
|
||||||
@ -101,7 +99,7 @@ class OptimizerV2(trackable.Trackable):
|
|||||||
opt.minimize(loss, var_list=[var1, var2])
|
opt.minimize(loss, var_list=[var1, var2])
|
||||||
```
|
```
|
||||||
|
|
||||||
### Custom training loop with Keras models
|
### Usage in custom training loops
|
||||||
|
|
||||||
In Keras models, sometimes variables are created when the model is first
|
In Keras models, sometimes variables are created when the model is first
|
||||||
called, instead of construction time. Examples include 1) sequential models
|
called, instead of construction time. Examples include 1) sequential models
|
||||||
@ -109,6 +107,7 @@ class OptimizerV2(trackable.Trackable):
|
|||||||
callable in these cases.
|
callable in these cases.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
opt = tf.keras.optimizers.SGD(learning_rate=0.1)
|
opt = tf.keras.optimizers.SGD(learning_rate=0.1)
|
||||||
model = tf.keras.Sequential()
|
model = tf.keras.Sequential()
|
||||||
@ -120,7 +119,7 @@ class OptimizerV2(trackable.Trackable):
|
|||||||
opt.minimize(loss_fn, var_list_fn)
|
opt.minimize(loss_fn, var_list_fn)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Processing gradients before applying them.
|
### Processing gradients before applying them
|
||||||
|
|
||||||
Calling `minimize()` takes care of both computing the gradients and
|
Calling `minimize()` takes care of both computing the gradients and
|
||||||
applying them to the variables. If you want to process the gradients
|
applying them to the variables. If you want to process the gradients
|
||||||
@ -150,7 +149,7 @@ class OptimizerV2(trackable.Trackable):
|
|||||||
opt.apply_gradients(zip(processed_grads, var_list))
|
opt.apply_gradients(zip(processed_grads, var_list))
|
||||||
```
|
```
|
||||||
|
|
||||||
### Use with `tf.distribute.Strategy`.
|
### Use with `tf.distribute.Strategy`
|
||||||
|
|
||||||
This optimizer class is `tf.distribute.Strategy` aware, which means it
|
This optimizer class is `tf.distribute.Strategy` aware, which means it
|
||||||
automatically sums gradients across all replicas. To average gradients,
|
automatically sums gradients across all replicas. To average gradients,
|
||||||
@ -172,7 +171,7 @@ class OptimizerV2(trackable.Trackable):
|
|||||||
step. As a result, using `tf.math.reduce_mean` will give the wrong answer,
|
step. As a result, using `tf.math.reduce_mean` will give the wrong answer,
|
||||||
resulting in gradients that can be many times too big.
|
resulting in gradients that can be many times too big.
|
||||||
|
|
||||||
### Variable Constraint
|
### Variable Constraints
|
||||||
|
|
||||||
All Keras optimizers respect variable constraints. If constraint function is
|
All Keras optimizers respect variable constraints. If constraint function is
|
||||||
passed to any variable, the constraint will be applied to the variable after
|
passed to any variable, the constraint will be applied to the variable after
|
||||||
@ -195,7 +194,7 @@ class OptimizerV2(trackable.Trackable):
|
|||||||
This can be useful if you want to log debug a training algorithm, report stats
|
This can be useful if you want to log debug a training algorithm, report stats
|
||||||
about the slots, etc.
|
about the slots, etc.
|
||||||
|
|
||||||
### Hyper parameters
|
### Hyperparameters
|
||||||
|
|
||||||
These are arguments passed to the optimizer subclass constructor
|
These are arguments passed to the optimizer subclass constructor
|
||||||
(the `__init__` method), and then passed to `self._set_hyper()`.
|
(the `__init__` method), and then passed to `self._set_hyper()`.
|
||||||
@ -203,7 +202,7 @@ class OptimizerV2(trackable.Trackable):
|
|||||||
callables. If they are callable, the callable will be called during
|
callables. If they are callable, the callable will be called during
|
||||||
`apply_gradients()` to get the value for the hyper parameter.
|
`apply_gradients()` to get the value for the hyper parameter.
|
||||||
|
|
||||||
Hyper parameters can be overwritten through user code:
|
Hyperparameters can be overwritten through user code:
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
@ -220,7 +219,8 @@ class OptimizerV2(trackable.Trackable):
|
|||||||
opt.minimize(loss, var_list=[var1, var2])
|
opt.minimize(loss, var_list=[var1, var2])
|
||||||
```
|
```
|
||||||
|
|
||||||
### Callable learning rate.
|
### Callable learning rate
|
||||||
|
|
||||||
Optimizer accepts a callable learning rate in two ways. The first way is
|
Optimizer accepts a callable learning rate in two ways. The first way is
|
||||||
through built-in or customized
|
through built-in or customized
|
||||||
`tf.keras.optimizers.schedules.LearningRateSchedule`. The schedule will be
|
`tf.keras.optimizers.schedules.LearningRateSchedule`. The schedule will be
|
||||||
@ -250,14 +250,17 @@ class OptimizerV2(trackable.Trackable):
|
|||||||
>>> opt.minimize(loss, var_list=[var])
|
>>> opt.minimize(loss, var_list=[var])
|
||||||
<tf.Variable...
|
<tf.Variable...
|
||||||
|
|
||||||
### Write a customized optimizer.
|
### Creating a custom optimizer
|
||||||
|
|
||||||
If you intend to create your own optimization algorithm, simply inherit from
|
If you intend to create your own optimization algorithm, simply inherit from
|
||||||
this class and override the following methods:
|
this class and override the following methods:
|
||||||
|
|
||||||
- _resource_apply_dense (update variable given gradient tensor is dense)
|
- `_resource_apply_dense` (update variable given gradient tensor is dense)
|
||||||
- _resource_apply_sparse (update variable given gradient tensor is sparse)
|
- `_resource_apply_sparse` (update variable given gradient tensor is sparse)
|
||||||
- _create_slots (if your optimizer algorithm requires additional variables)
|
- `_create_slots`
|
||||||
- get_config (serialization of the optimizer, include all hyper parameters)
|
(if your optimizer algorithm requires additional variables)
|
||||||
|
- `get_config`
|
||||||
|
(serialization of the optimizer, include all hyper parameters)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Subclasses should set this to True unless they override `apply_gradients`
|
# Subclasses should set this to True unless they override `apply_gradients`
|
||||||
|
@ -12,7 +12,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
"""RMSprop for TensorFlow."""
|
"""RMSprop optimizer implementation."""
|
||||||
|
# pylint: disable=g-classes-have-attributes
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
@ -34,36 +35,37 @@ from tensorflow.python.util.tf_export import keras_export
|
|||||||
class RMSprop(optimizer_v2.OptimizerV2):
|
class RMSprop(optimizer_v2.OptimizerV2):
|
||||||
r"""Optimizer that implements the RMSprop algorithm.
|
r"""Optimizer that implements the RMSprop algorithm.
|
||||||
|
|
||||||
A detailed description of rmsprop.
|
The gist of RMSprop is to:
|
||||||
- maintain a moving (discounted) average of the square of gradients
|
|
||||||
- divide gradient by the root of this average
|
|
||||||
|
|
||||||
The default settings does not use momentum:
|
- Maintain a moving (discounted) average of the square of gradients
|
||||||
|
- Divide the gradient by the root of this average
|
||||||
$$rms_t = \rho * rms_{t-1} + (1-\rho) * g_t^2$$
|
|
||||||
$$\theta_t = \theta_{t-1} - \mathrm{learning\_rate} *
|
|
||||||
g_t / \sqrt{rms_t + \epsilon}$$
|
|
||||||
|
|
||||||
Since $x/x^2 = sign(x)$, this is an smoothed approximation of:
|
|
||||||
|
|
||||||
$$ \theta_t = \theta_{t-1} - \mathrm{learning\_rate} * sign(g_t) $$
|
|
||||||
|
|
||||||
With momentum the update is:
|
|
||||||
|
|
||||||
$$rms_t = \rho * rms_{t-1} + (1-\rho) * g_t^2$$
|
|
||||||
$$mom_t = \mathrm{momentum} * mom_{t-1} + g_t / \sqrt{rms_t + \epsilon}$$
|
|
||||||
$$\theta_t = \theta_{t-1} - \mathrm{learning\_rate} * mom_t$$
|
|
||||||
|
|
||||||
This implementation of RMSprop uses plain momentum, not Nesterov momentum.
|
This implementation of RMSprop uses plain momentum, not Nesterov momentum.
|
||||||
|
|
||||||
The centered version additionally maintains a moving average of the
|
The centered version additionally maintains a moving average of the
|
||||||
gradients, and uses that average to estimate the variance:
|
gradients, and uses that average to estimate the variance.
|
||||||
|
|
||||||
$$mg_t = \rho * mg_{t-1} + (1-\rho) * g_t$$
|
Args:
|
||||||
$$rms_t = \rho * rms_{t-1} + (1-\rho) * g_t^2$$
|
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
||||||
$$mom_t = \mathrm{momentum} * mom_{t-1} +
|
`tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
|
||||||
\mathrm{learning\_rate} * g_t / sqrt(rms_t - mg_t^2 + \epsilon)$$
|
that takes no arguments and returns the actual value to use. The
|
||||||
$$\theta_t = \theta_{t-1} - mom_t$$
|
learning rate. Defeaults to 0.001.
|
||||||
|
rho: Discounting factor for the history/coming gradient. Defaults to 0.9.
|
||||||
|
momentum: A scalar or a scalar `Tensor`. Defaults to 0.0.
|
||||||
|
epsilon: A small constant for numerical stability. This epsilon is
|
||||||
|
"epsilon hat" in the Kingma and Ba paper (in the formula just before
|
||||||
|
Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
|
||||||
|
1e-7.
|
||||||
|
centered: Boolean. If `True`, gradients are normalized by the estimated
|
||||||
|
variance of the gradient; if False, by the uncentered second moment.
|
||||||
|
Setting this to `True` may help with training, but is slightly more
|
||||||
|
expensive in terms of computation and memory. Defaults to `False`.
|
||||||
|
name: Optional name prefix for the operations created when applying
|
||||||
|
gradients. Defaults to `"RMSprop"`.
|
||||||
|
**kwargs: Keyword arguments. Allowed to be one of
|
||||||
|
`"clipnorm"` or `"clipvalue"`.
|
||||||
|
`"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
|
||||||
|
gradients by value.
|
||||||
|
|
||||||
Note that in the dense implementation of this algorithm, variables and their
|
Note that in the dense implementation of this algorithm, variables and their
|
||||||
corresponding accumulators (momentum, gradient moving average, square
|
corresponding accumulators (momentum, gradient moving average, square
|
||||||
@ -81,14 +83,14 @@ class RMSprop(optimizer_v2.OptimizerV2):
|
|||||||
|
|
||||||
>>> opt = tf.keras.optimizers.RMSprop(learning_rate=0.1)
|
>>> opt = tf.keras.optimizers.RMSprop(learning_rate=0.1)
|
||||||
>>> var1 = tf.Variable(10.0)
|
>>> var1 = tf.Variable(10.0)
|
||||||
>>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) = var1
|
>>> loss = lambda: (var1 ** 2) / 2.0 # d(loss) / d(var1) = var1
|
||||||
>>> step_count = opt.minimize(loss, [var1]).numpy()
|
>>> step_count = opt.minimize(loss, [var1]).numpy()
|
||||||
>>> var1.numpy()
|
>>> var1.numpy()
|
||||||
9.683772
|
9.683772
|
||||||
|
|
||||||
References
|
Reference:
|
||||||
See ([pdf]
|
- [Hinton, 2012](
|
||||||
http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
|
http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_HAS_AGGREGATE_GRAD = True
|
_HAS_AGGREGATE_GRAD = True
|
||||||
|
Loading…
Reference in New Issue
Block a user