Clean up optimizer docstrings.
PiperOrigin-RevId: 304544799 Change-Id: Ic83206c54cbf8437a4d6ff693f139412b5bdcee8
This commit is contained in:
parent
8a370a0077
commit
f6302e4ec7
@ -12,8 +12,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
"""Adadelta for TensorFlow."""
|
||||
"""Adadelta optimizer implementation."""
|
||||
# pylint: disable=g-classes-have-attributes
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
@ -34,23 +34,9 @@ class Adadelta(optimizer_v2.OptimizerV2):
|
||||
|
||||
Adadelta optimization is a stochastic gradient descent method that is based on
|
||||
adaptive learning rate per dimension to address two drawbacks:
|
||||
1) the continual decay of learning rates throughout training
|
||||
2) the need for a manually selected global learning rate
|
||||
|
||||
Two accumulation steps are required:
|
||||
1) the accumulation of gradients squared,
|
||||
2) the accumulation of updates squared.
|
||||
|
||||
Initialization:
|
||||
|
||||
$$E[g^2]_0 := 0 \text{(Initialize gradient 2nd order moment vector)}$$
|
||||
$$E[\Delta x^2]_0 := 0 \text{(Initialize 2nd order variable update)}$$
|
||||
|
||||
$$t := t + 1$$
|
||||
$$E[g^2]_t := \rho * E[g^2]_{t-1} + (1 - \rho) * g^2$$
|
||||
$$\Delta x_t = -RMS[\Delta x]_{t-1} * g_t / RMS[g]_t$$
|
||||
$$E[\Delta x^2]_t := \rho * E[\Delta x^2]_{t-1} + (1 - \rho) * \Delta x_t^2$$
|
||||
$$x_t := x_{t-1} + \Delta x_{t}$$
|
||||
- The continual decay of learning rates throughout training
|
||||
- The need for a manually selected global learning rate
|
||||
|
||||
Adadelta is a more robust extension of Adagrad that adapts learning rates
|
||||
based on a moving window of gradient updates, instead of accumulating all
|
||||
@ -59,16 +45,22 @@ class Adadelta(optimizer_v2.OptimizerV2):
|
||||
don't have to set an initial learning rate. In this version, initial
|
||||
learning rate can be set, as in most other Keras optimizers.
|
||||
|
||||
@compatibility(eager)
|
||||
When eager execution is enabled, `learning_rate`, `rho`, and `epsilon` can
|
||||
each be a callable that takes no arguments and returns the actual value to
|
||||
use. This can be useful for changing these values across different
|
||||
invocations of optimizer functions.
|
||||
@end_compatibility
|
||||
Args:
|
||||
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
||||
`tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
|
||||
To match the exact form in the original paper use 1.0.
|
||||
rho: A `Tensor` or a floating point value. The decay rate.
|
||||
epsilon: A `Tensor` or a floating point value. A constant epsilon used
|
||||
to better conditioning the grad update.
|
||||
name: Optional name prefix for the operations created when applying
|
||||
gradients. Defaults to `"Adadelta"`.
|
||||
**kwargs: Keyword arguments. Allowed to be one of
|
||||
`"clipnorm"` or `"clipvalue"`.
|
||||
`"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
|
||||
gradients by value.
|
||||
|
||||
References
|
||||
See [M. D. Zeiler](http://arxiv.org/abs/1212.5701)
|
||||
([pdf](http://arxiv.org/pdf/1212.5701v1.pdf))
|
||||
Reference:
|
||||
- [Zeiler, 2012](http://arxiv.org/abs/1212.5701)
|
||||
"""
|
||||
|
||||
_HAS_AGGREGATE_GRAD = True
|
||||
@ -79,23 +71,6 @@ class Adadelta(optimizer_v2.OptimizerV2):
|
||||
epsilon=1e-7,
|
||||
name='Adadelta',
|
||||
**kwargs):
|
||||
"""Construct a new Adadelta optimizer.
|
||||
|
||||
Args:
|
||||
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
||||
`tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
|
||||
To match the exact form in the original paper use 1.0.
|
||||
rho: A `Tensor` or a floating point value. The decay rate.
|
||||
epsilon: A `Tensor` or a floating point value. A constant epsilon used
|
||||
to better conditioning the grad update.
|
||||
name: Optional name prefix for the operations created when applying
|
||||
gradients. Defaults to "Adadelta".
|
||||
**kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
|
||||
`decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
|
||||
gradients by value, `decay` is included for backward compatibility to
|
||||
allow time inverse decay of learning rate. `lr` is included for backward
|
||||
compatibility, recommended to use `learning_rate` instead.
|
||||
"""
|
||||
super(Adadelta, self).__init__(name, **kwargs)
|
||||
self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
|
||||
self._set_hyper('decay', self._initial_decay)
|
||||
|
@ -12,8 +12,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
"""Adagrad for TensorFlow."""
|
||||
"""Adagrad optimizer implementation."""
|
||||
# pylint: disable=g-classes-have-attributes
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
@ -39,26 +39,22 @@ class Adagrad(optimizer_v2.OptimizerV2):
|
||||
updated during training. The more updates a parameter receives,
|
||||
the smaller the updates.
|
||||
|
||||
Initialization:
|
||||
$$accum_{g_0} := \text{initial_accumulator_value}$$
|
||||
Args:
|
||||
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
||||
`tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
|
||||
initial_accumulator_value: A floating point value.
|
||||
Starting value for the accumulators, must be non-negative.
|
||||
epsilon: A small floating point value to avoid zero denominator.
|
||||
name: Optional name prefix for the operations created when applying
|
||||
gradients. Defaults to `"Adagrad"`.
|
||||
**kwargs: Keyword arguments. Allowed to be one of
|
||||
`"clipnorm"` or `"clipvalue"`.
|
||||
`"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
|
||||
gradients by value.
|
||||
|
||||
Update step:
|
||||
$$t := t + 1$$
|
||||
$$accum_{g_t} := accum_{g_{t-1}} + g^2$$
|
||||
$$\theta_t := \theta_{t-1} - lr * g / (\sqrt{accum_{g_t}} + \epsilon)$$
|
||||
|
||||
@compatibility(eager)
|
||||
When eager execution is enabled, `learning_rate` can be a callable that
|
||||
takes no arguments and returns the actual value to use. This can be useful
|
||||
for changing these values across different invocations of optimizer
|
||||
functions.
|
||||
@end_compatibility
|
||||
|
||||
References:
|
||||
|
||||
* [Paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
|
||||
* [Introduction]
|
||||
(https://ppasupat.github.io/a9online/uploads/proximal_notes.pdf).
|
||||
Reference:
|
||||
- [Duchi et al., 2011](
|
||||
http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
|
||||
"""
|
||||
|
||||
_HAS_AGGREGATE_GRAD = True
|
||||
@ -69,25 +65,6 @@ class Adagrad(optimizer_v2.OptimizerV2):
|
||||
epsilon=1e-7,
|
||||
name='Adagrad',
|
||||
**kwargs):
|
||||
"""Construct a new Adagrad optimizer.
|
||||
|
||||
Args:
|
||||
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
||||
`tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
|
||||
initial_accumulator_value: A floating point value.
|
||||
Starting value for the accumulators, must be non-negative.
|
||||
epsilon: A small floating point value to avoid zero denominator.
|
||||
name: Optional name prefix for the operations created when applying
|
||||
gradients. Defaults to "Adagrad".
|
||||
**kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
|
||||
`decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
|
||||
gradients by value, `decay` is included for backward compatibility to
|
||||
allow time inverse decay of learning rate. `lr` is included for backward
|
||||
compatibility, recommended to use `learning_rate` instead.
|
||||
|
||||
Raises:
|
||||
ValueError: If the `initial_accumulator_value` or `epsilon` is invalid.
|
||||
"""
|
||||
if initial_accumulator_value < 0.0:
|
||||
raise ValueError('initial_accumulator_value must be non-negative: %s' %
|
||||
initial_accumulator_value)
|
||||
|
@ -12,7 +12,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Adam for TensorFlow."""
|
||||
"""Adam optimizer implementation."""
|
||||
# pylint: disable=g-classes-have-attributes
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
@ -35,86 +36,14 @@ class Adam(optimizer_v2.OptimizerV2):
|
||||
|
||||
Adam optimization is a stochastic gradient descent method that is based on
|
||||
adaptive estimation of first-order and second-order moments.
|
||||
According to the paper
|
||||
[Adam: A Method for Stochastic Optimization. Kingma et al.,
|
||||
2014](http://arxiv.org/abs/1412.6980), the method is "*computationally
|
||||
|
||||
According to
|
||||
[Kingma et al., 2014](http://arxiv.org/abs/1412.6980),
|
||||
the method is "*computationally
|
||||
efficient, has little memory requirement, invariant to diagonal rescaling of
|
||||
gradients, and is well suited for problems that are large in terms of
|
||||
data/parameters*".
|
||||
|
||||
For AMSGrad see [On The Convergence Of Adam And Beyond.
|
||||
Reddi et al., 5-8](https://openreview.net/pdf?id=ryQu7f-RZ).
|
||||
|
||||
**If amsgrad = False**:
|
||||
|
||||
initialize $m_0$ as 1st moment vector
|
||||
initialize $v_0$ as 2nd moment vector
|
||||
|
||||
The update rule for $\theta$ with gradient $g$ uses an optimization
|
||||
described at the end of section 2 of the paper:
|
||||
|
||||
$$lr_t = \mathrm{learning\_rate} *
|
||||
\sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
|
||||
$$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
|
||||
$$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
|
||||
$$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
|
||||
|
||||
**If amsgrad = True**:
|
||||
|
||||
initialize $m_0$ as 1st moment vector
|
||||
initialize $v_0$ as 2nd moment vector
|
||||
initialize $\hat{v}_0$ as 2nd moment vector
|
||||
|
||||
The update rule for $\theta$ with gradient $g$ uses an optimization
|
||||
described at the end of section 2 of the paper:
|
||||
|
||||
$$lr_t = \mathrm{learning\_rate} *
|
||||
\sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
|
||||
|
||||
$$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
|
||||
$$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
|
||||
$$\hat{v}_t = \max(\hat{v}_{t-1}, v_t)$$
|
||||
$$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{\hat{v}_t} + \epsilon)$$
|
||||
|
||||
The default value of 1e-7 for epsilon might not be a good default in
|
||||
general. For example, when training an Inception network on ImageNet a
|
||||
current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
|
||||
formulation just before Section 2.1 of the Kingma and Ba paper rather than
|
||||
the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
|
||||
hat" in the paper.
|
||||
|
||||
The sparse implementation of this algorithm (used when the gradient is an
|
||||
IndexedSlices object, typically because of `tf.gather` or an embedding
|
||||
lookup in the forward pass) does apply momentum to variable slices even if
|
||||
they were not used in the forward pass (meaning they have a gradient equal
|
||||
to zero). Momentum decay (beta1) is also applied to the entire momentum
|
||||
accumulator. This means that the sparse behavior is equivalent to the dense
|
||||
behavior (in contrast to some momentum implementations which ignore momentum
|
||||
unless a variable slice was actually used).
|
||||
|
||||
Usage:
|
||||
|
||||
>>> opt = tf.keras.optimizers.Adam(learning_rate=0.1)
|
||||
>>> var1 = tf.Variable(10.0)
|
||||
>>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) == var1
|
||||
>>> step_count = opt.minimize(loss, [var1]).numpy()
|
||||
>>> # The first step is `-learning_rate*sign(grad)`
|
||||
>>> var1.numpy()
|
||||
9.9
|
||||
"""
|
||||
|
||||
_HAS_AGGREGATE_GRAD = True
|
||||
|
||||
def __init__(self,
|
||||
learning_rate=0.001,
|
||||
beta_1=0.9,
|
||||
beta_2=0.999,
|
||||
epsilon=1e-7,
|
||||
amsgrad=False,
|
||||
name='Adam',
|
||||
**kwargs):
|
||||
"""Construct a new Adam optimizer.
|
||||
|
||||
Args:
|
||||
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
||||
`tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
|
||||
@ -133,14 +62,56 @@ class Adam(optimizer_v2.OptimizerV2):
|
||||
amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
|
||||
the paper "On the Convergence of Adam and beyond". Defaults to `False`.
|
||||
name: Optional name for the operations created when applying gradients.
|
||||
Defaults to "Adam".
|
||||
**kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
|
||||
`decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
|
||||
gradients by value, `decay` is included for backward compatibility to
|
||||
allow time inverse decay of learning rate. `lr` is included for backward
|
||||
compatibility, recommended to use `learning_rate` instead.
|
||||
Defaults to `"Adam"`.
|
||||
**kwargs: Keyword arguments. Allowed to be one of
|
||||
`"clipnorm"` or `"clipvalue"`.
|
||||
`"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
|
||||
gradients by value.
|
||||
|
||||
Usage:
|
||||
|
||||
>>> opt = tf.keras.optimizers.Adam(learning_rate=0.1)
|
||||
>>> var1 = tf.Variable(10.0)
|
||||
>>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) == var1
|
||||
>>> step_count = opt.minimize(loss, [var1]).numpy()
|
||||
>>> # The first step is `-learning_rate*sign(grad)`
|
||||
>>> var1.numpy()
|
||||
9.9
|
||||
|
||||
Reference:
|
||||
- [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
|
||||
- [Reddi et al., 2018](
|
||||
https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.
|
||||
|
||||
Notes:
|
||||
|
||||
The default value of 1e-7 for epsilon might not be a good default in
|
||||
general. For example, when training an Inception network on ImageNet a
|
||||
current good choice is 1.0 or 0.1. Note that since Adam uses the
|
||||
formulation just before Section 2.1 of the Kingma and Ba paper rather than
|
||||
the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
|
||||
hat" in the paper.
|
||||
|
||||
The sparse implementation of this algorithm (used when the gradient is an
|
||||
IndexedSlices object, typically because of `tf.gather` or an embedding
|
||||
lookup in the forward pass) does apply momentum to variable slices even if
|
||||
they were not used in the forward pass (meaning they have a gradient equal
|
||||
to zero). Momentum decay (beta1) is also applied to the entire momentum
|
||||
accumulator. This means that the sparse behavior is equivalent to the dense
|
||||
behavior (in contrast to some momentum implementations which ignore momentum
|
||||
unless a variable slice was actually used).
|
||||
"""
|
||||
|
||||
_HAS_AGGREGATE_GRAD = True
|
||||
|
||||
def __init__(self,
|
||||
learning_rate=0.001,
|
||||
beta_1=0.9,
|
||||
beta_2=0.999,
|
||||
epsilon=1e-7,
|
||||
amsgrad=False,
|
||||
name='Adam',
|
||||
**kwargs):
|
||||
super(Adam, self).__init__(name, **kwargs)
|
||||
self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
|
||||
self._set_hyper('decay', self._initial_decay)
|
||||
@ -329,7 +300,7 @@ class NonFusedAdam(optimizer_v2.OptimizerV2):
|
||||
|
||||
The default value of 1e-7 for epsilon might not be a good default in
|
||||
general. For example, when training an Inception network on ImageNet a
|
||||
current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
|
||||
current good choice is 1.0 or 0.1. Note that since Adam uses the
|
||||
formulation just before Section 2.1 of the Kingma and Ba paper rather than
|
||||
the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
|
||||
hat" in the paper.
|
||||
|
@ -12,8 +12,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
"""Adamax for TensorFlow."""
|
||||
"""Adamax optimizer implementation."""
|
||||
# pylint: disable=g-classes-have-attributes
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
@ -39,27 +39,27 @@ class Adamax(optimizer_v2.OptimizerV2):
|
||||
|
||||
Initialization:
|
||||
|
||||
```
|
||||
m_0 <- 0 (Initialize initial 1st moment vector)
|
||||
v_0 <- 0 (Initialize the exponentially weighted infinity norm)
|
||||
t <- 0 (Initialize timestep)
|
||||
```python
|
||||
m = 0 # Initialize initial 1st moment vector
|
||||
v = 0 # Initialize the exponentially weighted infinity norm
|
||||
t = 0 # Initialize timestep
|
||||
```
|
||||
|
||||
The update rule for `variable` with gradient `g` uses an optimization
|
||||
The update rule for parameter `w` with gradient `g` is
|
||||
described at the end of section 7.1 of the paper:
|
||||
|
||||
```
|
||||
t <- t + 1
|
||||
|
||||
m_t <- beta1 * m_{t-1} + (1 - beta1) * g
|
||||
v_t <- max(beta2 * v_{t-1}, abs(g))
|
||||
variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
|
||||
```python
|
||||
t += 1
|
||||
m = beta1 * m + (1 - beta) * g
|
||||
v = max(beta2 * v, abs(g))
|
||||
current_lr = learning_rate / (1 - beta1 ** t)
|
||||
w = w - current_lr * m / (v + epsilon)
|
||||
```
|
||||
|
||||
Similar to AdamOptimizer, the epsilon is added for numerical stability
|
||||
(especially to get rid of division by zero when v_t = 0).
|
||||
Similarly to `Adam`, the epsilon is added for numerical stability
|
||||
(especially to get rid of division by zero when `v_t == 0`).
|
||||
|
||||
Contrast to AdamOptimizer, the sparse implementation of this algorithm
|
||||
In contrast to `Adam`, the sparse implementation of this algorithm
|
||||
(used when the gradient is an IndexedSlices object, typically because of
|
||||
`tf.gather` or an embedding lookup in the forward pass) only updates
|
||||
variable slices and corresponding `m_t`, `v_t` terms when that part of
|
||||
@ -68,9 +68,23 @@ class Adamax(optimizer_v2.OptimizerV2):
|
||||
implementations which ignore momentum unless a variable slice was actually
|
||||
used).
|
||||
|
||||
References
|
||||
see Section 7 of [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
|
||||
([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
|
||||
Args:
|
||||
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
||||
`tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
|
||||
beta_1: A float value or a constant float tensor. The exponential decay
|
||||
rate for the 1st moment estimates.
|
||||
beta_2: A float value or a constant float tensor. The exponential decay
|
||||
rate for the exponentially weighted infinity norm.
|
||||
epsilon: A small constant for numerical stability.
|
||||
name: Optional name for the operations created when applying gradients.
|
||||
Defaults to `"Adamax"`.
|
||||
**kwargs: Keyword arguments. Allowed to be one of
|
||||
`"clipnorm"` or `"clipvalue"`.
|
||||
`"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
|
||||
gradients by value.
|
||||
|
||||
Reference:
|
||||
- [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
|
||||
"""
|
||||
|
||||
_HAS_AGGREGATE_GRAD = True
|
||||
@ -82,24 +96,6 @@ class Adamax(optimizer_v2.OptimizerV2):
|
||||
epsilon=1e-7,
|
||||
name='Adamax',
|
||||
**kwargs):
|
||||
"""Construct a new Adamax optimizer.
|
||||
|
||||
Args:
|
||||
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
||||
`tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
|
||||
beta_1: A float value or a constant float tensor. The exponential decay
|
||||
rate for the 1st moment estimates.
|
||||
beta_2: A float value or a constant float tensor. The exponential decay
|
||||
rate for the exponentially weighted infinity norm.
|
||||
epsilon: A small constant for numerical stability.
|
||||
name: Optional name for the operations created when applying gradients.
|
||||
Defaults to "Adamax".
|
||||
**kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
|
||||
`decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
|
||||
gradients by value, `decay` is included for backward compatibility to
|
||||
allow time inverse decay of learning rate. `lr` is included for backward
|
||||
compatibility, recommended to use `learning_rate` instead.
|
||||
"""
|
||||
super(Adamax, self).__init__(name, **kwargs)
|
||||
self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
|
||||
self._set_hyper('decay', self._initial_decay)
|
||||
|
@ -12,7 +12,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Ftrl-proximal for TensorFlow."""
|
||||
"""Ftrl-proximal optimizer implementation."""
|
||||
# pylint: disable=g-classes-have-attributes
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
@ -35,39 +36,6 @@ class Ftrl(optimizer_v2.OptimizerV2):
|
||||
above) and shrinkage-type L2 (which is the addition of an L2 penalty to the
|
||||
loss function).
|
||||
|
||||
Initialization:
|
||||
$$t = 0$$
|
||||
$$n_{0} = 0$$
|
||||
$$\sigma_{0} = 0$$
|
||||
$$z_{0} = 0$$
|
||||
|
||||
Update ($$i$$ is variable index):
|
||||
$$t = t + 1$$
|
||||
$$n_{t,i} = n_{t-1,i} + g_{t,i}^{2}$$
|
||||
$$\sigma_{t,i} = (\sqrt{n_{t,i}} - \sqrt{n_{t-1,i}}) / \alpha$$
|
||||
$$z_{t,i} = z_{t-1,i} + g_{t,i} - \sigma_{t,i} * w_{t,i}$$
|
||||
$$w_{t,i} = - ((\beta+\sqrt{n+{t}}) / \alpha + \lambda_{2})^{-1} * (z_{i} -
|
||||
sgn(z_{i}) * \lambda_{1}) if \abs{z_{i}} > \lambda_{i} else 0$$
|
||||
|
||||
Check the documentation for the l2_shrinkage_regularization_strength
|
||||
parameter for more details when shrinkage is enabled, where gradient is
|
||||
replaced with gradient_with_shrinkage.
|
||||
|
||||
References: See
|
||||
[paper](https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
learning_rate=0.001,
|
||||
learning_rate_power=-0.5,
|
||||
initial_accumulator_value=0.1,
|
||||
l1_regularization_strength=0.0,
|
||||
l2_regularization_strength=0.0,
|
||||
name='Ftrl',
|
||||
l2_shrinkage_regularization_strength=0.0,
|
||||
**kwargs):
|
||||
r"""Construct a new FTRL optimizer.
|
||||
|
||||
Args:
|
||||
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
||||
`tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
|
||||
@ -81,29 +49,30 @@ class Ftrl(optimizer_v2.OptimizerV2):
|
||||
l2_regularization_strength: A float value, must be greater than or
|
||||
equal to zero.
|
||||
name: Optional name prefix for the operations created when applying
|
||||
gradients. Defaults to "Ftrl".
|
||||
gradients. Defaults to `"Ftrl"`.
|
||||
l2_shrinkage_regularization_strength: A float value, must be greater than
|
||||
or equal to zero. This differs from L2 above in that the L2 above is a
|
||||
stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
|
||||
The FTRL formulation can be written as:
|
||||
w_{t+1} = argmin_w(\hat{g}_{1:t}w + L1*||w||_1 + L2*||w||_2^2), where
|
||||
\hat{g} = g + (2*L2_shrinkage*w), and g is the gradient of the loss
|
||||
function w.r.t. the weights w.
|
||||
Specifically, in the absence of L1 regularization, it is equivalent to
|
||||
the following update rule:
|
||||
w_{t+1} = w_t - lr_t / (1 + 2*L2*lr_t) * g_t -
|
||||
2*L2_shrinkage*lr_t / (1 + 2*L2*lr_t) * w_t
|
||||
where lr_t is the learning rate at t.
|
||||
When input is sparse shrinkage will only happen on the active weights.\
|
||||
**kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
|
||||
`decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
|
||||
gradients by value, `decay` is included for backward compatibility to
|
||||
allow time inverse decay of learning rate. `lr` is included for backward
|
||||
compatibility, recommended to use `learning_rate` instead.
|
||||
When input is sparse shrinkage will only happen on the active weights.
|
||||
**kwargs: Keyword arguments. Allowed to be one of
|
||||
`"clipnorm"` or `"clipvalue"`.
|
||||
`"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
|
||||
gradients by value.
|
||||
|
||||
Raises:
|
||||
ValueError: If one of the arguments is invalid.
|
||||
Reference:
|
||||
- [paper](
|
||||
https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
learning_rate=0.001,
|
||||
learning_rate_power=-0.5,
|
||||
initial_accumulator_value=0.1,
|
||||
l1_regularization_strength=0.0,
|
||||
l2_regularization_strength=0.0,
|
||||
name='Ftrl',
|
||||
l2_shrinkage_regularization_strength=0.0,
|
||||
**kwargs):
|
||||
super(Ftrl, self).__init__(name, **kwargs)
|
||||
|
||||
if initial_accumulator_value < 0.0:
|
||||
|
@ -12,7 +12,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Momentum for TensorFlow."""
|
||||
"""SGD optimizer implementation."""
|
||||
# pylint: disable=g-classes-have-attributes
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
@ -27,17 +28,45 @@ from tensorflow.python.util.tf_export import keras_export
|
||||
|
||||
@keras_export("keras.optimizers.SGD")
|
||||
class SGD(optimizer_v2.OptimizerV2):
|
||||
r"""Stochastic gradient descent and momentum optimizer.
|
||||
r"""Gradient descent (with momentum) optimizer.
|
||||
|
||||
The update rule for $\theta$ with gradient $g$ when `momentum` is 0.0:
|
||||
$$\theta_t = \theta_{t-1} - \mathrm{learning\_rate} * g_t$$
|
||||
Update rule for parameter `w` with gradient `g` when `momentum` is 0:
|
||||
|
||||
The update rule when `momentum` is larger than 0.0:
|
||||
$$v_t = \mathrm{momentum} * v_{t-1} - \mathrm{learning\_rate} * g_t$$
|
||||
$$\theta_t = \theta_{t-1} + v_t$$
|
||||
if `nesterov` is False, gradient is evaluated at $\theta_t$.
|
||||
if `nesterov` is True, gradient is evaluated at $\theta_t + momentum * v_t$,
|
||||
and the variables always store $\theta + m v$ instead of $theta$
|
||||
```python
|
||||
w = w - learning_rate * g
|
||||
```
|
||||
|
||||
Update rule when `momentum` is larger than 0:
|
||||
|
||||
```python
|
||||
velocity = momentum * velocity - learning_rate * g
|
||||
w = w * velocity
|
||||
```
|
||||
|
||||
When `nesterov=False`, this rule becomes:
|
||||
|
||||
```python
|
||||
velocity = momentum * velocity - learning_rate * g
|
||||
w = w + momentum * velocity - learning_rate * g
|
||||
```
|
||||
|
||||
Args:
|
||||
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
||||
`tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
|
||||
that takes no arguments and returns the actual value to use. The
|
||||
learning rate. Defaults to 0.01.
|
||||
momentum: float hyperparameter >= 0 that accelerates gradient descent
|
||||
in the relevant
|
||||
direction and dampens oscillations. Defaults to 0, i.e., vanilla gradient
|
||||
descent.
|
||||
nesterov: boolean. Whether to apply Nesterov momentum.
|
||||
Defaults to `False`.
|
||||
name: Optional name prefix for the operations created when applying
|
||||
gradients. Defaults to `"SGD"`.
|
||||
**kwargs: Keyword arguments. Allowed to be one of
|
||||
`"clipnorm"` or `"clipvalue"`.
|
||||
`"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
|
||||
gradients by value.
|
||||
|
||||
Usage:
|
||||
|
||||
@ -64,13 +93,8 @@ class SGD(optimizer_v2.OptimizerV2):
|
||||
>>> (val1 - val2).numpy()
|
||||
0.18
|
||||
|
||||
Some of the args below are hyperparameters, where a hyperparameter is
|
||||
defined as a scalar Tensor, a regular Python value, or a callable (which
|
||||
will be evaluated when `apply_gradients` is called) returning a scalar
|
||||
Tensor or a Python value.
|
||||
|
||||
# References
|
||||
nesterov = True, See [Sutskever et al., 2013](
|
||||
Reference:
|
||||
- For `nesterov=True`, See [Sutskever et al., 2013](
|
||||
http://jmlr.org/proceedings/papers/v28/sutskever13.pdf).
|
||||
"""
|
||||
|
||||
@ -82,25 +106,6 @@ class SGD(optimizer_v2.OptimizerV2):
|
||||
nesterov=False,
|
||||
name="SGD",
|
||||
**kwargs):
|
||||
"""Construct a new Stochastic Gradient Descent or Momentum optimizer.
|
||||
|
||||
Arguments:
|
||||
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
||||
`tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
|
||||
that takes no arguments and returns the actual value to use. The
|
||||
learning rate. Defaults to 0.01.
|
||||
momentum: float hyperparameter >= 0 that accelerates SGD in the relevant
|
||||
direction and dampens oscillations. Defaults to 0.0, i.e., SGD.
|
||||
nesterov: boolean. Whether to apply Nesterov momentum.
|
||||
Defaults to `False`.
|
||||
name: Optional name prefix for the operations created when applying
|
||||
gradients. Defaults to 'SGD'.
|
||||
**kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
|
||||
`decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
|
||||
gradients by value, `decay` is included for backward compatibility to
|
||||
allow time inverse decay of learning rate. `lr` is included for backward
|
||||
compatibility, recommended to use `learning_rate` instead.
|
||||
"""
|
||||
super(SGD, self).__init__(name, **kwargs)
|
||||
self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
|
||||
self._set_hyper("decay", self._initial_decay)
|
||||
|
@ -12,7 +12,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Nadam for TensorFlow."""
|
||||
"""Nadam optimizer implementation."""
|
||||
# pylint: disable=g-classes-have-attributes
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
@ -36,29 +37,22 @@ class Nadam(optimizer_v2.OptimizerV2):
|
||||
Much like Adam is essentially RMSprop with momentum, Nadam is Adam with
|
||||
Nesterov momentum.
|
||||
|
||||
Initialization:
|
||||
Args:
|
||||
learning_rate: A Tensor or a floating point value. The learning rate.
|
||||
beta_1: A float value or a constant float tensor. The exponential decay
|
||||
rate for the 1st moment estimates.
|
||||
beta_2: A float value or a constant float tensor. The exponential decay
|
||||
rate for the exponentially weighted infinity norm.
|
||||
epsilon: A small constant for numerical stability.
|
||||
name: Optional name for the operations created when applying gradients.
|
||||
Defaults to `"Nadam"`.
|
||||
**kwargs: Keyword arguments. Allowed to be one of
|
||||
`"clipnorm"` or `"clipvalue"`.
|
||||
`"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
|
||||
gradients by value.
|
||||
|
||||
$$m_0 := 0 \text{(Initialize 1st moment vector)}$$
|
||||
$$v_0 := 0 \text{(Initialize 2nd moment vector)}$$
|
||||
$$mu_0 := 1$$
|
||||
$$t := 0 \text{(Initialize timestep)}$$
|
||||
|
||||
Computes:
|
||||
$$t := t + 1$$
|
||||
$$\mu_t := \beta_1 * (1 - 0.5 * 0.96^{0.004 * t})$$
|
||||
$$g' := g / (1 - \prod_{i=1}^{t}{\mu_i})$$
|
||||
$$m_t := \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
|
||||
$$m' := m_t / (1 - \prod_{i=1}^{t+1}{\mu_i})$$
|
||||
$$v_t := \beta_2 * v_{t-1} + (1 - \beta_2) * g * g$$
|
||||
$$v' := v_t / (1 - \beta_2^t)$$
|
||||
$$\bar{m} := (1 - \mu_t) * g' + \mu_{t+1} * m'$$
|
||||
$$\theta_t := \theta_{t-1} - lr * \bar{m} / (\sqrt{v'} + \epsilon)$$
|
||||
|
||||
gradient is evaluated at theta(t) + momentum * v(t), and the variables always
|
||||
store theta + beta_1 * m / sqrt(v) instead of theta.
|
||||
|
||||
References
|
||||
See [Dozat, T., 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
|
||||
Reference:
|
||||
- [Dozat, 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
|
||||
"""
|
||||
|
||||
_HAS_AGGREGATE_GRAD = True
|
||||
@ -70,24 +64,6 @@ class Nadam(optimizer_v2.OptimizerV2):
|
||||
epsilon=1e-7,
|
||||
name='Nadam',
|
||||
**kwargs):
|
||||
"""Construct a new Nadam optimizer.
|
||||
|
||||
Args:
|
||||
learning_rate: A Tensor or a floating point value. The learning rate.
|
||||
beta_1: A float value or a constant float tensor. The exponential decay
|
||||
rate for the 1st moment estimates.
|
||||
beta_2: A float value or a constant float tensor. The exponential decay
|
||||
rate for the exponentially weighted infinity norm.
|
||||
epsilon: A small constant for numerical stability.
|
||||
name: Optional name for the operations created when applying gradients.
|
||||
Defaults to "Nadam".
|
||||
**kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
|
||||
`decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
|
||||
gradients by value, `decay` is included for backward compatibility to
|
||||
allow time inverse decay of learning rate. `lr` is included for backward
|
||||
compatibility, recommended to use `learning_rate` instead.
|
||||
"""
|
||||
|
||||
# Backwards compatibility with keras NAdam optimizer.
|
||||
kwargs['decay'] = kwargs.pop('schedule_decay', 0.004)
|
||||
learning_rate = kwargs.get('lr', learning_rate)
|
||||
|
@ -12,7 +12,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
"""Version 2 of class Optimizer."""
|
||||
# pylint: disable=g-bad-name
|
||||
|
||||
@ -79,11 +78,10 @@ def _deduplicate_indexed_slices(values, indices):
|
||||
@six.add_metaclass(abc.ABCMeta)
|
||||
@keras_export("keras.optimizers.Optimizer")
|
||||
class OptimizerV2(trackable.Trackable):
|
||||
"""Updated base class for optimizers.
|
||||
"""Base class for Keras optimizers.
|
||||
|
||||
This class defines the API to add Ops to train a model. You never use this
|
||||
class directly, but instead instantiate one of its subclasses such as
|
||||
`tf.keras.optimizers.SGD`, `tf.keras.optimizers.Adam`.
|
||||
You should not use this class directly, but instead instantiate one of its
|
||||
subclasses such as `tf.keras.optimizers.SGD`, `tf.keras.optimizers.Adam`, etc.
|
||||
|
||||
### Usage
|
||||
|
||||
@ -101,7 +99,7 @@ class OptimizerV2(trackable.Trackable):
|
||||
opt.minimize(loss, var_list=[var1, var2])
|
||||
```
|
||||
|
||||
### Custom training loop with Keras models
|
||||
### Usage in custom training loops
|
||||
|
||||
In Keras models, sometimes variables are created when the model is first
|
||||
called, instead of construction time. Examples include 1) sequential models
|
||||
@ -109,6 +107,7 @@ class OptimizerV2(trackable.Trackable):
|
||||
callable in these cases.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
opt = tf.keras.optimizers.SGD(learning_rate=0.1)
|
||||
model = tf.keras.Sequential()
|
||||
@ -120,7 +119,7 @@ class OptimizerV2(trackable.Trackable):
|
||||
opt.minimize(loss_fn, var_list_fn)
|
||||
```
|
||||
|
||||
### Processing gradients before applying them.
|
||||
### Processing gradients before applying them
|
||||
|
||||
Calling `minimize()` takes care of both computing the gradients and
|
||||
applying them to the variables. If you want to process the gradients
|
||||
@ -150,7 +149,7 @@ class OptimizerV2(trackable.Trackable):
|
||||
opt.apply_gradients(zip(processed_grads, var_list))
|
||||
```
|
||||
|
||||
### Use with `tf.distribute.Strategy`.
|
||||
### Use with `tf.distribute.Strategy`
|
||||
|
||||
This optimizer class is `tf.distribute.Strategy` aware, which means it
|
||||
automatically sums gradients across all replicas. To average gradients,
|
||||
@ -172,7 +171,7 @@ class OptimizerV2(trackable.Trackable):
|
||||
step. As a result, using `tf.math.reduce_mean` will give the wrong answer,
|
||||
resulting in gradients that can be many times too big.
|
||||
|
||||
### Variable Constraint
|
||||
### Variable Constraints
|
||||
|
||||
All Keras optimizers respect variable constraints. If constraint function is
|
||||
passed to any variable, the constraint will be applied to the variable after
|
||||
@ -220,7 +219,8 @@ class OptimizerV2(trackable.Trackable):
|
||||
opt.minimize(loss, var_list=[var1, var2])
|
||||
```
|
||||
|
||||
### Callable learning rate.
|
||||
### Callable learning rate
|
||||
|
||||
Optimizer accepts a callable learning rate in two ways. The first way is
|
||||
through built-in or customized
|
||||
`tf.keras.optimizers.schedules.LearningRateSchedule`. The schedule will be
|
||||
@ -250,14 +250,17 @@ class OptimizerV2(trackable.Trackable):
|
||||
>>> opt.minimize(loss, var_list=[var])
|
||||
<tf.Variable...
|
||||
|
||||
### Write a customized optimizer.
|
||||
### Creating a custom optimizer
|
||||
|
||||
If you intend to create your own optimization algorithm, simply inherit from
|
||||
this class and override the following methods:
|
||||
|
||||
- _resource_apply_dense (update variable given gradient tensor is dense)
|
||||
- _resource_apply_sparse (update variable given gradient tensor is sparse)
|
||||
- _create_slots (if your optimizer algorithm requires additional variables)
|
||||
- get_config (serialization of the optimizer, include all hyper parameters)
|
||||
- `_resource_apply_dense` (update variable given gradient tensor is dense)
|
||||
- `_resource_apply_sparse` (update variable given gradient tensor is sparse)
|
||||
- `_create_slots`
|
||||
(if your optimizer algorithm requires additional variables)
|
||||
- `get_config`
|
||||
(serialization of the optimizer, include all hyper parameters)
|
||||
"""
|
||||
|
||||
# Subclasses should set this to True unless they override `apply_gradients`
|
||||
|
@ -12,7 +12,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""RMSprop for TensorFlow."""
|
||||
"""RMSprop optimizer implementation."""
|
||||
# pylint: disable=g-classes-have-attributes
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
@ -34,36 +35,37 @@ from tensorflow.python.util.tf_export import keras_export
|
||||
class RMSprop(optimizer_v2.OptimizerV2):
|
||||
r"""Optimizer that implements the RMSprop algorithm.
|
||||
|
||||
A detailed description of rmsprop.
|
||||
- maintain a moving (discounted) average of the square of gradients
|
||||
- divide gradient by the root of this average
|
||||
The gist of RMSprop is to:
|
||||
|
||||
The default settings does not use momentum:
|
||||
|
||||
$$rms_t = \rho * rms_{t-1} + (1-\rho) * g_t^2$$
|
||||
$$\theta_t = \theta_{t-1} - \mathrm{learning\_rate} *
|
||||
g_t / \sqrt{rms_t + \epsilon}$$
|
||||
|
||||
Since $x/x^2 = sign(x)$, this is an smoothed approximation of:
|
||||
|
||||
$$ \theta_t = \theta_{t-1} - \mathrm{learning\_rate} * sign(g_t) $$
|
||||
|
||||
With momentum the update is:
|
||||
|
||||
$$rms_t = \rho * rms_{t-1} + (1-\rho) * g_t^2$$
|
||||
$$mom_t = \mathrm{momentum} * mom_{t-1} + g_t / \sqrt{rms_t + \epsilon}$$
|
||||
$$\theta_t = \theta_{t-1} - \mathrm{learning\_rate} * mom_t$$
|
||||
- Maintain a moving (discounted) average of the square of gradients
|
||||
- Divide the gradient by the root of this average
|
||||
|
||||
This implementation of RMSprop uses plain momentum, not Nesterov momentum.
|
||||
|
||||
The centered version additionally maintains a moving average of the
|
||||
gradients, and uses that average to estimate the variance:
|
||||
gradients, and uses that average to estimate the variance.
|
||||
|
||||
$$mg_t = \rho * mg_{t-1} + (1-\rho) * g_t$$
|
||||
$$rms_t = \rho * rms_{t-1} + (1-\rho) * g_t^2$$
|
||||
$$mom_t = \mathrm{momentum} * mom_{t-1} +
|
||||
\mathrm{learning\_rate} * g_t / sqrt(rms_t - mg_t^2 + \epsilon)$$
|
||||
$$\theta_t = \theta_{t-1} - mom_t$$
|
||||
Args:
|
||||
learning_rate: A `Tensor`, floating point value, or a schedule that is a
|
||||
`tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
|
||||
that takes no arguments and returns the actual value to use. The
|
||||
learning rate. Defeaults to 0.001.
|
||||
rho: Discounting factor for the history/coming gradient. Defaults to 0.9.
|
||||
momentum: A scalar or a scalar `Tensor`. Defaults to 0.0.
|
||||
epsilon: A small constant for numerical stability. This epsilon is
|
||||
"epsilon hat" in the Kingma and Ba paper (in the formula just before
|
||||
Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
|
||||
1e-7.
|
||||
centered: Boolean. If `True`, gradients are normalized by the estimated
|
||||
variance of the gradient; if False, by the uncentered second moment.
|
||||
Setting this to `True` may help with training, but is slightly more
|
||||
expensive in terms of computation and memory. Defaults to `False`.
|
||||
name: Optional name prefix for the operations created when applying
|
||||
gradients. Defaults to `"RMSprop"`.
|
||||
**kwargs: Keyword arguments. Allowed to be one of
|
||||
`"clipnorm"` or `"clipvalue"`.
|
||||
`"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
|
||||
gradients by value.
|
||||
|
||||
Note that in the dense implementation of this algorithm, variables and their
|
||||
corresponding accumulators (momentum, gradient moving average, square
|
||||
@ -86,9 +88,9 @@ class RMSprop(optimizer_v2.OptimizerV2):
|
||||
>>> var1.numpy()
|
||||
9.683772
|
||||
|
||||
References
|
||||
See ([pdf]
|
||||
http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
|
||||
Reference:
|
||||
- [Hinton, 2012](
|
||||
http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
|
||||
"""
|
||||
|
||||
_HAS_AGGREGATE_GRAD = True
|
||||
|
Loading…
Reference in New Issue
Block a user