diff --git a/tensorflow/python/keras/optimizer_v2/adadelta.py b/tensorflow/python/keras/optimizer_v2/adadelta.py index 9d67ed25c66..99bd2f8e8bf 100644 --- a/tensorflow/python/keras/optimizer_v2/adadelta.py +++ b/tensorflow/python/keras/optimizer_v2/adadelta.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - -"""Adadelta for TensorFlow.""" +"""Adadelta optimizer implementation.""" +# pylint: disable=g-classes-have-attributes from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -34,23 +34,9 @@ class Adadelta(optimizer_v2.OptimizerV2): Adadelta optimization is a stochastic gradient descent method that is based on adaptive learning rate per dimension to address two drawbacks: - 1) the continual decay of learning rates throughout training - 2) the need for a manually selected global learning rate - Two accumulation steps are required: - 1) the accumulation of gradients squared, - 2) the accumulation of updates squared. - - Initialization: - - $$E[g^2]_0 := 0 \text{(Initialize gradient 2nd order moment vector)}$$ - $$E[\Delta x^2]_0 := 0 \text{(Initialize 2nd order variable update)}$$ - - $$t := t + 1$$ - $$E[g^2]_t := \rho * E[g^2]_{t-1} + (1 - \rho) * g^2$$ - $$\Delta x_t = -RMS[\Delta x]_{t-1} * g_t / RMS[g]_t$$ - $$E[\Delta x^2]_t := \rho * E[\Delta x^2]_{t-1} + (1 - \rho) * \Delta x_t^2$$ - $$x_t := x_{t-1} + \Delta x_{t}$$ + - The continual decay of learning rates throughout training + - The need for a manually selected global learning rate Adadelta is a more robust extension of Adagrad that adapts learning rates based on a moving window of gradient updates, instead of accumulating all @@ -59,16 +45,22 @@ class Adadelta(optimizer_v2.OptimizerV2): don't have to set an initial learning rate. In this version, initial learning rate can be set, as in most other Keras optimizers. - @compatibility(eager) - When eager execution is enabled, `learning_rate`, `rho`, and `epsilon` can - each be a callable that takes no arguments and returns the actual value to - use. This can be useful for changing these values across different - invocations of optimizer functions. - @end_compatibility + Args: + learning_rate: A `Tensor`, floating point value, or a schedule that is a + `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate. + To match the exact form in the original paper use 1.0. + rho: A `Tensor` or a floating point value. The decay rate. + epsilon: A `Tensor` or a floating point value. A constant epsilon used + to better conditioning the grad update. + name: Optional name prefix for the operations created when applying + gradients. Defaults to `"Adadelta"`. + **kwargs: Keyword arguments. Allowed to be one of + `"clipnorm"` or `"clipvalue"`. + `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips + gradients by value. - References - See [M. D. Zeiler](http://arxiv.org/abs/1212.5701) - ([pdf](http://arxiv.org/pdf/1212.5701v1.pdf)) + Reference: + - [Zeiler, 2012](http://arxiv.org/abs/1212.5701) """ _HAS_AGGREGATE_GRAD = True @@ -79,23 +71,6 @@ class Adadelta(optimizer_v2.OptimizerV2): epsilon=1e-7, name='Adadelta', **kwargs): - """Construct a new Adadelta optimizer. - - Args: - learning_rate: A `Tensor`, floating point value, or a schedule that is a - `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate. - To match the exact form in the original paper use 1.0. - rho: A `Tensor` or a floating point value. The decay rate. - epsilon: A `Tensor` or a floating point value. A constant epsilon used - to better conditioning the grad update. - name: Optional name prefix for the operations created when applying - gradients. Defaults to "Adadelta". - **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, - `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip - gradients by value, `decay` is included for backward compatibility to - allow time inverse decay of learning rate. `lr` is included for backward - compatibility, recommended to use `learning_rate` instead. - """ super(Adadelta, self).__init__(name, **kwargs) self._set_hyper('learning_rate', kwargs.get('lr', learning_rate)) self._set_hyper('decay', self._initial_decay) diff --git a/tensorflow/python/keras/optimizer_v2/adagrad.py b/tensorflow/python/keras/optimizer_v2/adagrad.py index 4e4ffd8e856..9552c3a0374 100644 --- a/tensorflow/python/keras/optimizer_v2/adagrad.py +++ b/tensorflow/python/keras/optimizer_v2/adagrad.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - -"""Adagrad for TensorFlow.""" +"""Adagrad optimizer implementation.""" +# pylint: disable=g-classes-have-attributes from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -39,26 +39,22 @@ class Adagrad(optimizer_v2.OptimizerV2): updated during training. The more updates a parameter receives, the smaller the updates. - Initialization: - $$accum_{g_0} := \text{initial_accumulator_value}$$ + Args: + learning_rate: A `Tensor`, floating point value, or a schedule that is a + `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate. + initial_accumulator_value: A floating point value. + Starting value for the accumulators, must be non-negative. + epsilon: A small floating point value to avoid zero denominator. + name: Optional name prefix for the operations created when applying + gradients. Defaults to `"Adagrad"`. + **kwargs: Keyword arguments. Allowed to be one of + `"clipnorm"` or `"clipvalue"`. + `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips + gradients by value. - Update step: - $$t := t + 1$$ - $$accum_{g_t} := accum_{g_{t-1}} + g^2$$ - $$\theta_t := \theta_{t-1} - lr * g / (\sqrt{accum_{g_t}} + \epsilon)$$ - - @compatibility(eager) - When eager execution is enabled, `learning_rate` can be a callable that - takes no arguments and returns the actual value to use. This can be useful - for changing these values across different invocations of optimizer - functions. - @end_compatibility - - References: - - * [Paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf). - * [Introduction] - (https://ppasupat.github.io/a9online/uploads/proximal_notes.pdf). + Reference: + - [Duchi et al., 2011]( + http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf). """ _HAS_AGGREGATE_GRAD = True @@ -69,25 +65,6 @@ class Adagrad(optimizer_v2.OptimizerV2): epsilon=1e-7, name='Adagrad', **kwargs): - """Construct a new Adagrad optimizer. - - Args: - learning_rate: A `Tensor`, floating point value, or a schedule that is a - `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate. - initial_accumulator_value: A floating point value. - Starting value for the accumulators, must be non-negative. - epsilon: A small floating point value to avoid zero denominator. - name: Optional name prefix for the operations created when applying - gradients. Defaults to "Adagrad". - **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, - `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip - gradients by value, `decay` is included for backward compatibility to - allow time inverse decay of learning rate. `lr` is included for backward - compatibility, recommended to use `learning_rate` instead. - - Raises: - ValueError: If the `initial_accumulator_value` or `epsilon` is invalid. - """ if initial_accumulator_value < 0.0: raise ValueError('initial_accumulator_value must be non-negative: %s' % initial_accumulator_value) diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py index 67152e4b537..df41201e14b 100644 --- a/tensorflow/python/keras/optimizer_v2/adam.py +++ b/tensorflow/python/keras/optimizer_v2/adam.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Adam for TensorFlow.""" +"""Adam optimizer implementation.""" +# pylint: disable=g-classes-have-attributes from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -35,50 +36,58 @@ class Adam(optimizer_v2.OptimizerV2): Adam optimization is a stochastic gradient descent method that is based on adaptive estimation of first-order and second-order moments. - According to the paper - [Adam: A Method for Stochastic Optimization. Kingma et al., - 2014](http://arxiv.org/abs/1412.6980), the method is "*computationally + + According to + [Kingma et al., 2014](http://arxiv.org/abs/1412.6980), + the method is "*computationally efficient, has little memory requirement, invariant to diagonal rescaling of gradients, and is well suited for problems that are large in terms of data/parameters*". - For AMSGrad see [On The Convergence Of Adam And Beyond. - Reddi et al., 5-8](https://openreview.net/pdf?id=ryQu7f-RZ). + Args: + learning_rate: A `Tensor`, floating point value, or a schedule that is a + `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable + that takes no arguments and returns the actual value to use, The + learning rate. Defaults to 0.001. + beta_1: A float value or a constant float tensor, or a callable + that takes no arguments and returns the actual value to use. The + exponential decay rate for the 1st moment estimates. Defaults to 0.9. + beta_2: A float value or a constant float tensor, or a callable + that takes no arguments and returns the actual value to use, The + exponential decay rate for the 2nd moment estimates. Defaults to 0.999. + epsilon: A small constant for numerical stability. This epsilon is + "epsilon hat" in the Kingma and Ba paper (in the formula just before + Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to + 1e-7. + amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from + the paper "On the Convergence of Adam and beyond". Defaults to `False`. + name: Optional name for the operations created when applying gradients. + Defaults to `"Adam"`. + **kwargs: Keyword arguments. Allowed to be one of + `"clipnorm"` or `"clipvalue"`. + `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips + gradients by value. - **If amsgrad = False**: + Usage: - initialize $m_0$ as 1st moment vector - initialize $v_0$ as 2nd moment vector + >>> opt = tf.keras.optimizers.Adam(learning_rate=0.1) + >>> var1 = tf.Variable(10.0) + >>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) == var1 + >>> step_count = opt.minimize(loss, [var1]).numpy() + >>> # The first step is `-learning_rate*sign(grad)` + >>> var1.numpy() + 9.9 - The update rule for $\theta$ with gradient $g$ uses an optimization - described at the end of section 2 of the paper: + Reference: + - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980) + - [Reddi et al., 2018]( + https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`. - $$lr_t = \mathrm{learning\_rate} * - \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$ - $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$ - $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$ - $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ - - **If amsgrad = True**: - - initialize $m_0$ as 1st moment vector - initialize $v_0$ as 2nd moment vector - initialize $\hat{v}_0$ as 2nd moment vector - - The update rule for $\theta$ with gradient $g$ uses an optimization - described at the end of section 2 of the paper: - - $$lr_t = \mathrm{learning\_rate} * - \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$ - - $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$ - $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$ - $$\hat{v}_t = \max(\hat{v}_{t-1}, v_t)$$ - $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{\hat{v}_t} + \epsilon)$$ + Notes: The default value of 1e-7 for epsilon might not be a good default in general. For example, when training an Inception network on ImageNet a - current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the + current good choice is 1.0 or 0.1. Note that since Adam uses the formulation just before Section 2.1 of the Kingma and Ba paper rather than the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon hat" in the paper. @@ -91,16 +100,6 @@ class Adam(optimizer_v2.OptimizerV2): accumulator. This means that the sparse behavior is equivalent to the dense behavior (in contrast to some momentum implementations which ignore momentum unless a variable slice was actually used). - - Usage: - - >>> opt = tf.keras.optimizers.Adam(learning_rate=0.1) - >>> var1 = tf.Variable(10.0) - >>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) == var1 - >>> step_count = opt.minimize(loss, [var1]).numpy() - >>> # The first step is `-learning_rate*sign(grad)` - >>> var1.numpy() - 9.9 """ _HAS_AGGREGATE_GRAD = True @@ -113,34 +112,6 @@ class Adam(optimizer_v2.OptimizerV2): amsgrad=False, name='Adam', **kwargs): - """Construct a new Adam optimizer. - - Args: - learning_rate: A `Tensor`, floating point value, or a schedule that is a - `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable - that takes no arguments and returns the actual value to use, The - learning rate. Defaults to 0.001. - beta_1: A float value or a constant float tensor, or a callable - that takes no arguments and returns the actual value to use. The - exponential decay rate for the 1st moment estimates. Defaults to 0.9. - beta_2: A float value or a constant float tensor, or a callable - that takes no arguments and returns the actual value to use, The - exponential decay rate for the 2nd moment estimates. Defaults to 0.999. - epsilon: A small constant for numerical stability. This epsilon is - "epsilon hat" in the Kingma and Ba paper (in the formula just before - Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to - 1e-7. - amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from - the paper "On the Convergence of Adam and beyond". Defaults to `False`. - name: Optional name for the operations created when applying gradients. - Defaults to "Adam". - **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, - `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip - gradients by value, `decay` is included for backward compatibility to - allow time inverse decay of learning rate. `lr` is included for backward - compatibility, recommended to use `learning_rate` instead. - """ - super(Adam, self).__init__(name, **kwargs) self._set_hyper('learning_rate', kwargs.get('lr', learning_rate)) self._set_hyper('decay', self._initial_decay) @@ -329,7 +300,7 @@ class NonFusedAdam(optimizer_v2.OptimizerV2): The default value of 1e-7 for epsilon might not be a good default in general. For example, when training an Inception network on ImageNet a - current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the + current good choice is 1.0 or 0.1. Note that since Adam uses the formulation just before Section 2.1 of the Kingma and Ba paper rather than the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon hat" in the paper. diff --git a/tensorflow/python/keras/optimizer_v2/adamax.py b/tensorflow/python/keras/optimizer_v2/adamax.py index 9a7e1e28a89..5ac4734c6a2 100644 --- a/tensorflow/python/keras/optimizer_v2/adamax.py +++ b/tensorflow/python/keras/optimizer_v2/adamax.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - -"""Adamax for TensorFlow.""" +"""Adamax optimizer implementation.""" +# pylint: disable=g-classes-have-attributes from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -39,27 +39,27 @@ class Adamax(optimizer_v2.OptimizerV2): Initialization: - ``` - m_0 <- 0 (Initialize initial 1st moment vector) - v_0 <- 0 (Initialize the exponentially weighted infinity norm) - t <- 0 (Initialize timestep) + ```python + m = 0 # Initialize initial 1st moment vector + v = 0 # Initialize the exponentially weighted infinity norm + t = 0 # Initialize timestep ``` - The update rule for `variable` with gradient `g` uses an optimization + The update rule for parameter `w` with gradient `g` is described at the end of section 7.1 of the paper: - ``` - t <- t + 1 - - m_t <- beta1 * m_{t-1} + (1 - beta1) * g - v_t <- max(beta2 * v_{t-1}, abs(g)) - variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon) + ```python + t += 1 + m = beta1 * m + (1 - beta) * g + v = max(beta2 * v, abs(g)) + current_lr = learning_rate / (1 - beta1 ** t) + w = w - current_lr * m / (v + epsilon) ``` - Similar to AdamOptimizer, the epsilon is added for numerical stability - (especially to get rid of division by zero when v_t = 0). + Similarly to `Adam`, the epsilon is added for numerical stability + (especially to get rid of division by zero when `v_t == 0`). - Contrast to AdamOptimizer, the sparse implementation of this algorithm + In contrast to `Adam`, the sparse implementation of this algorithm (used when the gradient is an IndexedSlices object, typically because of `tf.gather` or an embedding lookup in the forward pass) only updates variable slices and corresponding `m_t`, `v_t` terms when that part of @@ -68,9 +68,23 @@ class Adamax(optimizer_v2.OptimizerV2): implementations which ignore momentum unless a variable slice was actually used). - References - see Section 7 of [Kingma et al., 2014](http://arxiv.org/abs/1412.6980) - ([pdf](http://arxiv.org/pdf/1412.6980.pdf)). + Args: + learning_rate: A `Tensor`, floating point value, or a schedule that is a + `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate. + beta_1: A float value or a constant float tensor. The exponential decay + rate for the 1st moment estimates. + beta_2: A float value or a constant float tensor. The exponential decay + rate for the exponentially weighted infinity norm. + epsilon: A small constant for numerical stability. + name: Optional name for the operations created when applying gradients. + Defaults to `"Adamax"`. + **kwargs: Keyword arguments. Allowed to be one of + `"clipnorm"` or `"clipvalue"`. + `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips + gradients by value. + + Reference: + - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980) """ _HAS_AGGREGATE_GRAD = True @@ -82,24 +96,6 @@ class Adamax(optimizer_v2.OptimizerV2): epsilon=1e-7, name='Adamax', **kwargs): - """Construct a new Adamax optimizer. - - Args: - learning_rate: A `Tensor`, floating point value, or a schedule that is a - `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate. - beta_1: A float value or a constant float tensor. The exponential decay - rate for the 1st moment estimates. - beta_2: A float value or a constant float tensor. The exponential decay - rate for the exponentially weighted infinity norm. - epsilon: A small constant for numerical stability. - name: Optional name for the operations created when applying gradients. - Defaults to "Adamax". - **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, - `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip - gradients by value, `decay` is included for backward compatibility to - allow time inverse decay of learning rate. `lr` is included for backward - compatibility, recommended to use `learning_rate` instead. - """ super(Adamax, self).__init__(name, **kwargs) self._set_hyper('learning_rate', kwargs.get('lr', learning_rate)) self._set_hyper('decay', self._initial_decay) diff --git a/tensorflow/python/keras/optimizer_v2/ftrl.py b/tensorflow/python/keras/optimizer_v2/ftrl.py index 17484395044..419f0f70125 100644 --- a/tensorflow/python/keras/optimizer_v2/ftrl.py +++ b/tensorflow/python/keras/optimizer_v2/ftrl.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Ftrl-proximal for TensorFlow.""" +"""Ftrl-proximal optimizer implementation.""" +# pylint: disable=g-classes-have-attributes from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -35,26 +36,32 @@ class Ftrl(optimizer_v2.OptimizerV2): above) and shrinkage-type L2 (which is the addition of an L2 penalty to the loss function). - Initialization: - $$t = 0$$ - $$n_{0} = 0$$ - $$\sigma_{0} = 0$$ - $$z_{0} = 0$$ + Args: + learning_rate: A `Tensor`, floating point value, or a schedule that is a + `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate. + learning_rate_power: A float value, must be less or equal to zero. + Controls how the learning rate decreases during training. Use zero for + a fixed learning rate. + initial_accumulator_value: The starting value for accumulators. + Only zero or positive values are allowed. + l1_regularization_strength: A float value, must be greater than or + equal to zero. + l2_regularization_strength: A float value, must be greater than or + equal to zero. + name: Optional name prefix for the operations created when applying + gradients. Defaults to `"Ftrl"`. + l2_shrinkage_regularization_strength: A float value, must be greater than + or equal to zero. This differs from L2 above in that the L2 above is a + stabilization penalty, whereas this L2 shrinkage is a magnitude penalty. + When input is sparse shrinkage will only happen on the active weights. + **kwargs: Keyword arguments. Allowed to be one of + `"clipnorm"` or `"clipvalue"`. + `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips + gradients by value. - Update ($$i$$ is variable index): - $$t = t + 1$$ - $$n_{t,i} = n_{t-1,i} + g_{t,i}^{2}$$ - $$\sigma_{t,i} = (\sqrt{n_{t,i}} - \sqrt{n_{t-1,i}}) / \alpha$$ - $$z_{t,i} = z_{t-1,i} + g_{t,i} - \sigma_{t,i} * w_{t,i}$$ - $$w_{t,i} = - ((\beta+\sqrt{n+{t}}) / \alpha + \lambda_{2})^{-1} * (z_{i} - - sgn(z_{i}) * \lambda_{1}) if \abs{z_{i}} > \lambda_{i} else 0$$ - - Check the documentation for the l2_shrinkage_regularization_strength - parameter for more details when shrinkage is enabled, where gradient is - replaced with gradient_with_shrinkage. - - References: See - [paper](https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf) + Reference: + - [paper]( + https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf) """ def __init__(self, @@ -66,44 +73,6 @@ class Ftrl(optimizer_v2.OptimizerV2): name='Ftrl', l2_shrinkage_regularization_strength=0.0, **kwargs): - r"""Construct a new FTRL optimizer. - - Args: - learning_rate: A `Tensor`, floating point value, or a schedule that is a - `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate. - learning_rate_power: A float value, must be less or equal to zero. - Controls how the learning rate decreases during training. Use zero for - a fixed learning rate. - initial_accumulator_value: The starting value for accumulators. - Only zero or positive values are allowed. - l1_regularization_strength: A float value, must be greater than or - equal to zero. - l2_regularization_strength: A float value, must be greater than or - equal to zero. - name: Optional name prefix for the operations created when applying - gradients. Defaults to "Ftrl". - l2_shrinkage_regularization_strength: A float value, must be greater than - or equal to zero. This differs from L2 above in that the L2 above is a - stabilization penalty, whereas this L2 shrinkage is a magnitude penalty. - The FTRL formulation can be written as: - w_{t+1} = argmin_w(\hat{g}_{1:t}w + L1*||w||_1 + L2*||w||_2^2), where - \hat{g} = g + (2*L2_shrinkage*w), and g is the gradient of the loss - function w.r.t. the weights w. - Specifically, in the absence of L1 regularization, it is equivalent to - the following update rule: - w_{t+1} = w_t - lr_t / (1 + 2*L2*lr_t) * g_t - - 2*L2_shrinkage*lr_t / (1 + 2*L2*lr_t) * w_t - where lr_t is the learning rate at t. - When input is sparse shrinkage will only happen on the active weights.\ - **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, - `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip - gradients by value, `decay` is included for backward compatibility to - allow time inverse decay of learning rate. `lr` is included for backward - compatibility, recommended to use `learning_rate` instead. - - Raises: - ValueError: If one of the arguments is invalid. - """ super(Ftrl, self).__init__(name, **kwargs) if initial_accumulator_value < 0.0: diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent.py b/tensorflow/python/keras/optimizer_v2/gradient_descent.py index 32547b95a52..856cc692431 100644 --- a/tensorflow/python/keras/optimizer_v2/gradient_descent.py +++ b/tensorflow/python/keras/optimizer_v2/gradient_descent.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Momentum for TensorFlow.""" +"""SGD optimizer implementation.""" +# pylint: disable=g-classes-have-attributes from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -27,17 +28,45 @@ from tensorflow.python.util.tf_export import keras_export @keras_export("keras.optimizers.SGD") class SGD(optimizer_v2.OptimizerV2): - r"""Stochastic gradient descent and momentum optimizer. + r"""Gradient descent (with momentum) optimizer. - The update rule for $\theta$ with gradient $g$ when `momentum` is 0.0: - $$\theta_t = \theta_{t-1} - \mathrm{learning\_rate} * g_t$$ + Update rule for parameter `w` with gradient `g` when `momentum` is 0: - The update rule when `momentum` is larger than 0.0: - $$v_t = \mathrm{momentum} * v_{t-1} - \mathrm{learning\_rate} * g_t$$ - $$\theta_t = \theta_{t-1} + v_t$$ - if `nesterov` is False, gradient is evaluated at $\theta_t$. - if `nesterov` is True, gradient is evaluated at $\theta_t + momentum * v_t$, - and the variables always store $\theta + m v$ instead of $theta$ + ```python + w = w - learning_rate * g + ``` + + Update rule when `momentum` is larger than 0: + + ```python + velocity = momentum * velocity - learning_rate * g + w = w * velocity + ``` + + When `nesterov=False`, this rule becomes: + + ```python + velocity = momentum * velocity - learning_rate * g + w = w + momentum * velocity - learning_rate * g + ``` + + Args: + learning_rate: A `Tensor`, floating point value, or a schedule that is a + `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable + that takes no arguments and returns the actual value to use. The + learning rate. Defaults to 0.01. + momentum: float hyperparameter >= 0 that accelerates gradient descent + in the relevant + direction and dampens oscillations. Defaults to 0, i.e., vanilla gradient + descent. + nesterov: boolean. Whether to apply Nesterov momentum. + Defaults to `False`. + name: Optional name prefix for the operations created when applying + gradients. Defaults to `"SGD"`. + **kwargs: Keyword arguments. Allowed to be one of + `"clipnorm"` or `"clipvalue"`. + `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips + gradients by value. Usage: @@ -45,7 +74,7 @@ class SGD(optimizer_v2.OptimizerV2): >>> var = tf.Variable(1.0) >>> loss = lambda: (var ** 2)/2.0 # d(loss)/d(var1) = var1 >>> step_count = opt.minimize(loss, [var]).numpy() - >>> # Step is `-learning_rate*grad` + >>> # Step is `- learning_rate * grad` >>> var.numpy() 0.9 @@ -53,7 +82,7 @@ class SGD(optimizer_v2.OptimizerV2): >>> var = tf.Variable(1.0) >>> val0 = var.value() >>> loss = lambda: (var ** 2)/2.0 # d(loss)/d(var1) = var1 - >>> # First step is `-learning_rate*grad` + >>> # First step is `- learning_rate * grad` >>> step_count = opt.minimize(loss, [var]).numpy() >>> val1 = var.value() >>> (val0 - val1).numpy() @@ -64,13 +93,8 @@ class SGD(optimizer_v2.OptimizerV2): >>> (val1 - val2).numpy() 0.18 - Some of the args below are hyperparameters, where a hyperparameter is - defined as a scalar Tensor, a regular Python value, or a callable (which - will be evaluated when `apply_gradients` is called) returning a scalar - Tensor or a Python value. - - # References - nesterov = True, See [Sutskever et al., 2013]( + Reference: + - For `nesterov=True`, See [Sutskever et al., 2013]( http://jmlr.org/proceedings/papers/v28/sutskever13.pdf). """ @@ -82,25 +106,6 @@ class SGD(optimizer_v2.OptimizerV2): nesterov=False, name="SGD", **kwargs): - """Construct a new Stochastic Gradient Descent or Momentum optimizer. - - Arguments: - learning_rate: A `Tensor`, floating point value, or a schedule that is a - `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable - that takes no arguments and returns the actual value to use. The - learning rate. Defaults to 0.01. - momentum: float hyperparameter >= 0 that accelerates SGD in the relevant - direction and dampens oscillations. Defaults to 0.0, i.e., SGD. - nesterov: boolean. Whether to apply Nesterov momentum. - Defaults to `False`. - name: Optional name prefix for the operations created when applying - gradients. Defaults to 'SGD'. - **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, - `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip - gradients by value, `decay` is included for backward compatibility to - allow time inverse decay of learning rate. `lr` is included for backward - compatibility, recommended to use `learning_rate` instead. - """ super(SGD, self).__init__(name, **kwargs) self._set_hyper("learning_rate", kwargs.get("lr", learning_rate)) self._set_hyper("decay", self._initial_decay) diff --git a/tensorflow/python/keras/optimizer_v2/nadam.py b/tensorflow/python/keras/optimizer_v2/nadam.py index f22fbaaae3c..090eabacf1e 100644 --- a/tensorflow/python/keras/optimizer_v2/nadam.py +++ b/tensorflow/python/keras/optimizer_v2/nadam.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Nadam for TensorFlow.""" +"""Nadam optimizer implementation.""" +# pylint: disable=g-classes-have-attributes from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -36,29 +37,22 @@ class Nadam(optimizer_v2.OptimizerV2): Much like Adam is essentially RMSprop with momentum, Nadam is Adam with Nesterov momentum. - Initialization: + Args: + learning_rate: A Tensor or a floating point value. The learning rate. + beta_1: A float value or a constant float tensor. The exponential decay + rate for the 1st moment estimates. + beta_2: A float value or a constant float tensor. The exponential decay + rate for the exponentially weighted infinity norm. + epsilon: A small constant for numerical stability. + name: Optional name for the operations created when applying gradients. + Defaults to `"Nadam"`. + **kwargs: Keyword arguments. Allowed to be one of + `"clipnorm"` or `"clipvalue"`. + `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips + gradients by value. - $$m_0 := 0 \text{(Initialize 1st moment vector)}$$ - $$v_0 := 0 \text{(Initialize 2nd moment vector)}$$ - $$mu_0 := 1$$ - $$t := 0 \text{(Initialize timestep)}$$ - - Computes: - $$t := t + 1$$ - $$\mu_t := \beta_1 * (1 - 0.5 * 0.96^{0.004 * t})$$ - $$g' := g / (1 - \prod_{i=1}^{t}{\mu_i})$$ - $$m_t := \beta_1 * m_{t-1} + (1 - \beta_1) * g$$ - $$m' := m_t / (1 - \prod_{i=1}^{t+1}{\mu_i})$$ - $$v_t := \beta_2 * v_{t-1} + (1 - \beta_2) * g * g$$ - $$v' := v_t / (1 - \beta_2^t)$$ - $$\bar{m} := (1 - \mu_t) * g' + \mu_{t+1} * m'$$ - $$\theta_t := \theta_{t-1} - lr * \bar{m} / (\sqrt{v'} + \epsilon)$$ - - gradient is evaluated at theta(t) + momentum * v(t), and the variables always - store theta + beta_1 * m / sqrt(v) instead of theta. - - References - See [Dozat, T., 2015](http://cs229.stanford.edu/proj2015/054_report.pdf). + Reference: + - [Dozat, 2015](http://cs229.stanford.edu/proj2015/054_report.pdf). """ _HAS_AGGREGATE_GRAD = True @@ -70,24 +64,6 @@ class Nadam(optimizer_v2.OptimizerV2): epsilon=1e-7, name='Nadam', **kwargs): - """Construct a new Nadam optimizer. - - Args: - learning_rate: A Tensor or a floating point value. The learning rate. - beta_1: A float value or a constant float tensor. The exponential decay - rate for the 1st moment estimates. - beta_2: A float value or a constant float tensor. The exponential decay - rate for the exponentially weighted infinity norm. - epsilon: A small constant for numerical stability. - name: Optional name for the operations created when applying gradients. - Defaults to "Nadam". - **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, - `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip - gradients by value, `decay` is included for backward compatibility to - allow time inverse decay of learning rate. `lr` is included for backward - compatibility, recommended to use `learning_rate` instead. - """ - # Backwards compatibility with keras NAdam optimizer. kwargs['decay'] = kwargs.pop('schedule_decay', 0.004) learning_rate = kwargs.get('lr', learning_rate) diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py index 37ec1e933ff..7b2e336678e 100644 --- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py +++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """Version 2 of class Optimizer.""" # pylint: disable=g-bad-name @@ -79,11 +78,10 @@ def _deduplicate_indexed_slices(values, indices): @six.add_metaclass(abc.ABCMeta) @keras_export("keras.optimizers.Optimizer") class OptimizerV2(trackable.Trackable): - """Updated base class for optimizers. + """Base class for Keras optimizers. - This class defines the API to add Ops to train a model. You never use this - class directly, but instead instantiate one of its subclasses such as - `tf.keras.optimizers.SGD`, `tf.keras.optimizers.Adam`. + You should not use this class directly, but instead instantiate one of its + subclasses such as `tf.keras.optimizers.SGD`, `tf.keras.optimizers.Adam`, etc. ### Usage @@ -101,7 +99,7 @@ class OptimizerV2(trackable.Trackable): opt.minimize(loss, var_list=[var1, var2]) ``` - ### Custom training loop with Keras models + ### Usage in custom training loops In Keras models, sometimes variables are created when the model is first called, instead of construction time. Examples include 1) sequential models @@ -109,6 +107,7 @@ class OptimizerV2(trackable.Trackable): callable in these cases. Example: + ```python opt = tf.keras.optimizers.SGD(learning_rate=0.1) model = tf.keras.Sequential() @@ -120,7 +119,7 @@ class OptimizerV2(trackable.Trackable): opt.minimize(loss_fn, var_list_fn) ``` - ### Processing gradients before applying them. + ### Processing gradients before applying them Calling `minimize()` takes care of both computing the gradients and applying them to the variables. If you want to process the gradients @@ -150,7 +149,7 @@ class OptimizerV2(trackable.Trackable): opt.apply_gradients(zip(processed_grads, var_list)) ``` - ### Use with `tf.distribute.Strategy`. + ### Use with `tf.distribute.Strategy` This optimizer class is `tf.distribute.Strategy` aware, which means it automatically sums gradients across all replicas. To average gradients, @@ -172,7 +171,7 @@ class OptimizerV2(trackable.Trackable): step. As a result, using `tf.math.reduce_mean` will give the wrong answer, resulting in gradients that can be many times too big. - ### Variable Constraint + ### Variable Constraints All Keras optimizers respect variable constraints. If constraint function is passed to any variable, the constraint will be applied to the variable after @@ -195,7 +194,7 @@ class OptimizerV2(trackable.Trackable): This can be useful if you want to log debug a training algorithm, report stats about the slots, etc. - ### Hyper parameters + ### Hyperparameters These are arguments passed to the optimizer subclass constructor (the `__init__` method), and then passed to `self._set_hyper()`. @@ -203,7 +202,7 @@ class OptimizerV2(trackable.Trackable): callables. If they are callable, the callable will be called during `apply_gradients()` to get the value for the hyper parameter. - Hyper parameters can be overwritten through user code: + Hyperparameters can be overwritten through user code: Example: @@ -220,7 +219,8 @@ class OptimizerV2(trackable.Trackable): opt.minimize(loss, var_list=[var1, var2]) ``` - ### Callable learning rate. + ### Callable learning rate + Optimizer accepts a callable learning rate in two ways. The first way is through built-in or customized `tf.keras.optimizers.schedules.LearningRateSchedule`. The schedule will be @@ -250,14 +250,17 @@ class OptimizerV2(trackable.Trackable): >>> opt.minimize(loss, var_list=[var]) >> opt = tf.keras.optimizers.RMSprop(learning_rate=0.1) >>> var1 = tf.Variable(10.0) - >>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) = var1 + >>> loss = lambda: (var1 ** 2) / 2.0 # d(loss) / d(var1) = var1 >>> step_count = opt.minimize(loss, [var1]).numpy() >>> var1.numpy() 9.683772 - References - See ([pdf] - http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf). + Reference: + - [Hinton, 2012]( + http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) """ _HAS_AGGREGATE_GRAD = True