diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 36fa9fd0bc0..090a9d126e6 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -5206,7 +5206,7 @@ py_library( "//tensorflow/python/distribute:reduce_util", "//tensorflow/python/eager:backprop", "//tensorflow/python/eager:context", - "//tensorflow/python/keras/optimizer_v2:learning_rate_schedule", + "//tensorflow/python/keras/optimizer_v2:legacy_learning_rate_decay", "//tensorflow/python/ops/losses", "//third_party/py/numpy", "@six_archive//:six", @@ -6602,7 +6602,6 @@ cuda_py_tests( "training/device_setter_test.py", "training/ftrl_test.py", "training/gradient_descent_test.py", - "training/learning_rate_decay_test.py", "training/momentum_test.py", "training/optimizer_test.py", "training/proximal_adagrad_test.py", diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD index c5eab79f6c2..8636ffb237e 100644 --- a/tensorflow/python/keras/optimizer_v2/BUILD +++ b/tensorflow/python/keras/optimizer_v2/BUILD @@ -66,6 +66,20 @@ py_library( ], ) +py_library( + name = "legacy_learning_rate_decay", + srcs = ["legacy_learning_rate_decay.py"], + srcs_version = "PY2AND3", + deps = [ + ":learning_rate_schedule", + "//tensorflow/python:dtypes", + "//tensorflow/python:framework_ops", + "//tensorflow/python:math_ops", + "//tensorflow/python:tf_export", + "//tensorflow/python/eager:context", + ], +) + cuda_py_test( name = "adagrad_test", size = "medium", @@ -245,6 +259,21 @@ cuda_py_test( ], ) +cuda_py_test( + name = "legacy_learning_rate_decay_test", + size = "medium", + srcs = ["legacy_learning_rate_decay_test.py"], + deps = [ + ":legacy_learning_rate_decay", + "//tensorflow/python:framework_test_lib", + "//tensorflow/python:platform_test", + "//tensorflow/python:resource_variable_ops", + "//tensorflow/python:training_lib", + "//tensorflow/python:variables", + "//tensorflow/python/eager:context", + ], +) + cuda_py_test( name = "rmsprop_test", size = "medium", diff --git a/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay.py b/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay.py new file mode 100644 index 00000000000..f86e68d188f --- /dev/null +++ b/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay.py @@ -0,0 +1,771 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Various learning rate decay functions.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools + +from tensorflow.python.eager import context +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule +from tensorflow.python.ops import math_ops +from tensorflow.python.util.tf_export import tf_export + + +@tf_export(v1=["train.exponential_decay"]) +def exponential_decay(learning_rate, + global_step, + decay_steps, + decay_rate, + staircase=False, + name=None): + """Applies exponential decay to the learning rate. + + When training a model, it is often recommended to lower the learning rate as + the training progresses. This function applies an exponential decay function + to a provided initial learning rate. It requires a `global_step` value to + compute the decayed learning rate. You can just pass a TensorFlow variable + that you increment at each training step. + + The function returns the decayed learning rate. It is computed as: + + ```python + decayed_learning_rate = learning_rate * + decay_rate ^ (global_step / decay_steps) + ``` + + If the argument `staircase` is `True`, then `global_step / decay_steps` is an + integer division and the decayed learning rate follows a staircase function. + + Example: decay every 100000 steps with a base of 0.96: + + ```python + ... + global_step = tf.Variable(0, trainable=False) + starter_learning_rate = 0.1 + learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate, + global_step, + 100000, 0.96, staircase=True) + # Passing global_step to minimize() will increment it at each step. + learning_step = ( + tf.compat.v1.train.GradientDescentOptimizer(learning_rate) + .minimize(...my loss..., global_step=global_step) + ) + ``` + + Args: + learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. + The initial learning rate. + global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global + step to use for the decay computation. Must not be negative. + decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Must + be positive. See the decay computation above. + decay_rate: A scalar `float32` or `float64` `Tensor` or a Python number. + The decay rate. + staircase: Boolean. If `True` decay the learning rate at discrete intervals + name: String. Optional name of the operation. Defaults to + 'ExponentialDecay'. + + Returns: + A scalar `Tensor` of the same type as `learning_rate`. The decayed + learning rate. + + Raises: + ValueError: if `global_step` is not supplied. + + @compatibility(eager) + When eager execution is enabled, this function returns a function which in + turn returns the decayed learning rate Tensor. This can be useful for changing + the learning rate value across different invocations of optimizer functions. + @end_compatibility + """ + decayed_lr = learning_rate_schedule.ExponentialDecay( + learning_rate, decay_steps, decay_rate, staircase=staircase, name=name) + if not context.executing_eagerly(): + decayed_lr = decayed_lr(global_step) + else: + decayed_lr = functools.partial(decayed_lr, global_step) + return decayed_lr + + +@tf_export(v1=["train.piecewise_constant_decay", "train.piecewise_constant"]) +def piecewise_constant(x, boundaries, values, name=None): + """Piecewise constant from boundaries and interval values. + + Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5 + for the next 10000 steps, and 0.1 for any additional steps. + + ```python + global_step = tf.Variable(0, trainable=False) + boundaries = [100000, 110000] + values = [1.0, 0.5, 0.1] + learning_rate = tf.compat.v1.train.piecewise_constant(global_step, boundaries, + values) + + # Later, whenever we perform an optimization step, we increment global_step. + ``` + + Args: + x: A 0-D scalar `Tensor`. Must be one of the following types: `float32`, + `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`. + boundaries: A list of `Tensor`s or `int`s or `float`s with strictly + increasing entries, and with all elements having the same type as `x`. + values: A list of `Tensor`s or `float`s or `int`s that specifies the values + for the intervals defined by `boundaries`. It should have one more element + than `boundaries`, and all elements should have the same type. + name: A string. Optional name of the operation. Defaults to + 'PiecewiseConstant'. + + Returns: + A 0-D Tensor. Its value is `values[0]` when `x <= boundaries[0]`, + `values[1]` when `x > boundaries[0]` and `x <= boundaries[1]`, ..., + and values[-1] when `x > boundaries[-1]`. + + Raises: + ValueError: if types of `x` and `boundaries` do not match, or types of all + `values` do not match or + the number of elements in the lists does not match. + + @compatibility(eager) + When eager execution is enabled, this function returns a function which in + turn returns the decayed learning rate Tensor. This can be useful for changing + the learning rate value across different invocations of optimizer functions. + @end_compatibility + """ + boundaries = ops.convert_n_to_tensor(boundaries) + values = ops.convert_n_to_tensor(values) + x_recomp = ops.convert_to_tensor(x) + # Avoid explicit conversion to x's dtype. This could result in faulty + # comparisons, for example if floats are converted to integers. + for i, b in enumerate(boundaries): + if b.dtype.base_dtype != x_recomp.dtype.base_dtype: + # We can promote int32 boundaries to int64 without loss of precision. + # This covers the most common case where the user passes in boundaries + # as an array of Python integers. + if (b.dtype.base_dtype == dtypes.int32 and + x_recomp.dtype.base_dtype == dtypes.int64): + b = math_ops.cast(b, x_recomp.dtype.base_dtype) + boundaries[i] = b + else: + raise ValueError( + "Boundaries (%s) must have the same dtype as x (%s)." % + (b.dtype.base_dtype, x_recomp.dtype.base_dtype)) + for v in values[1:]: + if v.dtype.base_dtype != values[0].dtype.base_dtype: + raise ValueError( + "Values must have elements all with the same dtype (%s vs %s)." % + (values[0].dtype.base_dtype, v.dtype.base_dtype)) + decayed_lr = learning_rate_schedule.PiecewiseConstantDecay( + boundaries, values, name=name) + if not context.executing_eagerly(): + decayed_lr = decayed_lr(x) + else: + decayed_lr = functools.partial(decayed_lr, x) + return decayed_lr + + +@tf_export(v1=["train.polynomial_decay"]) +def polynomial_decay(learning_rate, + global_step, + decay_steps, + end_learning_rate=0.0001, + power=1.0, + cycle=False, + name=None): + """Applies a polynomial decay to the learning rate. + + It is commonly observed that a monotonically decreasing learning rate, whose + degree of change is carefully chosen, results in a better performing model. + This function applies a polynomial decay function to a provided initial + `learning_rate` to reach an `end_learning_rate` in the given `decay_steps`. + + It requires a `global_step` value to compute the decayed learning rate. You + can just pass a TensorFlow variable that you increment at each training step. + + The function returns the decayed learning rate. It is computed as: + + ```python + global_step = min(global_step, decay_steps) + decayed_learning_rate = (learning_rate - end_learning_rate) * + (1 - global_step / decay_steps) ^ (power) + + end_learning_rate + + ``` + + If `cycle` is True then a multiple of `decay_steps` is used, the first one + that is bigger than `global_steps`. + + ```python + decay_steps = decay_steps * ceil(global_step / decay_steps) + decayed_learning_rate = (learning_rate - end_learning_rate) * + (1 - global_step / decay_steps) ^ (power) + + end_learning_rate + + ``` + + Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5): + + ```python + ... + global_step = tf.Variable(0, trainable=False) + starter_learning_rate = 0.1 + end_learning_rate = 0.01 + decay_steps = 10000 + learning_rate = tf.compat.v1.train.polynomial_decay(starter_learning_rate, + global_step, + decay_steps, end_learning_rate, + power=0.5) + # Passing global_step to minimize() will increment it at each step. + learning_step = ( + tf.compat.v1.train.GradientDescentOptimizer(learning_rate) + .minimize(...my loss..., global_step=global_step) + ) + ``` + + Args: + learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. + The initial learning rate. + global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global + step to use for the decay computation. Must not be negative. + decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Must + be positive. See the decay computation above. + end_learning_rate: A scalar `float32` or `float64` `Tensor` or a Python + number. The minimal end learning rate. + power: A scalar `float32` or `float64` `Tensor` or a Python number. The + power of the polynomial. Defaults to linear, 1.0. + cycle: A boolean, whether or not it should cycle beyond decay_steps. + name: String. Optional name of the operation. Defaults to + 'PolynomialDecay'. + + Returns: + A scalar `Tensor` of the same type as `learning_rate`. The decayed + learning rate. + + Raises: + ValueError: if `global_step` is not supplied. + + @compatibility(eager) + When eager execution is enabled, this function returns a function which in + turn returns the decayed learning rate Tensor. This can be useful for changing + the learning rate value across different invocations of optimizer functions. + @end_compatibility + """ + decayed_lr = learning_rate_schedule.PolynomialDecay( + learning_rate, + decay_steps, + end_learning_rate=end_learning_rate, + power=power, + cycle=cycle, + name=name) + + if not context.executing_eagerly(): + decayed_lr = decayed_lr(global_step) + else: + decayed_lr = functools.partial(decayed_lr, global_step) + return decayed_lr + + +@tf_export(v1=["train.natural_exp_decay"]) +def natural_exp_decay(learning_rate, + global_step, + decay_steps, + decay_rate, + staircase=False, + name=None): + """Applies natural exponential decay to the initial learning rate. + + When training a model, it is often recommended to lower the learning rate as + the training progresses. This function applies an exponential decay function + to a provided initial learning rate. It requires an `global_step` value to + compute the decayed learning rate. You can just pass a TensorFlow variable + that you increment at each training step. + + The function returns the decayed learning rate. It is computed as: + + ```python + decayed_learning_rate = learning_rate * exp(-decay_rate * global_step / + decay_step) + ``` + + or, if `staircase` is `True`, as: + + ```python + decayed_learning_rate = learning_rate * exp(-decay_rate * floor(global_step / + decay_step)) + ``` + + Example: decay exponentially with a base of 0.96: + + ```python + ... + global_step = tf.Variable(0, trainable=False) + learning_rate = 0.1 + decay_steps = 5 + k = 0.5 + learning_rate = tf.compat.v1.train.natural_exp_decay(learning_rate, + global_step, + decay_steps, k) + + # Passing global_step to minimize() will increment it at each step. + learning_step = ( + tf.compat.v1.train.GradientDescentOptimizer(learning_rate) + .minimize(...my loss..., global_step=global_step) + ) + ``` + + Args: + learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. + The initial learning rate. + global_step: A Python number. Global step to use for the decay computation. + Must not be negative. + decay_steps: How often to apply decay. + decay_rate: A Python number. The decay rate. + staircase: Whether to apply decay in a discrete staircase, as opposed to + continuous, fashion. + name: String. Optional name of the operation. Defaults to + 'ExponentialTimeDecay'. + + Returns: + A scalar `Tensor` of the same type as `learning_rate`. The decayed + learning rate. + + Raises: + ValueError: if `global_step` is not supplied. + + @compatibility(eager) + When eager execution is enabled, this function returns a function which in + turn returns the decayed learning rate Tensor. This can be useful for changing + the learning rate value across different invocations of optimizer functions. + @end_compatibility + """ + natural_exp_rate = math_ops.exp(math_ops.negative(decay_rate)) + decayed_lr = learning_rate_schedule.ExponentialDecay( + learning_rate, + decay_steps, + natural_exp_rate, + staircase=staircase, + name=name) + + if not context.executing_eagerly(): + decayed_lr = decayed_lr(global_step) + else: + decayed_lr = functools.partial(decayed_lr, global_step) + return decayed_lr + + +@tf_export(v1=["train.inverse_time_decay"]) +def inverse_time_decay(learning_rate, + global_step, + decay_steps, + decay_rate, + staircase=False, + name=None): + """Applies inverse time decay to the initial learning rate. + + When training a model, it is often recommended to lower the learning rate as + the training progresses. This function applies an inverse decay function + to a provided initial learning rate. It requires an `global_step` value to + compute the decayed learning rate. You can just pass a TensorFlow variable + that you increment at each training step. + + The function returns the decayed learning rate. It is computed as: + + ```python + decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / + decay_step) + ``` + + or, if `staircase` is `True`, as: + + ```python + decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / + decay_step)) + ``` + + Example: decay 1/t with a rate of 0.5: + + ```python + ... + global_step = tf.Variable(0, trainable=False) + learning_rate = 0.1 + decay_steps = 1.0 + decay_rate = 0.5 + learning_rate = tf.compat.v1.train.inverse_time_decay(learning_rate, + global_step, + decay_steps, decay_rate) + + # Passing global_step to minimize() will increment it at each step. + learning_step = ( + tf.compat.v1.train.GradientDescentOptimizer(learning_rate) + .minimize(...my loss..., global_step=global_step) + ) + ``` + + Args: + learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. + The initial learning rate. + global_step: A Python number. Global step to use for the decay computation. + Must not be negative. + decay_steps: How often to apply decay. + decay_rate: A Python number. The decay rate. + staircase: Whether to apply decay in a discrete staircase, as opposed to + continuous, fashion. + name: String. Optional name of the operation. Defaults to + 'InverseTimeDecay'. + + Returns: + A scalar `Tensor` of the same type as `learning_rate`. The decayed + learning rate. + + Raises: + ValueError: if `global_step` is not supplied. + + @compatibility(eager) + When eager execution is enabled, this function returns a function which in + turn returns the decayed learning rate Tensor. This can be useful for changing + the learning rate value across different invocations of optimizer functions. + @end_compatibility + """ + decayed_lr = learning_rate_schedule.InverseTimeDecay( + learning_rate, decay_steps, decay_rate, staircase=staircase, name=name) + + if not context.executing_eagerly(): + decayed_lr = decayed_lr(global_step) + else: + decayed_lr = functools.partial(decayed_lr, global_step) + return decayed_lr + + +@tf_export(v1=["train.cosine_decay"]) +def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None): + """Applies cosine decay to the learning rate. + + When training a model, it is often recommended to lower the learning rate as + the training progresses. This function applies a cosine decay function + to a provided initial learning rate. It requires a `global_step` value to + compute the decayed learning rate. You can just pass a TensorFlow variable + that you increment at each training step. + + The function returns the decayed learning rate. It is computed as: + ```python + global_step = min(global_step, decay_steps) + cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps)) + decayed = (1 - alpha) * cosine_decay + alpha + decayed_learning_rate = learning_rate * decayed + ``` + + Example usage: + ```python + decay_steps = 1000 + lr_decayed = cosine_decay(learning_rate, global_step, decay_steps) + ``` + + Args: + learning_rate: A scalar `float32` or `float64` Tensor or a Python number. + The initial learning rate. + global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global + step to use for the decay computation. + decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number + of steps to decay over. + alpha: A scalar `float32` or `float64` Tensor or a Python number. Minimum + learning rate value as a fraction of learning_rate. + name: String. Optional name of the operation. Defaults to 'CosineDecay'. + + Returns: + A scalar `Tensor` of the same type as `learning_rate`. The decayed + learning rate. + Raises: + ValueError: if `global_step` is not supplied. + + References: + Stochastic Gradient Descent with Warm Restarts: + [Loshchilov et al., 2017] + (https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) + ([pdf](https://openreview.net/pdf?id=Skq89Scxx)) + + @compatibility(eager) + When eager execution is enabled, this function returns a function which in + turn returns the decayed learning rate Tensor. This can be useful for changing + the learning rate value across different invocations of optimizer functions. + @end_compatibility + """ + decayed_lr = learning_rate_schedule.CosineDecay( + learning_rate, decay_steps, alpha=alpha, name=name) + + if not context.executing_eagerly(): + decayed_lr = decayed_lr(global_step) + else: + decayed_lr = functools.partial(decayed_lr, global_step) + return decayed_lr + + +@tf_export(v1=["train.cosine_decay_restarts"]) +def cosine_decay_restarts(learning_rate, + global_step, + first_decay_steps, + t_mul=2.0, + m_mul=1.0, + alpha=0.0, + name=None): + """Applies cosine decay with restarts to the learning rate. + + When training a model, it is often recommended to lower the learning rate as + the training progresses. This function applies a cosine decay function with + restarts to a provided initial learning rate. It requires a `global_step` + value to compute the decayed learning rate. You can just pass a TensorFlow + variable that you increment at each training step. + + The function returns the decayed learning rate while taking into account + possible warm restarts. The learning rate multiplier first decays + from 1 to `alpha` for `first_decay_steps` steps. Then, a warm + restart is performed. Each new warm restart runs for `t_mul` times more steps + and with `m_mul` times smaller initial learning rate. + + Example usage: + ```python + first_decay_steps = 1000 + lr_decayed = cosine_decay_restarts(learning_rate, global_step, + first_decay_steps) + ``` + + Args: + learning_rate: A scalar `float32` or `float64` Tensor or a Python number. + The initial learning rate. + global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global + step to use for the decay computation. + first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. + Number of steps to decay over. + t_mul: A scalar `float32` or `float64` `Tensor` or a Python number. Used to + derive the number of iterations in the i-th period + m_mul: A scalar `float32` or `float64` `Tensor` or a Python number. + Used to derive the initial learning rate of the i-th period: + alpha: A scalar `float32` or `float64` Tensor or a Python number. Minimum + learning rate value as a fraction of the learning_rate. + name: String. Optional name of the operation. Defaults to 'SGDRDecay'. + + Returns: + A scalar `Tensor` of the same type as `learning_rate`. The decayed + learning rate. + Raises: + ValueError: if `global_step` is not supplied. + + References: + Stochastic Gradient Descent with Warm Restarts: + [Loshchilov et al., 2017] + (https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) + ([pdf](https://openreview.net/pdf?id=Skq89Scxx)) + + @compatibility(eager) + When eager execution is enabled, this function returns a function which in + turn returns the decayed learning rate Tensor. This can be useful for changing + the learning rate value across different invocations of optimizer functions. + @end_compatibility + """ + decayed_lr = learning_rate_schedule.CosineDecayRestarts( + learning_rate, + first_decay_steps, + t_mul=t_mul, + m_mul=m_mul, + alpha=alpha, + name=name) + + if not context.executing_eagerly(): + decayed_lr = decayed_lr(global_step) + else: + decayed_lr = functools.partial(decayed_lr, global_step) + return decayed_lr + + +@tf_export(v1=["train.linear_cosine_decay"]) +def linear_cosine_decay(learning_rate, + global_step, + decay_steps, + num_periods=0.5, + alpha=0.0, + beta=0.001, + name=None): + """Applies linear cosine decay to the learning rate. + + Note that linear cosine decay is more aggressive than cosine decay and + larger initial learning rates can typically be used. + + When training a model, it is often recommended to lower the learning rate as + the training progresses. This function applies a linear cosine decay function + to a provided initial learning rate. It requires a `global_step` value to + compute the decayed learning rate. You can just pass a TensorFlow variable + that you increment at each training step. + + The function returns the decayed learning rate. It is computed as: + ```python + global_step = min(global_step, decay_steps) + linear_decay = (decay_steps - global_step) / decay_steps) + cosine_decay = 0.5 * ( + 1 + cos(pi * 2 * num_periods * global_step / decay_steps)) + decayed = (alpha + linear_decay) * cosine_decay + beta + decayed_learning_rate = learning_rate * decayed + ``` + + Example usage: + ```python + decay_steps = 1000 + lr_decayed = linear_cosine_decay(learning_rate, global_step, decay_steps) + ``` + + Args: + learning_rate: A scalar `float32` or `float64` Tensor or a Python number. + The initial learning rate. + global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global + step to use for the decay computation. + decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number + of steps to decay over. + num_periods: Number of periods in the cosine part of the decay. See + computation above. + alpha: See computation above. + beta: See computation above. + name: String. Optional name of the operation. Defaults to + 'LinearCosineDecay'. + + Returns: + A scalar `Tensor` of the same type as `learning_rate`. The decayed + learning rate. + Raises: + ValueError: if `global_step` is not supplied. + + References: + Neural Optimizer Search with Reinforcement Learning: + [Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html) + ([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf)) + Stochastic Gradient Descent with Warm Restarts: + [Loshchilov et al., 2017] + (https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) + ([pdf](https://openreview.net/pdf?id=Skq89Scxx)) + + @compatibility(eager) + When eager execution is enabled, this function returns a function which in + turn returns the decayed learning rate Tensor. This can be useful for changing + the learning rate value across different invocations of optimizer functions. + @end_compatibility + """ + decayed_lr = learning_rate_schedule.LinearCosineDecay( + learning_rate, + decay_steps, + num_periods=num_periods, + alpha=alpha, + beta=beta, + name=name) + + if not context.executing_eagerly(): + decayed_lr = decayed_lr(global_step) + else: + decayed_lr = functools.partial(decayed_lr, global_step) + return decayed_lr + + +@tf_export(v1=["train.noisy_linear_cosine_decay"]) +def noisy_linear_cosine_decay(learning_rate, + global_step, + decay_steps, + initial_variance=1.0, + variance_decay=0.55, + num_periods=0.5, + alpha=0.0, + beta=0.001, + name=None): + """Applies noisy linear cosine decay to the learning rate. + + Note that linear cosine decay is more aggressive than cosine decay and + larger initial learning rates can typically be used. + + When training a model, it is often recommended to lower the learning rate as + the training progresses. This function applies a noisy linear + cosine decay function to a provided initial learning rate. + It requires a `global_step` value to compute the decayed learning rate. + You can just pass a TensorFlow variable that you increment at each + training step. + + The function returns the decayed learning rate. It is computed as: + ```python + global_step = min(global_step, decay_steps) + linear_decay = (decay_steps - global_step) / decay_steps) + cosine_decay = 0.5 * ( + 1 + cos(pi * 2 * num_periods * global_step / decay_steps)) + decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta + decayed_learning_rate = learning_rate * decayed + ``` + where eps_t is 0-centered gaussian noise with variance + initial_variance / (1 + global_step) ** variance_decay + + Example usage: + ```python + decay_steps = 1000 + lr_decayed = noisy_linear_cosine_decay( + learning_rate, global_step, decay_steps) + ``` + + Args: + learning_rate: A scalar `float32` or `float64` Tensor or a Python number. + The initial learning rate. + global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global + step to use for the decay computation. + decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number + of steps to decay over. + initial_variance: initial variance for the noise. See computation above. + variance_decay: decay for the noise's variance. See computation above. + num_periods: Number of periods in the cosine part of the decay. See + computation above. + alpha: See computation above. + beta: See computation above. + name: String. Optional name of the operation. Defaults to + 'NoisyLinearCosineDecay'. + + Returns: + A scalar `Tensor` of the same type as `learning_rate`. The decayed + learning rate. + Raises: + ValueError: if `global_step` is not supplied. + + References: + Neural Optimizer Search with Reinforcement Learning: + [Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html) + ([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf)) + Stochastic Gradient Descent with Warm Restarts: + [Loshchilov et al., 2017] + (https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) + ([pdf](https://openreview.net/pdf?id=Skq89Scxx)) + + @compatibility(eager) + When eager execution is enabled, this function returns a function which in + turn returns the decayed learning rate Tensor. This can be useful for changing + the learning rate value across different invocations of optimizer functions. + @end_compatibility + """ + decayed_lr = learning_rate_schedule.NoisyLinearCosineDecay( + learning_rate, + decay_steps, + initial_variance=initial_variance, + variance_decay=variance_decay, + num_periods=num_periods, + alpha=alpha, + beta=beta, + name=name) + + if not context.executing_eagerly(): + decayed_lr = decayed_lr(global_step) + else: + decayed_lr = functools.partial(decayed_lr, global_step) + return decayed_lr diff --git a/tensorflow/python/training/learning_rate_decay_test.py b/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay_test.py similarity index 99% rename from tensorflow/python/training/learning_rate_decay_test.py rename to tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay_test.py index 1029d4cea8f..b5a3197ca67 100644 --- a/tensorflow/python/training/learning_rate_decay_test.py +++ b/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay_test.py @@ -22,11 +22,11 @@ import math from tensorflow.python.eager import context from tensorflow.python.framework import test_util +from tensorflow.python.keras.optimizer_v2 import legacy_learning_rate_decay as learning_rate_decay # Import resource_variable_ops for the variables-to-tensor implicit conversion. from tensorflow.python.ops import resource_variable_ops # pylint: disable=unused-import from tensorflow.python.ops import variables from tensorflow.python.platform import googletest -from tensorflow.python.training import learning_rate_decay class LRDecayTest(test_util.TensorFlowTestCase): diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py index f86e68d188f..86a718f8c5b 100644 --- a/tensorflow/python/training/learning_rate_decay.py +++ b/tensorflow/python/training/learning_rate_decay.py @@ -17,755 +17,16 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import functools -from tensorflow.python.eager import context -from tensorflow.python.framework import dtypes -from tensorflow.python.framework import ops -from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule -from tensorflow.python.ops import math_ops -from tensorflow.python.util.tf_export import tf_export +from tensorflow.python.keras.optimizer_v2 import legacy_learning_rate_decay as learning_rate_decay -@tf_export(v1=["train.exponential_decay"]) -def exponential_decay(learning_rate, - global_step, - decay_steps, - decay_rate, - staircase=False, - name=None): - """Applies exponential decay to the learning rate. - - When training a model, it is often recommended to lower the learning rate as - the training progresses. This function applies an exponential decay function - to a provided initial learning rate. It requires a `global_step` value to - compute the decayed learning rate. You can just pass a TensorFlow variable - that you increment at each training step. - - The function returns the decayed learning rate. It is computed as: - - ```python - decayed_learning_rate = learning_rate * - decay_rate ^ (global_step / decay_steps) - ``` - - If the argument `staircase` is `True`, then `global_step / decay_steps` is an - integer division and the decayed learning rate follows a staircase function. - - Example: decay every 100000 steps with a base of 0.96: - - ```python - ... - global_step = tf.Variable(0, trainable=False) - starter_learning_rate = 0.1 - learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate, - global_step, - 100000, 0.96, staircase=True) - # Passing global_step to minimize() will increment it at each step. - learning_step = ( - tf.compat.v1.train.GradientDescentOptimizer(learning_rate) - .minimize(...my loss..., global_step=global_step) - ) - ``` - - Args: - learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. - The initial learning rate. - global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global - step to use for the decay computation. Must not be negative. - decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Must - be positive. See the decay computation above. - decay_rate: A scalar `float32` or `float64` `Tensor` or a Python number. - The decay rate. - staircase: Boolean. If `True` decay the learning rate at discrete intervals - name: String. Optional name of the operation. Defaults to - 'ExponentialDecay'. - - Returns: - A scalar `Tensor` of the same type as `learning_rate`. The decayed - learning rate. - - Raises: - ValueError: if `global_step` is not supplied. - - @compatibility(eager) - When eager execution is enabled, this function returns a function which in - turn returns the decayed learning rate Tensor. This can be useful for changing - the learning rate value across different invocations of optimizer functions. - @end_compatibility - """ - decayed_lr = learning_rate_schedule.ExponentialDecay( - learning_rate, decay_steps, decay_rate, staircase=staircase, name=name) - if not context.executing_eagerly(): - decayed_lr = decayed_lr(global_step) - else: - decayed_lr = functools.partial(decayed_lr, global_step) - return decayed_lr - - -@tf_export(v1=["train.piecewise_constant_decay", "train.piecewise_constant"]) -def piecewise_constant(x, boundaries, values, name=None): - """Piecewise constant from boundaries and interval values. - - Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5 - for the next 10000 steps, and 0.1 for any additional steps. - - ```python - global_step = tf.Variable(0, trainable=False) - boundaries = [100000, 110000] - values = [1.0, 0.5, 0.1] - learning_rate = tf.compat.v1.train.piecewise_constant(global_step, boundaries, - values) - - # Later, whenever we perform an optimization step, we increment global_step. - ``` - - Args: - x: A 0-D scalar `Tensor`. Must be one of the following types: `float32`, - `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`. - boundaries: A list of `Tensor`s or `int`s or `float`s with strictly - increasing entries, and with all elements having the same type as `x`. - values: A list of `Tensor`s or `float`s or `int`s that specifies the values - for the intervals defined by `boundaries`. It should have one more element - than `boundaries`, and all elements should have the same type. - name: A string. Optional name of the operation. Defaults to - 'PiecewiseConstant'. - - Returns: - A 0-D Tensor. Its value is `values[0]` when `x <= boundaries[0]`, - `values[1]` when `x > boundaries[0]` and `x <= boundaries[1]`, ..., - and values[-1] when `x > boundaries[-1]`. - - Raises: - ValueError: if types of `x` and `boundaries` do not match, or types of all - `values` do not match or - the number of elements in the lists does not match. - - @compatibility(eager) - When eager execution is enabled, this function returns a function which in - turn returns the decayed learning rate Tensor. This can be useful for changing - the learning rate value across different invocations of optimizer functions. - @end_compatibility - """ - boundaries = ops.convert_n_to_tensor(boundaries) - values = ops.convert_n_to_tensor(values) - x_recomp = ops.convert_to_tensor(x) - # Avoid explicit conversion to x's dtype. This could result in faulty - # comparisons, for example if floats are converted to integers. - for i, b in enumerate(boundaries): - if b.dtype.base_dtype != x_recomp.dtype.base_dtype: - # We can promote int32 boundaries to int64 without loss of precision. - # This covers the most common case where the user passes in boundaries - # as an array of Python integers. - if (b.dtype.base_dtype == dtypes.int32 and - x_recomp.dtype.base_dtype == dtypes.int64): - b = math_ops.cast(b, x_recomp.dtype.base_dtype) - boundaries[i] = b - else: - raise ValueError( - "Boundaries (%s) must have the same dtype as x (%s)." % - (b.dtype.base_dtype, x_recomp.dtype.base_dtype)) - for v in values[1:]: - if v.dtype.base_dtype != values[0].dtype.base_dtype: - raise ValueError( - "Values must have elements all with the same dtype (%s vs %s)." % - (values[0].dtype.base_dtype, v.dtype.base_dtype)) - decayed_lr = learning_rate_schedule.PiecewiseConstantDecay( - boundaries, values, name=name) - if not context.executing_eagerly(): - decayed_lr = decayed_lr(x) - else: - decayed_lr = functools.partial(decayed_lr, x) - return decayed_lr - - -@tf_export(v1=["train.polynomial_decay"]) -def polynomial_decay(learning_rate, - global_step, - decay_steps, - end_learning_rate=0.0001, - power=1.0, - cycle=False, - name=None): - """Applies a polynomial decay to the learning rate. - - It is commonly observed that a monotonically decreasing learning rate, whose - degree of change is carefully chosen, results in a better performing model. - This function applies a polynomial decay function to a provided initial - `learning_rate` to reach an `end_learning_rate` in the given `decay_steps`. - - It requires a `global_step` value to compute the decayed learning rate. You - can just pass a TensorFlow variable that you increment at each training step. - - The function returns the decayed learning rate. It is computed as: - - ```python - global_step = min(global_step, decay_steps) - decayed_learning_rate = (learning_rate - end_learning_rate) * - (1 - global_step / decay_steps) ^ (power) + - end_learning_rate - - ``` - - If `cycle` is True then a multiple of `decay_steps` is used, the first one - that is bigger than `global_steps`. - - ```python - decay_steps = decay_steps * ceil(global_step / decay_steps) - decayed_learning_rate = (learning_rate - end_learning_rate) * - (1 - global_step / decay_steps) ^ (power) + - end_learning_rate - - ``` - - Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5): - - ```python - ... - global_step = tf.Variable(0, trainable=False) - starter_learning_rate = 0.1 - end_learning_rate = 0.01 - decay_steps = 10000 - learning_rate = tf.compat.v1.train.polynomial_decay(starter_learning_rate, - global_step, - decay_steps, end_learning_rate, - power=0.5) - # Passing global_step to minimize() will increment it at each step. - learning_step = ( - tf.compat.v1.train.GradientDescentOptimizer(learning_rate) - .minimize(...my loss..., global_step=global_step) - ) - ``` - - Args: - learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. - The initial learning rate. - global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global - step to use for the decay computation. Must not be negative. - decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Must - be positive. See the decay computation above. - end_learning_rate: A scalar `float32` or `float64` `Tensor` or a Python - number. The minimal end learning rate. - power: A scalar `float32` or `float64` `Tensor` or a Python number. The - power of the polynomial. Defaults to linear, 1.0. - cycle: A boolean, whether or not it should cycle beyond decay_steps. - name: String. Optional name of the operation. Defaults to - 'PolynomialDecay'. - - Returns: - A scalar `Tensor` of the same type as `learning_rate`. The decayed - learning rate. - - Raises: - ValueError: if `global_step` is not supplied. - - @compatibility(eager) - When eager execution is enabled, this function returns a function which in - turn returns the decayed learning rate Tensor. This can be useful for changing - the learning rate value across different invocations of optimizer functions. - @end_compatibility - """ - decayed_lr = learning_rate_schedule.PolynomialDecay( - learning_rate, - decay_steps, - end_learning_rate=end_learning_rate, - power=power, - cycle=cycle, - name=name) - - if not context.executing_eagerly(): - decayed_lr = decayed_lr(global_step) - else: - decayed_lr = functools.partial(decayed_lr, global_step) - return decayed_lr - - -@tf_export(v1=["train.natural_exp_decay"]) -def natural_exp_decay(learning_rate, - global_step, - decay_steps, - decay_rate, - staircase=False, - name=None): - """Applies natural exponential decay to the initial learning rate. - - When training a model, it is often recommended to lower the learning rate as - the training progresses. This function applies an exponential decay function - to a provided initial learning rate. It requires an `global_step` value to - compute the decayed learning rate. You can just pass a TensorFlow variable - that you increment at each training step. - - The function returns the decayed learning rate. It is computed as: - - ```python - decayed_learning_rate = learning_rate * exp(-decay_rate * global_step / - decay_step) - ``` - - or, if `staircase` is `True`, as: - - ```python - decayed_learning_rate = learning_rate * exp(-decay_rate * floor(global_step / - decay_step)) - ``` - - Example: decay exponentially with a base of 0.96: - - ```python - ... - global_step = tf.Variable(0, trainable=False) - learning_rate = 0.1 - decay_steps = 5 - k = 0.5 - learning_rate = tf.compat.v1.train.natural_exp_decay(learning_rate, - global_step, - decay_steps, k) - - # Passing global_step to minimize() will increment it at each step. - learning_step = ( - tf.compat.v1.train.GradientDescentOptimizer(learning_rate) - .minimize(...my loss..., global_step=global_step) - ) - ``` - - Args: - learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. - The initial learning rate. - global_step: A Python number. Global step to use for the decay computation. - Must not be negative. - decay_steps: How often to apply decay. - decay_rate: A Python number. The decay rate. - staircase: Whether to apply decay in a discrete staircase, as opposed to - continuous, fashion. - name: String. Optional name of the operation. Defaults to - 'ExponentialTimeDecay'. - - Returns: - A scalar `Tensor` of the same type as `learning_rate`. The decayed - learning rate. - - Raises: - ValueError: if `global_step` is not supplied. - - @compatibility(eager) - When eager execution is enabled, this function returns a function which in - turn returns the decayed learning rate Tensor. This can be useful for changing - the learning rate value across different invocations of optimizer functions. - @end_compatibility - """ - natural_exp_rate = math_ops.exp(math_ops.negative(decay_rate)) - decayed_lr = learning_rate_schedule.ExponentialDecay( - learning_rate, - decay_steps, - natural_exp_rate, - staircase=staircase, - name=name) - - if not context.executing_eagerly(): - decayed_lr = decayed_lr(global_step) - else: - decayed_lr = functools.partial(decayed_lr, global_step) - return decayed_lr - - -@tf_export(v1=["train.inverse_time_decay"]) -def inverse_time_decay(learning_rate, - global_step, - decay_steps, - decay_rate, - staircase=False, - name=None): - """Applies inverse time decay to the initial learning rate. - - When training a model, it is often recommended to lower the learning rate as - the training progresses. This function applies an inverse decay function - to a provided initial learning rate. It requires an `global_step` value to - compute the decayed learning rate. You can just pass a TensorFlow variable - that you increment at each training step. - - The function returns the decayed learning rate. It is computed as: - - ```python - decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / - decay_step) - ``` - - or, if `staircase` is `True`, as: - - ```python - decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / - decay_step)) - ``` - - Example: decay 1/t with a rate of 0.5: - - ```python - ... - global_step = tf.Variable(0, trainable=False) - learning_rate = 0.1 - decay_steps = 1.0 - decay_rate = 0.5 - learning_rate = tf.compat.v1.train.inverse_time_decay(learning_rate, - global_step, - decay_steps, decay_rate) - - # Passing global_step to minimize() will increment it at each step. - learning_step = ( - tf.compat.v1.train.GradientDescentOptimizer(learning_rate) - .minimize(...my loss..., global_step=global_step) - ) - ``` - - Args: - learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. - The initial learning rate. - global_step: A Python number. Global step to use for the decay computation. - Must not be negative. - decay_steps: How often to apply decay. - decay_rate: A Python number. The decay rate. - staircase: Whether to apply decay in a discrete staircase, as opposed to - continuous, fashion. - name: String. Optional name of the operation. Defaults to - 'InverseTimeDecay'. - - Returns: - A scalar `Tensor` of the same type as `learning_rate`. The decayed - learning rate. - - Raises: - ValueError: if `global_step` is not supplied. - - @compatibility(eager) - When eager execution is enabled, this function returns a function which in - turn returns the decayed learning rate Tensor. This can be useful for changing - the learning rate value across different invocations of optimizer functions. - @end_compatibility - """ - decayed_lr = learning_rate_schedule.InverseTimeDecay( - learning_rate, decay_steps, decay_rate, staircase=staircase, name=name) - - if not context.executing_eagerly(): - decayed_lr = decayed_lr(global_step) - else: - decayed_lr = functools.partial(decayed_lr, global_step) - return decayed_lr - - -@tf_export(v1=["train.cosine_decay"]) -def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None): - """Applies cosine decay to the learning rate. - - When training a model, it is often recommended to lower the learning rate as - the training progresses. This function applies a cosine decay function - to a provided initial learning rate. It requires a `global_step` value to - compute the decayed learning rate. You can just pass a TensorFlow variable - that you increment at each training step. - - The function returns the decayed learning rate. It is computed as: - ```python - global_step = min(global_step, decay_steps) - cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps)) - decayed = (1 - alpha) * cosine_decay + alpha - decayed_learning_rate = learning_rate * decayed - ``` - - Example usage: - ```python - decay_steps = 1000 - lr_decayed = cosine_decay(learning_rate, global_step, decay_steps) - ``` - - Args: - learning_rate: A scalar `float32` or `float64` Tensor or a Python number. - The initial learning rate. - global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global - step to use for the decay computation. - decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number - of steps to decay over. - alpha: A scalar `float32` or `float64` Tensor or a Python number. Minimum - learning rate value as a fraction of learning_rate. - name: String. Optional name of the operation. Defaults to 'CosineDecay'. - - Returns: - A scalar `Tensor` of the same type as `learning_rate`. The decayed - learning rate. - Raises: - ValueError: if `global_step` is not supplied. - - References: - Stochastic Gradient Descent with Warm Restarts: - [Loshchilov et al., 2017] - (https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) - ([pdf](https://openreview.net/pdf?id=Skq89Scxx)) - - @compatibility(eager) - When eager execution is enabled, this function returns a function which in - turn returns the decayed learning rate Tensor. This can be useful for changing - the learning rate value across different invocations of optimizer functions. - @end_compatibility - """ - decayed_lr = learning_rate_schedule.CosineDecay( - learning_rate, decay_steps, alpha=alpha, name=name) - - if not context.executing_eagerly(): - decayed_lr = decayed_lr(global_step) - else: - decayed_lr = functools.partial(decayed_lr, global_step) - return decayed_lr - - -@tf_export(v1=["train.cosine_decay_restarts"]) -def cosine_decay_restarts(learning_rate, - global_step, - first_decay_steps, - t_mul=2.0, - m_mul=1.0, - alpha=0.0, - name=None): - """Applies cosine decay with restarts to the learning rate. - - When training a model, it is often recommended to lower the learning rate as - the training progresses. This function applies a cosine decay function with - restarts to a provided initial learning rate. It requires a `global_step` - value to compute the decayed learning rate. You can just pass a TensorFlow - variable that you increment at each training step. - - The function returns the decayed learning rate while taking into account - possible warm restarts. The learning rate multiplier first decays - from 1 to `alpha` for `first_decay_steps` steps. Then, a warm - restart is performed. Each new warm restart runs for `t_mul` times more steps - and with `m_mul` times smaller initial learning rate. - - Example usage: - ```python - first_decay_steps = 1000 - lr_decayed = cosine_decay_restarts(learning_rate, global_step, - first_decay_steps) - ``` - - Args: - learning_rate: A scalar `float32` or `float64` Tensor or a Python number. - The initial learning rate. - global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global - step to use for the decay computation. - first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. - Number of steps to decay over. - t_mul: A scalar `float32` or `float64` `Tensor` or a Python number. Used to - derive the number of iterations in the i-th period - m_mul: A scalar `float32` or `float64` `Tensor` or a Python number. - Used to derive the initial learning rate of the i-th period: - alpha: A scalar `float32` or `float64` Tensor or a Python number. Minimum - learning rate value as a fraction of the learning_rate. - name: String. Optional name of the operation. Defaults to 'SGDRDecay'. - - Returns: - A scalar `Tensor` of the same type as `learning_rate`. The decayed - learning rate. - Raises: - ValueError: if `global_step` is not supplied. - - References: - Stochastic Gradient Descent with Warm Restarts: - [Loshchilov et al., 2017] - (https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) - ([pdf](https://openreview.net/pdf?id=Skq89Scxx)) - - @compatibility(eager) - When eager execution is enabled, this function returns a function which in - turn returns the decayed learning rate Tensor. This can be useful for changing - the learning rate value across different invocations of optimizer functions. - @end_compatibility - """ - decayed_lr = learning_rate_schedule.CosineDecayRestarts( - learning_rate, - first_decay_steps, - t_mul=t_mul, - m_mul=m_mul, - alpha=alpha, - name=name) - - if not context.executing_eagerly(): - decayed_lr = decayed_lr(global_step) - else: - decayed_lr = functools.partial(decayed_lr, global_step) - return decayed_lr - - -@tf_export(v1=["train.linear_cosine_decay"]) -def linear_cosine_decay(learning_rate, - global_step, - decay_steps, - num_periods=0.5, - alpha=0.0, - beta=0.001, - name=None): - """Applies linear cosine decay to the learning rate. - - Note that linear cosine decay is more aggressive than cosine decay and - larger initial learning rates can typically be used. - - When training a model, it is often recommended to lower the learning rate as - the training progresses. This function applies a linear cosine decay function - to a provided initial learning rate. It requires a `global_step` value to - compute the decayed learning rate. You can just pass a TensorFlow variable - that you increment at each training step. - - The function returns the decayed learning rate. It is computed as: - ```python - global_step = min(global_step, decay_steps) - linear_decay = (decay_steps - global_step) / decay_steps) - cosine_decay = 0.5 * ( - 1 + cos(pi * 2 * num_periods * global_step / decay_steps)) - decayed = (alpha + linear_decay) * cosine_decay + beta - decayed_learning_rate = learning_rate * decayed - ``` - - Example usage: - ```python - decay_steps = 1000 - lr_decayed = linear_cosine_decay(learning_rate, global_step, decay_steps) - ``` - - Args: - learning_rate: A scalar `float32` or `float64` Tensor or a Python number. - The initial learning rate. - global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global - step to use for the decay computation. - decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number - of steps to decay over. - num_periods: Number of periods in the cosine part of the decay. See - computation above. - alpha: See computation above. - beta: See computation above. - name: String. Optional name of the operation. Defaults to - 'LinearCosineDecay'. - - Returns: - A scalar `Tensor` of the same type as `learning_rate`. The decayed - learning rate. - Raises: - ValueError: if `global_step` is not supplied. - - References: - Neural Optimizer Search with Reinforcement Learning: - [Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html) - ([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf)) - Stochastic Gradient Descent with Warm Restarts: - [Loshchilov et al., 2017] - (https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) - ([pdf](https://openreview.net/pdf?id=Skq89Scxx)) - - @compatibility(eager) - When eager execution is enabled, this function returns a function which in - turn returns the decayed learning rate Tensor. This can be useful for changing - the learning rate value across different invocations of optimizer functions. - @end_compatibility - """ - decayed_lr = learning_rate_schedule.LinearCosineDecay( - learning_rate, - decay_steps, - num_periods=num_periods, - alpha=alpha, - beta=beta, - name=name) - - if not context.executing_eagerly(): - decayed_lr = decayed_lr(global_step) - else: - decayed_lr = functools.partial(decayed_lr, global_step) - return decayed_lr - - -@tf_export(v1=["train.noisy_linear_cosine_decay"]) -def noisy_linear_cosine_decay(learning_rate, - global_step, - decay_steps, - initial_variance=1.0, - variance_decay=0.55, - num_periods=0.5, - alpha=0.0, - beta=0.001, - name=None): - """Applies noisy linear cosine decay to the learning rate. - - Note that linear cosine decay is more aggressive than cosine decay and - larger initial learning rates can typically be used. - - When training a model, it is often recommended to lower the learning rate as - the training progresses. This function applies a noisy linear - cosine decay function to a provided initial learning rate. - It requires a `global_step` value to compute the decayed learning rate. - You can just pass a TensorFlow variable that you increment at each - training step. - - The function returns the decayed learning rate. It is computed as: - ```python - global_step = min(global_step, decay_steps) - linear_decay = (decay_steps - global_step) / decay_steps) - cosine_decay = 0.5 * ( - 1 + cos(pi * 2 * num_periods * global_step / decay_steps)) - decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta - decayed_learning_rate = learning_rate * decayed - ``` - where eps_t is 0-centered gaussian noise with variance - initial_variance / (1 + global_step) ** variance_decay - - Example usage: - ```python - decay_steps = 1000 - lr_decayed = noisy_linear_cosine_decay( - learning_rate, global_step, decay_steps) - ``` - - Args: - learning_rate: A scalar `float32` or `float64` Tensor or a Python number. - The initial learning rate. - global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global - step to use for the decay computation. - decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number - of steps to decay over. - initial_variance: initial variance for the noise. See computation above. - variance_decay: decay for the noise's variance. See computation above. - num_periods: Number of periods in the cosine part of the decay. See - computation above. - alpha: See computation above. - beta: See computation above. - name: String. Optional name of the operation. Defaults to - 'NoisyLinearCosineDecay'. - - Returns: - A scalar `Tensor` of the same type as `learning_rate`. The decayed - learning rate. - Raises: - ValueError: if `global_step` is not supplied. - - References: - Neural Optimizer Search with Reinforcement Learning: - [Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html) - ([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf)) - Stochastic Gradient Descent with Warm Restarts: - [Loshchilov et al., 2017] - (https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) - ([pdf](https://openreview.net/pdf?id=Skq89Scxx)) - - @compatibility(eager) - When eager execution is enabled, this function returns a function which in - turn returns the decayed learning rate Tensor. This can be useful for changing - the learning rate value across different invocations of optimizer functions. - @end_compatibility - """ - decayed_lr = learning_rate_schedule.NoisyLinearCosineDecay( - learning_rate, - decay_steps, - initial_variance=initial_variance, - variance_decay=variance_decay, - num_periods=num_periods, - alpha=alpha, - beta=beta, - name=name) - - if not context.executing_eagerly(): - decayed_lr = decayed_lr(global_step) - else: - decayed_lr = functools.partial(decayed_lr, global_step) - return decayed_lr +exponential_decay = learning_rate_decay.exponential_decay +piecewise_constant = learning_rate_decay.piecewise_constant +polynomial_decay = learning_rate_decay.polynomial_decay +natural_exp_decay = learning_rate_decay.natural_exp_decay +inverse_time_decay = learning_rate_decay.inverse_time_decay +cosine_decay = learning_rate_decay.cosine_decay +cosine_decay_restarts = learning_rate_decay.cosine_decay_restarts +linear_cosine_decay = learning_rate_decay.linear_cosine_decay +noisy_linear_cosine_decay = learning_rate_decay.noisy_linear_cosine_decay