|
|
|
@ -62,7 +62,55 @@ class LearningRateSchedule(object):
|
|
|
|
|
|
|
|
|
|
@keras_export("keras.optimizers.schedules.ExponentialDecay")
|
|
|
|
|
class ExponentialDecay(LearningRateSchedule):
|
|
|
|
|
"""A LearningRateSchedule that uses an exponential decay schedule."""
|
|
|
|
|
"""A LearningRateSchedule that uses an exponential decay schedule.
|
|
|
|
|
|
|
|
|
|
When training a model, it is often recommended to lower the learning rate as
|
|
|
|
|
the training progresses. This schedule applies an exponential decay function
|
|
|
|
|
to an optimizer step, given a provided initial learning rate.
|
|
|
|
|
|
|
|
|
|
The schedule a 1-arg callable that produces a decayed learning
|
|
|
|
|
rate when passed the current optimizer step. This can be useful for changing
|
|
|
|
|
the learning rate value across different invocations of optimizer functions.
|
|
|
|
|
It is computed as:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
def decayed_learning_rate(step):
|
|
|
|
|
return initial_learning_rate * decay_rate ^ (step / decay_steps)
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
If the argument `staircase` is `True`, then `step / decay_steps` is
|
|
|
|
|
an integer division and the decayed learning rate follows a
|
|
|
|
|
staircase function.
|
|
|
|
|
|
|
|
|
|
You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
|
|
|
|
|
as the learning rate.
|
|
|
|
|
Example: When fitting a Keras model, decay every 100000 steps with a base
|
|
|
|
|
of 0.96:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
initial_learning_rate = 0.1
|
|
|
|
|
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
|
|
|
|
|
initial_learning_rate,
|
|
|
|
|
decay_steps=100000,
|
|
|
|
|
decay_rate=0.96,
|
|
|
|
|
staircase=True)
|
|
|
|
|
|
|
|
|
|
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=lr_schedule),
|
|
|
|
|
loss='sparse_categorical_crossentropy',
|
|
|
|
|
metrics=['accuracy'])
|
|
|
|
|
|
|
|
|
|
model.fit(data, labels, epochs=5)
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
The learning rate schedule is also serializable and deserializable using
|
|
|
|
|
`tf.keras.optimizers.schedules.serialize` and
|
|
|
|
|
`tf.keras.optimizers.schedules.deserialize`.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A 1-arg callable learning rate schedule that takes the current optimizer
|
|
|
|
|
step and outputs the decayed learning rate, a scalar `Tensor` of the same
|
|
|
|
|
type as `initial_learning_rate`.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
@ -73,48 +121,6 @@ class ExponentialDecay(LearningRateSchedule):
|
|
|
|
|
name=None):
|
|
|
|
|
"""Applies exponential decay to the learning rate.
|
|
|
|
|
|
|
|
|
|
When training a model, it is often recommended to lower the learning rate as
|
|
|
|
|
the training progresses. This schedule applies an exponential decay function
|
|
|
|
|
to an optimizer step, given a provided initial learning rate.
|
|
|
|
|
|
|
|
|
|
The schedule a 1-arg callable that produces a decayed learning
|
|
|
|
|
rate when passed the current optimizer step. This can be useful for changing
|
|
|
|
|
the learning rate value across different invocations of optimizer functions.
|
|
|
|
|
It is computed as:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
def decayed_learning_rate(step):
|
|
|
|
|
return initial_learning_rate * decay_rate ^ (step / decay_steps)
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
If the argument `staircase` is `True`, then `step / decay_steps` is
|
|
|
|
|
an integer division and the decayed learning rate follows a
|
|
|
|
|
staircase function.
|
|
|
|
|
|
|
|
|
|
You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
|
|
|
|
|
as the learning rate.
|
|
|
|
|
Example: When fitting a Keras model, decay every 100000 steps with a base
|
|
|
|
|
of 0.96:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
initial_learning_rate = 0.1
|
|
|
|
|
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
|
|
|
|
|
initial_learning_rate,
|
|
|
|
|
decay_steps=100000,
|
|
|
|
|
decay_rate=0.96,
|
|
|
|
|
staircase=True)
|
|
|
|
|
|
|
|
|
|
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=lr_schedule),
|
|
|
|
|
loss='sparse_categorical_crossentropy',
|
|
|
|
|
metrics=['accuracy'])
|
|
|
|
|
|
|
|
|
|
model.fit(data, labels, epochs=5)
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
The learning rate schedule is also serializable and deserializable using
|
|
|
|
|
`tf.keras.optimizers.schedules.serialize` and
|
|
|
|
|
`tf.keras.optimizers.schedules.deserialize`.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
|
|
|
|
|
Python number. The initial learning rate.
|
|
|
|
@ -126,11 +132,6 @@ class ExponentialDecay(LearningRateSchedule):
|
|
|
|
|
intervals
|
|
|
|
|
name: String. Optional name of the operation. Defaults to
|
|
|
|
|
'ExponentialDecay'.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A 1-arg callable learning rate schedule that takes the current optimizer
|
|
|
|
|
step and outputs the decayed learning rate, a scalar `Tensor` of the same
|
|
|
|
|
type as `initial_learning_rate`.
|
|
|
|
|
"""
|
|
|
|
|
super(ExponentialDecay, self).__init__()
|
|
|
|
|
self.initial_learning_rate = initial_learning_rate
|
|
|
|
@ -166,7 +167,41 @@ class ExponentialDecay(LearningRateSchedule):
|
|
|
|
|
|
|
|
|
|
@keras_export("keras.optimizers.schedules.PiecewiseConstantDecay")
|
|
|
|
|
class PiecewiseConstantDecay(LearningRateSchedule):
|
|
|
|
|
"""A LearningRateSchedule that uses a piecewise constant decay schedule."""
|
|
|
|
|
"""A LearningRateSchedule that uses a piecewise constant decay schedule.
|
|
|
|
|
|
|
|
|
|
The function returns a 1-arg callable to compute the piecewise constant
|
|
|
|
|
when passed the current optimizer step. This can be useful for changing the
|
|
|
|
|
learning rate value across different invocations of optimizer functions.
|
|
|
|
|
|
|
|
|
|
Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
|
|
|
|
|
for the next 10000 steps, and 0.1 for any additional steps.
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
step = tf.Variable(0, trainable=False)
|
|
|
|
|
boundaries = [100000, 110000]
|
|
|
|
|
values = [1.0, 0.5, 0.1]
|
|
|
|
|
learning_rate_fn = keras.optimizers.schedules.PiecewiseConstantDecay(
|
|
|
|
|
boundaries, values)
|
|
|
|
|
|
|
|
|
|
# Later, whenever we perform an optimization step, we pass in the step.
|
|
|
|
|
learning_rate = learning_rate_fn(step)
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
|
|
|
|
|
as the learning rate. The learning rate schedule is also serializable and
|
|
|
|
|
deserializable using `tf.keras.optimizers.schedules.serialize` and
|
|
|
|
|
`tf.keras.optimizers.schedules.deserialize`.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A 1-arg callable learning rate schedule that takes the current optimizer
|
|
|
|
|
step and outputs the decayed learning rate, a scalar `Tensor` of the same
|
|
|
|
|
type as the boundary tensors.
|
|
|
|
|
|
|
|
|
|
The output of the 1-arg function that takes the `step`
|
|
|
|
|
is `values[0]` when `step <= boundaries[0]`,
|
|
|
|
|
`values[1]` when `step > boundaries[0]` and `step <= boundaries[1]`, ...,
|
|
|
|
|
and values[-1] when `step > boundaries[-1]`.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
@ -175,29 +210,6 @@ class PiecewiseConstantDecay(LearningRateSchedule):
|
|
|
|
|
name=None):
|
|
|
|
|
"""Piecewise constant from boundaries and interval values.
|
|
|
|
|
|
|
|
|
|
The function returns a 1-arg callable to compute the piecewise constant
|
|
|
|
|
when passed the current optimizer step. This can be useful for changing the
|
|
|
|
|
learning rate value across different invocations of optimizer functions.
|
|
|
|
|
|
|
|
|
|
Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
|
|
|
|
|
for the next 10000 steps, and 0.1 for any additional steps.
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
step = tf.Variable(0, trainable=False)
|
|
|
|
|
boundaries = [100000, 110000]
|
|
|
|
|
values = [1.0, 0.5, 0.1]
|
|
|
|
|
learning_rate_fn = keras.optimizers.schedules.PiecewiseConstantDecay(
|
|
|
|
|
boundaries, values)
|
|
|
|
|
|
|
|
|
|
# Later, whenever we perform an optimization step, we pass in the step.
|
|
|
|
|
learning_rate = learning_rate_fn(step)
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
|
|
|
|
|
as the learning rate. The learning rate schedule is also serializable and
|
|
|
|
|
deserializable using `tf.keras.optimizers.schedules.serialize` and
|
|
|
|
|
`tf.keras.optimizers.schedules.deserialize`.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
|
|
|
|
|
increasing entries, and with all elements having the same type as the
|
|
|
|
@ -209,16 +221,6 @@ class PiecewiseConstantDecay(LearningRateSchedule):
|
|
|
|
|
name: A string. Optional name of the operation. Defaults to
|
|
|
|
|
'PiecewiseConstant'.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A 1-arg callable learning rate schedule that takes the current optimizer
|
|
|
|
|
step and outputs the decayed learning rate, a scalar `Tensor` of the same
|
|
|
|
|
type as the boundary tensors.
|
|
|
|
|
|
|
|
|
|
The output of the 1-arg function that takes the `step`
|
|
|
|
|
is `values[0]` when `step <= boundaries[0]`,
|
|
|
|
|
`values[1]` when `step > boundaries[0]` and `step <= boundaries[1]`, ...,
|
|
|
|
|
and values[-1] when `step > boundaries[-1]`.
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
ValueError: if the number of elements in the lists do not match.
|
|
|
|
|
"""
|
|
|
|
@ -265,7 +267,75 @@ class PiecewiseConstantDecay(LearningRateSchedule):
|
|
|
|
|
|
|
|
|
|
@keras_export("keras.optimizers.schedules.PolynomialDecay")
|
|
|
|
|
class PolynomialDecay(LearningRateSchedule):
|
|
|
|
|
"""A LearningRateSchedule that uses a polynomial decay schedule."""
|
|
|
|
|
"""A LearningRateSchedule that uses a polynomial decay schedule.
|
|
|
|
|
|
|
|
|
|
It is commonly observed that a monotonically decreasing learning rate, whose
|
|
|
|
|
degree of change is carefully chosen, results in a better performing model.
|
|
|
|
|
This schedule applies a polynomial decay function to an optimizer step,
|
|
|
|
|
given a provided `initial_learning_rate`, to reach an `end_learning_rate`
|
|
|
|
|
in the given `decay_steps`.
|
|
|
|
|
|
|
|
|
|
It requires a `step` value to compute the decayed learning rate. You
|
|
|
|
|
can just pass a TensorFlow variable that you increment at each training
|
|
|
|
|
step.
|
|
|
|
|
|
|
|
|
|
The schedule is a 1-arg callable that produces a decayed learning rate
|
|
|
|
|
when passed the current optimizer step. This can be useful for changing the
|
|
|
|
|
learning rate value across different invocations of optimizer functions.
|
|
|
|
|
It is computed as:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
def decayed_learning_rate(step):
|
|
|
|
|
step = min(step, decay_steps)
|
|
|
|
|
return ((initial_learning_rate - end_learning_rate) *
|
|
|
|
|
(1 - step / decay_steps) ^ (power)
|
|
|
|
|
) + end_learning_rate
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
If `cycle` is True then a multiple of `decay_steps` is used, the first one
|
|
|
|
|
that is bigger than `step`.
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
def decayed_learning_rate(step):
|
|
|
|
|
decay_steps = decay_steps * ceil(step / decay_steps)
|
|
|
|
|
return ((initial_learning_rate - end_learning_rate) *
|
|
|
|
|
(1 - step / decay_steps) ^ (power)
|
|
|
|
|
) + end_learning_rate
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
|
|
|
|
|
as the learning rate.
|
|
|
|
|
Example: Fit a model while decaying from 0.1 to 0.01 in 10000 steps using
|
|
|
|
|
sqrt (i.e. power=0.5):
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
...
|
|
|
|
|
starter_learning_rate = 0.1
|
|
|
|
|
end_learning_rate = 0.01
|
|
|
|
|
decay_steps = 10000
|
|
|
|
|
learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
|
|
|
|
|
starter_learning_rate,
|
|
|
|
|
decay_steps,
|
|
|
|
|
end_learning_rate,
|
|
|
|
|
power=0.5)
|
|
|
|
|
|
|
|
|
|
model.compile(optimizer=tf.keras.optimizers.SGD(
|
|
|
|
|
learning_rate=learning_rate_fn),
|
|
|
|
|
loss='sparse_categorical_crossentropy',
|
|
|
|
|
metrics=['accuracy'])
|
|
|
|
|
|
|
|
|
|
model.fit(data, labels, epochs=5)
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
The learning rate schedule is also serializable and deserializable using
|
|
|
|
|
`tf.keras.optimizers.schedules.serialize` and
|
|
|
|
|
`tf.keras.optimizers.schedules.deserialize`.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A 1-arg callable learning rate schedule that takes the current optimizer
|
|
|
|
|
step and outputs the decayed learning rate, a scalar `Tensor` of the same
|
|
|
|
|
type as `initial_learning_rate`.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
@ -277,68 +347,6 @@ class PolynomialDecay(LearningRateSchedule):
|
|
|
|
|
name=None):
|
|
|
|
|
"""Applies a polynomial decay to the learning rate.
|
|
|
|
|
|
|
|
|
|
It is commonly observed that a monotonically decreasing learning rate, whose
|
|
|
|
|
degree of change is carefully chosen, results in a better performing model.
|
|
|
|
|
This schedule applies a polynomial decay function to an optimizer step,
|
|
|
|
|
given a provided `initial_learning_rate`, to reach an `end_learning_rate`
|
|
|
|
|
in the given `decay_steps`.
|
|
|
|
|
|
|
|
|
|
It requires a `step` value to compute the decayed learning rate. You
|
|
|
|
|
can just pass a TensorFlow variable that you increment at each training
|
|
|
|
|
step.
|
|
|
|
|
|
|
|
|
|
The schedule is a 1-arg callable that produces a decayed learning rate
|
|
|
|
|
when passed the current optimizer step. This can be useful for changing the
|
|
|
|
|
learning rate value across different invocations of optimizer functions.
|
|
|
|
|
It is computed as:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
def decayed_learning_rate(step):
|
|
|
|
|
step = min(step, decay_steps)
|
|
|
|
|
return ((initial_learning_rate - end_learning_rate) *
|
|
|
|
|
(1 - step / decay_steps) ^ (power)
|
|
|
|
|
) + end_learning_rate
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
If `cycle` is True then a multiple of `decay_steps` is used, the first one
|
|
|
|
|
that is bigger than `step`.
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
def decayed_learning_rate(step):
|
|
|
|
|
decay_steps = decay_steps * ceil(step / decay_steps)
|
|
|
|
|
return ((initial_learning_rate - end_learning_rate) *
|
|
|
|
|
(1 - step / decay_steps) ^ (power)
|
|
|
|
|
) + end_learning_rate
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
|
|
|
|
|
as the learning rate.
|
|
|
|
|
Example: Fit a model while decaying from 0.1 to 0.01 in 10000 steps using
|
|
|
|
|
sqrt (i.e. power=0.5):
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
...
|
|
|
|
|
starter_learning_rate = 0.1
|
|
|
|
|
end_learning_rate = 0.01
|
|
|
|
|
decay_steps = 10000
|
|
|
|
|
learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
|
|
|
|
|
starter_learning_rate,
|
|
|
|
|
decay_steps,
|
|
|
|
|
end_learning_rate,
|
|
|
|
|
power=0.5)
|
|
|
|
|
|
|
|
|
|
model.compile(optimizer=tf.keras.optimizers.SGD(
|
|
|
|
|
learning_rate=learning_rate_fn),
|
|
|
|
|
loss='sparse_categorical_crossentropy',
|
|
|
|
|
metrics=['accuracy'])
|
|
|
|
|
|
|
|
|
|
model.fit(data, labels, epochs=5)
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
The learning rate schedule is also serializable and deserializable using
|
|
|
|
|
`tf.keras.optimizers.schedules.serialize` and
|
|
|
|
|
`tf.keras.optimizers.schedules.deserialize`.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
|
|
|
|
|
Python number. The initial learning rate.
|
|
|
|
@ -351,11 +359,6 @@ class PolynomialDecay(LearningRateSchedule):
|
|
|
|
|
cycle: A boolean, whether or not it should cycle beyond decay_steps.
|
|
|
|
|
name: String. Optional name of the operation. Defaults to
|
|
|
|
|
'PolynomialDecay'.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A 1-arg callable learning rate schedule that takes the current optimizer
|
|
|
|
|
step and outputs the decayed learning rate, a scalar `Tensor` of the same
|
|
|
|
|
type as `initial_learning_rate`.
|
|
|
|
|
"""
|
|
|
|
|
super(PolynomialDecay, self).__init__()
|
|
|
|
|
|
|
|
|
@ -408,7 +411,56 @@ class PolynomialDecay(LearningRateSchedule):
|
|
|
|
|
|
|
|
|
|
@keras_export("keras.optimizers.schedules.InverseTimeDecay")
|
|
|
|
|
class InverseTimeDecay(LearningRateSchedule):
|
|
|
|
|
"""A LearningRateSchedule that uses an inverse time decay schedule."""
|
|
|
|
|
"""A LearningRateSchedule that uses an inverse time decay schedule.
|
|
|
|
|
|
|
|
|
|
When training a model, it is often recommended to lower the learning rate as
|
|
|
|
|
the training progresses. This schedule applies the inverse decay function
|
|
|
|
|
to an optimizer step, given a provided initial learning rate.
|
|
|
|
|
It requires a `step` value to compute the decayed learning rate. You can
|
|
|
|
|
just pass a TensorFlow variable that you increment at each training step.
|
|
|
|
|
|
|
|
|
|
The schedule a 1-arg callable that produces a decayed learning
|
|
|
|
|
rate when passed the current optimizer step. This can be useful for changing
|
|
|
|
|
the learning rate value across different invocations of optimizer functions.
|
|
|
|
|
It is computed as:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
def decayed_learning_rate(step):
|
|
|
|
|
return initial_learning_rate / (1 + decay_rate * step / decay_step)
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
or, if `staircase` is `True`, as:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
def decayed_learning_rate(step):
|
|
|
|
|
return initial_learning_rate / (1 + decay_rate * floor(step / decay_step))
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
|
|
|
|
|
as the learning rate.
|
|
|
|
|
Example: Fit a Keras model when decaying 1/t with a rate of 0.5:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
...
|
|
|
|
|
initial_learning_rate = 0.1
|
|
|
|
|
decay_steps = 1.0
|
|
|
|
|
decay_rate = 0.5
|
|
|
|
|
learning_rate_fn = keras.optimizers.schedules.InverseTimeDecay(
|
|
|
|
|
initial_learning_rate, decay_steps, decay_rate)
|
|
|
|
|
|
|
|
|
|
model.compile(optimizer=tf.keras.optimizers.SGD(
|
|
|
|
|
learning_rate=learning_rate_fn),
|
|
|
|
|
loss='sparse_categorical_crossentropy',
|
|
|
|
|
metrics=['accuracy'])
|
|
|
|
|
|
|
|
|
|
model.fit(data, labels, epochs=5)
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A 1-arg callable learning rate schedule that takes the current optimizer
|
|
|
|
|
step and outputs the decayed learning rate, a scalar `Tensor` of the same
|
|
|
|
|
type as `initial_learning_rate`.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
@ -419,49 +471,6 @@ class InverseTimeDecay(LearningRateSchedule):
|
|
|
|
|
name=None):
|
|
|
|
|
"""Applies inverse time decay to the initial learning rate.
|
|
|
|
|
|
|
|
|
|
When training a model, it is often recommended to lower the learning rate as
|
|
|
|
|
the training progresses. This schedule applies the inverse decay function
|
|
|
|
|
to an optimizer step, given a provided initial learning rate.
|
|
|
|
|
It requires a `step` value to compute the decayed learning rate. You can
|
|
|
|
|
just pass a TensorFlow variable that you increment at each training step.
|
|
|
|
|
|
|
|
|
|
The schedule a 1-arg callable that produces a decayed learning
|
|
|
|
|
rate when passed the current optimizer step. This can be useful for changing
|
|
|
|
|
the learning rate value across different invocations of optimizer functions.
|
|
|
|
|
It is computed as:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
def decayed_learning_rate(step):
|
|
|
|
|
return initial_learning_rate / (1 + decay_rate * step / decay_step)
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
or, if `staircase` is `True`, as:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
def decayed_learning_rate(step):
|
|
|
|
|
return initial_learning_rate / (1 + decay_rate * floor(step / decay_step))
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
|
|
|
|
|
as the learning rate.
|
|
|
|
|
Example: Fit a Keras model when decaying 1/t with a rate of 0.5:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
...
|
|
|
|
|
initial_learning_rate = 0.1
|
|
|
|
|
decay_steps = 1.0
|
|
|
|
|
decay_rate = 0.5
|
|
|
|
|
learning_rate_fn = keras.optimizers.schedules.InverseTimeDecay(
|
|
|
|
|
initial_learning_rate, decay_steps, decay_rate)
|
|
|
|
|
|
|
|
|
|
model.compile(optimizer=tf.keras.optimizers.SGD(
|
|
|
|
|
learning_rate=learning_rate_fn),
|
|
|
|
|
loss='sparse_categorical_crossentropy',
|
|
|
|
|
metrics=['accuracy'])
|
|
|
|
|
|
|
|
|
|
model.fit(data, labels, epochs=5)
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
|
|
|
|
|
Python number. The initial learning rate.
|
|
|
|
@ -471,11 +480,6 @@ class InverseTimeDecay(LearningRateSchedule):
|
|
|
|
|
continuous, fashion.
|
|
|
|
|
name: String. Optional name of the operation. Defaults to
|
|
|
|
|
'InverseTimeDecay'.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A 1-arg callable learning rate schedule that takes the current optimizer
|
|
|
|
|
step and outputs the decayed learning rate, a scalar `Tensor` of the same
|
|
|
|
|
type as `initial_learning_rate`.
|
|
|
|
|
"""
|
|
|
|
|
super(InverseTimeDecay, self).__init__()
|
|
|
|
|
|
|
|
|
@ -513,7 +517,47 @@ class InverseTimeDecay(LearningRateSchedule):
|
|
|
|
|
|
|
|
|
|
@keras_export("keras.experimental.CosineDecay")
|
|
|
|
|
class CosineDecay(LearningRateSchedule):
|
|
|
|
|
"""A LearningRateSchedule that uses a cosine decay schedule."""
|
|
|
|
|
"""A LearningRateSchedule that uses a cosine decay schedule.
|
|
|
|
|
|
|
|
|
|
See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
|
|
|
|
|
with Warm Restarts. https://arxiv.org/abs/1608.03983
|
|
|
|
|
|
|
|
|
|
When training a model, it is often recommended to lower the learning rate as
|
|
|
|
|
the training progresses. This schedule applies a cosine decay function
|
|
|
|
|
to an optimizer step, given a provided initial learning rate.
|
|
|
|
|
It requires a `step` value to compute the decayed learning rate. You can
|
|
|
|
|
just pass a TensorFlow variable that you increment at each training step.
|
|
|
|
|
|
|
|
|
|
The schedule a 1-arg callable that produces a decayed learning
|
|
|
|
|
rate when passed the current optimizer step. This can be useful for changing
|
|
|
|
|
the learning rate value across different invocations of optimizer functions.
|
|
|
|
|
It is computed as:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
def decayed_learning_rate(step):
|
|
|
|
|
step = min(step, decay_steps)
|
|
|
|
|
cosine_decay = 0.5 * (1 + cos(pi * step / decay_steps))
|
|
|
|
|
decayed = (1 - alpha) * cosine_decay + alpha
|
|
|
|
|
return initial_learning_rate * decayed
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
Example usage:
|
|
|
|
|
```python
|
|
|
|
|
decay_steps = 1000
|
|
|
|
|
lr_decayed_fn = tf.keras.experimental.CosineDecay(
|
|
|
|
|
initial_learning_rate, decay_steps)
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
|
|
|
|
|
as the learning rate. The learning rate schedule is also serializable and
|
|
|
|
|
deserializable using `tf.keras.optimizers.schedules.serialize` and
|
|
|
|
|
`tf.keras.optimizers.schedules.deserialize`.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A 1-arg callable learning rate schedule that takes the current optimizer
|
|
|
|
|
step and outputs the decayed learning rate, a scalar `Tensor` of the same
|
|
|
|
|
type as `initial_learning_rate`.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
@ -523,40 +567,6 @@ class CosineDecay(LearningRateSchedule):
|
|
|
|
|
name=None):
|
|
|
|
|
"""Applies cosine decay to the learning rate.
|
|
|
|
|
|
|
|
|
|
See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
|
|
|
|
|
with Warm Restarts. https://arxiv.org/abs/1608.03983
|
|
|
|
|
|
|
|
|
|
When training a model, it is often recommended to lower the learning rate as
|
|
|
|
|
the training progresses. This schedule applies a cosine decay function
|
|
|
|
|
to an optimizer step, given a provided initial learning rate.
|
|
|
|
|
It requires a `step` value to compute the decayed learning rate. You can
|
|
|
|
|
just pass a TensorFlow variable that you increment at each training step.
|
|
|
|
|
|
|
|
|
|
The schedule a 1-arg callable that produces a decayed learning
|
|
|
|
|
rate when passed the current optimizer step. This can be useful for changing
|
|
|
|
|
the learning rate value across different invocations of optimizer functions.
|
|
|
|
|
It is computed as:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
def decayed_learning_rate(step):
|
|
|
|
|
step = min(step, decay_steps)
|
|
|
|
|
cosine_decay = 0.5 * (1 + cos(pi * step / decay_steps))
|
|
|
|
|
decayed = (1 - alpha) * cosine_decay + alpha
|
|
|
|
|
return initial_learning_rate * decayed
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
Example usage:
|
|
|
|
|
```python
|
|
|
|
|
decay_steps = 1000
|
|
|
|
|
lr_decayed_fn = tf.keras.experimental.CosineDecay(
|
|
|
|
|
initial_learning_rate, decay_steps)
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
|
|
|
|
|
as the learning rate. The learning rate schedule is also serializable and
|
|
|
|
|
deserializable using `tf.keras.optimizers.schedules.serialize` and
|
|
|
|
|
`tf.keras.optimizers.schedules.deserialize`.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
initial_learning_rate: A scalar `float32` or `float64` Tensor or a
|
|
|
|
|
Python number. The initial learning rate.
|
|
|
|
@ -565,10 +575,6 @@ class CosineDecay(LearningRateSchedule):
|
|
|
|
|
alpha: A scalar `float32` or `float64` Tensor or a Python number.
|
|
|
|
|
Minimum learning rate value as a fraction of initial_learning_rate.
|
|
|
|
|
name: String. Optional name of the operation. Defaults to 'CosineDecay'.
|
|
|
|
|
Returns:
|
|
|
|
|
A 1-arg callable learning rate schedule that takes the current optimizer
|
|
|
|
|
step and outputs the decayed learning rate, a scalar `Tensor` of the same
|
|
|
|
|
type as `initial_learning_rate`.
|
|
|
|
|
"""
|
|
|
|
|
super(CosineDecay, self).__init__()
|
|
|
|
|
|
|
|
|
@ -604,7 +610,45 @@ class CosineDecay(LearningRateSchedule):
|
|
|
|
|
|
|
|
|
|
@keras_export("keras.experimental.CosineDecayRestarts")
|
|
|
|
|
class CosineDecayRestarts(LearningRateSchedule):
|
|
|
|
|
"""A LearningRateSchedule that uses a cosine decay schedule with restarts."""
|
|
|
|
|
"""A LearningRateSchedule that uses a cosine decay schedule with restarts.
|
|
|
|
|
|
|
|
|
|
See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
|
|
|
|
|
with Warm Restarts. https://arxiv.org/abs/1608.03983
|
|
|
|
|
|
|
|
|
|
When training a model, it is often recommended to lower the learning rate as
|
|
|
|
|
the training progresses. This schedule applies a cosine decay function with
|
|
|
|
|
restarts to an optimizer step, given a provided initial learning rate.
|
|
|
|
|
It requires a `step` value to compute the decayed learning rate. You can
|
|
|
|
|
just pass a TensorFlow variable that you increment at each training step.
|
|
|
|
|
|
|
|
|
|
The schedule a 1-arg callable that produces a decayed learning
|
|
|
|
|
rate when passed the current optimizer step. This can be useful for changing
|
|
|
|
|
the learning rate value across different invocations of optimizer functions.
|
|
|
|
|
|
|
|
|
|
The learning rate multiplier first decays
|
|
|
|
|
from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
|
|
|
|
|
restart is performed. Each new warm restart runs for `t_mul` times more
|
|
|
|
|
steps and with `m_mul` times smaller initial learning rate.
|
|
|
|
|
|
|
|
|
|
Example usage:
|
|
|
|
|
```python
|
|
|
|
|
first_decay_steps = 1000
|
|
|
|
|
lr_decayed_fn = (
|
|
|
|
|
tf.keras.experimental.CosineDecayRestarts(
|
|
|
|
|
initial_learning_rate,
|
|
|
|
|
first_decay_steps))
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
|
|
|
|
|
as the learning rate. The learning rate schedule is also serializable and
|
|
|
|
|
deserializable using `tf.keras.optimizers.schedules.serialize` and
|
|
|
|
|
`tf.keras.optimizers.schedules.deserialize`.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A 1-arg callable learning rate schedule that takes the current optimizer
|
|
|
|
|
step and outputs the decayed learning rate, a scalar `Tensor` of the same
|
|
|
|
|
type as `initial_learning_rate`.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
@ -616,38 +660,6 @@ class CosineDecayRestarts(LearningRateSchedule):
|
|
|
|
|
name=None):
|
|
|
|
|
"""Applies cosine decay with restarts to the learning rate.
|
|
|
|
|
|
|
|
|
|
See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
|
|
|
|
|
with Warm Restarts. https://arxiv.org/abs/1608.03983
|
|
|
|
|
|
|
|
|
|
When training a model, it is often recommended to lower the learning rate as
|
|
|
|
|
the training progresses. This schedule applies a cosine decay function with
|
|
|
|
|
restarts to an optimizer step, given a provided initial learning rate.
|
|
|
|
|
It requires a `step` value to compute the decayed learning rate. You can
|
|
|
|
|
just pass a TensorFlow variable that you increment at each training step.
|
|
|
|
|
|
|
|
|
|
The schedule a 1-arg callable that produces a decayed learning
|
|
|
|
|
rate when passed the current optimizer step. This can be useful for changing
|
|
|
|
|
the learning rate value across different invocations of optimizer functions.
|
|
|
|
|
|
|
|
|
|
The learning rate multiplier first decays
|
|
|
|
|
from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
|
|
|
|
|
restart is performed. Each new warm restart runs for `t_mul` times more
|
|
|
|
|
steps and with `m_mul` times smaller initial learning rate.
|
|
|
|
|
|
|
|
|
|
Example usage:
|
|
|
|
|
```python
|
|
|
|
|
first_decay_steps = 1000
|
|
|
|
|
lr_decayed_fn = (
|
|
|
|
|
tf.keras.experimental.CosineDecayRestarts(
|
|
|
|
|
initial_learning_rate,
|
|
|
|
|
first_decay_steps))
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
|
|
|
|
|
as the learning rate. The learning rate schedule is also serializable and
|
|
|
|
|
deserializable using `tf.keras.optimizers.schedules.serialize` and
|
|
|
|
|
`tf.keras.optimizers.schedules.deserialize`.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
|
|
|
|
|
number. The initial learning rate.
|
|
|
|
@ -660,10 +672,6 @@ class CosineDecayRestarts(LearningRateSchedule):
|
|
|
|
|
alpha: A scalar `float32` or `float64` Tensor or a Python number.
|
|
|
|
|
Minimum learning rate value as a fraction of the initial_learning_rate.
|
|
|
|
|
name: String. Optional name of the operation. Defaults to 'SGDRDecay'.
|
|
|
|
|
Returns:
|
|
|
|
|
A 1-arg callable learning rate schedule that takes the current optimizer
|
|
|
|
|
step and outputs the decayed learning rate, a scalar `Tensor` of the same
|
|
|
|
|
type as `initial_learning_rate`.
|
|
|
|
|
"""
|
|
|
|
|
super(CosineDecayRestarts, self).__init__()
|
|
|
|
|
|
|
|
|
@ -728,7 +736,57 @@ class CosineDecayRestarts(LearningRateSchedule):
|
|
|
|
|
|
|
|
|
|
@keras_export("keras.experimental.LinearCosineDecay")
|
|
|
|
|
class LinearCosineDecay(LearningRateSchedule):
|
|
|
|
|
"""A LearningRateSchedule that uses a linear cosine decay schedule."""
|
|
|
|
|
"""A LearningRateSchedule that uses a linear cosine decay schedule.
|
|
|
|
|
|
|
|
|
|
See [Bello et al., ICML2017] Neural Optimizer Search with RL.
|
|
|
|
|
https://arxiv.org/abs/1709.07417
|
|
|
|
|
|
|
|
|
|
For the idea of warm starts here controlled by `num_periods`,
|
|
|
|
|
see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
|
|
|
|
|
with Warm Restarts. https://arxiv.org/abs/1608.03983
|
|
|
|
|
|
|
|
|
|
Note that linear cosine decay is more aggressive than cosine decay and
|
|
|
|
|
larger initial learning rates can typically be used.
|
|
|
|
|
|
|
|
|
|
When training a model, it is often recommended to lower the learning rate as
|
|
|
|
|
the training progresses. This schedule applies a linear cosine decay
|
|
|
|
|
function to an optimizer step, given a provided initial learning rate.
|
|
|
|
|
It requires a `step` value to compute the decayed learning rate. You can
|
|
|
|
|
just pass a TensorFlow variable that you increment at each training step.
|
|
|
|
|
|
|
|
|
|
The schedule a 1-arg callable that produces a decayed learning
|
|
|
|
|
rate when passed the current optimizer step. This can be useful for changing
|
|
|
|
|
the learning rate value across different invocations of optimizer functions.
|
|
|
|
|
It is computed as:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
def decayed_learning_rate(step):
|
|
|
|
|
step = min(step, decay_steps)
|
|
|
|
|
linear_decay = (decay_steps - step) / decay_steps
|
|
|
|
|
cosine_decay = 0.5 * (
|
|
|
|
|
1 + cos(pi * 2 * num_periods * step / decay_steps))
|
|
|
|
|
decayed = (alpha + linear_decay) * cosine_decay + beta
|
|
|
|
|
return initial_learning_rate * decayed
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
Example usage:
|
|
|
|
|
```python
|
|
|
|
|
decay_steps = 1000
|
|
|
|
|
lr_decayed_fn = (
|
|
|
|
|
tf.keras.experimental.LinearCosineDecay(
|
|
|
|
|
initial_learning_rate, decay_steps))
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
|
|
|
|
|
as the learning rate. The learning rate schedule is also serializable and
|
|
|
|
|
deserializable using `tf.keras.optimizers.schedules.serialize` and
|
|
|
|
|
`tf.keras.optimizers.schedules.deserialize`.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A 1-arg callable learning rate schedule that takes the current optimizer
|
|
|
|
|
step and outputs the decayed learning rate, a scalar `Tensor` of the same
|
|
|
|
|
type as `initial_learning_rate`.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
@ -740,50 +798,6 @@ class LinearCosineDecay(LearningRateSchedule):
|
|
|
|
|
name=None):
|
|
|
|
|
"""Applies linear cosine decay to the learning rate.
|
|
|
|
|
|
|
|
|
|
See [Bello et al., ICML2017] Neural Optimizer Search with RL.
|
|
|
|
|
https://arxiv.org/abs/1709.07417
|
|
|
|
|
|
|
|
|
|
For the idea of warm starts here controlled by `num_periods`,
|
|
|
|
|
see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
|
|
|
|
|
with Warm Restarts. https://arxiv.org/abs/1608.03983
|
|
|
|
|
|
|
|
|
|
Note that linear cosine decay is more aggressive than cosine decay and
|
|
|
|
|
larger initial learning rates can typically be used.
|
|
|
|
|
|
|
|
|
|
When training a model, it is often recommended to lower the learning rate as
|
|
|
|
|
the training progresses. This schedule applies a linear cosine decay
|
|
|
|
|
function to an optimizer step, given a provided initial learning rate.
|
|
|
|
|
It requires a `step` value to compute the decayed learning rate. You can
|
|
|
|
|
just pass a TensorFlow variable that you increment at each training step.
|
|
|
|
|
|
|
|
|
|
The schedule a 1-arg callable that produces a decayed learning
|
|
|
|
|
rate when passed the current optimizer step. This can be useful for changing
|
|
|
|
|
the learning rate value across different invocations of optimizer functions.
|
|
|
|
|
It is computed as:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
def decayed_learning_rate(step):
|
|
|
|
|
step = min(step, decay_steps)
|
|
|
|
|
linear_decay = (decay_steps - step) / decay_steps
|
|
|
|
|
cosine_decay = 0.5 * (
|
|
|
|
|
1 + cos(pi * 2 * num_periods * step / decay_steps))
|
|
|
|
|
decayed = (alpha + linear_decay) * cosine_decay + beta
|
|
|
|
|
return initial_learning_rate * decayed
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
Example usage:
|
|
|
|
|
```python
|
|
|
|
|
decay_steps = 1000
|
|
|
|
|
lr_decayed_fn = (
|
|
|
|
|
tf.keras.experimental.LinearCosineDecay(
|
|
|
|
|
initial_learning_rate, decay_steps))
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
|
|
|
|
|
as the learning rate. The learning rate schedule is also serializable and
|
|
|
|
|
deserializable using `tf.keras.optimizers.schedules.serialize` and
|
|
|
|
|
`tf.keras.optimizers.schedules.deserialize`.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
|
|
|
|
|
number. The initial learning rate.
|
|
|
|
@ -795,10 +809,6 @@ class LinearCosineDecay(LearningRateSchedule):
|
|
|
|
|
beta: See computation above.
|
|
|
|
|
name: String. Optional name of the operation. Defaults to
|
|
|
|
|
'LinearCosineDecay'.
|
|
|
|
|
Returns:
|
|
|
|
|
A 1-arg callable learning rate schedule that takes the current optimizer
|
|
|
|
|
step and outputs the decayed learning rate, a scalar `Tensor` of the same
|
|
|
|
|
type as `initial_learning_rate`.
|
|
|
|
|
"""
|
|
|
|
|
super(LinearCosineDecay, self).__init__()
|
|
|
|
|
|
|
|
|
@ -844,7 +854,59 @@ class LinearCosineDecay(LearningRateSchedule):
|
|
|
|
|
|
|
|
|
|
@keras_export("keras.experimental.NoisyLinearCosineDecay")
|
|
|
|
|
class NoisyLinearCosineDecay(LearningRateSchedule):
|
|
|
|
|
"""A LearningRateSchedule that uses a noisy linear cosine decay schedule."""
|
|
|
|
|
"""A LearningRateSchedule that uses a noisy linear cosine decay schedule.
|
|
|
|
|
|
|
|
|
|
See [Bello et al., ICML2017] Neural Optimizer Search with RL.
|
|
|
|
|
https://arxiv.org/abs/1709.07417
|
|
|
|
|
|
|
|
|
|
For the idea of warm starts here controlled by `num_periods`,
|
|
|
|
|
see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
|
|
|
|
|
with Warm Restarts. https://arxiv.org/abs/1608.03983
|
|
|
|
|
|
|
|
|
|
Note that linear cosine decay is more aggressive than cosine decay and
|
|
|
|
|
larger initial learning rates can typically be used.
|
|
|
|
|
|
|
|
|
|
When training a model, it is often recommended to lower the learning rate as
|
|
|
|
|
the training progresses. This schedule applies a noisy linear cosine decay
|
|
|
|
|
function to an optimizer step, given a provided initial learning rate.
|
|
|
|
|
It requires a `step` value to compute the decayed learning rate. You can
|
|
|
|
|
just pass a TensorFlow variable that you increment at each training step.
|
|
|
|
|
|
|
|
|
|
The schedule a 1-arg callable that produces a decayed learning
|
|
|
|
|
rate when passed the current optimizer step. This can be useful for changing
|
|
|
|
|
the learning rate value across different invocations of optimizer functions.
|
|
|
|
|
It is computed as:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
def decayed_learning_rate(step):
|
|
|
|
|
step = min(step, decay_steps)
|
|
|
|
|
linear_decay = (decay_steps - step) / decay_steps)
|
|
|
|
|
cosine_decay = 0.5 * (
|
|
|
|
|
1 + cos(pi * 2 * num_periods * step / decay_steps))
|
|
|
|
|
decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
|
|
|
|
|
return initial_learning_rate * decayed
|
|
|
|
|
```
|
|
|
|
|
where eps_t is 0-centered gaussian noise with variance
|
|
|
|
|
initial_variance / (1 + global_step) ** variance_decay
|
|
|
|
|
|
|
|
|
|
Example usage:
|
|
|
|
|
```python
|
|
|
|
|
decay_steps = 1000
|
|
|
|
|
lr_decayed_fn = (
|
|
|
|
|
tf.keras.experimental.NoisyLinearCosineDecay(
|
|
|
|
|
initial_learning_rate, decay_steps))
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
|
|
|
|
|
as the learning rate. The learning rate schedule is also serializable and
|
|
|
|
|
deserializable using `tf.keras.optimizers.schedules.serialize` and
|
|
|
|
|
`tf.keras.optimizers.schedules.deserialize`.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A 1-arg callable learning rate schedule that takes the current optimizer
|
|
|
|
|
step and outputs the decayed learning rate, a scalar `Tensor` of the same
|
|
|
|
|
type as `initial_learning_rate`.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
@ -858,52 +920,6 @@ class NoisyLinearCosineDecay(LearningRateSchedule):
|
|
|
|
|
name=None):
|
|
|
|
|
"""Applies noisy linear cosine decay to the learning rate.
|
|
|
|
|
|
|
|
|
|
See [Bello et al., ICML2017] Neural Optimizer Search with RL.
|
|
|
|
|
https://arxiv.org/abs/1709.07417
|
|
|
|
|
|
|
|
|
|
For the idea of warm starts here controlled by `num_periods`,
|
|
|
|
|
see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
|
|
|
|
|
with Warm Restarts. https://arxiv.org/abs/1608.03983
|
|
|
|
|
|
|
|
|
|
Note that linear cosine decay is more aggressive than cosine decay and
|
|
|
|
|
larger initial learning rates can typically be used.
|
|
|
|
|
|
|
|
|
|
When training a model, it is often recommended to lower the learning rate as
|
|
|
|
|
the training progresses. This schedule applies a noisy linear cosine decay
|
|
|
|
|
function to an optimizer step, given a provided initial learning rate.
|
|
|
|
|
It requires a `step` value to compute the decayed learning rate. You can
|
|
|
|
|
just pass a TensorFlow variable that you increment at each training step.
|
|
|
|
|
|
|
|
|
|
The schedule a 1-arg callable that produces a decayed learning
|
|
|
|
|
rate when passed the current optimizer step. This can be useful for changing
|
|
|
|
|
the learning rate value across different invocations of optimizer functions.
|
|
|
|
|
It is computed as:
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
def decayed_learning_rate(step):
|
|
|
|
|
step = min(step, decay_steps)
|
|
|
|
|
linear_decay = (decay_steps - step) / decay_steps)
|
|
|
|
|
cosine_decay = 0.5 * (
|
|
|
|
|
1 + cos(pi * 2 * num_periods * step / decay_steps))
|
|
|
|
|
decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
|
|
|
|
|
return initial_learning_rate * decayed
|
|
|
|
|
```
|
|
|
|
|
where eps_t is 0-centered gaussian noise with variance
|
|
|
|
|
initial_variance / (1 + global_step) ** variance_decay
|
|
|
|
|
|
|
|
|
|
Example usage:
|
|
|
|
|
```python
|
|
|
|
|
decay_steps = 1000
|
|
|
|
|
lr_decayed_fn = (
|
|
|
|
|
tf.keras.experimental.NoisyLinearCosineDecay(
|
|
|
|
|
initial_learning_rate, decay_steps))
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
|
|
|
|
|
as the learning rate. The learning rate schedule is also serializable and
|
|
|
|
|
deserializable using `tf.keras.optimizers.schedules.serialize` and
|
|
|
|
|
`tf.keras.optimizers.schedules.deserialize`.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
|
|
|
|
|
number. The initial learning rate.
|
|
|
|
@ -917,10 +933,6 @@ class NoisyLinearCosineDecay(LearningRateSchedule):
|
|
|
|
|
beta: See computation above.
|
|
|
|
|
name: String. Optional name of the operation. Defaults to
|
|
|
|
|
'NoisyLinearCosineDecay'.
|
|
|
|
|
Returns:
|
|
|
|
|
A 1-arg callable learning rate schedule that takes the current optimizer
|
|
|
|
|
step and outputs the decayed learning rate, a scalar `Tensor` of the same
|
|
|
|
|
type as `initial_learning_rate`.
|
|
|
|
|
"""
|
|
|
|
|
super(NoisyLinearCosineDecay, self).__init__()
|
|
|
|
|
|
|
|
|
|