Move V1 optimizer code to a separate file optimizer_v1.py from the generic utils in optimizers.py.
PiperOrigin-RevId: 333495430 Change-Id: I6bf730dc507f067f79f51b7a5952b50549c7c5a4
This commit is contained in:
parent
e798106686
commit
a6b7e4b94c
@ -287,6 +287,7 @@ py_library(
|
||||
py_library(
|
||||
name = "optimizers",
|
||||
srcs = [
|
||||
"optimizer_v1.py",
|
||||
"optimizers.py",
|
||||
],
|
||||
srcs_version = "PY2AND3",
|
||||
|
@ -36,7 +36,7 @@ from tensorflow.python.framework import tensor_util
|
||||
from tensorflow.python.keras import backend as K
|
||||
from tensorflow.python.keras import callbacks
|
||||
from tensorflow.python.keras import metrics as metrics_module
|
||||
from tensorflow.python.keras import optimizers
|
||||
from tensorflow.python.keras import optimizer_v1
|
||||
from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils
|
||||
from tensorflow.python.keras.engine import training_utils
|
||||
from tensorflow.python.keras.optimizer_v2 import optimizer_v2
|
||||
@ -779,7 +779,7 @@ def _clone_and_build_model(model, mode, inputs=None, targets=None):
|
||||
cloned_model = models.clone_model(model, input_tensors=inputs)
|
||||
|
||||
# Compile and build model.
|
||||
if isinstance(model.optimizer, optimizers.TFOptimizer):
|
||||
if isinstance(model.optimizer, optimizer_v1.TFOptimizer):
|
||||
optimizer = model.optimizer
|
||||
else:
|
||||
optimizer_config = model.optimizer.get_config()
|
||||
|
@ -42,7 +42,7 @@ from tensorflow.python.keras import backend
|
||||
from tensorflow.python.keras import callbacks
|
||||
from tensorflow.python.keras import metrics as metrics_module
|
||||
from tensorflow.python.keras import models
|
||||
from tensorflow.python.keras import optimizers
|
||||
from tensorflow.python.keras import optimizer_v1
|
||||
from tensorflow.python.keras.distribute import multi_worker_testing_utils
|
||||
from tensorflow.python.platform import test
|
||||
from tensorflow.python.util import nest
|
||||
@ -71,11 +71,11 @@ def _clone_and_build_model(model, strategy):
|
||||
cloned_model = models.clone_model(model)
|
||||
|
||||
# Compile and build model.
|
||||
if isinstance(model.optimizer, optimizers.TFOptimizer):
|
||||
if isinstance(model.optimizer, optimizer_v1.TFOptimizer):
|
||||
optimizer = model.optimizer
|
||||
# TODO(yuefengz): figure out why the optimizer here is still a
|
||||
# TFOptimizer.
|
||||
while isinstance(optimizer, optimizers.TFOptimizer):
|
||||
while isinstance(optimizer, optimizer_v1.TFOptimizer):
|
||||
optimizer = optimizer.optimizer
|
||||
optimizer = copy.deepcopy(optimizer)
|
||||
else:
|
||||
|
@ -39,6 +39,7 @@ from tensorflow.python.framework import sparse_tensor
|
||||
from tensorflow.python.framework import tensor_shape
|
||||
from tensorflow.python.keras import backend
|
||||
from tensorflow.python.keras import callbacks as callbacks_module
|
||||
from tensorflow.python.keras import optimizer_v1
|
||||
from tensorflow.python.keras import optimizers
|
||||
from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils
|
||||
from tensorflow.python.keras.engine import base_layer
|
||||
@ -2463,7 +2464,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
|
||||
def _validate_compile(self, optimizer, metrics, **kwargs):
|
||||
"""Performs validation checks for the default `compile`."""
|
||||
if any(
|
||||
isinstance(opt, optimizers.Optimizer)
|
||||
isinstance(opt, optimizer_v1.Optimizer)
|
||||
for opt in nest.flatten(optimizer)):
|
||||
raise ValueError(
|
||||
'`tf.compat.v1.keras` Optimizer (', optimizer, ') is '
|
||||
|
@ -40,6 +40,7 @@ from tensorflow.python.framework import type_spec
|
||||
from tensorflow.python.keras import backend as K
|
||||
from tensorflow.python.keras import losses
|
||||
from tensorflow.python.keras import metrics as metrics_module
|
||||
from tensorflow.python.keras import optimizer_v1
|
||||
from tensorflow.python.keras import optimizers
|
||||
from tensorflow.python.keras.distribute import distributed_training_utils
|
||||
from tensorflow.python.keras.distribute import distributed_training_utils_v1
|
||||
@ -322,8 +323,8 @@ class Model(training_lib.Model):
|
||||
|
||||
self._set_optimizer(optimizer)
|
||||
is_any_keras_optimizer_v1 = any(
|
||||
(isinstance(opt, optimizers.Optimizer)
|
||||
and not isinstance(opt, optimizers.TFOptimizer)
|
||||
(isinstance(opt, optimizer_v1.Optimizer)
|
||||
and not isinstance(opt, optimizer_v1.TFOptimizer)
|
||||
) for opt in nest.flatten(self.optimizer))
|
||||
|
||||
if is_any_keras_optimizer_v1 and ops.executing_eagerly_outside_functions():
|
||||
|
@ -37,7 +37,7 @@ from tensorflow.python.keras import combinations
|
||||
from tensorflow.python.keras import keras_parameterized
|
||||
from tensorflow.python.keras import layers
|
||||
from tensorflow.python.keras import models
|
||||
from tensorflow.python.keras import optimizers
|
||||
from tensorflow.python.keras import optimizer_v1
|
||||
from tensorflow.python.keras import testing_utils
|
||||
from tensorflow.python.keras.engine import base_layer
|
||||
from tensorflow.python.keras.engine import base_layer_utils
|
||||
@ -854,7 +854,7 @@ class KerasModelTest(keras_parameterized.TestCase):
|
||||
else:
|
||||
error_msg = 'optimizer" must be an instance of '
|
||||
with self.assertRaisesRegex(ValueError, error_msg):
|
||||
model.compile(optimizers.SGD(1.), 'mse')
|
||||
model.compile(optimizer_v1.SGD(1.), 'mse')
|
||||
|
||||
@combinations.generate(combinations.combine(mode=['graph', 'eager']))
|
||||
def test_functional_model_loss_dtype(self):
|
||||
|
@ -22,7 +22,7 @@ from __future__ import print_function
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.keras import backend as K
|
||||
from tensorflow.python.keras import metrics as metrics_module
|
||||
from tensorflow.python.keras import optimizers
|
||||
from tensorflow.python.keras import optimizer_v1
|
||||
from tensorflow.python.keras.engine import functional
|
||||
from tensorflow.python.keras.engine import sequential
|
||||
from tensorflow.python.keras.engine import training
|
||||
@ -682,8 +682,8 @@ def clone_and_build_model(
|
||||
clone._set_inputs(input_tensors)
|
||||
|
||||
if compile_clone:
|
||||
if isinstance(orig_optimizer, optimizers.TFOptimizer):
|
||||
optimizer = optimizers.TFOptimizer(
|
||||
if isinstance(orig_optimizer, optimizer_v1.TFOptimizer):
|
||||
optimizer = optimizer_v1.TFOptimizer(
|
||||
orig_optimizer.optimizer, optimizer_iterations)
|
||||
K.track_tf_optimizer(optimizer)
|
||||
else:
|
||||
|
@ -32,6 +32,7 @@ from tensorflow.python.keras import backend as K
|
||||
from tensorflow.python.keras import keras_parameterized
|
||||
from tensorflow.python.keras import metrics
|
||||
from tensorflow.python.keras import models
|
||||
from tensorflow.python.keras import optimizer_v1
|
||||
from tensorflow.python.keras import testing_utils
|
||||
from tensorflow.python.ops import array_ops
|
||||
from tensorflow.python.ops import math_ops
|
||||
@ -420,10 +421,9 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase):
|
||||
"""Assert that two models have the same compile parameters."""
|
||||
|
||||
self.assertEqual('mse', model.loss)
|
||||
self.assertTrue(
|
||||
isinstance(model.optimizer,
|
||||
(keras.optimizers.RMSprop,
|
||||
keras.optimizer_v2.rmsprop.RMSprop)))
|
||||
self.assertIsInstance(
|
||||
model.optimizer,
|
||||
(optimizer_v1.RMSprop, keras.optimizer_v2.rmsprop.RMSprop))
|
||||
|
||||
def _clone_and_build_test_helper(self, model, model_type):
|
||||
inp = np.random.random((10, 4))
|
||||
|
839
tensorflow/python/keras/optimizer_v1.py
Normal file
839
tensorflow/python/keras/optimizer_v1.py
Normal file
@ -0,0 +1,839 @@
|
||||
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
# pylint: disable=invalid-name
|
||||
# pylint: disable=g-classes-have-attributes
|
||||
"""Legacy v1 optimizer classes.
|
||||
|
||||
For more examples see the base class `tf.compat.v1.keras.optimizers.Optimizer`.
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from six.moves import zip # pylint: disable=redefined-builtin
|
||||
|
||||
from tensorflow.python.distribute import distribution_strategy_context
|
||||
from tensorflow.python.eager import backprop
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.keras import backend as K
|
||||
from tensorflow.python.ops import clip_ops
|
||||
from tensorflow.python.ops import math_ops
|
||||
from tensorflow.python.ops import state_ops
|
||||
from tensorflow.python.training import training_util
|
||||
from tensorflow.python.training.tracking import base as trackable
|
||||
from tensorflow.python.util import nest
|
||||
|
||||
|
||||
class Optimizer(object):
|
||||
"""Abstract optimizer base class.
|
||||
|
||||
Note: this is the parent class of all optimizers, not an actual optimizer
|
||||
that can be used for training models.
|
||||
|
||||
All Keras optimizers support the following keyword arguments:
|
||||
|
||||
clipnorm: float >= 0. Gradients will be clipped
|
||||
when their L2 norm exceeds this value.
|
||||
clipvalue: float >= 0. Gradients will be clipped
|
||||
when their absolute value exceeds this value.
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
allowed_kwargs = {'clipnorm', 'clipvalue'}
|
||||
for k in kwargs:
|
||||
if k not in allowed_kwargs:
|
||||
raise TypeError('Unexpected keyword argument '
|
||||
'passed to optimizer: ' + str(k))
|
||||
# checks that clipnorm >= 0 and clipvalue >= 0
|
||||
if kwargs[k] < 0:
|
||||
raise ValueError('Expected {} >= 0, received: {}'.format(k, kwargs[k]))
|
||||
self.__dict__.update(kwargs)
|
||||
self.updates = []
|
||||
self.weights = []
|
||||
|
||||
# Set this to False, indicating `apply_gradients` does not take the
|
||||
# `experimental_aggregate_gradients` argument.
|
||||
_HAS_AGGREGATE_GRAD = False
|
||||
|
||||
def _create_all_weights(self, params):
|
||||
"""Creates and sets all optimizer weights.
|
||||
|
||||
Args:
|
||||
params: list or tuple of `Variable` objects that will be minimized
|
||||
using this optimizer.
|
||||
|
||||
Returns:
|
||||
Specific weight values that are used in `get_updates`
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def get_updates(self, loss, params):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_gradients(self, loss, params):
|
||||
"""Returns gradients of `loss` with respect to `params`.
|
||||
|
||||
Arguments:
|
||||
loss: Loss tensor.
|
||||
params: List of variables.
|
||||
|
||||
Returns:
|
||||
List of gradient tensors.
|
||||
|
||||
Raises:
|
||||
ValueError: In case any gradient cannot be computed (e.g. if gradient
|
||||
function not implemented).
|
||||
"""
|
||||
grads = K.gradients(loss, params)
|
||||
if any(g is None for g in grads):
|
||||
raise ValueError('An operation has `None` for gradient. '
|
||||
'Please make sure that all of your ops have a '
|
||||
'gradient defined (i.e. are differentiable). '
|
||||
'Common ops without gradient: '
|
||||
'K.argmax, K.round, K.eval.')
|
||||
if hasattr(self, 'clipnorm'):
|
||||
grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
|
||||
if hasattr(self, 'clipvalue'):
|
||||
grads = [
|
||||
clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
|
||||
for g in grads
|
||||
]
|
||||
return grads
|
||||
|
||||
def set_weights(self, weights):
|
||||
"""Sets the weights of the optimizer, from Numpy arrays.
|
||||
|
||||
Should only be called after computing the gradients
|
||||
(otherwise the optimizer has no weights).
|
||||
|
||||
Arguments:
|
||||
weights: a list of Numpy arrays. The number of arrays and their shape
|
||||
must match number of the dimensions of the weights of the optimizer
|
||||
(i.e. it should match the output of `get_weights`).
|
||||
|
||||
Raises:
|
||||
ValueError: in case of incompatible weight shapes.
|
||||
"""
|
||||
params = self.weights
|
||||
if len(params) != len(weights):
|
||||
raise ValueError('Length of the specified weight list (' +
|
||||
str(len(weights)) +
|
||||
') does not match the number of weights '
|
||||
'of the optimizer (' + str(len(params)) + ')')
|
||||
weight_value_tuples = []
|
||||
param_values = K.batch_get_value(params)
|
||||
for pv, p, w in zip(param_values, params, weights):
|
||||
if pv.shape != w.shape:
|
||||
raise ValueError('Optimizer weight shape ' + str(pv.shape) +
|
||||
' not compatible with '
|
||||
'provided weight shape ' + str(w.shape))
|
||||
weight_value_tuples.append((p, w))
|
||||
K.batch_set_value(weight_value_tuples)
|
||||
|
||||
def get_weights(self):
|
||||
"""Returns the current value of the weights of the optimizer.
|
||||
|
||||
Returns:
|
||||
A list of numpy arrays.
|
||||
"""
|
||||
return K.batch_get_value(self.weights)
|
||||
|
||||
def get_config(self):
|
||||
config = {}
|
||||
if hasattr(self, 'clipnorm'):
|
||||
config['clipnorm'] = self.clipnorm
|
||||
if hasattr(self, 'clipvalue'):
|
||||
config['clipvalue'] = self.clipvalue
|
||||
return config
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, config):
|
||||
return cls(**config)
|
||||
|
||||
|
||||
class SGD(Optimizer):
|
||||
"""Stochastic gradient descent optimizer.
|
||||
|
||||
Includes support for momentum,
|
||||
learning rate decay, and Nesterov momentum.
|
||||
|
||||
Arguments:
|
||||
lr: float >= 0. Learning rate.
|
||||
momentum: float >= 0. Parameter that accelerates SGD in the relevant
|
||||
direction and dampens oscillations.
|
||||
decay: float >= 0. Learning rate decay over each update.
|
||||
nesterov: boolean. Whether to apply Nesterov momentum.
|
||||
"""
|
||||
|
||||
def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, **kwargs):
|
||||
super(SGD, self).__init__(**kwargs)
|
||||
with K.name_scope(self.__class__.__name__):
|
||||
self.iterations = K.variable(0, dtype='int64', name='iterations')
|
||||
self.lr = K.variable(lr, name='lr')
|
||||
self.momentum = K.variable(momentum, name='momentum')
|
||||
self.decay = K.variable(decay, name='decay')
|
||||
self.initial_decay = decay
|
||||
self.nesterov = nesterov
|
||||
|
||||
def _create_all_weights(self, params):
|
||||
shapes = [K.int_shape(p) for p in params]
|
||||
moments = [K.zeros(shape) for shape in shapes]
|
||||
self.weights = [self.iterations] + moments
|
||||
return moments
|
||||
|
||||
def get_updates(self, loss, params):
|
||||
grads = self.get_gradients(loss, params)
|
||||
self.updates = [state_ops.assign_add(self.iterations, 1)]
|
||||
|
||||
lr = self.lr
|
||||
if self.initial_decay > 0:
|
||||
lr = lr * ( # pylint: disable=g-no-augmented-assignment
|
||||
1. /
|
||||
(1. +
|
||||
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
|
||||
# momentum
|
||||
moments = self._create_all_weights(params)
|
||||
for p, g, m in zip(params, grads, moments):
|
||||
v = self.momentum * m - lr * g # velocity
|
||||
self.updates.append(state_ops.assign(m, v))
|
||||
|
||||
if self.nesterov:
|
||||
new_p = p + self.momentum * v - lr * g
|
||||
else:
|
||||
new_p = p + v
|
||||
|
||||
# Apply constraints.
|
||||
if getattr(p, 'constraint', None) is not None:
|
||||
new_p = p.constraint(new_p)
|
||||
|
||||
self.updates.append(state_ops.assign(p, new_p))
|
||||
return self.updates
|
||||
|
||||
def get_config(self):
|
||||
config = {
|
||||
'lr': float(K.get_value(self.lr)),
|
||||
'momentum': float(K.get_value(self.momentum)),
|
||||
'decay': float(K.get_value(self.decay)),
|
||||
'nesterov': self.nesterov
|
||||
}
|
||||
base_config = super(SGD, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
|
||||
class RMSprop(Optimizer):
|
||||
"""RMSProp optimizer.
|
||||
|
||||
It is recommended to leave the parameters of this optimizer
|
||||
at their default values
|
||||
(except the learning rate, which can be freely tuned).
|
||||
|
||||
Arguments:
|
||||
lr: float >= 0. Learning rate.
|
||||
rho: float >= 0.
|
||||
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
|
||||
decay: float >= 0. Learning rate decay over each update.
|
||||
"""
|
||||
|
||||
def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0., **kwargs):
|
||||
super(RMSprop, self).__init__(**kwargs)
|
||||
with K.name_scope(self.__class__.__name__):
|
||||
self.lr = K.variable(lr, name='lr')
|
||||
self.rho = K.variable(rho, name='rho')
|
||||
self.decay = K.variable(decay, name='decay')
|
||||
self.iterations = K.variable(0, dtype='int64', name='iterations')
|
||||
if epsilon is None:
|
||||
epsilon = K.epsilon()
|
||||
self.epsilon = epsilon
|
||||
self.initial_decay = decay
|
||||
|
||||
def _create_all_weights(self, params):
|
||||
accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
|
||||
self.weights = accumulators
|
||||
return accumulators
|
||||
|
||||
def get_updates(self, loss, params):
|
||||
grads = self.get_gradients(loss, params)
|
||||
accumulators = self._create_all_weights(params)
|
||||
self.updates = [state_ops.assign_add(self.iterations, 1)]
|
||||
|
||||
lr = self.lr
|
||||
if self.initial_decay > 0:
|
||||
lr = lr * ( # pylint: disable=g-no-augmented-assignment
|
||||
1. /
|
||||
(1. +
|
||||
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
|
||||
|
||||
for p, g, a in zip(params, grads, accumulators):
|
||||
# update accumulator
|
||||
new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
|
||||
self.updates.append(state_ops.assign(a, new_a))
|
||||
new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
|
||||
|
||||
# Apply constraints.
|
||||
if getattr(p, 'constraint', None) is not None:
|
||||
new_p = p.constraint(new_p)
|
||||
|
||||
self.updates.append(state_ops.assign(p, new_p))
|
||||
return self.updates
|
||||
|
||||
def get_config(self):
|
||||
config = {
|
||||
'lr': float(K.get_value(self.lr)),
|
||||
'rho': float(K.get_value(self.rho)),
|
||||
'decay': float(K.get_value(self.decay)),
|
||||
'epsilon': self.epsilon
|
||||
}
|
||||
base_config = super(RMSprop, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
|
||||
class Adagrad(Optimizer):
|
||||
"""Adagrad optimizer.
|
||||
|
||||
Adagrad is an optimizer with parameter-specific learning rates,
|
||||
which are adapted relative to how frequently a parameter gets
|
||||
updated during training. The more updates a parameter receives,
|
||||
the smaller the updates.
|
||||
|
||||
It is recommended to leave the parameters of this optimizer
|
||||
at their default values.
|
||||
|
||||
# Arguments
|
||||
lr: float >= 0. Initial learning rate.
|
||||
epsilon: float >= 0. If `None`, defaults to `K.epsilon()`.
|
||||
decay: float >= 0. Learning rate decay over each update.
|
||||
|
||||
# References
|
||||
- [Adaptive Subgradient Methods for Online Learning and Stochastic
|
||||
Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
|
||||
"""
|
||||
|
||||
def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs):
|
||||
super(Adagrad, self).__init__(**kwargs)
|
||||
with K.name_scope(self.__class__.__name__):
|
||||
self.lr = K.variable(lr, name='lr')
|
||||
self.decay = K.variable(decay, name='decay')
|
||||
self.iterations = K.variable(0, dtype='int64', name='iterations')
|
||||
if epsilon is None:
|
||||
epsilon = K.epsilon()
|
||||
self.epsilon = epsilon
|
||||
self.initial_decay = decay
|
||||
|
||||
def _create_all_weights(self, params):
|
||||
shapes = [K.int_shape(p) for p in params]
|
||||
accumulators = [K.zeros(shape) for shape in shapes]
|
||||
self.weights = accumulators
|
||||
return accumulators
|
||||
|
||||
def get_updates(self, loss, params):
|
||||
grads = self.get_gradients(loss, params)
|
||||
accumulators = self._create_all_weights(params)
|
||||
|
||||
self.updates = [state_ops.assign_add(self.iterations, 1)]
|
||||
|
||||
lr = self.lr
|
||||
if self.initial_decay > 0:
|
||||
lr = lr * ( # pylint: disable=g-no-augmented-assignment
|
||||
1. /
|
||||
(1. +
|
||||
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
|
||||
|
||||
for p, g, a in zip(params, grads, accumulators):
|
||||
new_a = a + math_ops.square(g) # update accumulator
|
||||
self.updates.append(state_ops.assign(a, new_a))
|
||||
new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
|
||||
|
||||
# Apply constraints.
|
||||
if getattr(p, 'constraint', None) is not None:
|
||||
new_p = p.constraint(new_p)
|
||||
|
||||
self.updates.append(state_ops.assign(p, new_p))
|
||||
return self.updates
|
||||
|
||||
def get_config(self):
|
||||
config = {
|
||||
'lr': float(K.get_value(self.lr)),
|
||||
'decay': float(K.get_value(self.decay)),
|
||||
'epsilon': self.epsilon
|
||||
}
|
||||
base_config = super(Adagrad, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
|
||||
class Adadelta(Optimizer):
|
||||
"""Adadelta optimizer.
|
||||
|
||||
Adadelta is a more robust extension of Adagrad
|
||||
that adapts learning rates based on a moving window of gradient updates,
|
||||
instead of accumulating all past gradients. This way, Adadelta continues
|
||||
learning even when many updates have been done. Compared to Adagrad, in the
|
||||
original version of Adadelta you don't have to set an initial learning
|
||||
rate. In this version, initial learning rate and decay factor can
|
||||
be set, as in most other Keras optimizers.
|
||||
|
||||
It is recommended to leave the parameters of this optimizer
|
||||
at their default values.
|
||||
|
||||
# Arguments
|
||||
lr: float >= 0. Initial learning rate, defaults to 1.
|
||||
It is recommended to leave it at the default value.
|
||||
rho: float >= 0. Adadelta decay factor, corresponding to fraction of
|
||||
gradient to keep at each time step.
|
||||
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
|
||||
decay: float >= 0. Initial learning rate decay.
|
||||
|
||||
# References
|
||||
- [Adadelta - an adaptive learning rate
|
||||
method](http://arxiv.org/abs/1212.5701)
|
||||
"""
|
||||
|
||||
def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0., **kwargs):
|
||||
super(Adadelta, self).__init__(**kwargs)
|
||||
with K.name_scope(self.__class__.__name__):
|
||||
self.lr = K.variable(lr, name='lr')
|
||||
self.decay = K.variable(decay, name='decay')
|
||||
self.iterations = K.variable(0, dtype='int64', name='iterations')
|
||||
if epsilon is None:
|
||||
epsilon = K.epsilon()
|
||||
self.rho = rho
|
||||
self.epsilon = epsilon
|
||||
self.initial_decay = decay
|
||||
|
||||
def _create_all_weights(self, params):
|
||||
shapes = [K.int_shape(p) for p in params]
|
||||
accumulators = [K.zeros(shape) for shape in shapes]
|
||||
delta_accumulators = [K.zeros(shape) for shape in shapes]
|
||||
self.weights = accumulators + delta_accumulators
|
||||
return accumulators, delta_accumulators
|
||||
|
||||
def get_updates(self, loss, params):
|
||||
grads = self.get_gradients(loss, params)
|
||||
self.updates = [state_ops.assign_add(self.iterations, 1)]
|
||||
accumulators, delta_accumulators = self._create_all_weights(params)
|
||||
|
||||
lr = self.lr
|
||||
if self.initial_decay > 0:
|
||||
lr = lr * ( # pylint: disable=g-no-augmented-assignment
|
||||
1. /
|
||||
(1. +
|
||||
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
|
||||
|
||||
for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
|
||||
# update accumulator
|
||||
new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
|
||||
self.updates.append(state_ops.assign(a, new_a))
|
||||
|
||||
# use the new accumulator and the *old* delta_accumulator
|
||||
update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon)
|
||||
new_p = p - lr * update
|
||||
|
||||
# Apply constraints.
|
||||
if getattr(p, 'constraint', None) is not None:
|
||||
new_p = p.constraint(new_p)
|
||||
|
||||
self.updates.append(state_ops.assign(p, new_p))
|
||||
|
||||
# update delta_accumulator
|
||||
new_d_a = self.rho * d_a + (1 - self.rho) * math_ops.square(update)
|
||||
self.updates.append(state_ops.assign(d_a, new_d_a))
|
||||
return self.updates
|
||||
|
||||
def get_config(self):
|
||||
config = {
|
||||
'lr': float(K.get_value(self.lr)),
|
||||
'rho': self.rho,
|
||||
'decay': float(K.get_value(self.decay)),
|
||||
'epsilon': self.epsilon
|
||||
}
|
||||
base_config = super(Adadelta, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
|
||||
class Adam(Optimizer):
|
||||
"""Adam optimizer.
|
||||
|
||||
Default parameters follow those provided in the original paper.
|
||||
|
||||
Arguments:
|
||||
lr: float >= 0. Learning rate.
|
||||
beta_1: float, 0 < beta < 1. Generally close to 1.
|
||||
beta_2: float, 0 < beta < 1. Generally close to 1.
|
||||
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
|
||||
decay: float >= 0. Learning rate decay over each update.
|
||||
amsgrad: boolean. Whether to apply the AMSGrad variant of this algorithm
|
||||
from the paper "On the Convergence of Adam and Beyond".
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
lr=0.001,
|
||||
beta_1=0.9,
|
||||
beta_2=0.999,
|
||||
epsilon=None,
|
||||
decay=0.,
|
||||
amsgrad=False,
|
||||
**kwargs):
|
||||
super(Adam, self).__init__(**kwargs)
|
||||
with K.name_scope(self.__class__.__name__):
|
||||
self.iterations = K.variable(0, dtype='int64', name='iterations')
|
||||
self.lr = K.variable(lr, name='lr')
|
||||
self.beta_1 = K.variable(beta_1, name='beta_1')
|
||||
self.beta_2 = K.variable(beta_2, name='beta_2')
|
||||
self.decay = K.variable(decay, name='decay')
|
||||
if epsilon is None:
|
||||
epsilon = K.epsilon()
|
||||
self.epsilon = epsilon
|
||||
self.initial_decay = decay
|
||||
self.amsgrad = amsgrad
|
||||
|
||||
def _create_all_weights(self, params):
|
||||
ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
|
||||
vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
|
||||
if self.amsgrad:
|
||||
vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
|
||||
else:
|
||||
vhats = [K.zeros(1) for _ in params]
|
||||
self.weights = [self.iterations] + ms + vs + vhats
|
||||
return ms, vs, vhats
|
||||
|
||||
def get_updates(self, loss, params):
|
||||
grads = self.get_gradients(loss, params)
|
||||
self.updates = []
|
||||
|
||||
lr = self.lr
|
||||
if self.initial_decay > 0:
|
||||
lr = lr * ( # pylint: disable=g-no-augmented-assignment
|
||||
1. /
|
||||
(1. +
|
||||
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
|
||||
|
||||
with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
|
||||
t = math_ops.cast(self.iterations, K.floatx())
|
||||
lr_t = lr * (
|
||||
K.sqrt(1. - math_ops.pow(self.beta_2, t)) /
|
||||
(1. - math_ops.pow(self.beta_1, t)))
|
||||
|
||||
ms, vs, vhats = self._create_all_weights(params)
|
||||
for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
|
||||
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
|
||||
v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g)
|
||||
if self.amsgrad:
|
||||
vhat_t = math_ops.maximum(vhat, v_t)
|
||||
p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
|
||||
self.updates.append(state_ops.assign(vhat, vhat_t))
|
||||
else:
|
||||
p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
|
||||
|
||||
self.updates.append(state_ops.assign(m, m_t))
|
||||
self.updates.append(state_ops.assign(v, v_t))
|
||||
new_p = p_t
|
||||
|
||||
# Apply constraints.
|
||||
if getattr(p, 'constraint', None) is not None:
|
||||
new_p = p.constraint(new_p)
|
||||
|
||||
self.updates.append(state_ops.assign(p, new_p))
|
||||
return self.updates
|
||||
|
||||
def get_config(self):
|
||||
config = {
|
||||
'lr': float(K.get_value(self.lr)),
|
||||
'beta_1': float(K.get_value(self.beta_1)),
|
||||
'beta_2': float(K.get_value(self.beta_2)),
|
||||
'decay': float(K.get_value(self.decay)),
|
||||
'epsilon': self.epsilon,
|
||||
'amsgrad': self.amsgrad
|
||||
}
|
||||
base_config = super(Adam, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
|
||||
class Adamax(Optimizer):
|
||||
"""Adamax optimizer from Adam paper's Section 7.
|
||||
|
||||
It is a variant of Adam based on the infinity norm.
|
||||
Default parameters follow those provided in the paper.
|
||||
|
||||
Arguments:
|
||||
lr: float >= 0. Learning rate.
|
||||
beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
|
||||
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
|
||||
decay: float >= 0. Learning rate decay over each update.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
lr=0.002,
|
||||
beta_1=0.9,
|
||||
beta_2=0.999,
|
||||
epsilon=None,
|
||||
decay=0.,
|
||||
**kwargs):
|
||||
super(Adamax, self).__init__(**kwargs)
|
||||
with K.name_scope(self.__class__.__name__):
|
||||
self.iterations = K.variable(0, dtype='int64', name='iterations')
|
||||
self.lr = K.variable(lr, name='lr')
|
||||
self.beta_1 = K.variable(beta_1, name='beta_1')
|
||||
self.beta_2 = K.variable(beta_2, name='beta_2')
|
||||
self.decay = K.variable(decay, name='decay')
|
||||
if epsilon is None:
|
||||
epsilon = K.epsilon()
|
||||
self.epsilon = epsilon
|
||||
self.initial_decay = decay
|
||||
|
||||
def _create_all_weights(self, params):
|
||||
|
||||
shapes = [K.int_shape(p) for p in params]
|
||||
# zero init of 1st moment
|
||||
ms = [K.zeros(shape) for shape in shapes]
|
||||
# zero init of exponentially weighted infinity norm
|
||||
us = [K.zeros(shape) for shape in shapes]
|
||||
self.weights = [self.iterations] + ms + us
|
||||
return ms, us
|
||||
|
||||
def get_updates(self, loss, params):
|
||||
grads = self.get_gradients(loss, params)
|
||||
self.updates = []
|
||||
|
||||
lr = self.lr
|
||||
if self.initial_decay > 0:
|
||||
lr = lr * ( # pylint: disable=g-no-augmented-assignment
|
||||
1. /
|
||||
(1. +
|
||||
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
|
||||
|
||||
with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
|
||||
t = math_ops.cast(self.iterations, K.floatx())
|
||||
lr_t = lr / (1. - math_ops.pow(self.beta_1, t))
|
||||
|
||||
ms, us = self._create_all_weights(params)
|
||||
|
||||
for p, g, m, u in zip(params, grads, ms, us):
|
||||
|
||||
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
|
||||
u_t = math_ops.maximum(self.beta_2 * u, math_ops.abs(g))
|
||||
p_t = p - lr_t * m_t / (u_t + self.epsilon)
|
||||
|
||||
self.updates.append(state_ops.assign(m, m_t))
|
||||
self.updates.append(state_ops.assign(u, u_t))
|
||||
new_p = p_t
|
||||
|
||||
# Apply constraints.
|
||||
if getattr(p, 'constraint', None) is not None:
|
||||
new_p = p.constraint(new_p)
|
||||
|
||||
self.updates.append(state_ops.assign(p, new_p))
|
||||
return self.updates
|
||||
|
||||
def get_config(self):
|
||||
config = {
|
||||
'lr': float(K.get_value(self.lr)),
|
||||
'beta_1': float(K.get_value(self.beta_1)),
|
||||
'beta_2': float(K.get_value(self.beta_2)),
|
||||
'decay': float(K.get_value(self.decay)),
|
||||
'epsilon': self.epsilon
|
||||
}
|
||||
base_config = super(Adamax, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
|
||||
class Nadam(Optimizer):
|
||||
"""Nesterov Adam optimizer.
|
||||
|
||||
Much like Adam is essentially RMSprop with momentum,
|
||||
Nadam is Adam RMSprop with Nesterov momentum.
|
||||
|
||||
Default parameters follow those provided in the paper.
|
||||
It is recommended to leave the parameters of this optimizer
|
||||
at their default values.
|
||||
|
||||
Arguments:
|
||||
lr: float >= 0. Learning rate.
|
||||
beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
|
||||
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
lr=0.002,
|
||||
beta_1=0.9,
|
||||
beta_2=0.999,
|
||||
epsilon=None,
|
||||
schedule_decay=0.004,
|
||||
**kwargs):
|
||||
super(Nadam, self).__init__(**kwargs)
|
||||
with K.name_scope(self.__class__.__name__):
|
||||
self.iterations = K.variable(0, dtype='int64', name='iterations')
|
||||
self.m_schedule = K.variable(1., name='m_schedule')
|
||||
self.lr = K.variable(lr, name='lr')
|
||||
self.beta_1 = K.variable(beta_1, name='beta_1')
|
||||
self.beta_2 = K.variable(beta_2, name='beta_2')
|
||||
if epsilon is None:
|
||||
epsilon = K.epsilon()
|
||||
self.epsilon = epsilon
|
||||
self.schedule_decay = schedule_decay
|
||||
|
||||
def _create_all_weights(self, params):
|
||||
shapes = [K.int_shape(p) for p in params]
|
||||
ms = [K.zeros(shape) for shape in shapes]
|
||||
vs = [K.zeros(shape) for shape in shapes]
|
||||
|
||||
self.weights = [self.iterations, self.m_schedule] + ms + vs
|
||||
return ms, vs
|
||||
|
||||
def get_updates(self, loss, params):
|
||||
grads = self.get_gradients(loss, params)
|
||||
self.updates = []
|
||||
|
||||
with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
|
||||
t = math_ops.cast(self.iterations, K.floatx())
|
||||
|
||||
# Due to the recommendations in [2], i.e. warming momentum schedule
|
||||
momentum_cache_t = self.beta_1 * (
|
||||
1. - 0.5 *
|
||||
(math_ops.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
|
||||
momentum_cache_t_1 = self.beta_1 * (
|
||||
1. - 0.5 *
|
||||
(math_ops.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
|
||||
m_schedule_new = self.m_schedule * momentum_cache_t
|
||||
m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
|
||||
self.updates.append((self.m_schedule, m_schedule_new))
|
||||
|
||||
ms, vs = self._create_all_weights(params)
|
||||
|
||||
for p, g, m, v in zip(params, grads, ms, vs):
|
||||
# the following equations given in [1]
|
||||
g_prime = g / (1. - m_schedule_new)
|
||||
m_t = self.beta_1 * m + (1. - self.beta_1) * g
|
||||
m_t_prime = m_t / (1. - m_schedule_next)
|
||||
v_t = self.beta_2 * v + (1. - self.beta_2) * math_ops.square(g)
|
||||
v_t_prime = v_t / (1. - math_ops.pow(self.beta_2, t))
|
||||
m_t_bar = (1. -
|
||||
momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
|
||||
|
||||
self.updates.append(state_ops.assign(m, m_t))
|
||||
self.updates.append(state_ops.assign(v, v_t))
|
||||
|
||||
p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon)
|
||||
new_p = p_t
|
||||
|
||||
# Apply constraints.
|
||||
if getattr(p, 'constraint', None) is not None:
|
||||
new_p = p.constraint(new_p)
|
||||
|
||||
self.updates.append(state_ops.assign(p, new_p))
|
||||
return self.updates
|
||||
|
||||
def get_config(self):
|
||||
config = {
|
||||
'lr': float(K.get_value(self.lr)),
|
||||
'beta_1': float(K.get_value(self.beta_1)),
|
||||
'beta_2': float(K.get_value(self.beta_2)),
|
||||
'epsilon': self.epsilon,
|
||||
'schedule_decay': self.schedule_decay
|
||||
}
|
||||
base_config = super(Nadam, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
|
||||
class TFOptimizer(Optimizer, trackable.Trackable):
|
||||
"""Wrapper class for native TensorFlow optimizers."""
|
||||
|
||||
def __init__(self, optimizer, iterations=None): # pylint: disable=super-init-not-called
|
||||
self.optimizer = optimizer
|
||||
self._track_trackable(optimizer, name='optimizer')
|
||||
if iterations is None:
|
||||
with K.name_scope(self.__class__.__name__):
|
||||
self.iterations = K.variable(0, dtype='int64', name='iterations')
|
||||
else:
|
||||
self.iterations = iterations
|
||||
self._track_trackable(self.iterations, name='global_step')
|
||||
|
||||
def _clip_gradients(self, grads):
|
||||
"""Clip gradients according to the clipnorm and clipvalue attributes."""
|
||||
# TFOptimizer wrapper has no gradient clipping options.
|
||||
return grads
|
||||
|
||||
def minimize(self, loss, var_list, grad_loss=None, tape=None):
|
||||
"""Mimics the `OptimizerV2.minimize` API."""
|
||||
if not callable(loss) and tape is None:
|
||||
raise ValueError('`tape` is required when a `Tensor` loss is passed.')
|
||||
tape = tape if tape is not None else backprop.GradientTape()
|
||||
|
||||
if callable(loss):
|
||||
with tape:
|
||||
if not callable(var_list):
|
||||
tape.watch(var_list)
|
||||
loss = loss()
|
||||
if callable(var_list):
|
||||
var_list = var_list()
|
||||
|
||||
var_list = nest.flatten(var_list)
|
||||
if var_list:
|
||||
grads = tape.gradient(loss, var_list, grad_loss)
|
||||
grads_and_vars = list(zip(grads, var_list))
|
||||
self.apply_gradients(grads_and_vars)
|
||||
|
||||
def apply_gradients(self, grads_and_vars):
|
||||
self.optimizer.apply_gradients(grads_and_vars, global_step=self.iterations)
|
||||
|
||||
def get_grads(self, loss, params):
|
||||
return self.optimizer.compute_gradients(loss, params)
|
||||
|
||||
def get_updates(self, loss, params):
|
||||
if distribution_strategy_context.has_strategy():
|
||||
self.updates = []
|
||||
|
||||
if not params:
|
||||
# After the model vars have been created, the second call to get_updates
|
||||
# is called with params as an empty list. This ensures that we call
|
||||
# compute_gradients with params=None.
|
||||
grads = self.optimizer.compute_gradients(loss)
|
||||
else:
|
||||
grads = self.optimizer.compute_gradients(loss, params)
|
||||
global_step = training_util.get_global_step()
|
||||
opt_update = self.optimizer.apply_gradients(grads, global_step)
|
||||
else:
|
||||
if not params:
|
||||
self.updates = [state_ops.assign_add(self.iterations, 1)]
|
||||
return self.updates
|
||||
|
||||
# Updates list starts out empty because the iterations variable is
|
||||
# incremented in optimizer.apply_gradients()
|
||||
self.updates = []
|
||||
grads = self.optimizer.compute_gradients(loss, params)
|
||||
opt_update = self.optimizer.apply_gradients(
|
||||
grads, global_step=self.iterations)
|
||||
|
||||
self.updates.append(opt_update)
|
||||
return self.updates
|
||||
|
||||
@property
|
||||
def weights(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_config(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def from_config(self, config):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
# Aliases.
|
||||
|
||||
sgd = SGD
|
||||
rmsprop = RMSprop
|
||||
adagrad = Adagrad
|
||||
adadelta = Adadelta
|
||||
adam = Adam
|
||||
adamax = Adamax
|
||||
nadam = Nadam
|
@ -26,7 +26,7 @@ from tensorflow.python.framework import constant_op
|
||||
from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.keras import combinations
|
||||
from tensorflow.python.keras import optimizers
|
||||
from tensorflow.python.keras import optimizer_v1
|
||||
from tensorflow.python.keras.optimizer_v2 import adam
|
||||
from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
|
||||
from tensorflow.python.ops import array_ops
|
||||
@ -537,7 +537,7 @@ class AdamOptimizerTest(test.TestCase, parameterized.TestCase):
|
||||
self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
|
||||
|
||||
def testSetWeightsFromV1AdamWithoutMinimize(self):
|
||||
keras_v1_adam = optimizers.Adam()
|
||||
keras_v1_adam = optimizer_v1.Adam()
|
||||
keras_v2_adam = adam.Adam()
|
||||
keras_v2_adam.set_weights(keras_v1_adam.get_weights())
|
||||
keras_v1_iteration = keras_v1_adam.iterations
|
||||
|
@ -35,7 +35,7 @@ from tensorflow.python.keras import callbacks
|
||||
from tensorflow.python.keras import combinations
|
||||
from tensorflow.python.keras import keras_parameterized
|
||||
from tensorflow.python.keras import losses
|
||||
from tensorflow.python.keras import optimizers
|
||||
from tensorflow.python.keras import optimizer_v1
|
||||
from tensorflow.python.keras import testing_utils
|
||||
from tensorflow.python.keras.engine import input_layer
|
||||
from tensorflow.python.keras.engine import sequential
|
||||
@ -739,42 +739,42 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
|
||||
rtol=1e-5, atol=1e-5)
|
||||
|
||||
def testAdadeltaCompatibility(self):
|
||||
opt_v1 = optimizers.Adadelta(lr=0.01)
|
||||
opt_v1 = optimizer_v1.Adadelta(lr=0.01)
|
||||
opt_v2 = adadelta.Adadelta(learning_rate=0.01)
|
||||
self._testOptimizersCompatibility(opt_v1, opt_v2)
|
||||
|
||||
def testAdagradCompatibility(self):
|
||||
opt_v1 = optimizers.Adagrad(lr=0.01)
|
||||
opt_v1 = optimizer_v1.Adagrad(lr=0.01)
|
||||
opt_v2 = adagrad.Adagrad(learning_rate=0.01)
|
||||
self._testOptimizersCompatibility(opt_v1, opt_v2)
|
||||
|
||||
def testAdamCompatibility(self):
|
||||
opt_v1 = optimizers.Adam()
|
||||
opt_v1 = optimizer_v1.Adam()
|
||||
opt_v2 = adam.Adam()
|
||||
self._testOptimizersCompatibility(opt_v1, opt_v2)
|
||||
|
||||
def testAdamaxCompatibility(self):
|
||||
opt_v1 = optimizers.Adamax(lr=0.01)
|
||||
opt_v1 = optimizer_v1.Adamax(lr=0.01)
|
||||
opt_v2 = adamax.Adamax(learning_rate=0.01)
|
||||
self._testOptimizersCompatibility(opt_v1, opt_v2)
|
||||
|
||||
def testNadamCompatibility(self):
|
||||
opt_v1 = optimizers.Nadam(lr=0.001)
|
||||
opt_v1 = optimizer_v1.Nadam(lr=0.001)
|
||||
opt_v2 = nadam.Nadam(learning_rate=0.001)
|
||||
self._testOptimizersCompatibility(opt_v1, opt_v2)
|
||||
|
||||
def testMomentumCompatibility(self):
|
||||
opt_v1 = optimizers.SGD(lr=0.01, momentum=0.9)
|
||||
opt_v1 = optimizer_v1.SGD(lr=0.01, momentum=0.9)
|
||||
opt_v2 = gradient_descent.SGD(learning_rate=0.01, momentum=0.9)
|
||||
self._testOptimizersCompatibility(opt_v1, opt_v2)
|
||||
|
||||
def testRMSpropCompatibility(self):
|
||||
opt_v1 = optimizers.RMSprop()
|
||||
opt_v1 = optimizer_v1.RMSprop()
|
||||
opt_v2 = rmsprop.RMSprop()
|
||||
self._testOptimizersCompatibility(opt_v1, opt_v2)
|
||||
|
||||
def testSGDCompatibility(self):
|
||||
opt_v1 = optimizers.SGD(lr=0.01)
|
||||
opt_v1 = optimizer_v1.SGD(lr=0.01)
|
||||
opt_v2 = gradient_descent.SGD(learning_rate=0.01)
|
||||
self._testOptimizersCompatibility(opt_v1, opt_v2, False)
|
||||
|
||||
@ -804,7 +804,7 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
|
||||
num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
|
||||
model_tf.set_weights(model_k_v2.get_weights())
|
||||
|
||||
opt_k_v1 = optimizers.SGD(momentum=0.9, nesterov=True)
|
||||
opt_k_v1 = optimizer_v1.SGD(momentum=0.9, nesterov=True)
|
||||
opt_k_v2 = gradient_descent.SGD(momentum=0.9, nesterov=True)
|
||||
opt_tf = momentum.MomentumOptimizer(
|
||||
learning_rate=0.01, momentum=0.9, use_nesterov=True)
|
||||
@ -858,7 +858,7 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
|
||||
num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
|
||||
model_k_v2.set_weights(model_k_v1.get_weights())
|
||||
|
||||
opt_k_v1 = optimizers.Adam(amsgrad=True)
|
||||
opt_k_v1 = optimizer_v1.Adam(amsgrad=True)
|
||||
opt_k_v2 = adam.Adam(amsgrad=True)
|
||||
|
||||
model_k_v1.compile(
|
||||
|
@ -22,12 +22,10 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import six
|
||||
from six.moves import zip # pylint: disable=redefined-builtin
|
||||
|
||||
from tensorflow.python.distribute import distribution_strategy_context
|
||||
from tensorflow.python.eager import backprop
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.keras import backend as K
|
||||
from tensorflow.python.keras.optimizer_v1 import Optimizer
|
||||
from tensorflow.python.keras.optimizer_v1 import TFOptimizer
|
||||
from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_v2
|
||||
from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_v2
|
||||
from tensorflow.python.keras.optimizer_v2 import adam as adam_v2
|
||||
@ -39,819 +37,10 @@ from tensorflow.python.keras.optimizer_v2 import optimizer_v2
|
||||
from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
|
||||
from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
|
||||
from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
|
||||
from tensorflow.python.ops import clip_ops
|
||||
from tensorflow.python.ops import math_ops
|
||||
from tensorflow.python.ops import state_ops
|
||||
from tensorflow.python.training import optimizer as tf_optimizer_module
|
||||
from tensorflow.python.training import training_util
|
||||
from tensorflow.python.training.tracking import base as trackable
|
||||
from tensorflow.python.util import nest
|
||||
from tensorflow.python.util.tf_export import keras_export
|
||||
|
||||
|
||||
class Optimizer(object):
|
||||
"""Abstract optimizer base class.
|
||||
|
||||
Note: this is the parent class of all optimizers, not an actual optimizer
|
||||
that can be used for training models.
|
||||
|
||||
All Keras optimizers support the following keyword arguments:
|
||||
|
||||
clipnorm: float >= 0. Gradients will be clipped
|
||||
when their L2 norm exceeds this value.
|
||||
clipvalue: float >= 0. Gradients will be clipped
|
||||
when their absolute value exceeds this value.
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
allowed_kwargs = {'clipnorm', 'clipvalue'}
|
||||
for k in kwargs:
|
||||
if k not in allowed_kwargs:
|
||||
raise TypeError('Unexpected keyword argument '
|
||||
'passed to optimizer: ' + str(k))
|
||||
# checks that clipnorm >= 0 and clipvalue >= 0
|
||||
if kwargs[k] < 0:
|
||||
raise ValueError('Expected {} >= 0, received: {}'.format(k, kwargs[k]))
|
||||
self.__dict__.update(kwargs)
|
||||
self.updates = []
|
||||
self.weights = []
|
||||
|
||||
# Set this to False, indicating `apply_gradients` does not take the
|
||||
# `experimental_aggregate_gradients` argument.
|
||||
_HAS_AGGREGATE_GRAD = False
|
||||
|
||||
def _create_all_weights(self, params):
|
||||
"""Creates and sets all optimizer weights.
|
||||
|
||||
Args:
|
||||
params: list or tuple of `Variable` objects that will be minimized
|
||||
using this optimizer.
|
||||
|
||||
Returns:
|
||||
Specific weight values that are used in `get_updates`
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def get_updates(self, loss, params):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_gradients(self, loss, params):
|
||||
"""Returns gradients of `loss` with respect to `params`.
|
||||
|
||||
Arguments:
|
||||
loss: Loss tensor.
|
||||
params: List of variables.
|
||||
|
||||
Returns:
|
||||
List of gradient tensors.
|
||||
|
||||
Raises:
|
||||
ValueError: In case any gradient cannot be computed (e.g. if gradient
|
||||
function not implemented).
|
||||
"""
|
||||
grads = K.gradients(loss, params)
|
||||
if any(g is None for g in grads):
|
||||
raise ValueError('An operation has `None` for gradient. '
|
||||
'Please make sure that all of your ops have a '
|
||||
'gradient defined (i.e. are differentiable). '
|
||||
'Common ops without gradient: '
|
||||
'K.argmax, K.round, K.eval.')
|
||||
if hasattr(self, 'clipnorm'):
|
||||
grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
|
||||
if hasattr(self, 'clipvalue'):
|
||||
grads = [
|
||||
clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
|
||||
for g in grads
|
||||
]
|
||||
return grads
|
||||
|
||||
def set_weights(self, weights):
|
||||
"""Sets the weights of the optimizer, from Numpy arrays.
|
||||
|
||||
Should only be called after computing the gradients
|
||||
(otherwise the optimizer has no weights).
|
||||
|
||||
Arguments:
|
||||
weights: a list of Numpy arrays. The number of arrays and their shape
|
||||
must match number of the dimensions of the weights of the optimizer
|
||||
(i.e. it should match the output of `get_weights`).
|
||||
|
||||
Raises:
|
||||
ValueError: in case of incompatible weight shapes.
|
||||
"""
|
||||
params = self.weights
|
||||
if len(params) != len(weights):
|
||||
raise ValueError('Length of the specified weight list (' +
|
||||
str(len(weights)) +
|
||||
') does not match the number of weights '
|
||||
'of the optimizer (' + str(len(params)) + ')')
|
||||
weight_value_tuples = []
|
||||
param_values = K.batch_get_value(params)
|
||||
for pv, p, w in zip(param_values, params, weights):
|
||||
if pv.shape != w.shape:
|
||||
raise ValueError('Optimizer weight shape ' + str(pv.shape) +
|
||||
' not compatible with '
|
||||
'provided weight shape ' + str(w.shape))
|
||||
weight_value_tuples.append((p, w))
|
||||
K.batch_set_value(weight_value_tuples)
|
||||
|
||||
def get_weights(self):
|
||||
"""Returns the current value of the weights of the optimizer.
|
||||
|
||||
Returns:
|
||||
A list of numpy arrays.
|
||||
"""
|
||||
return K.batch_get_value(self.weights)
|
||||
|
||||
def get_config(self):
|
||||
config = {}
|
||||
if hasattr(self, 'clipnorm'):
|
||||
config['clipnorm'] = self.clipnorm
|
||||
if hasattr(self, 'clipvalue'):
|
||||
config['clipvalue'] = self.clipvalue
|
||||
return config
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, config):
|
||||
return cls(**config)
|
||||
|
||||
|
||||
class SGD(Optimizer):
|
||||
"""Stochastic gradient descent optimizer.
|
||||
|
||||
Includes support for momentum,
|
||||
learning rate decay, and Nesterov momentum.
|
||||
|
||||
Arguments:
|
||||
lr: float >= 0. Learning rate.
|
||||
momentum: float >= 0. Parameter that accelerates SGD in the relevant
|
||||
direction and dampens oscillations.
|
||||
decay: float >= 0. Learning rate decay over each update.
|
||||
nesterov: boolean. Whether to apply Nesterov momentum.
|
||||
"""
|
||||
|
||||
def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, **kwargs):
|
||||
super(SGD, self).__init__(**kwargs)
|
||||
with K.name_scope(self.__class__.__name__):
|
||||
self.iterations = K.variable(0, dtype='int64', name='iterations')
|
||||
self.lr = K.variable(lr, name='lr')
|
||||
self.momentum = K.variable(momentum, name='momentum')
|
||||
self.decay = K.variable(decay, name='decay')
|
||||
self.initial_decay = decay
|
||||
self.nesterov = nesterov
|
||||
|
||||
def _create_all_weights(self, params):
|
||||
shapes = [K.int_shape(p) for p in params]
|
||||
moments = [K.zeros(shape) for shape in shapes]
|
||||
self.weights = [self.iterations] + moments
|
||||
return moments
|
||||
|
||||
def get_updates(self, loss, params):
|
||||
grads = self.get_gradients(loss, params)
|
||||
self.updates = [state_ops.assign_add(self.iterations, 1)]
|
||||
|
||||
lr = self.lr
|
||||
if self.initial_decay > 0:
|
||||
lr = lr * ( # pylint: disable=g-no-augmented-assignment
|
||||
1. /
|
||||
(1. +
|
||||
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
|
||||
# momentum
|
||||
moments = self._create_all_weights(params)
|
||||
for p, g, m in zip(params, grads, moments):
|
||||
v = self.momentum * m - lr * g # velocity
|
||||
self.updates.append(state_ops.assign(m, v))
|
||||
|
||||
if self.nesterov:
|
||||
new_p = p + self.momentum * v - lr * g
|
||||
else:
|
||||
new_p = p + v
|
||||
|
||||
# Apply constraints.
|
||||
if getattr(p, 'constraint', None) is not None:
|
||||
new_p = p.constraint(new_p)
|
||||
|
||||
self.updates.append(state_ops.assign(p, new_p))
|
||||
return self.updates
|
||||
|
||||
def get_config(self):
|
||||
config = {
|
||||
'lr': float(K.get_value(self.lr)),
|
||||
'momentum': float(K.get_value(self.momentum)),
|
||||
'decay': float(K.get_value(self.decay)),
|
||||
'nesterov': self.nesterov
|
||||
}
|
||||
base_config = super(SGD, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
|
||||
class RMSprop(Optimizer):
|
||||
"""RMSProp optimizer.
|
||||
|
||||
It is recommended to leave the parameters of this optimizer
|
||||
at their default values
|
||||
(except the learning rate, which can be freely tuned).
|
||||
|
||||
Arguments:
|
||||
lr: float >= 0. Learning rate.
|
||||
rho: float >= 0.
|
||||
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
|
||||
decay: float >= 0. Learning rate decay over each update.
|
||||
"""
|
||||
|
||||
def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0., **kwargs):
|
||||
super(RMSprop, self).__init__(**kwargs)
|
||||
with K.name_scope(self.__class__.__name__):
|
||||
self.lr = K.variable(lr, name='lr')
|
||||
self.rho = K.variable(rho, name='rho')
|
||||
self.decay = K.variable(decay, name='decay')
|
||||
self.iterations = K.variable(0, dtype='int64', name='iterations')
|
||||
if epsilon is None:
|
||||
epsilon = K.epsilon()
|
||||
self.epsilon = epsilon
|
||||
self.initial_decay = decay
|
||||
|
||||
def _create_all_weights(self, params):
|
||||
accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
|
||||
self.weights = accumulators
|
||||
return accumulators
|
||||
|
||||
def get_updates(self, loss, params):
|
||||
grads = self.get_gradients(loss, params)
|
||||
accumulators = self._create_all_weights(params)
|
||||
self.updates = [state_ops.assign_add(self.iterations, 1)]
|
||||
|
||||
lr = self.lr
|
||||
if self.initial_decay > 0:
|
||||
lr = lr * ( # pylint: disable=g-no-augmented-assignment
|
||||
1. /
|
||||
(1. +
|
||||
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
|
||||
|
||||
for p, g, a in zip(params, grads, accumulators):
|
||||
# update accumulator
|
||||
new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
|
||||
self.updates.append(state_ops.assign(a, new_a))
|
||||
new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
|
||||
|
||||
# Apply constraints.
|
||||
if getattr(p, 'constraint', None) is not None:
|
||||
new_p = p.constraint(new_p)
|
||||
|
||||
self.updates.append(state_ops.assign(p, new_p))
|
||||
return self.updates
|
||||
|
||||
def get_config(self):
|
||||
config = {
|
||||
'lr': float(K.get_value(self.lr)),
|
||||
'rho': float(K.get_value(self.rho)),
|
||||
'decay': float(K.get_value(self.decay)),
|
||||
'epsilon': self.epsilon
|
||||
}
|
||||
base_config = super(RMSprop, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
|
||||
class Adagrad(Optimizer):
|
||||
"""Adagrad optimizer.
|
||||
|
||||
Adagrad is an optimizer with parameter-specific learning rates,
|
||||
which are adapted relative to how frequently a parameter gets
|
||||
updated during training. The more updates a parameter receives,
|
||||
the smaller the updates.
|
||||
|
||||
It is recommended to leave the parameters of this optimizer
|
||||
at their default values.
|
||||
|
||||
# Arguments
|
||||
lr: float >= 0. Initial learning rate.
|
||||
epsilon: float >= 0. If `None`, defaults to `K.epsilon()`.
|
||||
decay: float >= 0. Learning rate decay over each update.
|
||||
|
||||
# References
|
||||
- [Adaptive Subgradient Methods for Online Learning and Stochastic
|
||||
Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
|
||||
"""
|
||||
|
||||
def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs):
|
||||
super(Adagrad, self).__init__(**kwargs)
|
||||
with K.name_scope(self.__class__.__name__):
|
||||
self.lr = K.variable(lr, name='lr')
|
||||
self.decay = K.variable(decay, name='decay')
|
||||
self.iterations = K.variable(0, dtype='int64', name='iterations')
|
||||
if epsilon is None:
|
||||
epsilon = K.epsilon()
|
||||
self.epsilon = epsilon
|
||||
self.initial_decay = decay
|
||||
|
||||
def _create_all_weights(self, params):
|
||||
shapes = [K.int_shape(p) for p in params]
|
||||
accumulators = [K.zeros(shape) for shape in shapes]
|
||||
self.weights = accumulators
|
||||
return accumulators
|
||||
|
||||
def get_updates(self, loss, params):
|
||||
grads = self.get_gradients(loss, params)
|
||||
accumulators = self._create_all_weights(params)
|
||||
|
||||
self.updates = [state_ops.assign_add(self.iterations, 1)]
|
||||
|
||||
lr = self.lr
|
||||
if self.initial_decay > 0:
|
||||
lr = lr * ( # pylint: disable=g-no-augmented-assignment
|
||||
1. /
|
||||
(1. +
|
||||
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
|
||||
|
||||
for p, g, a in zip(params, grads, accumulators):
|
||||
new_a = a + math_ops.square(g) # update accumulator
|
||||
self.updates.append(state_ops.assign(a, new_a))
|
||||
new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
|
||||
|
||||
# Apply constraints.
|
||||
if getattr(p, 'constraint', None) is not None:
|
||||
new_p = p.constraint(new_p)
|
||||
|
||||
self.updates.append(state_ops.assign(p, new_p))
|
||||
return self.updates
|
||||
|
||||
def get_config(self):
|
||||
config = {
|
||||
'lr': float(K.get_value(self.lr)),
|
||||
'decay': float(K.get_value(self.decay)),
|
||||
'epsilon': self.epsilon
|
||||
}
|
||||
base_config = super(Adagrad, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
|
||||
class Adadelta(Optimizer):
|
||||
"""Adadelta optimizer.
|
||||
|
||||
Adadelta is a more robust extension of Adagrad
|
||||
that adapts learning rates based on a moving window of gradient updates,
|
||||
instead of accumulating all past gradients. This way, Adadelta continues
|
||||
learning even when many updates have been done. Compared to Adagrad, in the
|
||||
original version of Adadelta you don't have to set an initial learning
|
||||
rate. In this version, initial learning rate and decay factor can
|
||||
be set, as in most other Keras optimizers.
|
||||
|
||||
It is recommended to leave the parameters of this optimizer
|
||||
at their default values.
|
||||
|
||||
# Arguments
|
||||
lr: float >= 0. Initial learning rate, defaults to 1.
|
||||
It is recommended to leave it at the default value.
|
||||
rho: float >= 0. Adadelta decay factor, corresponding to fraction of
|
||||
gradient to keep at each time step.
|
||||
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
|
||||
decay: float >= 0. Initial learning rate decay.
|
||||
|
||||
# References
|
||||
- [Adadelta - an adaptive learning rate
|
||||
method](http://arxiv.org/abs/1212.5701)
|
||||
"""
|
||||
|
||||
def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0., **kwargs):
|
||||
super(Adadelta, self).__init__(**kwargs)
|
||||
with K.name_scope(self.__class__.__name__):
|
||||
self.lr = K.variable(lr, name='lr')
|
||||
self.decay = K.variable(decay, name='decay')
|
||||
self.iterations = K.variable(0, dtype='int64', name='iterations')
|
||||
if epsilon is None:
|
||||
epsilon = K.epsilon()
|
||||
self.rho = rho
|
||||
self.epsilon = epsilon
|
||||
self.initial_decay = decay
|
||||
|
||||
def _create_all_weights(self, params):
|
||||
shapes = [K.int_shape(p) for p in params]
|
||||
accumulators = [K.zeros(shape) for shape in shapes]
|
||||
delta_accumulators = [K.zeros(shape) for shape in shapes]
|
||||
self.weights = accumulators + delta_accumulators
|
||||
return accumulators, delta_accumulators
|
||||
|
||||
def get_updates(self, loss, params):
|
||||
grads = self.get_gradients(loss, params)
|
||||
self.updates = [state_ops.assign_add(self.iterations, 1)]
|
||||
accumulators, delta_accumulators = self._create_all_weights(params)
|
||||
|
||||
lr = self.lr
|
||||
if self.initial_decay > 0:
|
||||
lr = lr * ( # pylint: disable=g-no-augmented-assignment
|
||||
1. /
|
||||
(1. +
|
||||
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
|
||||
|
||||
for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
|
||||
# update accumulator
|
||||
new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
|
||||
self.updates.append(state_ops.assign(a, new_a))
|
||||
|
||||
# use the new accumulator and the *old* delta_accumulator
|
||||
update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon)
|
||||
new_p = p - lr * update
|
||||
|
||||
# Apply constraints.
|
||||
if getattr(p, 'constraint', None) is not None:
|
||||
new_p = p.constraint(new_p)
|
||||
|
||||
self.updates.append(state_ops.assign(p, new_p))
|
||||
|
||||
# update delta_accumulator
|
||||
new_d_a = self.rho * d_a + (1 - self.rho) * math_ops.square(update)
|
||||
self.updates.append(state_ops.assign(d_a, new_d_a))
|
||||
return self.updates
|
||||
|
||||
def get_config(self):
|
||||
config = {
|
||||
'lr': float(K.get_value(self.lr)),
|
||||
'rho': self.rho,
|
||||
'decay': float(K.get_value(self.decay)),
|
||||
'epsilon': self.epsilon
|
||||
}
|
||||
base_config = super(Adadelta, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
|
||||
class Adam(Optimizer):
|
||||
"""Adam optimizer.
|
||||
|
||||
Default parameters follow those provided in the original paper.
|
||||
|
||||
Arguments:
|
||||
lr: float >= 0. Learning rate.
|
||||
beta_1: float, 0 < beta < 1. Generally close to 1.
|
||||
beta_2: float, 0 < beta < 1. Generally close to 1.
|
||||
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
|
||||
decay: float >= 0. Learning rate decay over each update.
|
||||
amsgrad: boolean. Whether to apply the AMSGrad variant of this algorithm
|
||||
from the paper "On the Convergence of Adam and Beyond".
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
lr=0.001,
|
||||
beta_1=0.9,
|
||||
beta_2=0.999,
|
||||
epsilon=None,
|
||||
decay=0.,
|
||||
amsgrad=False,
|
||||
**kwargs):
|
||||
super(Adam, self).__init__(**kwargs)
|
||||
with K.name_scope(self.__class__.__name__):
|
||||
self.iterations = K.variable(0, dtype='int64', name='iterations')
|
||||
self.lr = K.variable(lr, name='lr')
|
||||
self.beta_1 = K.variable(beta_1, name='beta_1')
|
||||
self.beta_2 = K.variable(beta_2, name='beta_2')
|
||||
self.decay = K.variable(decay, name='decay')
|
||||
if epsilon is None:
|
||||
epsilon = K.epsilon()
|
||||
self.epsilon = epsilon
|
||||
self.initial_decay = decay
|
||||
self.amsgrad = amsgrad
|
||||
|
||||
def _create_all_weights(self, params):
|
||||
ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
|
||||
vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
|
||||
if self.amsgrad:
|
||||
vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
|
||||
else:
|
||||
vhats = [K.zeros(1) for _ in params]
|
||||
self.weights = [self.iterations] + ms + vs + vhats
|
||||
return ms, vs, vhats
|
||||
|
||||
def get_updates(self, loss, params):
|
||||
grads = self.get_gradients(loss, params)
|
||||
self.updates = []
|
||||
|
||||
lr = self.lr
|
||||
if self.initial_decay > 0:
|
||||
lr = lr * ( # pylint: disable=g-no-augmented-assignment
|
||||
1. /
|
||||
(1. +
|
||||
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
|
||||
|
||||
with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
|
||||
t = math_ops.cast(self.iterations, K.floatx())
|
||||
lr_t = lr * (
|
||||
K.sqrt(1. - math_ops.pow(self.beta_2, t)) /
|
||||
(1. - math_ops.pow(self.beta_1, t)))
|
||||
|
||||
ms, vs, vhats = self._create_all_weights(params)
|
||||
for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
|
||||
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
|
||||
v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g)
|
||||
if self.amsgrad:
|
||||
vhat_t = math_ops.maximum(vhat, v_t)
|
||||
p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
|
||||
self.updates.append(state_ops.assign(vhat, vhat_t))
|
||||
else:
|
||||
p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
|
||||
|
||||
self.updates.append(state_ops.assign(m, m_t))
|
||||
self.updates.append(state_ops.assign(v, v_t))
|
||||
new_p = p_t
|
||||
|
||||
# Apply constraints.
|
||||
if getattr(p, 'constraint', None) is not None:
|
||||
new_p = p.constraint(new_p)
|
||||
|
||||
self.updates.append(state_ops.assign(p, new_p))
|
||||
return self.updates
|
||||
|
||||
def get_config(self):
|
||||
config = {
|
||||
'lr': float(K.get_value(self.lr)),
|
||||
'beta_1': float(K.get_value(self.beta_1)),
|
||||
'beta_2': float(K.get_value(self.beta_2)),
|
||||
'decay': float(K.get_value(self.decay)),
|
||||
'epsilon': self.epsilon,
|
||||
'amsgrad': self.amsgrad
|
||||
}
|
||||
base_config = super(Adam, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
|
||||
class Adamax(Optimizer):
|
||||
"""Adamax optimizer from Adam paper's Section 7.
|
||||
|
||||
It is a variant of Adam based on the infinity norm.
|
||||
Default parameters follow those provided in the paper.
|
||||
|
||||
Arguments:
|
||||
lr: float >= 0. Learning rate.
|
||||
beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
|
||||
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
|
||||
decay: float >= 0. Learning rate decay over each update.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
lr=0.002,
|
||||
beta_1=0.9,
|
||||
beta_2=0.999,
|
||||
epsilon=None,
|
||||
decay=0.,
|
||||
**kwargs):
|
||||
super(Adamax, self).__init__(**kwargs)
|
||||
with K.name_scope(self.__class__.__name__):
|
||||
self.iterations = K.variable(0, dtype='int64', name='iterations')
|
||||
self.lr = K.variable(lr, name='lr')
|
||||
self.beta_1 = K.variable(beta_1, name='beta_1')
|
||||
self.beta_2 = K.variable(beta_2, name='beta_2')
|
||||
self.decay = K.variable(decay, name='decay')
|
||||
if epsilon is None:
|
||||
epsilon = K.epsilon()
|
||||
self.epsilon = epsilon
|
||||
self.initial_decay = decay
|
||||
|
||||
def _create_all_weights(self, params):
|
||||
|
||||
shapes = [K.int_shape(p) for p in params]
|
||||
# zero init of 1st moment
|
||||
ms = [K.zeros(shape) for shape in shapes]
|
||||
# zero init of exponentially weighted infinity norm
|
||||
us = [K.zeros(shape) for shape in shapes]
|
||||
self.weights = [self.iterations] + ms + us
|
||||
return ms, us
|
||||
|
||||
def get_updates(self, loss, params):
|
||||
grads = self.get_gradients(loss, params)
|
||||
self.updates = []
|
||||
|
||||
lr = self.lr
|
||||
if self.initial_decay > 0:
|
||||
lr = lr * ( # pylint: disable=g-no-augmented-assignment
|
||||
1. /
|
||||
(1. +
|
||||
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
|
||||
|
||||
with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
|
||||
t = math_ops.cast(self.iterations, K.floatx())
|
||||
lr_t = lr / (1. - math_ops.pow(self.beta_1, t))
|
||||
|
||||
ms, us = self._create_all_weights(params)
|
||||
|
||||
for p, g, m, u in zip(params, grads, ms, us):
|
||||
|
||||
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
|
||||
u_t = math_ops.maximum(self.beta_2 * u, math_ops.abs(g))
|
||||
p_t = p - lr_t * m_t / (u_t + self.epsilon)
|
||||
|
||||
self.updates.append(state_ops.assign(m, m_t))
|
||||
self.updates.append(state_ops.assign(u, u_t))
|
||||
new_p = p_t
|
||||
|
||||
# Apply constraints.
|
||||
if getattr(p, 'constraint', None) is not None:
|
||||
new_p = p.constraint(new_p)
|
||||
|
||||
self.updates.append(state_ops.assign(p, new_p))
|
||||
return self.updates
|
||||
|
||||
def get_config(self):
|
||||
config = {
|
||||
'lr': float(K.get_value(self.lr)),
|
||||
'beta_1': float(K.get_value(self.beta_1)),
|
||||
'beta_2': float(K.get_value(self.beta_2)),
|
||||
'decay': float(K.get_value(self.decay)),
|
||||
'epsilon': self.epsilon
|
||||
}
|
||||
base_config = super(Adamax, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
|
||||
class Nadam(Optimizer):
|
||||
"""Nesterov Adam optimizer.
|
||||
|
||||
Much like Adam is essentially RMSprop with momentum,
|
||||
Nadam is Adam RMSprop with Nesterov momentum.
|
||||
|
||||
Default parameters follow those provided in the paper.
|
||||
It is recommended to leave the parameters of this optimizer
|
||||
at their default values.
|
||||
|
||||
Arguments:
|
||||
lr: float >= 0. Learning rate.
|
||||
beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
|
||||
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
lr=0.002,
|
||||
beta_1=0.9,
|
||||
beta_2=0.999,
|
||||
epsilon=None,
|
||||
schedule_decay=0.004,
|
||||
**kwargs):
|
||||
super(Nadam, self).__init__(**kwargs)
|
||||
with K.name_scope(self.__class__.__name__):
|
||||
self.iterations = K.variable(0, dtype='int64', name='iterations')
|
||||
self.m_schedule = K.variable(1., name='m_schedule')
|
||||
self.lr = K.variable(lr, name='lr')
|
||||
self.beta_1 = K.variable(beta_1, name='beta_1')
|
||||
self.beta_2 = K.variable(beta_2, name='beta_2')
|
||||
if epsilon is None:
|
||||
epsilon = K.epsilon()
|
||||
self.epsilon = epsilon
|
||||
self.schedule_decay = schedule_decay
|
||||
|
||||
def _create_all_weights(self, params):
|
||||
shapes = [K.int_shape(p) for p in params]
|
||||
ms = [K.zeros(shape) for shape in shapes]
|
||||
vs = [K.zeros(shape) for shape in shapes]
|
||||
|
||||
self.weights = [self.iterations, self.m_schedule] + ms + vs
|
||||
return ms, vs
|
||||
|
||||
def get_updates(self, loss, params):
|
||||
grads = self.get_gradients(loss, params)
|
||||
self.updates = []
|
||||
|
||||
with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
|
||||
t = math_ops.cast(self.iterations, K.floatx())
|
||||
|
||||
# Due to the recommendations in [2], i.e. warming momentum schedule
|
||||
momentum_cache_t = self.beta_1 * (
|
||||
1. - 0.5 *
|
||||
(math_ops.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
|
||||
momentum_cache_t_1 = self.beta_1 * (
|
||||
1. - 0.5 *
|
||||
(math_ops.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
|
||||
m_schedule_new = self.m_schedule * momentum_cache_t
|
||||
m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
|
||||
self.updates.append((self.m_schedule, m_schedule_new))
|
||||
|
||||
ms, vs = self._create_all_weights(params)
|
||||
|
||||
for p, g, m, v in zip(params, grads, ms, vs):
|
||||
# the following equations given in [1]
|
||||
g_prime = g / (1. - m_schedule_new)
|
||||
m_t = self.beta_1 * m + (1. - self.beta_1) * g
|
||||
m_t_prime = m_t / (1. - m_schedule_next)
|
||||
v_t = self.beta_2 * v + (1. - self.beta_2) * math_ops.square(g)
|
||||
v_t_prime = v_t / (1. - math_ops.pow(self.beta_2, t))
|
||||
m_t_bar = (1. -
|
||||
momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
|
||||
|
||||
self.updates.append(state_ops.assign(m, m_t))
|
||||
self.updates.append(state_ops.assign(v, v_t))
|
||||
|
||||
p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon)
|
||||
new_p = p_t
|
||||
|
||||
# Apply constraints.
|
||||
if getattr(p, 'constraint', None) is not None:
|
||||
new_p = p.constraint(new_p)
|
||||
|
||||
self.updates.append(state_ops.assign(p, new_p))
|
||||
return self.updates
|
||||
|
||||
def get_config(self):
|
||||
config = {
|
||||
'lr': float(K.get_value(self.lr)),
|
||||
'beta_1': float(K.get_value(self.beta_1)),
|
||||
'beta_2': float(K.get_value(self.beta_2)),
|
||||
'epsilon': self.epsilon,
|
||||
'schedule_decay': self.schedule_decay
|
||||
}
|
||||
base_config = super(Nadam, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
|
||||
class TFOptimizer(Optimizer, trackable.Trackable):
|
||||
"""Wrapper class for native TensorFlow optimizers."""
|
||||
|
||||
def __init__(self, optimizer, iterations=None): # pylint: disable=super-init-not-called
|
||||
self.optimizer = optimizer
|
||||
self._track_trackable(optimizer, name='optimizer')
|
||||
if iterations is None:
|
||||
with K.name_scope(self.__class__.__name__):
|
||||
self.iterations = K.variable(0, dtype='int64', name='iterations')
|
||||
else:
|
||||
self.iterations = iterations
|
||||
self._track_trackable(self.iterations, name='global_step')
|
||||
|
||||
def _clip_gradients(self, grads):
|
||||
"""Clip gradients according to the clipnorm and clipvalue attributes."""
|
||||
# TFOptimizer wrapper has no gradient clipping options.
|
||||
return grads
|
||||
|
||||
def minimize(self, loss, var_list, grad_loss=None, tape=None):
|
||||
"""Mimics the `OptimizerV2.minimize` API."""
|
||||
if not callable(loss) and tape is None:
|
||||
raise ValueError('`tape` is required when a `Tensor` loss is passed.')
|
||||
tape = tape if tape is not None else backprop.GradientTape()
|
||||
|
||||
if callable(loss):
|
||||
with tape:
|
||||
if not callable(var_list):
|
||||
tape.watch(var_list)
|
||||
loss = loss()
|
||||
if callable(var_list):
|
||||
var_list = var_list()
|
||||
|
||||
var_list = nest.flatten(var_list)
|
||||
if var_list:
|
||||
grads = tape.gradient(loss, var_list, grad_loss)
|
||||
grads_and_vars = list(zip(grads, var_list))
|
||||
self.apply_gradients(grads_and_vars)
|
||||
|
||||
def apply_gradients(self, grads_and_vars):
|
||||
self.optimizer.apply_gradients(grads_and_vars, global_step=self.iterations)
|
||||
|
||||
def get_grads(self, loss, params):
|
||||
return self.optimizer.compute_gradients(loss, params)
|
||||
|
||||
def get_updates(self, loss, params):
|
||||
if distribution_strategy_context.has_strategy():
|
||||
self.updates = []
|
||||
|
||||
if not params:
|
||||
# After the model vars have been created, the second call to get_updates
|
||||
# is called with params as an empty list. This ensures that we call
|
||||
# compute_gradients with params=None.
|
||||
grads = self.optimizer.compute_gradients(loss)
|
||||
else:
|
||||
grads = self.optimizer.compute_gradients(loss, params)
|
||||
global_step = training_util.get_global_step()
|
||||
opt_update = self.optimizer.apply_gradients(grads, global_step)
|
||||
else:
|
||||
if not params:
|
||||
self.updates = [state_ops.assign_add(self.iterations, 1)]
|
||||
return self.updates
|
||||
|
||||
# Updates list starts out empty because the iterations variable is
|
||||
# incremented in optimizer.apply_gradients()
|
||||
self.updates = []
|
||||
grads = self.optimizer.compute_gradients(loss, params)
|
||||
opt_update = self.optimizer.apply_gradients(
|
||||
grads, global_step=self.iterations)
|
||||
|
||||
self.updates.append(opt_update)
|
||||
return self.updates
|
||||
|
||||
@property
|
||||
def weights(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_config(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def from_config(self, config):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
# Aliases.
|
||||
|
||||
sgd = SGD
|
||||
rmsprop = RMSprop
|
||||
adagrad = Adagrad
|
||||
adadelta = Adadelta
|
||||
adam = Adam
|
||||
adamax = Adamax
|
||||
nadam = Nadam
|
||||
|
||||
|
||||
@keras_export('keras.optimizers.serialize')
|
||||
def serialize(optimizer):
|
||||
return serialize_keras_object(optimizer)
|
||||
|
@ -27,6 +27,7 @@ from tensorflow.python import keras
|
||||
from tensorflow.python.eager import context
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.keras import keras_parameterized
|
||||
from tensorflow.python.keras import optimizer_v1
|
||||
from tensorflow.python.keras import testing_utils
|
||||
from tensorflow.python.keras.utils import np_utils
|
||||
from tensorflow.python.platform import test
|
||||
@ -109,63 +110,63 @@ class KerasOptimizersTest(keras_parameterized.TestCase):
|
||||
|
||||
def test_sgd(self):
|
||||
with self.cached_session():
|
||||
self._test_optimizer(keras.optimizers.SGD())
|
||||
self._test_optimizer(optimizer_v1.SGD())
|
||||
|
||||
def test_momentum(self):
|
||||
with self.cached_session():
|
||||
self._test_optimizer(
|
||||
keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True))
|
||||
optimizer_v1.SGD(lr=0.01, momentum=0.9, nesterov=True))
|
||||
|
||||
def test_rmsprop(self):
|
||||
with self.cached_session():
|
||||
self._test_optimizer(keras.optimizers.RMSprop())
|
||||
self._test_optimizer(keras.optimizers.RMSprop(decay=1e-3))
|
||||
self._test_optimizer(optimizer_v1.RMSprop())
|
||||
self._test_optimizer(optimizer_v1.RMSprop(decay=1e-3))
|
||||
|
||||
def test_adagrad(self):
|
||||
with self.cached_session():
|
||||
self._test_optimizer(keras.optimizers.Adagrad())
|
||||
self._test_optimizer(keras.optimizers.Adagrad(decay=1e-3))
|
||||
self._test_optimizer(optimizer_v1.Adagrad())
|
||||
self._test_optimizer(optimizer_v1.Adagrad(decay=1e-3))
|
||||
|
||||
def test_adadelta(self):
|
||||
with self.cached_session():
|
||||
self._test_optimizer(keras.optimizers.Adadelta(), target=0.6)
|
||||
self._test_optimizer(optimizer_v1.Adadelta(), target=0.6)
|
||||
# Accuracy seems dependent on the initialization. Even adding
|
||||
# tf.compat.v1.Print nodes in the graph seemed to affect the
|
||||
# initialization seed, and hence the accuracy.
|
||||
self._test_optimizer(keras.optimizers.Adadelta(decay=1e-3), target=0.4)
|
||||
self._test_optimizer(optimizer_v1.Adadelta(decay=1e-3), target=0.4)
|
||||
|
||||
def test_adam(self):
|
||||
with self.cached_session():
|
||||
self._test_optimizer(keras.optimizers.Adam())
|
||||
self._test_optimizer(optimizer_v1.Adam())
|
||||
# Accuracy seems dependent on the seed initialization.
|
||||
# TODO(b/121051441): fix test flakiness.
|
||||
self._test_optimizer(keras.optimizers.Adam(decay=1e-3), target=0.73)
|
||||
self._test_optimizer(keras.optimizers.Adam(amsgrad=True))
|
||||
self._test_optimizer(optimizer_v1.Adam(decay=1e-3), target=0.73)
|
||||
self._test_optimizer(optimizer_v1.Adam(amsgrad=True))
|
||||
|
||||
def test_adamax(self):
|
||||
with self.cached_session():
|
||||
self._test_optimizer(keras.optimizers.Adamax())
|
||||
self._test_optimizer(keras.optimizers.Adamax(decay=1e-3))
|
||||
self._test_optimizer(optimizer_v1.Adamax())
|
||||
self._test_optimizer(optimizer_v1.Adamax(decay=1e-3))
|
||||
|
||||
def test_nadam(self):
|
||||
with self.cached_session():
|
||||
self._test_optimizer(keras.optimizers.Nadam())
|
||||
self._test_optimizer(optimizer_v1.Nadam())
|
||||
|
||||
def test_clipnorm(self):
|
||||
with self.cached_session():
|
||||
self._test_optimizer(
|
||||
keras.optimizers.SGD(lr=0.01, momentum=0.9, clipnorm=0.5))
|
||||
optimizer_v1.SGD(lr=0.01, momentum=0.9, clipnorm=0.5))
|
||||
|
||||
def test_clipvalue(self):
|
||||
with self.cached_session():
|
||||
self._test_optimizer(
|
||||
keras.optimizers.SGD(lr=0.01, momentum=0.9, clipvalue=0.5))
|
||||
optimizer_v1.SGD(lr=0.01, momentum=0.9, clipvalue=0.5))
|
||||
|
||||
def test_tf_optimizer(self):
|
||||
if context.executing_eagerly():
|
||||
self.skipTest(
|
||||
'v1 optimizer does not run in eager mode')
|
||||
optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
|
||||
optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01))
|
||||
model = keras.models.Sequential()
|
||||
model.add(keras.layers.Dense(
|
||||
2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1)))
|
||||
@ -194,7 +195,7 @@ class KerasOptimizersTest(keras_parameterized.TestCase):
|
||||
'v1 optimizer does not run in eager mode')
|
||||
graph = ops.Graph()
|
||||
with graph.as_default():
|
||||
optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
|
||||
optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01))
|
||||
keras.backend.track_tf_optimizer(optimizer)
|
||||
optimizer_weak = weakref.ref(optimizer)
|
||||
graph_weak = weakref.ref(graph)
|
||||
@ -209,7 +210,7 @@ class KerasOptimizersTest(keras_parameterized.TestCase):
|
||||
self.skipTest(
|
||||
'v1 optimizer does not run in eager mode')
|
||||
with self.cached_session():
|
||||
optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
|
||||
optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01))
|
||||
model = keras.models.Sequential()
|
||||
model.add(keras.layers.Dense(
|
||||
2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1)))
|
||||
@ -229,9 +230,9 @@ class KerasOptimizersTest(keras_parameterized.TestCase):
|
||||
|
||||
def test_negative_clipvalue_or_clipnorm(self):
|
||||
with self.assertRaises(ValueError):
|
||||
_ = keras.optimizers.SGD(lr=0.01, clipvalue=-0.5)
|
||||
_ = optimizer_v1.SGD(lr=0.01, clipvalue=-0.5)
|
||||
with self.assertRaises(ValueError):
|
||||
_ = keras.optimizers.Adam(clipnorm=-2.0)
|
||||
_ = optimizer_v1.Adam(clipnorm=-2.0)
|
||||
|
||||
def test_mixed_precision_loss_scale_optimizer(self):
|
||||
if context.executing_eagerly():
|
||||
|
@ -26,7 +26,7 @@ import numpy as np
|
||||
from six.moves import zip # pylint: disable=redefined-builtin
|
||||
|
||||
from tensorflow.python.keras import backend as K
|
||||
from tensorflow.python.keras import optimizers
|
||||
from tensorflow.python.keras import optimizer_v1
|
||||
from tensorflow.python.keras.saving import model_config as model_config_lib
|
||||
from tensorflow.python.keras.saving import saving_utils
|
||||
from tensorflow.python.keras.saving.saved_model import json_utils
|
||||
@ -127,7 +127,7 @@ def save_model_to_hdf5(model, filepath, overwrite=True, include_optimizer=True):
|
||||
# TODO(b/128683857): Add integration tests between tf.keras and external
|
||||
# Keras, to avoid breaking TF.js users.
|
||||
if (include_optimizer and model.optimizer and
|
||||
not isinstance(model.optimizer, optimizers.TFOptimizer)):
|
||||
not isinstance(model.optimizer, optimizer_v1.TFOptimizer)):
|
||||
save_optimizer_weights_to_hdf5_group(f, model.optimizer)
|
||||
|
||||
f.flush()
|
||||
|
@ -34,6 +34,7 @@ from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.keras import combinations
|
||||
from tensorflow.python.keras import keras_parameterized
|
||||
from tensorflow.python.keras import optimizer_v1
|
||||
from tensorflow.python.keras import optimizers
|
||||
from tensorflow.python.keras import testing_utils
|
||||
from tensorflow.python.keras.engine import training
|
||||
@ -341,7 +342,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
|
||||
name='d1'))
|
||||
ref_model.add(keras.layers.Dense(num_classes, name='d2'))
|
||||
ref_model.compile(loss=keras.losses.MSE,
|
||||
optimizer=keras.optimizers.RMSprop(lr=0.0001),
|
||||
optimizer=optimizer_v1.RMSprop(lr=0.0001),
|
||||
metrics=[keras.metrics.categorical_accuracy])
|
||||
|
||||
f_ref_model = h5py.File(h5_path, 'w')
|
||||
@ -354,7 +355,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
|
||||
name='d1'))
|
||||
model.add(keras.layers.Dense(num_classes, name='d2'))
|
||||
model.compile(loss=keras.losses.MSE,
|
||||
optimizer=keras.optimizers.RMSprop(lr=0.0001),
|
||||
optimizer=optimizer_v1.RMSprop(lr=0.0001),
|
||||
metrics=[keras.metrics.categorical_accuracy])
|
||||
with self.assertRaisesRegex(
|
||||
ValueError, r'Layer #0 \(named "d1"\), weight '
|
||||
@ -515,7 +516,7 @@ class TestWholeModelSaving(keras_parameterized.TestCase):
|
||||
with ops.Graph().as_default(), self.cached_session():
|
||||
# test with custom optimizer, loss
|
||||
|
||||
class CustomOp(keras.optimizers.RMSprop):
|
||||
class CustomOp(optimizer_v1.RMSprop):
|
||||
pass
|
||||
|
||||
def custom_loss(y_true, y_pred):
|
||||
@ -692,7 +693,7 @@ class TestWholeModelSaving(keras_parameterized.TestCase):
|
||||
model = keras.Model(inputs, outputs)
|
||||
model.compile(
|
||||
loss=keras.losses.MSE,
|
||||
optimizer=keras.optimizers.Adam(),
|
||||
optimizer=optimizer_v1.Adam(),
|
||||
metrics=[
|
||||
keras.metrics.categorical_accuracy,
|
||||
keras.metrics.CategoricalAccuracy()
|
||||
@ -1028,7 +1029,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
|
||||
model = keras.models.Sequential()
|
||||
model.add(keras.layers.Dense(2, input_shape=(3,)))
|
||||
model.add(keras.layers.Dense(3))
|
||||
model.compile(loss='mse', optimizer=optimizers.Adam(), metrics=['acc'])
|
||||
model.compile(loss='mse', optimizer=optimizer_v1.Adam(), metrics=['acc'])
|
||||
if not ops.executing_eagerly_outside_functions():
|
||||
model._make_train_function()
|
||||
temp_dir = self.get_temp_dir()
|
||||
|
@ -25,7 +25,7 @@ import six
|
||||
from tensorflow.python.client import session
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.keras import backend as K
|
||||
from tensorflow.python.keras import optimizers
|
||||
from tensorflow.python.keras import optimizer_v1
|
||||
from tensorflow.python.keras.optimizer_v2 import optimizer_v2
|
||||
from tensorflow.python.keras.saving import model_config
|
||||
from tensorflow.python.keras.saving import saving_utils
|
||||
@ -206,7 +206,7 @@ def _save_v1_format(model, path, custom_objects, as_text, input_signature):
|
||||
|
||||
has_saved_vars = False
|
||||
if model.optimizer:
|
||||
if isinstance(model.optimizer, (optimizers.TFOptimizer,
|
||||
if isinstance(model.optimizer, (optimizer_v1.TFOptimizer,
|
||||
optimizer_v2.OptimizerV2)):
|
||||
_export_mode(mode_keys.ModeKeys.TRAIN, has_saved_vars, **export_args)
|
||||
has_saved_vars = True
|
||||
|
@ -31,6 +31,7 @@ from tensorflow.python.eager import context
|
||||
from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.framework import tensor_spec
|
||||
from tensorflow.python.keras import optimizer_v1
|
||||
from tensorflow.python.keras.engine import training as model_lib
|
||||
from tensorflow.python.keras.optimizer_v2 import adadelta
|
||||
from tensorflow.python.keras.optimizer_v2 import rmsprop
|
||||
@ -458,7 +459,7 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
|
||||
x = keras.layers.Dense(2)(inputs)
|
||||
x = keras.layers.Dense(3)(x)
|
||||
clone = keras.models.Model(inputs, x)
|
||||
clone.compile(loss='mse', optimizer=keras.optimizers.RMSprop(lr=0.0001))
|
||||
clone.compile(loss='mse', optimizer=optimizer_v1.RMSprop(lr=0.0001))
|
||||
clone.train_on_batch(input_arr, target_arr)
|
||||
|
||||
keras_saved_model._assert_same_non_optimizer_objects(
|
||||
@ -487,7 +488,7 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
|
||||
x = keras.layers.Dense(4)(x)
|
||||
x = keras.layers.Dense(3)(x)
|
||||
clone = keras.models.Model(inputs, x)
|
||||
clone.compile(loss='mse', optimizer=keras.optimizers.RMSprop(lr=0.0001))
|
||||
clone.compile(loss='mse', optimizer=optimizer_v1.RMSprop(lr=0.0001))
|
||||
clone.train_on_batch(input_arr, target_arr)
|
||||
|
||||
def testSaveSequentialModelWithoutInputShapes(self):
|
||||
|
@ -24,6 +24,7 @@ import six
|
||||
from tensorflow.python.eager import def_function
|
||||
from tensorflow.python.keras import backend as K
|
||||
from tensorflow.python.keras import losses
|
||||
from tensorflow.python.keras import optimizer_v1
|
||||
from tensorflow.python.keras import optimizers
|
||||
from tensorflow.python.keras.engine import base_layer_utils
|
||||
from tensorflow.python.keras.utils import generic_utils
|
||||
@ -161,7 +162,7 @@ def model_metadata(model, include_optimizer=True, require_config=True):
|
||||
backend=K.backend(),
|
||||
model_config=model_config)
|
||||
if model.optimizer and include_optimizer:
|
||||
if isinstance(model.optimizer, optimizers.TFOptimizer):
|
||||
if isinstance(model.optimizer, optimizer_v1.TFOptimizer):
|
||||
logging.warning(
|
||||
'TensorFlow optimizers do not '
|
||||
'make it possible to access '
|
||||
|
@ -26,6 +26,7 @@ import numpy as np
|
||||
|
||||
from tensorflow.python import keras
|
||||
from tensorflow.python.keras import keras_parameterized
|
||||
from tensorflow.python.keras import optimizer_v1
|
||||
from tensorflow.python.keras import testing_utils
|
||||
from tensorflow.python.keras.tests import model_architectures
|
||||
from tensorflow.python.platform import test
|
||||
@ -62,7 +63,7 @@ class TestModelArchitectures(keras_parameterized.TestCase):
|
||||
def get_custom_objects(self):
|
||||
"""Define custom_objects."""
|
||||
|
||||
class CustomOpt(keras.optimizers.SGD):
|
||||
class CustomOpt(optimizer_v1.SGD):
|
||||
pass
|
||||
|
||||
def custom_loss(y_true, y_pred):
|
||||
|
@ -24,6 +24,7 @@ from tensorflow.python import keras
|
||||
from tensorflow.python.eager import context
|
||||
from tensorflow.python.framework import config
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.keras import optimizer_v1
|
||||
from tensorflow.python.keras.utils import multi_gpu_utils
|
||||
from tensorflow.python.keras.utils import np_utils
|
||||
from tensorflow.python.platform import test
|
||||
@ -191,7 +192,7 @@ class TestMultiGPUModel(test.TestCase):
|
||||
|
||||
parallel_model.compile(
|
||||
loss='categorical_crossentropy',
|
||||
optimizer=keras.optimizers.RMSprop(lr=0.0001, decay=1e-6),
|
||||
optimizer=optimizer_v1.RMSprop(lr=0.0001, decay=1e-6),
|
||||
metrics=['accuracy'],
|
||||
target_tensors=[targets])
|
||||
parallel_model.fit(epochs=1, steps_per_epoch=3)
|
||||
|
Loading…
Reference in New Issue
Block a user