Move V1 optimizer code to a separate file optimizer_v1.py from the generic utils in optimizers.py.

PiperOrigin-RevId: 333495430
Change-Id: I6bf730dc507f067f79f51b7a5952b50549c7c5a4
This commit is contained in:
Tomer Kaftan 2020-09-24 05:30:54 -07:00 committed by TensorFlower Gardener
parent e798106686
commit a6b7e4b94c
20 changed files with 915 additions and 878 deletions

View File

@ -287,6 +287,7 @@ py_library(
py_library(
name = "optimizers",
srcs = [
"optimizer_v1.py",
"optimizers.py",
],
srcs_version = "PY2AND3",

View File

@ -36,7 +36,7 @@ from tensorflow.python.framework import tensor_util
from tensorflow.python.keras import backend as K
from tensorflow.python.keras import callbacks
from tensorflow.python.keras import metrics as metrics_module
from tensorflow.python.keras import optimizers
from tensorflow.python.keras import optimizer_v1
from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils
from tensorflow.python.keras.engine import training_utils
from tensorflow.python.keras.optimizer_v2 import optimizer_v2
@ -779,7 +779,7 @@ def _clone_and_build_model(model, mode, inputs=None, targets=None):
cloned_model = models.clone_model(model, input_tensors=inputs)
# Compile and build model.
if isinstance(model.optimizer, optimizers.TFOptimizer):
if isinstance(model.optimizer, optimizer_v1.TFOptimizer):
optimizer = model.optimizer
else:
optimizer_config = model.optimizer.get_config()

View File

@ -42,7 +42,7 @@ from tensorflow.python.keras import backend
from tensorflow.python.keras import callbacks
from tensorflow.python.keras import metrics as metrics_module
from tensorflow.python.keras import models
from tensorflow.python.keras import optimizers
from tensorflow.python.keras import optimizer_v1
from tensorflow.python.keras.distribute import multi_worker_testing_utils
from tensorflow.python.platform import test
from tensorflow.python.util import nest
@ -71,11 +71,11 @@ def _clone_and_build_model(model, strategy):
cloned_model = models.clone_model(model)
# Compile and build model.
if isinstance(model.optimizer, optimizers.TFOptimizer):
if isinstance(model.optimizer, optimizer_v1.TFOptimizer):
optimizer = model.optimizer
# TODO(yuefengz): figure out why the optimizer here is still a
# TFOptimizer.
while isinstance(optimizer, optimizers.TFOptimizer):
while isinstance(optimizer, optimizer_v1.TFOptimizer):
optimizer = optimizer.optimizer
optimizer = copy.deepcopy(optimizer)
else:

View File

@ -39,6 +39,7 @@ from tensorflow.python.framework import sparse_tensor
from tensorflow.python.framework import tensor_shape
from tensorflow.python.keras import backend
from tensorflow.python.keras import callbacks as callbacks_module
from tensorflow.python.keras import optimizer_v1
from tensorflow.python.keras import optimizers
from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils
from tensorflow.python.keras.engine import base_layer
@ -2463,7 +2464,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
def _validate_compile(self, optimizer, metrics, **kwargs):
"""Performs validation checks for the default `compile`."""
if any(
isinstance(opt, optimizers.Optimizer)
isinstance(opt, optimizer_v1.Optimizer)
for opt in nest.flatten(optimizer)):
raise ValueError(
'`tf.compat.v1.keras` Optimizer (', optimizer, ') is '

View File

@ -40,6 +40,7 @@ from tensorflow.python.framework import type_spec
from tensorflow.python.keras import backend as K
from tensorflow.python.keras import losses
from tensorflow.python.keras import metrics as metrics_module
from tensorflow.python.keras import optimizer_v1
from tensorflow.python.keras import optimizers
from tensorflow.python.keras.distribute import distributed_training_utils
from tensorflow.python.keras.distribute import distributed_training_utils_v1
@ -322,8 +323,8 @@ class Model(training_lib.Model):
self._set_optimizer(optimizer)
is_any_keras_optimizer_v1 = any(
(isinstance(opt, optimizers.Optimizer)
and not isinstance(opt, optimizers.TFOptimizer)
(isinstance(opt, optimizer_v1.Optimizer)
and not isinstance(opt, optimizer_v1.TFOptimizer)
) for opt in nest.flatten(self.optimizer))
if is_any_keras_optimizer_v1 and ops.executing_eagerly_outside_functions():

View File

@ -37,7 +37,7 @@ from tensorflow.python.keras import combinations
from tensorflow.python.keras import keras_parameterized
from tensorflow.python.keras import layers
from tensorflow.python.keras import models
from tensorflow.python.keras import optimizers
from tensorflow.python.keras import optimizer_v1
from tensorflow.python.keras import testing_utils
from tensorflow.python.keras.engine import base_layer
from tensorflow.python.keras.engine import base_layer_utils
@ -854,7 +854,7 @@ class KerasModelTest(keras_parameterized.TestCase):
else:
error_msg = 'optimizer" must be an instance of '
with self.assertRaisesRegex(ValueError, error_msg):
model.compile(optimizers.SGD(1.), 'mse')
model.compile(optimizer_v1.SGD(1.), 'mse')
@combinations.generate(combinations.combine(mode=['graph', 'eager']))
def test_functional_model_loss_dtype(self):

View File

@ -22,7 +22,7 @@ from __future__ import print_function
from tensorflow.python.framework import ops
from tensorflow.python.keras import backend as K
from tensorflow.python.keras import metrics as metrics_module
from tensorflow.python.keras import optimizers
from tensorflow.python.keras import optimizer_v1
from tensorflow.python.keras.engine import functional
from tensorflow.python.keras.engine import sequential
from tensorflow.python.keras.engine import training
@ -682,8 +682,8 @@ def clone_and_build_model(
clone._set_inputs(input_tensors)
if compile_clone:
if isinstance(orig_optimizer, optimizers.TFOptimizer):
optimizer = optimizers.TFOptimizer(
if isinstance(orig_optimizer, optimizer_v1.TFOptimizer):
optimizer = optimizer_v1.TFOptimizer(
orig_optimizer.optimizer, optimizer_iterations)
K.track_tf_optimizer(optimizer)
else:

View File

@ -32,6 +32,7 @@ from tensorflow.python.keras import backend as K
from tensorflow.python.keras import keras_parameterized
from tensorflow.python.keras import metrics
from tensorflow.python.keras import models
from tensorflow.python.keras import optimizer_v1
from tensorflow.python.keras import testing_utils
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
@ -420,10 +421,9 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase):
"""Assert that two models have the same compile parameters."""
self.assertEqual('mse', model.loss)
self.assertTrue(
isinstance(model.optimizer,
(keras.optimizers.RMSprop,
keras.optimizer_v2.rmsprop.RMSprop)))
self.assertIsInstance(
model.optimizer,
(optimizer_v1.RMSprop, keras.optimizer_v2.rmsprop.RMSprop))
def _clone_and_build_test_helper(self, model, model_type):
inp = np.random.random((10, 4))

View File

@ -0,0 +1,839 @@
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# pylint: disable=invalid-name
# pylint: disable=g-classes-have-attributes
"""Legacy v1 optimizer classes.
For more examples see the base class `tf.compat.v1.keras.optimizers.Optimizer`.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from six.moves import zip # pylint: disable=redefined-builtin
from tensorflow.python.distribute import distribution_strategy_context
from tensorflow.python.eager import backprop
from tensorflow.python.framework import ops
from tensorflow.python.keras import backend as K
from tensorflow.python.ops import clip_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import state_ops
from tensorflow.python.training import training_util
from tensorflow.python.training.tracking import base as trackable
from tensorflow.python.util import nest
class Optimizer(object):
"""Abstract optimizer base class.
Note: this is the parent class of all optimizers, not an actual optimizer
that can be used for training models.
All Keras optimizers support the following keyword arguments:
clipnorm: float >= 0. Gradients will be clipped
when their L2 norm exceeds this value.
clipvalue: float >= 0. Gradients will be clipped
when their absolute value exceeds this value.
"""
def __init__(self, **kwargs):
allowed_kwargs = {'clipnorm', 'clipvalue'}
for k in kwargs:
if k not in allowed_kwargs:
raise TypeError('Unexpected keyword argument '
'passed to optimizer: ' + str(k))
# checks that clipnorm >= 0 and clipvalue >= 0
if kwargs[k] < 0:
raise ValueError('Expected {} >= 0, received: {}'.format(k, kwargs[k]))
self.__dict__.update(kwargs)
self.updates = []
self.weights = []
# Set this to False, indicating `apply_gradients` does not take the
# `experimental_aggregate_gradients` argument.
_HAS_AGGREGATE_GRAD = False
def _create_all_weights(self, params):
"""Creates and sets all optimizer weights.
Args:
params: list or tuple of `Variable` objects that will be minimized
using this optimizer.
Returns:
Specific weight values that are used in `get_updates`
"""
raise NotImplementedError
def get_updates(self, loss, params):
raise NotImplementedError
def get_gradients(self, loss, params):
"""Returns gradients of `loss` with respect to `params`.
Arguments:
loss: Loss tensor.
params: List of variables.
Returns:
List of gradient tensors.
Raises:
ValueError: In case any gradient cannot be computed (e.g. if gradient
function not implemented).
"""
grads = K.gradients(loss, params)
if any(g is None for g in grads):
raise ValueError('An operation has `None` for gradient. '
'Please make sure that all of your ops have a '
'gradient defined (i.e. are differentiable). '
'Common ops without gradient: '
'K.argmax, K.round, K.eval.')
if hasattr(self, 'clipnorm'):
grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
if hasattr(self, 'clipvalue'):
grads = [
clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
for g in grads
]
return grads
def set_weights(self, weights):
"""Sets the weights of the optimizer, from Numpy arrays.
Should only be called after computing the gradients
(otherwise the optimizer has no weights).
Arguments:
weights: a list of Numpy arrays. The number of arrays and their shape
must match number of the dimensions of the weights of the optimizer
(i.e. it should match the output of `get_weights`).
Raises:
ValueError: in case of incompatible weight shapes.
"""
params = self.weights
if len(params) != len(weights):
raise ValueError('Length of the specified weight list (' +
str(len(weights)) +
') does not match the number of weights '
'of the optimizer (' + str(len(params)) + ')')
weight_value_tuples = []
param_values = K.batch_get_value(params)
for pv, p, w in zip(param_values, params, weights):
if pv.shape != w.shape:
raise ValueError('Optimizer weight shape ' + str(pv.shape) +
' not compatible with '
'provided weight shape ' + str(w.shape))
weight_value_tuples.append((p, w))
K.batch_set_value(weight_value_tuples)
def get_weights(self):
"""Returns the current value of the weights of the optimizer.
Returns:
A list of numpy arrays.
"""
return K.batch_get_value(self.weights)
def get_config(self):
config = {}
if hasattr(self, 'clipnorm'):
config['clipnorm'] = self.clipnorm
if hasattr(self, 'clipvalue'):
config['clipvalue'] = self.clipvalue
return config
@classmethod
def from_config(cls, config):
return cls(**config)
class SGD(Optimizer):
"""Stochastic gradient descent optimizer.
Includes support for momentum,
learning rate decay, and Nesterov momentum.
Arguments:
lr: float >= 0. Learning rate.
momentum: float >= 0. Parameter that accelerates SGD in the relevant
direction and dampens oscillations.
decay: float >= 0. Learning rate decay over each update.
nesterov: boolean. Whether to apply Nesterov momentum.
"""
def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, **kwargs):
super(SGD, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.lr = K.variable(lr, name='lr')
self.momentum = K.variable(momentum, name='momentum')
self.decay = K.variable(decay, name='decay')
self.initial_decay = decay
self.nesterov = nesterov
def _create_all_weights(self, params):
shapes = [K.int_shape(p) for p in params]
moments = [K.zeros(shape) for shape in shapes]
self.weights = [self.iterations] + moments
return moments
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = [state_ops.assign_add(self.iterations, 1)]
lr = self.lr
if self.initial_decay > 0:
lr = lr * ( # pylint: disable=g-no-augmented-assignment
1. /
(1. +
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
# momentum
moments = self._create_all_weights(params)
for p, g, m in zip(params, grads, moments):
v = self.momentum * m - lr * g # velocity
self.updates.append(state_ops.assign(m, v))
if self.nesterov:
new_p = p + self.momentum * v - lr * g
else:
new_p = p + v
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(state_ops.assign(p, new_p))
return self.updates
def get_config(self):
config = {
'lr': float(K.get_value(self.lr)),
'momentum': float(K.get_value(self.momentum)),
'decay': float(K.get_value(self.decay)),
'nesterov': self.nesterov
}
base_config = super(SGD, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class RMSprop(Optimizer):
"""RMSProp optimizer.
It is recommended to leave the parameters of this optimizer
at their default values
(except the learning rate, which can be freely tuned).
Arguments:
lr: float >= 0. Learning rate.
rho: float >= 0.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
"""
def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0., **kwargs):
super(RMSprop, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.lr = K.variable(lr, name='lr')
self.rho = K.variable(rho, name='rho')
self.decay = K.variable(decay, name='decay')
self.iterations = K.variable(0, dtype='int64', name='iterations')
if epsilon is None:
epsilon = K.epsilon()
self.epsilon = epsilon
self.initial_decay = decay
def _create_all_weights(self, params):
accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
self.weights = accumulators
return accumulators
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
accumulators = self._create_all_weights(params)
self.updates = [state_ops.assign_add(self.iterations, 1)]
lr = self.lr
if self.initial_decay > 0:
lr = lr * ( # pylint: disable=g-no-augmented-assignment
1. /
(1. +
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
for p, g, a in zip(params, grads, accumulators):
# update accumulator
new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
self.updates.append(state_ops.assign(a, new_a))
new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(state_ops.assign(p, new_p))
return self.updates
def get_config(self):
config = {
'lr': float(K.get_value(self.lr)),
'rho': float(K.get_value(self.rho)),
'decay': float(K.get_value(self.decay)),
'epsilon': self.epsilon
}
base_config = super(RMSprop, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class Adagrad(Optimizer):
"""Adagrad optimizer.
Adagrad is an optimizer with parameter-specific learning rates,
which are adapted relative to how frequently a parameter gets
updated during training. The more updates a parameter receives,
the smaller the updates.
It is recommended to leave the parameters of this optimizer
at their default values.
# Arguments
lr: float >= 0. Initial learning rate.
epsilon: float >= 0. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
# References
- [Adaptive Subgradient Methods for Online Learning and Stochastic
Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
"""
def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs):
super(Adagrad, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.lr = K.variable(lr, name='lr')
self.decay = K.variable(decay, name='decay')
self.iterations = K.variable(0, dtype='int64', name='iterations')
if epsilon is None:
epsilon = K.epsilon()
self.epsilon = epsilon
self.initial_decay = decay
def _create_all_weights(self, params):
shapes = [K.int_shape(p) for p in params]
accumulators = [K.zeros(shape) for shape in shapes]
self.weights = accumulators
return accumulators
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
accumulators = self._create_all_weights(params)
self.updates = [state_ops.assign_add(self.iterations, 1)]
lr = self.lr
if self.initial_decay > 0:
lr = lr * ( # pylint: disable=g-no-augmented-assignment
1. /
(1. +
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
for p, g, a in zip(params, grads, accumulators):
new_a = a + math_ops.square(g) # update accumulator
self.updates.append(state_ops.assign(a, new_a))
new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(state_ops.assign(p, new_p))
return self.updates
def get_config(self):
config = {
'lr': float(K.get_value(self.lr)),
'decay': float(K.get_value(self.decay)),
'epsilon': self.epsilon
}
base_config = super(Adagrad, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class Adadelta(Optimizer):
"""Adadelta optimizer.
Adadelta is a more robust extension of Adagrad
that adapts learning rates based on a moving window of gradient updates,
instead of accumulating all past gradients. This way, Adadelta continues
learning even when many updates have been done. Compared to Adagrad, in the
original version of Adadelta you don't have to set an initial learning
rate. In this version, initial learning rate and decay factor can
be set, as in most other Keras optimizers.
It is recommended to leave the parameters of this optimizer
at their default values.
# Arguments
lr: float >= 0. Initial learning rate, defaults to 1.
It is recommended to leave it at the default value.
rho: float >= 0. Adadelta decay factor, corresponding to fraction of
gradient to keep at each time step.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Initial learning rate decay.
# References
- [Adadelta - an adaptive learning rate
method](http://arxiv.org/abs/1212.5701)
"""
def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0., **kwargs):
super(Adadelta, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.lr = K.variable(lr, name='lr')
self.decay = K.variable(decay, name='decay')
self.iterations = K.variable(0, dtype='int64', name='iterations')
if epsilon is None:
epsilon = K.epsilon()
self.rho = rho
self.epsilon = epsilon
self.initial_decay = decay
def _create_all_weights(self, params):
shapes = [K.int_shape(p) for p in params]
accumulators = [K.zeros(shape) for shape in shapes]
delta_accumulators = [K.zeros(shape) for shape in shapes]
self.weights = accumulators + delta_accumulators
return accumulators, delta_accumulators
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = [state_ops.assign_add(self.iterations, 1)]
accumulators, delta_accumulators = self._create_all_weights(params)
lr = self.lr
if self.initial_decay > 0:
lr = lr * ( # pylint: disable=g-no-augmented-assignment
1. /
(1. +
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
# update accumulator
new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
self.updates.append(state_ops.assign(a, new_a))
# use the new accumulator and the *old* delta_accumulator
update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon)
new_p = p - lr * update
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(state_ops.assign(p, new_p))
# update delta_accumulator
new_d_a = self.rho * d_a + (1 - self.rho) * math_ops.square(update)
self.updates.append(state_ops.assign(d_a, new_d_a))
return self.updates
def get_config(self):
config = {
'lr': float(K.get_value(self.lr)),
'rho': self.rho,
'decay': float(K.get_value(self.decay)),
'epsilon': self.epsilon
}
base_config = super(Adadelta, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class Adam(Optimizer):
"""Adam optimizer.
Default parameters follow those provided in the original paper.
Arguments:
lr: float >= 0. Learning rate.
beta_1: float, 0 < beta < 1. Generally close to 1.
beta_2: float, 0 < beta < 1. Generally close to 1.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
amsgrad: boolean. Whether to apply the AMSGrad variant of this algorithm
from the paper "On the Convergence of Adam and Beyond".
"""
def __init__(self,
lr=0.001,
beta_1=0.9,
beta_2=0.999,
epsilon=None,
decay=0.,
amsgrad=False,
**kwargs):
super(Adam, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.lr = K.variable(lr, name='lr')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
self.decay = K.variable(decay, name='decay')
if epsilon is None:
epsilon = K.epsilon()
self.epsilon = epsilon
self.initial_decay = decay
self.amsgrad = amsgrad
def _create_all_weights(self, params):
ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
if self.amsgrad:
vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
else:
vhats = [K.zeros(1) for _ in params]
self.weights = [self.iterations] + ms + vs + vhats
return ms, vs, vhats
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = []
lr = self.lr
if self.initial_decay > 0:
lr = lr * ( # pylint: disable=g-no-augmented-assignment
1. /
(1. +
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
t = math_ops.cast(self.iterations, K.floatx())
lr_t = lr * (
K.sqrt(1. - math_ops.pow(self.beta_2, t)) /
(1. - math_ops.pow(self.beta_1, t)))
ms, vs, vhats = self._create_all_weights(params)
for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g)
if self.amsgrad:
vhat_t = math_ops.maximum(vhat, v_t)
p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
self.updates.append(state_ops.assign(vhat, vhat_t))
else:
p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
self.updates.append(state_ops.assign(m, m_t))
self.updates.append(state_ops.assign(v, v_t))
new_p = p_t
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(state_ops.assign(p, new_p))
return self.updates
def get_config(self):
config = {
'lr': float(K.get_value(self.lr)),
'beta_1': float(K.get_value(self.beta_1)),
'beta_2': float(K.get_value(self.beta_2)),
'decay': float(K.get_value(self.decay)),
'epsilon': self.epsilon,
'amsgrad': self.amsgrad
}
base_config = super(Adam, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class Adamax(Optimizer):
"""Adamax optimizer from Adam paper's Section 7.
It is a variant of Adam based on the infinity norm.
Default parameters follow those provided in the paper.
Arguments:
lr: float >= 0. Learning rate.
beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
"""
def __init__(self,
lr=0.002,
beta_1=0.9,
beta_2=0.999,
epsilon=None,
decay=0.,
**kwargs):
super(Adamax, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.lr = K.variable(lr, name='lr')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
self.decay = K.variable(decay, name='decay')
if epsilon is None:
epsilon = K.epsilon()
self.epsilon = epsilon
self.initial_decay = decay
def _create_all_weights(self, params):
shapes = [K.int_shape(p) for p in params]
# zero init of 1st moment
ms = [K.zeros(shape) for shape in shapes]
# zero init of exponentially weighted infinity norm
us = [K.zeros(shape) for shape in shapes]
self.weights = [self.iterations] + ms + us
return ms, us
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = []
lr = self.lr
if self.initial_decay > 0:
lr = lr * ( # pylint: disable=g-no-augmented-assignment
1. /
(1. +
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
t = math_ops.cast(self.iterations, K.floatx())
lr_t = lr / (1. - math_ops.pow(self.beta_1, t))
ms, us = self._create_all_weights(params)
for p, g, m, u in zip(params, grads, ms, us):
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
u_t = math_ops.maximum(self.beta_2 * u, math_ops.abs(g))
p_t = p - lr_t * m_t / (u_t + self.epsilon)
self.updates.append(state_ops.assign(m, m_t))
self.updates.append(state_ops.assign(u, u_t))
new_p = p_t
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(state_ops.assign(p, new_p))
return self.updates
def get_config(self):
config = {
'lr': float(K.get_value(self.lr)),
'beta_1': float(K.get_value(self.beta_1)),
'beta_2': float(K.get_value(self.beta_2)),
'decay': float(K.get_value(self.decay)),
'epsilon': self.epsilon
}
base_config = super(Adamax, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class Nadam(Optimizer):
"""Nesterov Adam optimizer.
Much like Adam is essentially RMSprop with momentum,
Nadam is Adam RMSprop with Nesterov momentum.
Default parameters follow those provided in the paper.
It is recommended to leave the parameters of this optimizer
at their default values.
Arguments:
lr: float >= 0. Learning rate.
beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
"""
def __init__(self,
lr=0.002,
beta_1=0.9,
beta_2=0.999,
epsilon=None,
schedule_decay=0.004,
**kwargs):
super(Nadam, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.m_schedule = K.variable(1., name='m_schedule')
self.lr = K.variable(lr, name='lr')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
if epsilon is None:
epsilon = K.epsilon()
self.epsilon = epsilon
self.schedule_decay = schedule_decay
def _create_all_weights(self, params):
shapes = [K.int_shape(p) for p in params]
ms = [K.zeros(shape) for shape in shapes]
vs = [K.zeros(shape) for shape in shapes]
self.weights = [self.iterations, self.m_schedule] + ms + vs
return ms, vs
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = []
with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
t = math_ops.cast(self.iterations, K.floatx())
# Due to the recommendations in [2], i.e. warming momentum schedule
momentum_cache_t = self.beta_1 * (
1. - 0.5 *
(math_ops.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
momentum_cache_t_1 = self.beta_1 * (
1. - 0.5 *
(math_ops.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
m_schedule_new = self.m_schedule * momentum_cache_t
m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
self.updates.append((self.m_schedule, m_schedule_new))
ms, vs = self._create_all_weights(params)
for p, g, m, v in zip(params, grads, ms, vs):
# the following equations given in [1]
g_prime = g / (1. - m_schedule_new)
m_t = self.beta_1 * m + (1. - self.beta_1) * g
m_t_prime = m_t / (1. - m_schedule_next)
v_t = self.beta_2 * v + (1. - self.beta_2) * math_ops.square(g)
v_t_prime = v_t / (1. - math_ops.pow(self.beta_2, t))
m_t_bar = (1. -
momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
self.updates.append(state_ops.assign(m, m_t))
self.updates.append(state_ops.assign(v, v_t))
p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon)
new_p = p_t
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(state_ops.assign(p, new_p))
return self.updates
def get_config(self):
config = {
'lr': float(K.get_value(self.lr)),
'beta_1': float(K.get_value(self.beta_1)),
'beta_2': float(K.get_value(self.beta_2)),
'epsilon': self.epsilon,
'schedule_decay': self.schedule_decay
}
base_config = super(Nadam, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class TFOptimizer(Optimizer, trackable.Trackable):
"""Wrapper class for native TensorFlow optimizers."""
def __init__(self, optimizer, iterations=None): # pylint: disable=super-init-not-called
self.optimizer = optimizer
self._track_trackable(optimizer, name='optimizer')
if iterations is None:
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
else:
self.iterations = iterations
self._track_trackable(self.iterations, name='global_step')
def _clip_gradients(self, grads):
"""Clip gradients according to the clipnorm and clipvalue attributes."""
# TFOptimizer wrapper has no gradient clipping options.
return grads
def minimize(self, loss, var_list, grad_loss=None, tape=None):
"""Mimics the `OptimizerV2.minimize` API."""
if not callable(loss) and tape is None:
raise ValueError('`tape` is required when a `Tensor` loss is passed.')
tape = tape if tape is not None else backprop.GradientTape()
if callable(loss):
with tape:
if not callable(var_list):
tape.watch(var_list)
loss = loss()
if callable(var_list):
var_list = var_list()
var_list = nest.flatten(var_list)
if var_list:
grads = tape.gradient(loss, var_list, grad_loss)
grads_and_vars = list(zip(grads, var_list))
self.apply_gradients(grads_and_vars)
def apply_gradients(self, grads_and_vars):
self.optimizer.apply_gradients(grads_and_vars, global_step=self.iterations)
def get_grads(self, loss, params):
return self.optimizer.compute_gradients(loss, params)
def get_updates(self, loss, params):
if distribution_strategy_context.has_strategy():
self.updates = []
if not params:
# After the model vars have been created, the second call to get_updates
# is called with params as an empty list. This ensures that we call
# compute_gradients with params=None.
grads = self.optimizer.compute_gradients(loss)
else:
grads = self.optimizer.compute_gradients(loss, params)
global_step = training_util.get_global_step()
opt_update = self.optimizer.apply_gradients(grads, global_step)
else:
if not params:
self.updates = [state_ops.assign_add(self.iterations, 1)]
return self.updates
# Updates list starts out empty because the iterations variable is
# incremented in optimizer.apply_gradients()
self.updates = []
grads = self.optimizer.compute_gradients(loss, params)
opt_update = self.optimizer.apply_gradients(
grads, global_step=self.iterations)
self.updates.append(opt_update)
return self.updates
@property
def weights(self):
raise NotImplementedError
def get_config(self):
raise NotImplementedError
def from_config(self, config):
raise NotImplementedError
# Aliases.
sgd = SGD
rmsprop = RMSprop
adagrad = Adagrad
adadelta = Adadelta
adam = Adam
adamax = Adamax
nadam = Nadam

View File

@ -26,7 +26,7 @@ from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.keras import combinations
from tensorflow.python.keras import optimizers
from tensorflow.python.keras import optimizer_v1
from tensorflow.python.keras.optimizer_v2 import adam
from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
from tensorflow.python.ops import array_ops
@ -537,7 +537,7 @@ class AdamOptimizerTest(test.TestCase, parameterized.TestCase):
self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
def testSetWeightsFromV1AdamWithoutMinimize(self):
keras_v1_adam = optimizers.Adam()
keras_v1_adam = optimizer_v1.Adam()
keras_v2_adam = adam.Adam()
keras_v2_adam.set_weights(keras_v1_adam.get_weights())
keras_v1_iteration = keras_v1_adam.iterations

View File

@ -35,7 +35,7 @@ from tensorflow.python.keras import callbacks
from tensorflow.python.keras import combinations
from tensorflow.python.keras import keras_parameterized
from tensorflow.python.keras import losses
from tensorflow.python.keras import optimizers
from tensorflow.python.keras import optimizer_v1
from tensorflow.python.keras import testing_utils
from tensorflow.python.keras.engine import input_layer
from tensorflow.python.keras.engine import sequential
@ -739,42 +739,42 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
rtol=1e-5, atol=1e-5)
def testAdadeltaCompatibility(self):
opt_v1 = optimizers.Adadelta(lr=0.01)
opt_v1 = optimizer_v1.Adadelta(lr=0.01)
opt_v2 = adadelta.Adadelta(learning_rate=0.01)
self._testOptimizersCompatibility(opt_v1, opt_v2)
def testAdagradCompatibility(self):
opt_v1 = optimizers.Adagrad(lr=0.01)
opt_v1 = optimizer_v1.Adagrad(lr=0.01)
opt_v2 = adagrad.Adagrad(learning_rate=0.01)
self._testOptimizersCompatibility(opt_v1, opt_v2)
def testAdamCompatibility(self):
opt_v1 = optimizers.Adam()
opt_v1 = optimizer_v1.Adam()
opt_v2 = adam.Adam()
self._testOptimizersCompatibility(opt_v1, opt_v2)
def testAdamaxCompatibility(self):
opt_v1 = optimizers.Adamax(lr=0.01)
opt_v1 = optimizer_v1.Adamax(lr=0.01)
opt_v2 = adamax.Adamax(learning_rate=0.01)
self._testOptimizersCompatibility(opt_v1, opt_v2)
def testNadamCompatibility(self):
opt_v1 = optimizers.Nadam(lr=0.001)
opt_v1 = optimizer_v1.Nadam(lr=0.001)
opt_v2 = nadam.Nadam(learning_rate=0.001)
self._testOptimizersCompatibility(opt_v1, opt_v2)
def testMomentumCompatibility(self):
opt_v1 = optimizers.SGD(lr=0.01, momentum=0.9)
opt_v1 = optimizer_v1.SGD(lr=0.01, momentum=0.9)
opt_v2 = gradient_descent.SGD(learning_rate=0.01, momentum=0.9)
self._testOptimizersCompatibility(opt_v1, opt_v2)
def testRMSpropCompatibility(self):
opt_v1 = optimizers.RMSprop()
opt_v1 = optimizer_v1.RMSprop()
opt_v2 = rmsprop.RMSprop()
self._testOptimizersCompatibility(opt_v1, opt_v2)
def testSGDCompatibility(self):
opt_v1 = optimizers.SGD(lr=0.01)
opt_v1 = optimizer_v1.SGD(lr=0.01)
opt_v2 = gradient_descent.SGD(learning_rate=0.01)
self._testOptimizersCompatibility(opt_v1, opt_v2, False)
@ -804,7 +804,7 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
model_tf.set_weights(model_k_v2.get_weights())
opt_k_v1 = optimizers.SGD(momentum=0.9, nesterov=True)
opt_k_v1 = optimizer_v1.SGD(momentum=0.9, nesterov=True)
opt_k_v2 = gradient_descent.SGD(momentum=0.9, nesterov=True)
opt_tf = momentum.MomentumOptimizer(
learning_rate=0.01, momentum=0.9, use_nesterov=True)
@ -858,7 +858,7 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
model_k_v2.set_weights(model_k_v1.get_weights())
opt_k_v1 = optimizers.Adam(amsgrad=True)
opt_k_v1 = optimizer_v1.Adam(amsgrad=True)
opt_k_v2 = adam.Adam(amsgrad=True)
model_k_v1.compile(

View File

@ -22,12 +22,10 @@ from __future__ import division
from __future__ import print_function
import six
from six.moves import zip # pylint: disable=redefined-builtin
from tensorflow.python.distribute import distribution_strategy_context
from tensorflow.python.eager import backprop
from tensorflow.python.framework import ops
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.optimizer_v1 import Optimizer
from tensorflow.python.keras.optimizer_v1 import TFOptimizer
from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_v2
from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_v2
from tensorflow.python.keras.optimizer_v2 import adam as adam_v2
@ -39,819 +37,10 @@ from tensorflow.python.keras.optimizer_v2 import optimizer_v2
from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
from tensorflow.python.ops import clip_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import state_ops
from tensorflow.python.training import optimizer as tf_optimizer_module
from tensorflow.python.training import training_util
from tensorflow.python.training.tracking import base as trackable
from tensorflow.python.util import nest
from tensorflow.python.util.tf_export import keras_export
class Optimizer(object):
"""Abstract optimizer base class.
Note: this is the parent class of all optimizers, not an actual optimizer
that can be used for training models.
All Keras optimizers support the following keyword arguments:
clipnorm: float >= 0. Gradients will be clipped
when their L2 norm exceeds this value.
clipvalue: float >= 0. Gradients will be clipped
when their absolute value exceeds this value.
"""
def __init__(self, **kwargs):
allowed_kwargs = {'clipnorm', 'clipvalue'}
for k in kwargs:
if k not in allowed_kwargs:
raise TypeError('Unexpected keyword argument '
'passed to optimizer: ' + str(k))
# checks that clipnorm >= 0 and clipvalue >= 0
if kwargs[k] < 0:
raise ValueError('Expected {} >= 0, received: {}'.format(k, kwargs[k]))
self.__dict__.update(kwargs)
self.updates = []
self.weights = []
# Set this to False, indicating `apply_gradients` does not take the
# `experimental_aggregate_gradients` argument.
_HAS_AGGREGATE_GRAD = False
def _create_all_weights(self, params):
"""Creates and sets all optimizer weights.
Args:
params: list or tuple of `Variable` objects that will be minimized
using this optimizer.
Returns:
Specific weight values that are used in `get_updates`
"""
raise NotImplementedError
def get_updates(self, loss, params):
raise NotImplementedError
def get_gradients(self, loss, params):
"""Returns gradients of `loss` with respect to `params`.
Arguments:
loss: Loss tensor.
params: List of variables.
Returns:
List of gradient tensors.
Raises:
ValueError: In case any gradient cannot be computed (e.g. if gradient
function not implemented).
"""
grads = K.gradients(loss, params)
if any(g is None for g in grads):
raise ValueError('An operation has `None` for gradient. '
'Please make sure that all of your ops have a '
'gradient defined (i.e. are differentiable). '
'Common ops without gradient: '
'K.argmax, K.round, K.eval.')
if hasattr(self, 'clipnorm'):
grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
if hasattr(self, 'clipvalue'):
grads = [
clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
for g in grads
]
return grads
def set_weights(self, weights):
"""Sets the weights of the optimizer, from Numpy arrays.
Should only be called after computing the gradients
(otherwise the optimizer has no weights).
Arguments:
weights: a list of Numpy arrays. The number of arrays and their shape
must match number of the dimensions of the weights of the optimizer
(i.e. it should match the output of `get_weights`).
Raises:
ValueError: in case of incompatible weight shapes.
"""
params = self.weights
if len(params) != len(weights):
raise ValueError('Length of the specified weight list (' +
str(len(weights)) +
') does not match the number of weights '
'of the optimizer (' + str(len(params)) + ')')
weight_value_tuples = []
param_values = K.batch_get_value(params)
for pv, p, w in zip(param_values, params, weights):
if pv.shape != w.shape:
raise ValueError('Optimizer weight shape ' + str(pv.shape) +
' not compatible with '
'provided weight shape ' + str(w.shape))
weight_value_tuples.append((p, w))
K.batch_set_value(weight_value_tuples)
def get_weights(self):
"""Returns the current value of the weights of the optimizer.
Returns:
A list of numpy arrays.
"""
return K.batch_get_value(self.weights)
def get_config(self):
config = {}
if hasattr(self, 'clipnorm'):
config['clipnorm'] = self.clipnorm
if hasattr(self, 'clipvalue'):
config['clipvalue'] = self.clipvalue
return config
@classmethod
def from_config(cls, config):
return cls(**config)
class SGD(Optimizer):
"""Stochastic gradient descent optimizer.
Includes support for momentum,
learning rate decay, and Nesterov momentum.
Arguments:
lr: float >= 0. Learning rate.
momentum: float >= 0. Parameter that accelerates SGD in the relevant
direction and dampens oscillations.
decay: float >= 0. Learning rate decay over each update.
nesterov: boolean. Whether to apply Nesterov momentum.
"""
def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, **kwargs):
super(SGD, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.lr = K.variable(lr, name='lr')
self.momentum = K.variable(momentum, name='momentum')
self.decay = K.variable(decay, name='decay')
self.initial_decay = decay
self.nesterov = nesterov
def _create_all_weights(self, params):
shapes = [K.int_shape(p) for p in params]
moments = [K.zeros(shape) for shape in shapes]
self.weights = [self.iterations] + moments
return moments
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = [state_ops.assign_add(self.iterations, 1)]
lr = self.lr
if self.initial_decay > 0:
lr = lr * ( # pylint: disable=g-no-augmented-assignment
1. /
(1. +
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
# momentum
moments = self._create_all_weights(params)
for p, g, m in zip(params, grads, moments):
v = self.momentum * m - lr * g # velocity
self.updates.append(state_ops.assign(m, v))
if self.nesterov:
new_p = p + self.momentum * v - lr * g
else:
new_p = p + v
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(state_ops.assign(p, new_p))
return self.updates
def get_config(self):
config = {
'lr': float(K.get_value(self.lr)),
'momentum': float(K.get_value(self.momentum)),
'decay': float(K.get_value(self.decay)),
'nesterov': self.nesterov
}
base_config = super(SGD, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class RMSprop(Optimizer):
"""RMSProp optimizer.
It is recommended to leave the parameters of this optimizer
at their default values
(except the learning rate, which can be freely tuned).
Arguments:
lr: float >= 0. Learning rate.
rho: float >= 0.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
"""
def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0., **kwargs):
super(RMSprop, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.lr = K.variable(lr, name='lr')
self.rho = K.variable(rho, name='rho')
self.decay = K.variable(decay, name='decay')
self.iterations = K.variable(0, dtype='int64', name='iterations')
if epsilon is None:
epsilon = K.epsilon()
self.epsilon = epsilon
self.initial_decay = decay
def _create_all_weights(self, params):
accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
self.weights = accumulators
return accumulators
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
accumulators = self._create_all_weights(params)
self.updates = [state_ops.assign_add(self.iterations, 1)]
lr = self.lr
if self.initial_decay > 0:
lr = lr * ( # pylint: disable=g-no-augmented-assignment
1. /
(1. +
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
for p, g, a in zip(params, grads, accumulators):
# update accumulator
new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
self.updates.append(state_ops.assign(a, new_a))
new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(state_ops.assign(p, new_p))
return self.updates
def get_config(self):
config = {
'lr': float(K.get_value(self.lr)),
'rho': float(K.get_value(self.rho)),
'decay': float(K.get_value(self.decay)),
'epsilon': self.epsilon
}
base_config = super(RMSprop, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class Adagrad(Optimizer):
"""Adagrad optimizer.
Adagrad is an optimizer with parameter-specific learning rates,
which are adapted relative to how frequently a parameter gets
updated during training. The more updates a parameter receives,
the smaller the updates.
It is recommended to leave the parameters of this optimizer
at their default values.
# Arguments
lr: float >= 0. Initial learning rate.
epsilon: float >= 0. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
# References
- [Adaptive Subgradient Methods for Online Learning and Stochastic
Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
"""
def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs):
super(Adagrad, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.lr = K.variable(lr, name='lr')
self.decay = K.variable(decay, name='decay')
self.iterations = K.variable(0, dtype='int64', name='iterations')
if epsilon is None:
epsilon = K.epsilon()
self.epsilon = epsilon
self.initial_decay = decay
def _create_all_weights(self, params):
shapes = [K.int_shape(p) for p in params]
accumulators = [K.zeros(shape) for shape in shapes]
self.weights = accumulators
return accumulators
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
accumulators = self._create_all_weights(params)
self.updates = [state_ops.assign_add(self.iterations, 1)]
lr = self.lr
if self.initial_decay > 0:
lr = lr * ( # pylint: disable=g-no-augmented-assignment
1. /
(1. +
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
for p, g, a in zip(params, grads, accumulators):
new_a = a + math_ops.square(g) # update accumulator
self.updates.append(state_ops.assign(a, new_a))
new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(state_ops.assign(p, new_p))
return self.updates
def get_config(self):
config = {
'lr': float(K.get_value(self.lr)),
'decay': float(K.get_value(self.decay)),
'epsilon': self.epsilon
}
base_config = super(Adagrad, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class Adadelta(Optimizer):
"""Adadelta optimizer.
Adadelta is a more robust extension of Adagrad
that adapts learning rates based on a moving window of gradient updates,
instead of accumulating all past gradients. This way, Adadelta continues
learning even when many updates have been done. Compared to Adagrad, in the
original version of Adadelta you don't have to set an initial learning
rate. In this version, initial learning rate and decay factor can
be set, as in most other Keras optimizers.
It is recommended to leave the parameters of this optimizer
at their default values.
# Arguments
lr: float >= 0. Initial learning rate, defaults to 1.
It is recommended to leave it at the default value.
rho: float >= 0. Adadelta decay factor, corresponding to fraction of
gradient to keep at each time step.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Initial learning rate decay.
# References
- [Adadelta - an adaptive learning rate
method](http://arxiv.org/abs/1212.5701)
"""
def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0., **kwargs):
super(Adadelta, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.lr = K.variable(lr, name='lr')
self.decay = K.variable(decay, name='decay')
self.iterations = K.variable(0, dtype='int64', name='iterations')
if epsilon is None:
epsilon = K.epsilon()
self.rho = rho
self.epsilon = epsilon
self.initial_decay = decay
def _create_all_weights(self, params):
shapes = [K.int_shape(p) for p in params]
accumulators = [K.zeros(shape) for shape in shapes]
delta_accumulators = [K.zeros(shape) for shape in shapes]
self.weights = accumulators + delta_accumulators
return accumulators, delta_accumulators
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = [state_ops.assign_add(self.iterations, 1)]
accumulators, delta_accumulators = self._create_all_weights(params)
lr = self.lr
if self.initial_decay > 0:
lr = lr * ( # pylint: disable=g-no-augmented-assignment
1. /
(1. +
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
# update accumulator
new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
self.updates.append(state_ops.assign(a, new_a))
# use the new accumulator and the *old* delta_accumulator
update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon)
new_p = p - lr * update
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(state_ops.assign(p, new_p))
# update delta_accumulator
new_d_a = self.rho * d_a + (1 - self.rho) * math_ops.square(update)
self.updates.append(state_ops.assign(d_a, new_d_a))
return self.updates
def get_config(self):
config = {
'lr': float(K.get_value(self.lr)),
'rho': self.rho,
'decay': float(K.get_value(self.decay)),
'epsilon': self.epsilon
}
base_config = super(Adadelta, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class Adam(Optimizer):
"""Adam optimizer.
Default parameters follow those provided in the original paper.
Arguments:
lr: float >= 0. Learning rate.
beta_1: float, 0 < beta < 1. Generally close to 1.
beta_2: float, 0 < beta < 1. Generally close to 1.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
amsgrad: boolean. Whether to apply the AMSGrad variant of this algorithm
from the paper "On the Convergence of Adam and Beyond".
"""
def __init__(self,
lr=0.001,
beta_1=0.9,
beta_2=0.999,
epsilon=None,
decay=0.,
amsgrad=False,
**kwargs):
super(Adam, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.lr = K.variable(lr, name='lr')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
self.decay = K.variable(decay, name='decay')
if epsilon is None:
epsilon = K.epsilon()
self.epsilon = epsilon
self.initial_decay = decay
self.amsgrad = amsgrad
def _create_all_weights(self, params):
ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
if self.amsgrad:
vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
else:
vhats = [K.zeros(1) for _ in params]
self.weights = [self.iterations] + ms + vs + vhats
return ms, vs, vhats
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = []
lr = self.lr
if self.initial_decay > 0:
lr = lr * ( # pylint: disable=g-no-augmented-assignment
1. /
(1. +
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
t = math_ops.cast(self.iterations, K.floatx())
lr_t = lr * (
K.sqrt(1. - math_ops.pow(self.beta_2, t)) /
(1. - math_ops.pow(self.beta_1, t)))
ms, vs, vhats = self._create_all_weights(params)
for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g)
if self.amsgrad:
vhat_t = math_ops.maximum(vhat, v_t)
p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
self.updates.append(state_ops.assign(vhat, vhat_t))
else:
p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
self.updates.append(state_ops.assign(m, m_t))
self.updates.append(state_ops.assign(v, v_t))
new_p = p_t
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(state_ops.assign(p, new_p))
return self.updates
def get_config(self):
config = {
'lr': float(K.get_value(self.lr)),
'beta_1': float(K.get_value(self.beta_1)),
'beta_2': float(K.get_value(self.beta_2)),
'decay': float(K.get_value(self.decay)),
'epsilon': self.epsilon,
'amsgrad': self.amsgrad
}
base_config = super(Adam, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class Adamax(Optimizer):
"""Adamax optimizer from Adam paper's Section 7.
It is a variant of Adam based on the infinity norm.
Default parameters follow those provided in the paper.
Arguments:
lr: float >= 0. Learning rate.
beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
"""
def __init__(self,
lr=0.002,
beta_1=0.9,
beta_2=0.999,
epsilon=None,
decay=0.,
**kwargs):
super(Adamax, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.lr = K.variable(lr, name='lr')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
self.decay = K.variable(decay, name='decay')
if epsilon is None:
epsilon = K.epsilon()
self.epsilon = epsilon
self.initial_decay = decay
def _create_all_weights(self, params):
shapes = [K.int_shape(p) for p in params]
# zero init of 1st moment
ms = [K.zeros(shape) for shape in shapes]
# zero init of exponentially weighted infinity norm
us = [K.zeros(shape) for shape in shapes]
self.weights = [self.iterations] + ms + us
return ms, us
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = []
lr = self.lr
if self.initial_decay > 0:
lr = lr * ( # pylint: disable=g-no-augmented-assignment
1. /
(1. +
self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
t = math_ops.cast(self.iterations, K.floatx())
lr_t = lr / (1. - math_ops.pow(self.beta_1, t))
ms, us = self._create_all_weights(params)
for p, g, m, u in zip(params, grads, ms, us):
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
u_t = math_ops.maximum(self.beta_2 * u, math_ops.abs(g))
p_t = p - lr_t * m_t / (u_t + self.epsilon)
self.updates.append(state_ops.assign(m, m_t))
self.updates.append(state_ops.assign(u, u_t))
new_p = p_t
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(state_ops.assign(p, new_p))
return self.updates
def get_config(self):
config = {
'lr': float(K.get_value(self.lr)),
'beta_1': float(K.get_value(self.beta_1)),
'beta_2': float(K.get_value(self.beta_2)),
'decay': float(K.get_value(self.decay)),
'epsilon': self.epsilon
}
base_config = super(Adamax, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class Nadam(Optimizer):
"""Nesterov Adam optimizer.
Much like Adam is essentially RMSprop with momentum,
Nadam is Adam RMSprop with Nesterov momentum.
Default parameters follow those provided in the paper.
It is recommended to leave the parameters of this optimizer
at their default values.
Arguments:
lr: float >= 0. Learning rate.
beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
"""
def __init__(self,
lr=0.002,
beta_1=0.9,
beta_2=0.999,
epsilon=None,
schedule_decay=0.004,
**kwargs):
super(Nadam, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.m_schedule = K.variable(1., name='m_schedule')
self.lr = K.variable(lr, name='lr')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
if epsilon is None:
epsilon = K.epsilon()
self.epsilon = epsilon
self.schedule_decay = schedule_decay
def _create_all_weights(self, params):
shapes = [K.int_shape(p) for p in params]
ms = [K.zeros(shape) for shape in shapes]
vs = [K.zeros(shape) for shape in shapes]
self.weights = [self.iterations, self.m_schedule] + ms + vs
return ms, vs
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = []
with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
t = math_ops.cast(self.iterations, K.floatx())
# Due to the recommendations in [2], i.e. warming momentum schedule
momentum_cache_t = self.beta_1 * (
1. - 0.5 *
(math_ops.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
momentum_cache_t_1 = self.beta_1 * (
1. - 0.5 *
(math_ops.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
m_schedule_new = self.m_schedule * momentum_cache_t
m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
self.updates.append((self.m_schedule, m_schedule_new))
ms, vs = self._create_all_weights(params)
for p, g, m, v in zip(params, grads, ms, vs):
# the following equations given in [1]
g_prime = g / (1. - m_schedule_new)
m_t = self.beta_1 * m + (1. - self.beta_1) * g
m_t_prime = m_t / (1. - m_schedule_next)
v_t = self.beta_2 * v + (1. - self.beta_2) * math_ops.square(g)
v_t_prime = v_t / (1. - math_ops.pow(self.beta_2, t))
m_t_bar = (1. -
momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
self.updates.append(state_ops.assign(m, m_t))
self.updates.append(state_ops.assign(v, v_t))
p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon)
new_p = p_t
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(state_ops.assign(p, new_p))
return self.updates
def get_config(self):
config = {
'lr': float(K.get_value(self.lr)),
'beta_1': float(K.get_value(self.beta_1)),
'beta_2': float(K.get_value(self.beta_2)),
'epsilon': self.epsilon,
'schedule_decay': self.schedule_decay
}
base_config = super(Nadam, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class TFOptimizer(Optimizer, trackable.Trackable):
"""Wrapper class for native TensorFlow optimizers."""
def __init__(self, optimizer, iterations=None): # pylint: disable=super-init-not-called
self.optimizer = optimizer
self._track_trackable(optimizer, name='optimizer')
if iterations is None:
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
else:
self.iterations = iterations
self._track_trackable(self.iterations, name='global_step')
def _clip_gradients(self, grads):
"""Clip gradients according to the clipnorm and clipvalue attributes."""
# TFOptimizer wrapper has no gradient clipping options.
return grads
def minimize(self, loss, var_list, grad_loss=None, tape=None):
"""Mimics the `OptimizerV2.minimize` API."""
if not callable(loss) and tape is None:
raise ValueError('`tape` is required when a `Tensor` loss is passed.')
tape = tape if tape is not None else backprop.GradientTape()
if callable(loss):
with tape:
if not callable(var_list):
tape.watch(var_list)
loss = loss()
if callable(var_list):
var_list = var_list()
var_list = nest.flatten(var_list)
if var_list:
grads = tape.gradient(loss, var_list, grad_loss)
grads_and_vars = list(zip(grads, var_list))
self.apply_gradients(grads_and_vars)
def apply_gradients(self, grads_and_vars):
self.optimizer.apply_gradients(grads_and_vars, global_step=self.iterations)
def get_grads(self, loss, params):
return self.optimizer.compute_gradients(loss, params)
def get_updates(self, loss, params):
if distribution_strategy_context.has_strategy():
self.updates = []
if not params:
# After the model vars have been created, the second call to get_updates
# is called with params as an empty list. This ensures that we call
# compute_gradients with params=None.
grads = self.optimizer.compute_gradients(loss)
else:
grads = self.optimizer.compute_gradients(loss, params)
global_step = training_util.get_global_step()
opt_update = self.optimizer.apply_gradients(grads, global_step)
else:
if not params:
self.updates = [state_ops.assign_add(self.iterations, 1)]
return self.updates
# Updates list starts out empty because the iterations variable is
# incremented in optimizer.apply_gradients()
self.updates = []
grads = self.optimizer.compute_gradients(loss, params)
opt_update = self.optimizer.apply_gradients(
grads, global_step=self.iterations)
self.updates.append(opt_update)
return self.updates
@property
def weights(self):
raise NotImplementedError
def get_config(self):
raise NotImplementedError
def from_config(self, config):
raise NotImplementedError
# Aliases.
sgd = SGD
rmsprop = RMSprop
adagrad = Adagrad
adadelta = Adadelta
adam = Adam
adamax = Adamax
nadam = Nadam
@keras_export('keras.optimizers.serialize')
def serialize(optimizer):
return serialize_keras_object(optimizer)

View File

@ -27,6 +27,7 @@ from tensorflow.python import keras
from tensorflow.python.eager import context
from tensorflow.python.framework import ops
from tensorflow.python.keras import keras_parameterized
from tensorflow.python.keras import optimizer_v1
from tensorflow.python.keras import testing_utils
from tensorflow.python.keras.utils import np_utils
from tensorflow.python.platform import test
@ -109,63 +110,63 @@ class KerasOptimizersTest(keras_parameterized.TestCase):
def test_sgd(self):
with self.cached_session():
self._test_optimizer(keras.optimizers.SGD())
self._test_optimizer(optimizer_v1.SGD())
def test_momentum(self):
with self.cached_session():
self._test_optimizer(
keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True))
optimizer_v1.SGD(lr=0.01, momentum=0.9, nesterov=True))
def test_rmsprop(self):
with self.cached_session():
self._test_optimizer(keras.optimizers.RMSprop())
self._test_optimizer(keras.optimizers.RMSprop(decay=1e-3))
self._test_optimizer(optimizer_v1.RMSprop())
self._test_optimizer(optimizer_v1.RMSprop(decay=1e-3))
def test_adagrad(self):
with self.cached_session():
self._test_optimizer(keras.optimizers.Adagrad())
self._test_optimizer(keras.optimizers.Adagrad(decay=1e-3))
self._test_optimizer(optimizer_v1.Adagrad())
self._test_optimizer(optimizer_v1.Adagrad(decay=1e-3))
def test_adadelta(self):
with self.cached_session():
self._test_optimizer(keras.optimizers.Adadelta(), target=0.6)
self._test_optimizer(optimizer_v1.Adadelta(), target=0.6)
# Accuracy seems dependent on the initialization. Even adding
# tf.compat.v1.Print nodes in the graph seemed to affect the
# initialization seed, and hence the accuracy.
self._test_optimizer(keras.optimizers.Adadelta(decay=1e-3), target=0.4)
self._test_optimizer(optimizer_v1.Adadelta(decay=1e-3), target=0.4)
def test_adam(self):
with self.cached_session():
self._test_optimizer(keras.optimizers.Adam())
self._test_optimizer(optimizer_v1.Adam())
# Accuracy seems dependent on the seed initialization.
# TODO(b/121051441): fix test flakiness.
self._test_optimizer(keras.optimizers.Adam(decay=1e-3), target=0.73)
self._test_optimizer(keras.optimizers.Adam(amsgrad=True))
self._test_optimizer(optimizer_v1.Adam(decay=1e-3), target=0.73)
self._test_optimizer(optimizer_v1.Adam(amsgrad=True))
def test_adamax(self):
with self.cached_session():
self._test_optimizer(keras.optimizers.Adamax())
self._test_optimizer(keras.optimizers.Adamax(decay=1e-3))
self._test_optimizer(optimizer_v1.Adamax())
self._test_optimizer(optimizer_v1.Adamax(decay=1e-3))
def test_nadam(self):
with self.cached_session():
self._test_optimizer(keras.optimizers.Nadam())
self._test_optimizer(optimizer_v1.Nadam())
def test_clipnorm(self):
with self.cached_session():
self._test_optimizer(
keras.optimizers.SGD(lr=0.01, momentum=0.9, clipnorm=0.5))
optimizer_v1.SGD(lr=0.01, momentum=0.9, clipnorm=0.5))
def test_clipvalue(self):
with self.cached_session():
self._test_optimizer(
keras.optimizers.SGD(lr=0.01, momentum=0.9, clipvalue=0.5))
optimizer_v1.SGD(lr=0.01, momentum=0.9, clipvalue=0.5))
def test_tf_optimizer(self):
if context.executing_eagerly():
self.skipTest(
'v1 optimizer does not run in eager mode')
optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01))
model = keras.models.Sequential()
model.add(keras.layers.Dense(
2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1)))
@ -194,7 +195,7 @@ class KerasOptimizersTest(keras_parameterized.TestCase):
'v1 optimizer does not run in eager mode')
graph = ops.Graph()
with graph.as_default():
optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01))
keras.backend.track_tf_optimizer(optimizer)
optimizer_weak = weakref.ref(optimizer)
graph_weak = weakref.ref(graph)
@ -209,7 +210,7 @@ class KerasOptimizersTest(keras_parameterized.TestCase):
self.skipTest(
'v1 optimizer does not run in eager mode')
with self.cached_session():
optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01))
model = keras.models.Sequential()
model.add(keras.layers.Dense(
2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1)))
@ -229,9 +230,9 @@ class KerasOptimizersTest(keras_parameterized.TestCase):
def test_negative_clipvalue_or_clipnorm(self):
with self.assertRaises(ValueError):
_ = keras.optimizers.SGD(lr=0.01, clipvalue=-0.5)
_ = optimizer_v1.SGD(lr=0.01, clipvalue=-0.5)
with self.assertRaises(ValueError):
_ = keras.optimizers.Adam(clipnorm=-2.0)
_ = optimizer_v1.Adam(clipnorm=-2.0)
def test_mixed_precision_loss_scale_optimizer(self):
if context.executing_eagerly():

View File

@ -26,7 +26,7 @@ import numpy as np
from six.moves import zip # pylint: disable=redefined-builtin
from tensorflow.python.keras import backend as K
from tensorflow.python.keras import optimizers
from tensorflow.python.keras import optimizer_v1
from tensorflow.python.keras.saving import model_config as model_config_lib
from tensorflow.python.keras.saving import saving_utils
from tensorflow.python.keras.saving.saved_model import json_utils
@ -127,7 +127,7 @@ def save_model_to_hdf5(model, filepath, overwrite=True, include_optimizer=True):
# TODO(b/128683857): Add integration tests between tf.keras and external
# Keras, to avoid breaking TF.js users.
if (include_optimizer and model.optimizer and
not isinstance(model.optimizer, optimizers.TFOptimizer)):
not isinstance(model.optimizer, optimizer_v1.TFOptimizer)):
save_optimizer_weights_to_hdf5_group(f, model.optimizer)
f.flush()

View File

@ -34,6 +34,7 @@ from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.keras import combinations
from tensorflow.python.keras import keras_parameterized
from tensorflow.python.keras import optimizer_v1
from tensorflow.python.keras import optimizers
from tensorflow.python.keras import testing_utils
from tensorflow.python.keras.engine import training
@ -341,7 +342,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
name='d1'))
ref_model.add(keras.layers.Dense(num_classes, name='d2'))
ref_model.compile(loss=keras.losses.MSE,
optimizer=keras.optimizers.RMSprop(lr=0.0001),
optimizer=optimizer_v1.RMSprop(lr=0.0001),
metrics=[keras.metrics.categorical_accuracy])
f_ref_model = h5py.File(h5_path, 'w')
@ -354,7 +355,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
name='d1'))
model.add(keras.layers.Dense(num_classes, name='d2'))
model.compile(loss=keras.losses.MSE,
optimizer=keras.optimizers.RMSprop(lr=0.0001),
optimizer=optimizer_v1.RMSprop(lr=0.0001),
metrics=[keras.metrics.categorical_accuracy])
with self.assertRaisesRegex(
ValueError, r'Layer #0 \(named "d1"\), weight '
@ -515,7 +516,7 @@ class TestWholeModelSaving(keras_parameterized.TestCase):
with ops.Graph().as_default(), self.cached_session():
# test with custom optimizer, loss
class CustomOp(keras.optimizers.RMSprop):
class CustomOp(optimizer_v1.RMSprop):
pass
def custom_loss(y_true, y_pred):
@ -692,7 +693,7 @@ class TestWholeModelSaving(keras_parameterized.TestCase):
model = keras.Model(inputs, outputs)
model.compile(
loss=keras.losses.MSE,
optimizer=keras.optimizers.Adam(),
optimizer=optimizer_v1.Adam(),
metrics=[
keras.metrics.categorical_accuracy,
keras.metrics.CategoricalAccuracy()
@ -1028,7 +1029,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
model = keras.models.Sequential()
model.add(keras.layers.Dense(2, input_shape=(3,)))
model.add(keras.layers.Dense(3))
model.compile(loss='mse', optimizer=optimizers.Adam(), metrics=['acc'])
model.compile(loss='mse', optimizer=optimizer_v1.Adam(), metrics=['acc'])
if not ops.executing_eagerly_outside_functions():
model._make_train_function()
temp_dir = self.get_temp_dir()

View File

@ -25,7 +25,7 @@ import six
from tensorflow.python.client import session
from tensorflow.python.framework import ops
from tensorflow.python.keras import backend as K
from tensorflow.python.keras import optimizers
from tensorflow.python.keras import optimizer_v1
from tensorflow.python.keras.optimizer_v2 import optimizer_v2
from tensorflow.python.keras.saving import model_config
from tensorflow.python.keras.saving import saving_utils
@ -206,7 +206,7 @@ def _save_v1_format(model, path, custom_objects, as_text, input_signature):
has_saved_vars = False
if model.optimizer:
if isinstance(model.optimizer, (optimizers.TFOptimizer,
if isinstance(model.optimizer, (optimizer_v1.TFOptimizer,
optimizer_v2.OptimizerV2)):
_export_mode(mode_keys.ModeKeys.TRAIN, has_saved_vars, **export_args)
has_saved_vars = True

View File

@ -31,6 +31,7 @@ from tensorflow.python.eager import context
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.framework import tensor_spec
from tensorflow.python.keras import optimizer_v1
from tensorflow.python.keras.engine import training as model_lib
from tensorflow.python.keras.optimizer_v2 import adadelta
from tensorflow.python.keras.optimizer_v2 import rmsprop
@ -458,7 +459,7 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
x = keras.layers.Dense(2)(inputs)
x = keras.layers.Dense(3)(x)
clone = keras.models.Model(inputs, x)
clone.compile(loss='mse', optimizer=keras.optimizers.RMSprop(lr=0.0001))
clone.compile(loss='mse', optimizer=optimizer_v1.RMSprop(lr=0.0001))
clone.train_on_batch(input_arr, target_arr)
keras_saved_model._assert_same_non_optimizer_objects(
@ -487,7 +488,7 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
x = keras.layers.Dense(4)(x)
x = keras.layers.Dense(3)(x)
clone = keras.models.Model(inputs, x)
clone.compile(loss='mse', optimizer=keras.optimizers.RMSprop(lr=0.0001))
clone.compile(loss='mse', optimizer=optimizer_v1.RMSprop(lr=0.0001))
clone.train_on_batch(input_arr, target_arr)
def testSaveSequentialModelWithoutInputShapes(self):

View File

@ -24,6 +24,7 @@ import six
from tensorflow.python.eager import def_function
from tensorflow.python.keras import backend as K
from tensorflow.python.keras import losses
from tensorflow.python.keras import optimizer_v1
from tensorflow.python.keras import optimizers
from tensorflow.python.keras.engine import base_layer_utils
from tensorflow.python.keras.utils import generic_utils
@ -161,7 +162,7 @@ def model_metadata(model, include_optimizer=True, require_config=True):
backend=K.backend(),
model_config=model_config)
if model.optimizer and include_optimizer:
if isinstance(model.optimizer, optimizers.TFOptimizer):
if isinstance(model.optimizer, optimizer_v1.TFOptimizer):
logging.warning(
'TensorFlow optimizers do not '
'make it possible to access '

View File

@ -26,6 +26,7 @@ import numpy as np
from tensorflow.python import keras
from tensorflow.python.keras import keras_parameterized
from tensorflow.python.keras import optimizer_v1
from tensorflow.python.keras import testing_utils
from tensorflow.python.keras.tests import model_architectures
from tensorflow.python.platform import test
@ -62,7 +63,7 @@ class TestModelArchitectures(keras_parameterized.TestCase):
def get_custom_objects(self):
"""Define custom_objects."""
class CustomOpt(keras.optimizers.SGD):
class CustomOpt(optimizer_v1.SGD):
pass
def custom_loss(y_true, y_pred):

View File

@ -24,6 +24,7 @@ from tensorflow.python import keras
from tensorflow.python.eager import context
from tensorflow.python.framework import config
from tensorflow.python.framework import ops
from tensorflow.python.keras import optimizer_v1
from tensorflow.python.keras.utils import multi_gpu_utils
from tensorflow.python.keras.utils import np_utils
from tensorflow.python.platform import test
@ -191,7 +192,7 @@ class TestMultiGPUModel(test.TestCase):
parallel_model.compile(
loss='categorical_crossentropy',
optimizer=keras.optimizers.RMSprop(lr=0.0001, decay=1e-6),
optimizer=optimizer_v1.RMSprop(lr=0.0001, decay=1e-6),
metrics=['accuracy'],
target_tensors=[targets])
parallel_model.fit(epochs=1, steps_per_epoch=3)