From a6b7e4b94ceb3471a268a57741fe1f32a1472119 Mon Sep 17 00:00:00 2001 From: Tomer Kaftan Date: Thu, 24 Sep 2020 05:30:54 -0700 Subject: [PATCH] Move V1 optimizer code to a separate file optimizer_v1.py from the generic utils in optimizers.py. PiperOrigin-RevId: 333495430 Change-Id: I6bf730dc507f067f79f51b7a5952b50549c7c5a4 --- tensorflow/python/keras/BUILD | 1 + .../distributed_training_utils_v1.py | 4 +- .../keras/distribute/multi_worker_test.py | 6 +- tensorflow/python/keras/engine/training.py | 3 +- tensorflow/python/keras/engine/training_v1.py | 5 +- .../experimental/keras_test.py | 4 +- tensorflow/python/keras/models.py | 6 +- tensorflow/python/keras/models_test.py | 8 +- tensorflow/python/keras/optimizer_v1.py | 839 ++++++++++++++++++ .../python/keras/optimizer_v2/adam_test.py | 4 +- .../keras/optimizer_v2/optimizer_v2_test.py | 22 +- tensorflow/python/keras/optimizers.py | 815 +---------------- tensorflow/python/keras/optimizers_test.py | 43 +- tensorflow/python/keras/saving/hdf5_format.py | 4 +- .../python/keras/saving/hdf5_format_test.py | 11 +- .../keras/saving/saved_model_experimental.py | 4 +- .../saving/saved_model_experimental_test.py | 5 +- .../python/keras/saving/saving_utils.py | 3 +- .../keras/tests/model_architectures_test.py | 3 +- .../keras/utils/multi_gpu_utils_test.py | 3 +- 20 files changed, 915 insertions(+), 878 deletions(-) create mode 100644 tensorflow/python/keras/optimizer_v1.py diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD index 53ac4c23d01..a194dca9a69 100755 --- a/tensorflow/python/keras/BUILD +++ b/tensorflow/python/keras/BUILD @@ -287,6 +287,7 @@ py_library( py_library( name = "optimizers", srcs = [ + "optimizer_v1.py", "optimizers.py", ], srcs_version = "PY2AND3", diff --git a/tensorflow/python/keras/distribute/distributed_training_utils_v1.py b/tensorflow/python/keras/distribute/distributed_training_utils_v1.py index 2ffe321ffba..83426016412 100644 --- a/tensorflow/python/keras/distribute/distributed_training_utils_v1.py +++ b/tensorflow/python/keras/distribute/distributed_training_utils_v1.py @@ -36,7 +36,7 @@ from tensorflow.python.framework import tensor_util from tensorflow.python.keras import backend as K from tensorflow.python.keras import callbacks from tensorflow.python.keras import metrics as metrics_module -from tensorflow.python.keras import optimizers +from tensorflow.python.keras import optimizer_v1 from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils from tensorflow.python.keras.engine import training_utils from tensorflow.python.keras.optimizer_v2 import optimizer_v2 @@ -779,7 +779,7 @@ def _clone_and_build_model(model, mode, inputs=None, targets=None): cloned_model = models.clone_model(model, input_tensors=inputs) # Compile and build model. - if isinstance(model.optimizer, optimizers.TFOptimizer): + if isinstance(model.optimizer, optimizer_v1.TFOptimizer): optimizer = model.optimizer else: optimizer_config = model.optimizer.get_config() diff --git a/tensorflow/python/keras/distribute/multi_worker_test.py b/tensorflow/python/keras/distribute/multi_worker_test.py index 43c3f74fed4..54c72004bb3 100644 --- a/tensorflow/python/keras/distribute/multi_worker_test.py +++ b/tensorflow/python/keras/distribute/multi_worker_test.py @@ -42,7 +42,7 @@ from tensorflow.python.keras import backend from tensorflow.python.keras import callbacks from tensorflow.python.keras import metrics as metrics_module from tensorflow.python.keras import models -from tensorflow.python.keras import optimizers +from tensorflow.python.keras import optimizer_v1 from tensorflow.python.keras.distribute import multi_worker_testing_utils from tensorflow.python.platform import test from tensorflow.python.util import nest @@ -71,11 +71,11 @@ def _clone_and_build_model(model, strategy): cloned_model = models.clone_model(model) # Compile and build model. - if isinstance(model.optimizer, optimizers.TFOptimizer): + if isinstance(model.optimizer, optimizer_v1.TFOptimizer): optimizer = model.optimizer # TODO(yuefengz): figure out why the optimizer here is still a # TFOptimizer. - while isinstance(optimizer, optimizers.TFOptimizer): + while isinstance(optimizer, optimizer_v1.TFOptimizer): optimizer = optimizer.optimizer optimizer = copy.deepcopy(optimizer) else: diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py index 37057620278..8e6a31a98b5 100644 --- a/tensorflow/python/keras/engine/training.py +++ b/tensorflow/python/keras/engine/training.py @@ -39,6 +39,7 @@ from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import tensor_shape from tensorflow.python.keras import backend from tensorflow.python.keras import callbacks as callbacks_module +from tensorflow.python.keras import optimizer_v1 from tensorflow.python.keras import optimizers from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils from tensorflow.python.keras.engine import base_layer @@ -2463,7 +2464,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector): def _validate_compile(self, optimizer, metrics, **kwargs): """Performs validation checks for the default `compile`.""" if any( - isinstance(opt, optimizers.Optimizer) + isinstance(opt, optimizer_v1.Optimizer) for opt in nest.flatten(optimizer)): raise ValueError( '`tf.compat.v1.keras` Optimizer (', optimizer, ') is ' diff --git a/tensorflow/python/keras/engine/training_v1.py b/tensorflow/python/keras/engine/training_v1.py index fe85e45372e..7dca6ae3da7 100644 --- a/tensorflow/python/keras/engine/training_v1.py +++ b/tensorflow/python/keras/engine/training_v1.py @@ -40,6 +40,7 @@ from tensorflow.python.framework import type_spec from tensorflow.python.keras import backend as K from tensorflow.python.keras import losses from tensorflow.python.keras import metrics as metrics_module +from tensorflow.python.keras import optimizer_v1 from tensorflow.python.keras import optimizers from tensorflow.python.keras.distribute import distributed_training_utils from tensorflow.python.keras.distribute import distributed_training_utils_v1 @@ -322,8 +323,8 @@ class Model(training_lib.Model): self._set_optimizer(optimizer) is_any_keras_optimizer_v1 = any( - (isinstance(opt, optimizers.Optimizer) - and not isinstance(opt, optimizers.TFOptimizer) + (isinstance(opt, optimizer_v1.Optimizer) + and not isinstance(opt, optimizer_v1.TFOptimizer) ) for opt in nest.flatten(self.optimizer)) if is_any_keras_optimizer_v1 and ops.executing_eagerly_outside_functions(): diff --git a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py index 572bdbbce8a..8eafe725514 100644 --- a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py +++ b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py @@ -37,7 +37,7 @@ from tensorflow.python.keras import combinations from tensorflow.python.keras import keras_parameterized from tensorflow.python.keras import layers from tensorflow.python.keras import models -from tensorflow.python.keras import optimizers +from tensorflow.python.keras import optimizer_v1 from tensorflow.python.keras import testing_utils from tensorflow.python.keras.engine import base_layer from tensorflow.python.keras.engine import base_layer_utils @@ -854,7 +854,7 @@ class KerasModelTest(keras_parameterized.TestCase): else: error_msg = 'optimizer" must be an instance of ' with self.assertRaisesRegex(ValueError, error_msg): - model.compile(optimizers.SGD(1.), 'mse') + model.compile(optimizer_v1.SGD(1.), 'mse') @combinations.generate(combinations.combine(mode=['graph', 'eager'])) def test_functional_model_loss_dtype(self): diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py index 76324621a8b..b3737b5c2c4 100644 --- a/tensorflow/python/keras/models.py +++ b/tensorflow/python/keras/models.py @@ -22,7 +22,7 @@ from __future__ import print_function from tensorflow.python.framework import ops from tensorflow.python.keras import backend as K from tensorflow.python.keras import metrics as metrics_module -from tensorflow.python.keras import optimizers +from tensorflow.python.keras import optimizer_v1 from tensorflow.python.keras.engine import functional from tensorflow.python.keras.engine import sequential from tensorflow.python.keras.engine import training @@ -682,8 +682,8 @@ def clone_and_build_model( clone._set_inputs(input_tensors) if compile_clone: - if isinstance(orig_optimizer, optimizers.TFOptimizer): - optimizer = optimizers.TFOptimizer( + if isinstance(orig_optimizer, optimizer_v1.TFOptimizer): + optimizer = optimizer_v1.TFOptimizer( orig_optimizer.optimizer, optimizer_iterations) K.track_tf_optimizer(optimizer) else: diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py index 8411ed0d3ea..854a0cabd3e 100644 --- a/tensorflow/python/keras/models_test.py +++ b/tensorflow/python/keras/models_test.py @@ -32,6 +32,7 @@ from tensorflow.python.keras import backend as K from tensorflow.python.keras import keras_parameterized from tensorflow.python.keras import metrics from tensorflow.python.keras import models +from tensorflow.python.keras import optimizer_v1 from tensorflow.python.keras import testing_utils from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops @@ -420,10 +421,9 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase): """Assert that two models have the same compile parameters.""" self.assertEqual('mse', model.loss) - self.assertTrue( - isinstance(model.optimizer, - (keras.optimizers.RMSprop, - keras.optimizer_v2.rmsprop.RMSprop))) + self.assertIsInstance( + model.optimizer, + (optimizer_v1.RMSprop, keras.optimizer_v2.rmsprop.RMSprop)) def _clone_and_build_test_helper(self, model, model_type): inp = np.random.random((10, 4)) diff --git a/tensorflow/python/keras/optimizer_v1.py b/tensorflow/python/keras/optimizer_v1.py new file mode 100644 index 00000000000..24cb0aaecff --- /dev/null +++ b/tensorflow/python/keras/optimizer_v1.py @@ -0,0 +1,839 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# pylint: disable=invalid-name +# pylint: disable=g-classes-have-attributes +"""Legacy v1 optimizer classes. + +For more examples see the base class `tf.compat.v1.keras.optimizers.Optimizer`. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from six.moves import zip # pylint: disable=redefined-builtin + +from tensorflow.python.distribute import distribution_strategy_context +from tensorflow.python.eager import backprop +from tensorflow.python.framework import ops +from tensorflow.python.keras import backend as K +from tensorflow.python.ops import clip_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import state_ops +from tensorflow.python.training import training_util +from tensorflow.python.training.tracking import base as trackable +from tensorflow.python.util import nest + + +class Optimizer(object): + """Abstract optimizer base class. + + Note: this is the parent class of all optimizers, not an actual optimizer + that can be used for training models. + + All Keras optimizers support the following keyword arguments: + + clipnorm: float >= 0. Gradients will be clipped + when their L2 norm exceeds this value. + clipvalue: float >= 0. Gradients will be clipped + when their absolute value exceeds this value. + """ + + def __init__(self, **kwargs): + allowed_kwargs = {'clipnorm', 'clipvalue'} + for k in kwargs: + if k not in allowed_kwargs: + raise TypeError('Unexpected keyword argument ' + 'passed to optimizer: ' + str(k)) + # checks that clipnorm >= 0 and clipvalue >= 0 + if kwargs[k] < 0: + raise ValueError('Expected {} >= 0, received: {}'.format(k, kwargs[k])) + self.__dict__.update(kwargs) + self.updates = [] + self.weights = [] + + # Set this to False, indicating `apply_gradients` does not take the + # `experimental_aggregate_gradients` argument. + _HAS_AGGREGATE_GRAD = False + + def _create_all_weights(self, params): + """Creates and sets all optimizer weights. + + Args: + params: list or tuple of `Variable` objects that will be minimized + using this optimizer. + + Returns: + Specific weight values that are used in `get_updates` + """ + raise NotImplementedError + + def get_updates(self, loss, params): + raise NotImplementedError + + def get_gradients(self, loss, params): + """Returns gradients of `loss` with respect to `params`. + + Arguments: + loss: Loss tensor. + params: List of variables. + + Returns: + List of gradient tensors. + + Raises: + ValueError: In case any gradient cannot be computed (e.g. if gradient + function not implemented). + """ + grads = K.gradients(loss, params) + if any(g is None for g in grads): + raise ValueError('An operation has `None` for gradient. ' + 'Please make sure that all of your ops have a ' + 'gradient defined (i.e. are differentiable). ' + 'Common ops without gradient: ' + 'K.argmax, K.round, K.eval.') + if hasattr(self, 'clipnorm'): + grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads] + if hasattr(self, 'clipvalue'): + grads = [ + clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue) + for g in grads + ] + return grads + + def set_weights(self, weights): + """Sets the weights of the optimizer, from Numpy arrays. + + Should only be called after computing the gradients + (otherwise the optimizer has no weights). + + Arguments: + weights: a list of Numpy arrays. The number of arrays and their shape + must match number of the dimensions of the weights of the optimizer + (i.e. it should match the output of `get_weights`). + + Raises: + ValueError: in case of incompatible weight shapes. + """ + params = self.weights + if len(params) != len(weights): + raise ValueError('Length of the specified weight list (' + + str(len(weights)) + + ') does not match the number of weights ' + 'of the optimizer (' + str(len(params)) + ')') + weight_value_tuples = [] + param_values = K.batch_get_value(params) + for pv, p, w in zip(param_values, params, weights): + if pv.shape != w.shape: + raise ValueError('Optimizer weight shape ' + str(pv.shape) + + ' not compatible with ' + 'provided weight shape ' + str(w.shape)) + weight_value_tuples.append((p, w)) + K.batch_set_value(weight_value_tuples) + + def get_weights(self): + """Returns the current value of the weights of the optimizer. + + Returns: + A list of numpy arrays. + """ + return K.batch_get_value(self.weights) + + def get_config(self): + config = {} + if hasattr(self, 'clipnorm'): + config['clipnorm'] = self.clipnorm + if hasattr(self, 'clipvalue'): + config['clipvalue'] = self.clipvalue + return config + + @classmethod + def from_config(cls, config): + return cls(**config) + + +class SGD(Optimizer): + """Stochastic gradient descent optimizer. + + Includes support for momentum, + learning rate decay, and Nesterov momentum. + + Arguments: + lr: float >= 0. Learning rate. + momentum: float >= 0. Parameter that accelerates SGD in the relevant + direction and dampens oscillations. + decay: float >= 0. Learning rate decay over each update. + nesterov: boolean. Whether to apply Nesterov momentum. + """ + + def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, **kwargs): + super(SGD, self).__init__(**kwargs) + with K.name_scope(self.__class__.__name__): + self.iterations = K.variable(0, dtype='int64', name='iterations') + self.lr = K.variable(lr, name='lr') + self.momentum = K.variable(momentum, name='momentum') + self.decay = K.variable(decay, name='decay') + self.initial_decay = decay + self.nesterov = nesterov + + def _create_all_weights(self, params): + shapes = [K.int_shape(p) for p in params] + moments = [K.zeros(shape) for shape in shapes] + self.weights = [self.iterations] + moments + return moments + + def get_updates(self, loss, params): + grads = self.get_gradients(loss, params) + self.updates = [state_ops.assign_add(self.iterations, 1)] + + lr = self.lr + if self.initial_decay > 0: + lr = lr * ( # pylint: disable=g-no-augmented-assignment + 1. / + (1. + + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) + # momentum + moments = self._create_all_weights(params) + for p, g, m in zip(params, grads, moments): + v = self.momentum * m - lr * g # velocity + self.updates.append(state_ops.assign(m, v)) + + if self.nesterov: + new_p = p + self.momentum * v - lr * g + else: + new_p = p + v + + # Apply constraints. + if getattr(p, 'constraint', None) is not None: + new_p = p.constraint(new_p) + + self.updates.append(state_ops.assign(p, new_p)) + return self.updates + + def get_config(self): + config = { + 'lr': float(K.get_value(self.lr)), + 'momentum': float(K.get_value(self.momentum)), + 'decay': float(K.get_value(self.decay)), + 'nesterov': self.nesterov + } + base_config = super(SGD, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + +class RMSprop(Optimizer): + """RMSProp optimizer. + + It is recommended to leave the parameters of this optimizer + at their default values + (except the learning rate, which can be freely tuned). + + Arguments: + lr: float >= 0. Learning rate. + rho: float >= 0. + epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`. + decay: float >= 0. Learning rate decay over each update. + """ + + def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0., **kwargs): + super(RMSprop, self).__init__(**kwargs) + with K.name_scope(self.__class__.__name__): + self.lr = K.variable(lr, name='lr') + self.rho = K.variable(rho, name='rho') + self.decay = K.variable(decay, name='decay') + self.iterations = K.variable(0, dtype='int64', name='iterations') + if epsilon is None: + epsilon = K.epsilon() + self.epsilon = epsilon + self.initial_decay = decay + + def _create_all_weights(self, params): + accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] + self.weights = accumulators + return accumulators + + def get_updates(self, loss, params): + grads = self.get_gradients(loss, params) + accumulators = self._create_all_weights(params) + self.updates = [state_ops.assign_add(self.iterations, 1)] + + lr = self.lr + if self.initial_decay > 0: + lr = lr * ( # pylint: disable=g-no-augmented-assignment + 1. / + (1. + + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) + + for p, g, a in zip(params, grads, accumulators): + # update accumulator + new_a = self.rho * a + (1. - self.rho) * math_ops.square(g) + self.updates.append(state_ops.assign(a, new_a)) + new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon) + + # Apply constraints. + if getattr(p, 'constraint', None) is not None: + new_p = p.constraint(new_p) + + self.updates.append(state_ops.assign(p, new_p)) + return self.updates + + def get_config(self): + config = { + 'lr': float(K.get_value(self.lr)), + 'rho': float(K.get_value(self.rho)), + 'decay': float(K.get_value(self.decay)), + 'epsilon': self.epsilon + } + base_config = super(RMSprop, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + +class Adagrad(Optimizer): + """Adagrad optimizer. + + Adagrad is an optimizer with parameter-specific learning rates, + which are adapted relative to how frequently a parameter gets + updated during training. The more updates a parameter receives, + the smaller the updates. + + It is recommended to leave the parameters of this optimizer + at their default values. + + # Arguments + lr: float >= 0. Initial learning rate. + epsilon: float >= 0. If `None`, defaults to `K.epsilon()`. + decay: float >= 0. Learning rate decay over each update. + + # References + - [Adaptive Subgradient Methods for Online Learning and Stochastic + Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) + """ + + def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs): + super(Adagrad, self).__init__(**kwargs) + with K.name_scope(self.__class__.__name__): + self.lr = K.variable(lr, name='lr') + self.decay = K.variable(decay, name='decay') + self.iterations = K.variable(0, dtype='int64', name='iterations') + if epsilon is None: + epsilon = K.epsilon() + self.epsilon = epsilon + self.initial_decay = decay + + def _create_all_weights(self, params): + shapes = [K.int_shape(p) for p in params] + accumulators = [K.zeros(shape) for shape in shapes] + self.weights = accumulators + return accumulators + + def get_updates(self, loss, params): + grads = self.get_gradients(loss, params) + accumulators = self._create_all_weights(params) + + self.updates = [state_ops.assign_add(self.iterations, 1)] + + lr = self.lr + if self.initial_decay > 0: + lr = lr * ( # pylint: disable=g-no-augmented-assignment + 1. / + (1. + + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) + + for p, g, a in zip(params, grads, accumulators): + new_a = a + math_ops.square(g) # update accumulator + self.updates.append(state_ops.assign(a, new_a)) + new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon) + + # Apply constraints. + if getattr(p, 'constraint', None) is not None: + new_p = p.constraint(new_p) + + self.updates.append(state_ops.assign(p, new_p)) + return self.updates + + def get_config(self): + config = { + 'lr': float(K.get_value(self.lr)), + 'decay': float(K.get_value(self.decay)), + 'epsilon': self.epsilon + } + base_config = super(Adagrad, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + +class Adadelta(Optimizer): + """Adadelta optimizer. + + Adadelta is a more robust extension of Adagrad + that adapts learning rates based on a moving window of gradient updates, + instead of accumulating all past gradients. This way, Adadelta continues + learning even when many updates have been done. Compared to Adagrad, in the + original version of Adadelta you don't have to set an initial learning + rate. In this version, initial learning rate and decay factor can + be set, as in most other Keras optimizers. + + It is recommended to leave the parameters of this optimizer + at their default values. + + # Arguments + lr: float >= 0. Initial learning rate, defaults to 1. + It is recommended to leave it at the default value. + rho: float >= 0. Adadelta decay factor, corresponding to fraction of + gradient to keep at each time step. + epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`. + decay: float >= 0. Initial learning rate decay. + + # References + - [Adadelta - an adaptive learning rate + method](http://arxiv.org/abs/1212.5701) + """ + + def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0., **kwargs): + super(Adadelta, self).__init__(**kwargs) + with K.name_scope(self.__class__.__name__): + self.lr = K.variable(lr, name='lr') + self.decay = K.variable(decay, name='decay') + self.iterations = K.variable(0, dtype='int64', name='iterations') + if epsilon is None: + epsilon = K.epsilon() + self.rho = rho + self.epsilon = epsilon + self.initial_decay = decay + + def _create_all_weights(self, params): + shapes = [K.int_shape(p) for p in params] + accumulators = [K.zeros(shape) for shape in shapes] + delta_accumulators = [K.zeros(shape) for shape in shapes] + self.weights = accumulators + delta_accumulators + return accumulators, delta_accumulators + + def get_updates(self, loss, params): + grads = self.get_gradients(loss, params) + self.updates = [state_ops.assign_add(self.iterations, 1)] + accumulators, delta_accumulators = self._create_all_weights(params) + + lr = self.lr + if self.initial_decay > 0: + lr = lr * ( # pylint: disable=g-no-augmented-assignment + 1. / + (1. + + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) + + for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators): + # update accumulator + new_a = self.rho * a + (1. - self.rho) * math_ops.square(g) + self.updates.append(state_ops.assign(a, new_a)) + + # use the new accumulator and the *old* delta_accumulator + update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon) + new_p = p - lr * update + + # Apply constraints. + if getattr(p, 'constraint', None) is not None: + new_p = p.constraint(new_p) + + self.updates.append(state_ops.assign(p, new_p)) + + # update delta_accumulator + new_d_a = self.rho * d_a + (1 - self.rho) * math_ops.square(update) + self.updates.append(state_ops.assign(d_a, new_d_a)) + return self.updates + + def get_config(self): + config = { + 'lr': float(K.get_value(self.lr)), + 'rho': self.rho, + 'decay': float(K.get_value(self.decay)), + 'epsilon': self.epsilon + } + base_config = super(Adadelta, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + +class Adam(Optimizer): + """Adam optimizer. + + Default parameters follow those provided in the original paper. + + Arguments: + lr: float >= 0. Learning rate. + beta_1: float, 0 < beta < 1. Generally close to 1. + beta_2: float, 0 < beta < 1. Generally close to 1. + epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`. + decay: float >= 0. Learning rate decay over each update. + amsgrad: boolean. Whether to apply the AMSGrad variant of this algorithm + from the paper "On the Convergence of Adam and Beyond". + """ + + def __init__(self, + lr=0.001, + beta_1=0.9, + beta_2=0.999, + epsilon=None, + decay=0., + amsgrad=False, + **kwargs): + super(Adam, self).__init__(**kwargs) + with K.name_scope(self.__class__.__name__): + self.iterations = K.variable(0, dtype='int64', name='iterations') + self.lr = K.variable(lr, name='lr') + self.beta_1 = K.variable(beta_1, name='beta_1') + self.beta_2 = K.variable(beta_2, name='beta_2') + self.decay = K.variable(decay, name='decay') + if epsilon is None: + epsilon = K.epsilon() + self.epsilon = epsilon + self.initial_decay = decay + self.amsgrad = amsgrad + + def _create_all_weights(self, params): + ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] + vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] + if self.amsgrad: + vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] + else: + vhats = [K.zeros(1) for _ in params] + self.weights = [self.iterations] + ms + vs + vhats + return ms, vs, vhats + + def get_updates(self, loss, params): + grads = self.get_gradients(loss, params) + self.updates = [] + + lr = self.lr + if self.initial_decay > 0: + lr = lr * ( # pylint: disable=g-no-augmented-assignment + 1. / + (1. + + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) + + with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]): + t = math_ops.cast(self.iterations, K.floatx()) + lr_t = lr * ( + K.sqrt(1. - math_ops.pow(self.beta_2, t)) / + (1. - math_ops.pow(self.beta_1, t))) + + ms, vs, vhats = self._create_all_weights(params) + for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): + m_t = (self.beta_1 * m) + (1. - self.beta_1) * g + v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g) + if self.amsgrad: + vhat_t = math_ops.maximum(vhat, v_t) + p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) + self.updates.append(state_ops.assign(vhat, vhat_t)) + else: + p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) + + self.updates.append(state_ops.assign(m, m_t)) + self.updates.append(state_ops.assign(v, v_t)) + new_p = p_t + + # Apply constraints. + if getattr(p, 'constraint', None) is not None: + new_p = p.constraint(new_p) + + self.updates.append(state_ops.assign(p, new_p)) + return self.updates + + def get_config(self): + config = { + 'lr': float(K.get_value(self.lr)), + 'beta_1': float(K.get_value(self.beta_1)), + 'beta_2': float(K.get_value(self.beta_2)), + 'decay': float(K.get_value(self.decay)), + 'epsilon': self.epsilon, + 'amsgrad': self.amsgrad + } + base_config = super(Adam, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + +class Adamax(Optimizer): + """Adamax optimizer from Adam paper's Section 7. + + It is a variant of Adam based on the infinity norm. + Default parameters follow those provided in the paper. + + Arguments: + lr: float >= 0. Learning rate. + beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. + epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`. + decay: float >= 0. Learning rate decay over each update. + """ + + def __init__(self, + lr=0.002, + beta_1=0.9, + beta_2=0.999, + epsilon=None, + decay=0., + **kwargs): + super(Adamax, self).__init__(**kwargs) + with K.name_scope(self.__class__.__name__): + self.iterations = K.variable(0, dtype='int64', name='iterations') + self.lr = K.variable(lr, name='lr') + self.beta_1 = K.variable(beta_1, name='beta_1') + self.beta_2 = K.variable(beta_2, name='beta_2') + self.decay = K.variable(decay, name='decay') + if epsilon is None: + epsilon = K.epsilon() + self.epsilon = epsilon + self.initial_decay = decay + + def _create_all_weights(self, params): + + shapes = [K.int_shape(p) for p in params] + # zero init of 1st moment + ms = [K.zeros(shape) for shape in shapes] + # zero init of exponentially weighted infinity norm + us = [K.zeros(shape) for shape in shapes] + self.weights = [self.iterations] + ms + us + return ms, us + + def get_updates(self, loss, params): + grads = self.get_gradients(loss, params) + self.updates = [] + + lr = self.lr + if self.initial_decay > 0: + lr = lr * ( # pylint: disable=g-no-augmented-assignment + 1. / + (1. + + self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) + + with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]): + t = math_ops.cast(self.iterations, K.floatx()) + lr_t = lr / (1. - math_ops.pow(self.beta_1, t)) + + ms, us = self._create_all_weights(params) + + for p, g, m, u in zip(params, grads, ms, us): + + m_t = (self.beta_1 * m) + (1. - self.beta_1) * g + u_t = math_ops.maximum(self.beta_2 * u, math_ops.abs(g)) + p_t = p - lr_t * m_t / (u_t + self.epsilon) + + self.updates.append(state_ops.assign(m, m_t)) + self.updates.append(state_ops.assign(u, u_t)) + new_p = p_t + + # Apply constraints. + if getattr(p, 'constraint', None) is not None: + new_p = p.constraint(new_p) + + self.updates.append(state_ops.assign(p, new_p)) + return self.updates + + def get_config(self): + config = { + 'lr': float(K.get_value(self.lr)), + 'beta_1': float(K.get_value(self.beta_1)), + 'beta_2': float(K.get_value(self.beta_2)), + 'decay': float(K.get_value(self.decay)), + 'epsilon': self.epsilon + } + base_config = super(Adamax, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + +class Nadam(Optimizer): + """Nesterov Adam optimizer. + + Much like Adam is essentially RMSprop with momentum, + Nadam is Adam RMSprop with Nesterov momentum. + + Default parameters follow those provided in the paper. + It is recommended to leave the parameters of this optimizer + at their default values. + + Arguments: + lr: float >= 0. Learning rate. + beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. + epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`. + """ + + def __init__(self, + lr=0.002, + beta_1=0.9, + beta_2=0.999, + epsilon=None, + schedule_decay=0.004, + **kwargs): + super(Nadam, self).__init__(**kwargs) + with K.name_scope(self.__class__.__name__): + self.iterations = K.variable(0, dtype='int64', name='iterations') + self.m_schedule = K.variable(1., name='m_schedule') + self.lr = K.variable(lr, name='lr') + self.beta_1 = K.variable(beta_1, name='beta_1') + self.beta_2 = K.variable(beta_2, name='beta_2') + if epsilon is None: + epsilon = K.epsilon() + self.epsilon = epsilon + self.schedule_decay = schedule_decay + + def _create_all_weights(self, params): + shapes = [K.int_shape(p) for p in params] + ms = [K.zeros(shape) for shape in shapes] + vs = [K.zeros(shape) for shape in shapes] + + self.weights = [self.iterations, self.m_schedule] + ms + vs + return ms, vs + + def get_updates(self, loss, params): + grads = self.get_gradients(loss, params) + self.updates = [] + + with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]): + t = math_ops.cast(self.iterations, K.floatx()) + + # Due to the recommendations in [2], i.e. warming momentum schedule + momentum_cache_t = self.beta_1 * ( + 1. - 0.5 * + (math_ops.pow(K.cast_to_floatx(0.96), t * self.schedule_decay))) + momentum_cache_t_1 = self.beta_1 * ( + 1. - 0.5 * + (math_ops.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay))) + m_schedule_new = self.m_schedule * momentum_cache_t + m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1 + self.updates.append((self.m_schedule, m_schedule_new)) + + ms, vs = self._create_all_weights(params) + + for p, g, m, v in zip(params, grads, ms, vs): + # the following equations given in [1] + g_prime = g / (1. - m_schedule_new) + m_t = self.beta_1 * m + (1. - self.beta_1) * g + m_t_prime = m_t / (1. - m_schedule_next) + v_t = self.beta_2 * v + (1. - self.beta_2) * math_ops.square(g) + v_t_prime = v_t / (1. - math_ops.pow(self.beta_2, t)) + m_t_bar = (1. - + momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime + + self.updates.append(state_ops.assign(m, m_t)) + self.updates.append(state_ops.assign(v, v_t)) + + p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon) + new_p = p_t + + # Apply constraints. + if getattr(p, 'constraint', None) is not None: + new_p = p.constraint(new_p) + + self.updates.append(state_ops.assign(p, new_p)) + return self.updates + + def get_config(self): + config = { + 'lr': float(K.get_value(self.lr)), + 'beta_1': float(K.get_value(self.beta_1)), + 'beta_2': float(K.get_value(self.beta_2)), + 'epsilon': self.epsilon, + 'schedule_decay': self.schedule_decay + } + base_config = super(Nadam, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + +class TFOptimizer(Optimizer, trackable.Trackable): + """Wrapper class for native TensorFlow optimizers.""" + + def __init__(self, optimizer, iterations=None): # pylint: disable=super-init-not-called + self.optimizer = optimizer + self._track_trackable(optimizer, name='optimizer') + if iterations is None: + with K.name_scope(self.__class__.__name__): + self.iterations = K.variable(0, dtype='int64', name='iterations') + else: + self.iterations = iterations + self._track_trackable(self.iterations, name='global_step') + + def _clip_gradients(self, grads): + """Clip gradients according to the clipnorm and clipvalue attributes.""" + # TFOptimizer wrapper has no gradient clipping options. + return grads + + def minimize(self, loss, var_list, grad_loss=None, tape=None): + """Mimics the `OptimizerV2.minimize` API.""" + if not callable(loss) and tape is None: + raise ValueError('`tape` is required when a `Tensor` loss is passed.') + tape = tape if tape is not None else backprop.GradientTape() + + if callable(loss): + with tape: + if not callable(var_list): + tape.watch(var_list) + loss = loss() + if callable(var_list): + var_list = var_list() + + var_list = nest.flatten(var_list) + if var_list: + grads = tape.gradient(loss, var_list, grad_loss) + grads_and_vars = list(zip(grads, var_list)) + self.apply_gradients(grads_and_vars) + + def apply_gradients(self, grads_and_vars): + self.optimizer.apply_gradients(grads_and_vars, global_step=self.iterations) + + def get_grads(self, loss, params): + return self.optimizer.compute_gradients(loss, params) + + def get_updates(self, loss, params): + if distribution_strategy_context.has_strategy(): + self.updates = [] + + if not params: + # After the model vars have been created, the second call to get_updates + # is called with params as an empty list. This ensures that we call + # compute_gradients with params=None. + grads = self.optimizer.compute_gradients(loss) + else: + grads = self.optimizer.compute_gradients(loss, params) + global_step = training_util.get_global_step() + opt_update = self.optimizer.apply_gradients(grads, global_step) + else: + if not params: + self.updates = [state_ops.assign_add(self.iterations, 1)] + return self.updates + + # Updates list starts out empty because the iterations variable is + # incremented in optimizer.apply_gradients() + self.updates = [] + grads = self.optimizer.compute_gradients(loss, params) + opt_update = self.optimizer.apply_gradients( + grads, global_step=self.iterations) + + self.updates.append(opt_update) + return self.updates + + @property + def weights(self): + raise NotImplementedError + + def get_config(self): + raise NotImplementedError + + def from_config(self, config): + raise NotImplementedError + + +# Aliases. + +sgd = SGD +rmsprop = RMSprop +adagrad = Adagrad +adadelta = Adadelta +adam = Adam +adamax = Adamax +nadam = Nadam diff --git a/tensorflow/python/keras/optimizer_v2/adam_test.py b/tensorflow/python/keras/optimizer_v2/adam_test.py index 61b639456c8..9cf58177446 100644 --- a/tensorflow/python/keras/optimizer_v2/adam_test.py +++ b/tensorflow/python/keras/optimizer_v2/adam_test.py @@ -26,7 +26,7 @@ from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.keras import combinations -from tensorflow.python.keras import optimizers +from tensorflow.python.keras import optimizer_v1 from tensorflow.python.keras.optimizer_v2 import adam from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule from tensorflow.python.ops import array_ops @@ -537,7 +537,7 @@ class AdamOptimizerTest(test.TestCase, parameterized.TestCase): self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations)) def testSetWeightsFromV1AdamWithoutMinimize(self): - keras_v1_adam = optimizers.Adam() + keras_v1_adam = optimizer_v1.Adam() keras_v2_adam = adam.Adam() keras_v2_adam.set_weights(keras_v1_adam.get_weights()) keras_v1_iteration = keras_v1_adam.iterations diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py index b4061424d1d..9a8946e34cc 100644 --- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py +++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py @@ -35,7 +35,7 @@ from tensorflow.python.keras import callbacks from tensorflow.python.keras import combinations from tensorflow.python.keras import keras_parameterized from tensorflow.python.keras import losses -from tensorflow.python.keras import optimizers +from tensorflow.python.keras import optimizer_v1 from tensorflow.python.keras import testing_utils from tensorflow.python.keras.engine import input_layer from tensorflow.python.keras.engine import sequential @@ -739,42 +739,42 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase): rtol=1e-5, atol=1e-5) def testAdadeltaCompatibility(self): - opt_v1 = optimizers.Adadelta(lr=0.01) + opt_v1 = optimizer_v1.Adadelta(lr=0.01) opt_v2 = adadelta.Adadelta(learning_rate=0.01) self._testOptimizersCompatibility(opt_v1, opt_v2) def testAdagradCompatibility(self): - opt_v1 = optimizers.Adagrad(lr=0.01) + opt_v1 = optimizer_v1.Adagrad(lr=0.01) opt_v2 = adagrad.Adagrad(learning_rate=0.01) self._testOptimizersCompatibility(opt_v1, opt_v2) def testAdamCompatibility(self): - opt_v1 = optimizers.Adam() + opt_v1 = optimizer_v1.Adam() opt_v2 = adam.Adam() self._testOptimizersCompatibility(opt_v1, opt_v2) def testAdamaxCompatibility(self): - opt_v1 = optimizers.Adamax(lr=0.01) + opt_v1 = optimizer_v1.Adamax(lr=0.01) opt_v2 = adamax.Adamax(learning_rate=0.01) self._testOptimizersCompatibility(opt_v1, opt_v2) def testNadamCompatibility(self): - opt_v1 = optimizers.Nadam(lr=0.001) + opt_v1 = optimizer_v1.Nadam(lr=0.001) opt_v2 = nadam.Nadam(learning_rate=0.001) self._testOptimizersCompatibility(opt_v1, opt_v2) def testMomentumCompatibility(self): - opt_v1 = optimizers.SGD(lr=0.01, momentum=0.9) + opt_v1 = optimizer_v1.SGD(lr=0.01, momentum=0.9) opt_v2 = gradient_descent.SGD(learning_rate=0.01, momentum=0.9) self._testOptimizersCompatibility(opt_v1, opt_v2) def testRMSpropCompatibility(self): - opt_v1 = optimizers.RMSprop() + opt_v1 = optimizer_v1.RMSprop() opt_v2 = rmsprop.RMSprop() self._testOptimizersCompatibility(opt_v1, opt_v2) def testSGDCompatibility(self): - opt_v1 = optimizers.SGD(lr=0.01) + opt_v1 = optimizer_v1.SGD(lr=0.01) opt_v2 = gradient_descent.SGD(learning_rate=0.01) self._testOptimizersCompatibility(opt_v1, opt_v2, False) @@ -804,7 +804,7 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase): num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim) model_tf.set_weights(model_k_v2.get_weights()) - opt_k_v1 = optimizers.SGD(momentum=0.9, nesterov=True) + opt_k_v1 = optimizer_v1.SGD(momentum=0.9, nesterov=True) opt_k_v2 = gradient_descent.SGD(momentum=0.9, nesterov=True) opt_tf = momentum.MomentumOptimizer( learning_rate=0.01, momentum=0.9, use_nesterov=True) @@ -858,7 +858,7 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase): num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim) model_k_v2.set_weights(model_k_v1.get_weights()) - opt_k_v1 = optimizers.Adam(amsgrad=True) + opt_k_v1 = optimizer_v1.Adam(amsgrad=True) opt_k_v2 = adam.Adam(amsgrad=True) model_k_v1.compile( diff --git a/tensorflow/python/keras/optimizers.py b/tensorflow/python/keras/optimizers.py index 88aede96692..e1f2ca80cf9 100644 --- a/tensorflow/python/keras/optimizers.py +++ b/tensorflow/python/keras/optimizers.py @@ -22,12 +22,10 @@ from __future__ import division from __future__ import print_function import six -from six.moves import zip # pylint: disable=redefined-builtin -from tensorflow.python.distribute import distribution_strategy_context -from tensorflow.python.eager import backprop -from tensorflow.python.framework import ops from tensorflow.python.keras import backend as K +from tensorflow.python.keras.optimizer_v1 import Optimizer +from tensorflow.python.keras.optimizer_v1 import TFOptimizer from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_v2 from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_v2 from tensorflow.python.keras.optimizer_v2 import adam as adam_v2 @@ -39,819 +37,10 @@ from tensorflow.python.keras.optimizer_v2 import optimizer_v2 from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object from tensorflow.python.keras.utils.generic_utils import serialize_keras_object -from tensorflow.python.ops import clip_ops -from tensorflow.python.ops import math_ops -from tensorflow.python.ops import state_ops from tensorflow.python.training import optimizer as tf_optimizer_module -from tensorflow.python.training import training_util -from tensorflow.python.training.tracking import base as trackable -from tensorflow.python.util import nest from tensorflow.python.util.tf_export import keras_export -class Optimizer(object): - """Abstract optimizer base class. - - Note: this is the parent class of all optimizers, not an actual optimizer - that can be used for training models. - - All Keras optimizers support the following keyword arguments: - - clipnorm: float >= 0. Gradients will be clipped - when their L2 norm exceeds this value. - clipvalue: float >= 0. Gradients will be clipped - when their absolute value exceeds this value. - """ - - def __init__(self, **kwargs): - allowed_kwargs = {'clipnorm', 'clipvalue'} - for k in kwargs: - if k not in allowed_kwargs: - raise TypeError('Unexpected keyword argument ' - 'passed to optimizer: ' + str(k)) - # checks that clipnorm >= 0 and clipvalue >= 0 - if kwargs[k] < 0: - raise ValueError('Expected {} >= 0, received: {}'.format(k, kwargs[k])) - self.__dict__.update(kwargs) - self.updates = [] - self.weights = [] - - # Set this to False, indicating `apply_gradients` does not take the - # `experimental_aggregate_gradients` argument. - _HAS_AGGREGATE_GRAD = False - - def _create_all_weights(self, params): - """Creates and sets all optimizer weights. - - Args: - params: list or tuple of `Variable` objects that will be minimized - using this optimizer. - - Returns: - Specific weight values that are used in `get_updates` - """ - raise NotImplementedError - - def get_updates(self, loss, params): - raise NotImplementedError - - def get_gradients(self, loss, params): - """Returns gradients of `loss` with respect to `params`. - - Arguments: - loss: Loss tensor. - params: List of variables. - - Returns: - List of gradient tensors. - - Raises: - ValueError: In case any gradient cannot be computed (e.g. if gradient - function not implemented). - """ - grads = K.gradients(loss, params) - if any(g is None for g in grads): - raise ValueError('An operation has `None` for gradient. ' - 'Please make sure that all of your ops have a ' - 'gradient defined (i.e. are differentiable). ' - 'Common ops without gradient: ' - 'K.argmax, K.round, K.eval.') - if hasattr(self, 'clipnorm'): - grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads] - if hasattr(self, 'clipvalue'): - grads = [ - clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue) - for g in grads - ] - return grads - - def set_weights(self, weights): - """Sets the weights of the optimizer, from Numpy arrays. - - Should only be called after computing the gradients - (otherwise the optimizer has no weights). - - Arguments: - weights: a list of Numpy arrays. The number of arrays and their shape - must match number of the dimensions of the weights of the optimizer - (i.e. it should match the output of `get_weights`). - - Raises: - ValueError: in case of incompatible weight shapes. - """ - params = self.weights - if len(params) != len(weights): - raise ValueError('Length of the specified weight list (' + - str(len(weights)) + - ') does not match the number of weights ' - 'of the optimizer (' + str(len(params)) + ')') - weight_value_tuples = [] - param_values = K.batch_get_value(params) - for pv, p, w in zip(param_values, params, weights): - if pv.shape != w.shape: - raise ValueError('Optimizer weight shape ' + str(pv.shape) + - ' not compatible with ' - 'provided weight shape ' + str(w.shape)) - weight_value_tuples.append((p, w)) - K.batch_set_value(weight_value_tuples) - - def get_weights(self): - """Returns the current value of the weights of the optimizer. - - Returns: - A list of numpy arrays. - """ - return K.batch_get_value(self.weights) - - def get_config(self): - config = {} - if hasattr(self, 'clipnorm'): - config['clipnorm'] = self.clipnorm - if hasattr(self, 'clipvalue'): - config['clipvalue'] = self.clipvalue - return config - - @classmethod - def from_config(cls, config): - return cls(**config) - - -class SGD(Optimizer): - """Stochastic gradient descent optimizer. - - Includes support for momentum, - learning rate decay, and Nesterov momentum. - - Arguments: - lr: float >= 0. Learning rate. - momentum: float >= 0. Parameter that accelerates SGD in the relevant - direction and dampens oscillations. - decay: float >= 0. Learning rate decay over each update. - nesterov: boolean. Whether to apply Nesterov momentum. - """ - - def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, **kwargs): - super(SGD, self).__init__(**kwargs) - with K.name_scope(self.__class__.__name__): - self.iterations = K.variable(0, dtype='int64', name='iterations') - self.lr = K.variable(lr, name='lr') - self.momentum = K.variable(momentum, name='momentum') - self.decay = K.variable(decay, name='decay') - self.initial_decay = decay - self.nesterov = nesterov - - def _create_all_weights(self, params): - shapes = [K.int_shape(p) for p in params] - moments = [K.zeros(shape) for shape in shapes] - self.weights = [self.iterations] + moments - return moments - - def get_updates(self, loss, params): - grads = self.get_gradients(loss, params) - self.updates = [state_ops.assign_add(self.iterations, 1)] - - lr = self.lr - if self.initial_decay > 0: - lr = lr * ( # pylint: disable=g-no-augmented-assignment - 1. / - (1. + - self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) - # momentum - moments = self._create_all_weights(params) - for p, g, m in zip(params, grads, moments): - v = self.momentum * m - lr * g # velocity - self.updates.append(state_ops.assign(m, v)) - - if self.nesterov: - new_p = p + self.momentum * v - lr * g - else: - new_p = p + v - - # Apply constraints. - if getattr(p, 'constraint', None) is not None: - new_p = p.constraint(new_p) - - self.updates.append(state_ops.assign(p, new_p)) - return self.updates - - def get_config(self): - config = { - 'lr': float(K.get_value(self.lr)), - 'momentum': float(K.get_value(self.momentum)), - 'decay': float(K.get_value(self.decay)), - 'nesterov': self.nesterov - } - base_config = super(SGD, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - -class RMSprop(Optimizer): - """RMSProp optimizer. - - It is recommended to leave the parameters of this optimizer - at their default values - (except the learning rate, which can be freely tuned). - - Arguments: - lr: float >= 0. Learning rate. - rho: float >= 0. - epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`. - decay: float >= 0. Learning rate decay over each update. - """ - - def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0., **kwargs): - super(RMSprop, self).__init__(**kwargs) - with K.name_scope(self.__class__.__name__): - self.lr = K.variable(lr, name='lr') - self.rho = K.variable(rho, name='rho') - self.decay = K.variable(decay, name='decay') - self.iterations = K.variable(0, dtype='int64', name='iterations') - if epsilon is None: - epsilon = K.epsilon() - self.epsilon = epsilon - self.initial_decay = decay - - def _create_all_weights(self, params): - accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] - self.weights = accumulators - return accumulators - - def get_updates(self, loss, params): - grads = self.get_gradients(loss, params) - accumulators = self._create_all_weights(params) - self.updates = [state_ops.assign_add(self.iterations, 1)] - - lr = self.lr - if self.initial_decay > 0: - lr = lr * ( # pylint: disable=g-no-augmented-assignment - 1. / - (1. + - self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) - - for p, g, a in zip(params, grads, accumulators): - # update accumulator - new_a = self.rho * a + (1. - self.rho) * math_ops.square(g) - self.updates.append(state_ops.assign(a, new_a)) - new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon) - - # Apply constraints. - if getattr(p, 'constraint', None) is not None: - new_p = p.constraint(new_p) - - self.updates.append(state_ops.assign(p, new_p)) - return self.updates - - def get_config(self): - config = { - 'lr': float(K.get_value(self.lr)), - 'rho': float(K.get_value(self.rho)), - 'decay': float(K.get_value(self.decay)), - 'epsilon': self.epsilon - } - base_config = super(RMSprop, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - -class Adagrad(Optimizer): - """Adagrad optimizer. - - Adagrad is an optimizer with parameter-specific learning rates, - which are adapted relative to how frequently a parameter gets - updated during training. The more updates a parameter receives, - the smaller the updates. - - It is recommended to leave the parameters of this optimizer - at their default values. - - # Arguments - lr: float >= 0. Initial learning rate. - epsilon: float >= 0. If `None`, defaults to `K.epsilon()`. - decay: float >= 0. Learning rate decay over each update. - - # References - - [Adaptive Subgradient Methods for Online Learning and Stochastic - Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) - """ - - def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs): - super(Adagrad, self).__init__(**kwargs) - with K.name_scope(self.__class__.__name__): - self.lr = K.variable(lr, name='lr') - self.decay = K.variable(decay, name='decay') - self.iterations = K.variable(0, dtype='int64', name='iterations') - if epsilon is None: - epsilon = K.epsilon() - self.epsilon = epsilon - self.initial_decay = decay - - def _create_all_weights(self, params): - shapes = [K.int_shape(p) for p in params] - accumulators = [K.zeros(shape) for shape in shapes] - self.weights = accumulators - return accumulators - - def get_updates(self, loss, params): - grads = self.get_gradients(loss, params) - accumulators = self._create_all_weights(params) - - self.updates = [state_ops.assign_add(self.iterations, 1)] - - lr = self.lr - if self.initial_decay > 0: - lr = lr * ( # pylint: disable=g-no-augmented-assignment - 1. / - (1. + - self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) - - for p, g, a in zip(params, grads, accumulators): - new_a = a + math_ops.square(g) # update accumulator - self.updates.append(state_ops.assign(a, new_a)) - new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon) - - # Apply constraints. - if getattr(p, 'constraint', None) is not None: - new_p = p.constraint(new_p) - - self.updates.append(state_ops.assign(p, new_p)) - return self.updates - - def get_config(self): - config = { - 'lr': float(K.get_value(self.lr)), - 'decay': float(K.get_value(self.decay)), - 'epsilon': self.epsilon - } - base_config = super(Adagrad, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - -class Adadelta(Optimizer): - """Adadelta optimizer. - - Adadelta is a more robust extension of Adagrad - that adapts learning rates based on a moving window of gradient updates, - instead of accumulating all past gradients. This way, Adadelta continues - learning even when many updates have been done. Compared to Adagrad, in the - original version of Adadelta you don't have to set an initial learning - rate. In this version, initial learning rate and decay factor can - be set, as in most other Keras optimizers. - - It is recommended to leave the parameters of this optimizer - at their default values. - - # Arguments - lr: float >= 0. Initial learning rate, defaults to 1. - It is recommended to leave it at the default value. - rho: float >= 0. Adadelta decay factor, corresponding to fraction of - gradient to keep at each time step. - epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`. - decay: float >= 0. Initial learning rate decay. - - # References - - [Adadelta - an adaptive learning rate - method](http://arxiv.org/abs/1212.5701) - """ - - def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0., **kwargs): - super(Adadelta, self).__init__(**kwargs) - with K.name_scope(self.__class__.__name__): - self.lr = K.variable(lr, name='lr') - self.decay = K.variable(decay, name='decay') - self.iterations = K.variable(0, dtype='int64', name='iterations') - if epsilon is None: - epsilon = K.epsilon() - self.rho = rho - self.epsilon = epsilon - self.initial_decay = decay - - def _create_all_weights(self, params): - shapes = [K.int_shape(p) for p in params] - accumulators = [K.zeros(shape) for shape in shapes] - delta_accumulators = [K.zeros(shape) for shape in shapes] - self.weights = accumulators + delta_accumulators - return accumulators, delta_accumulators - - def get_updates(self, loss, params): - grads = self.get_gradients(loss, params) - self.updates = [state_ops.assign_add(self.iterations, 1)] - accumulators, delta_accumulators = self._create_all_weights(params) - - lr = self.lr - if self.initial_decay > 0: - lr = lr * ( # pylint: disable=g-no-augmented-assignment - 1. / - (1. + - self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) - - for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators): - # update accumulator - new_a = self.rho * a + (1. - self.rho) * math_ops.square(g) - self.updates.append(state_ops.assign(a, new_a)) - - # use the new accumulator and the *old* delta_accumulator - update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon) - new_p = p - lr * update - - # Apply constraints. - if getattr(p, 'constraint', None) is not None: - new_p = p.constraint(new_p) - - self.updates.append(state_ops.assign(p, new_p)) - - # update delta_accumulator - new_d_a = self.rho * d_a + (1 - self.rho) * math_ops.square(update) - self.updates.append(state_ops.assign(d_a, new_d_a)) - return self.updates - - def get_config(self): - config = { - 'lr': float(K.get_value(self.lr)), - 'rho': self.rho, - 'decay': float(K.get_value(self.decay)), - 'epsilon': self.epsilon - } - base_config = super(Adadelta, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - -class Adam(Optimizer): - """Adam optimizer. - - Default parameters follow those provided in the original paper. - - Arguments: - lr: float >= 0. Learning rate. - beta_1: float, 0 < beta < 1. Generally close to 1. - beta_2: float, 0 < beta < 1. Generally close to 1. - epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`. - decay: float >= 0. Learning rate decay over each update. - amsgrad: boolean. Whether to apply the AMSGrad variant of this algorithm - from the paper "On the Convergence of Adam and Beyond". - """ - - def __init__(self, - lr=0.001, - beta_1=0.9, - beta_2=0.999, - epsilon=None, - decay=0., - amsgrad=False, - **kwargs): - super(Adam, self).__init__(**kwargs) - with K.name_scope(self.__class__.__name__): - self.iterations = K.variable(0, dtype='int64', name='iterations') - self.lr = K.variable(lr, name='lr') - self.beta_1 = K.variable(beta_1, name='beta_1') - self.beta_2 = K.variable(beta_2, name='beta_2') - self.decay = K.variable(decay, name='decay') - if epsilon is None: - epsilon = K.epsilon() - self.epsilon = epsilon - self.initial_decay = decay - self.amsgrad = amsgrad - - def _create_all_weights(self, params): - ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] - vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] - if self.amsgrad: - vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] - else: - vhats = [K.zeros(1) for _ in params] - self.weights = [self.iterations] + ms + vs + vhats - return ms, vs, vhats - - def get_updates(self, loss, params): - grads = self.get_gradients(loss, params) - self.updates = [] - - lr = self.lr - if self.initial_decay > 0: - lr = lr * ( # pylint: disable=g-no-augmented-assignment - 1. / - (1. + - self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) - - with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]): - t = math_ops.cast(self.iterations, K.floatx()) - lr_t = lr * ( - K.sqrt(1. - math_ops.pow(self.beta_2, t)) / - (1. - math_ops.pow(self.beta_1, t))) - - ms, vs, vhats = self._create_all_weights(params) - for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): - m_t = (self.beta_1 * m) + (1. - self.beta_1) * g - v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g) - if self.amsgrad: - vhat_t = math_ops.maximum(vhat, v_t) - p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) - self.updates.append(state_ops.assign(vhat, vhat_t)) - else: - p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - - self.updates.append(state_ops.assign(m, m_t)) - self.updates.append(state_ops.assign(v, v_t)) - new_p = p_t - - # Apply constraints. - if getattr(p, 'constraint', None) is not None: - new_p = p.constraint(new_p) - - self.updates.append(state_ops.assign(p, new_p)) - return self.updates - - def get_config(self): - config = { - 'lr': float(K.get_value(self.lr)), - 'beta_1': float(K.get_value(self.beta_1)), - 'beta_2': float(K.get_value(self.beta_2)), - 'decay': float(K.get_value(self.decay)), - 'epsilon': self.epsilon, - 'amsgrad': self.amsgrad - } - base_config = super(Adam, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - -class Adamax(Optimizer): - """Adamax optimizer from Adam paper's Section 7. - - It is a variant of Adam based on the infinity norm. - Default parameters follow those provided in the paper. - - Arguments: - lr: float >= 0. Learning rate. - beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. - epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`. - decay: float >= 0. Learning rate decay over each update. - """ - - def __init__(self, - lr=0.002, - beta_1=0.9, - beta_2=0.999, - epsilon=None, - decay=0., - **kwargs): - super(Adamax, self).__init__(**kwargs) - with K.name_scope(self.__class__.__name__): - self.iterations = K.variable(0, dtype='int64', name='iterations') - self.lr = K.variable(lr, name='lr') - self.beta_1 = K.variable(beta_1, name='beta_1') - self.beta_2 = K.variable(beta_2, name='beta_2') - self.decay = K.variable(decay, name='decay') - if epsilon is None: - epsilon = K.epsilon() - self.epsilon = epsilon - self.initial_decay = decay - - def _create_all_weights(self, params): - - shapes = [K.int_shape(p) for p in params] - # zero init of 1st moment - ms = [K.zeros(shape) for shape in shapes] - # zero init of exponentially weighted infinity norm - us = [K.zeros(shape) for shape in shapes] - self.weights = [self.iterations] + ms + us - return ms, us - - def get_updates(self, loss, params): - grads = self.get_gradients(loss, params) - self.updates = [] - - lr = self.lr - if self.initial_decay > 0: - lr = lr * ( # pylint: disable=g-no-augmented-assignment - 1. / - (1. + - self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) - - with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]): - t = math_ops.cast(self.iterations, K.floatx()) - lr_t = lr / (1. - math_ops.pow(self.beta_1, t)) - - ms, us = self._create_all_weights(params) - - for p, g, m, u in zip(params, grads, ms, us): - - m_t = (self.beta_1 * m) + (1. - self.beta_1) * g - u_t = math_ops.maximum(self.beta_2 * u, math_ops.abs(g)) - p_t = p - lr_t * m_t / (u_t + self.epsilon) - - self.updates.append(state_ops.assign(m, m_t)) - self.updates.append(state_ops.assign(u, u_t)) - new_p = p_t - - # Apply constraints. - if getattr(p, 'constraint', None) is not None: - new_p = p.constraint(new_p) - - self.updates.append(state_ops.assign(p, new_p)) - return self.updates - - def get_config(self): - config = { - 'lr': float(K.get_value(self.lr)), - 'beta_1': float(K.get_value(self.beta_1)), - 'beta_2': float(K.get_value(self.beta_2)), - 'decay': float(K.get_value(self.decay)), - 'epsilon': self.epsilon - } - base_config = super(Adamax, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - -class Nadam(Optimizer): - """Nesterov Adam optimizer. - - Much like Adam is essentially RMSprop with momentum, - Nadam is Adam RMSprop with Nesterov momentum. - - Default parameters follow those provided in the paper. - It is recommended to leave the parameters of this optimizer - at their default values. - - Arguments: - lr: float >= 0. Learning rate. - beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. - epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`. - """ - - def __init__(self, - lr=0.002, - beta_1=0.9, - beta_2=0.999, - epsilon=None, - schedule_decay=0.004, - **kwargs): - super(Nadam, self).__init__(**kwargs) - with K.name_scope(self.__class__.__name__): - self.iterations = K.variable(0, dtype='int64', name='iterations') - self.m_schedule = K.variable(1., name='m_schedule') - self.lr = K.variable(lr, name='lr') - self.beta_1 = K.variable(beta_1, name='beta_1') - self.beta_2 = K.variable(beta_2, name='beta_2') - if epsilon is None: - epsilon = K.epsilon() - self.epsilon = epsilon - self.schedule_decay = schedule_decay - - def _create_all_weights(self, params): - shapes = [K.int_shape(p) for p in params] - ms = [K.zeros(shape) for shape in shapes] - vs = [K.zeros(shape) for shape in shapes] - - self.weights = [self.iterations, self.m_schedule] + ms + vs - return ms, vs - - def get_updates(self, loss, params): - grads = self.get_gradients(loss, params) - self.updates = [] - - with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]): - t = math_ops.cast(self.iterations, K.floatx()) - - # Due to the recommendations in [2], i.e. warming momentum schedule - momentum_cache_t = self.beta_1 * ( - 1. - 0.5 * - (math_ops.pow(K.cast_to_floatx(0.96), t * self.schedule_decay))) - momentum_cache_t_1 = self.beta_1 * ( - 1. - 0.5 * - (math_ops.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay))) - m_schedule_new = self.m_schedule * momentum_cache_t - m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1 - self.updates.append((self.m_schedule, m_schedule_new)) - - ms, vs = self._create_all_weights(params) - - for p, g, m, v in zip(params, grads, ms, vs): - # the following equations given in [1] - g_prime = g / (1. - m_schedule_new) - m_t = self.beta_1 * m + (1. - self.beta_1) * g - m_t_prime = m_t / (1. - m_schedule_next) - v_t = self.beta_2 * v + (1. - self.beta_2) * math_ops.square(g) - v_t_prime = v_t / (1. - math_ops.pow(self.beta_2, t)) - m_t_bar = (1. - - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime - - self.updates.append(state_ops.assign(m, m_t)) - self.updates.append(state_ops.assign(v, v_t)) - - p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon) - new_p = p_t - - # Apply constraints. - if getattr(p, 'constraint', None) is not None: - new_p = p.constraint(new_p) - - self.updates.append(state_ops.assign(p, new_p)) - return self.updates - - def get_config(self): - config = { - 'lr': float(K.get_value(self.lr)), - 'beta_1': float(K.get_value(self.beta_1)), - 'beta_2': float(K.get_value(self.beta_2)), - 'epsilon': self.epsilon, - 'schedule_decay': self.schedule_decay - } - base_config = super(Nadam, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - -class TFOptimizer(Optimizer, trackable.Trackable): - """Wrapper class for native TensorFlow optimizers.""" - - def __init__(self, optimizer, iterations=None): # pylint: disable=super-init-not-called - self.optimizer = optimizer - self._track_trackable(optimizer, name='optimizer') - if iterations is None: - with K.name_scope(self.__class__.__name__): - self.iterations = K.variable(0, dtype='int64', name='iterations') - else: - self.iterations = iterations - self._track_trackable(self.iterations, name='global_step') - - def _clip_gradients(self, grads): - """Clip gradients according to the clipnorm and clipvalue attributes.""" - # TFOptimizer wrapper has no gradient clipping options. - return grads - - def minimize(self, loss, var_list, grad_loss=None, tape=None): - """Mimics the `OptimizerV2.minimize` API.""" - if not callable(loss) and tape is None: - raise ValueError('`tape` is required when a `Tensor` loss is passed.') - tape = tape if tape is not None else backprop.GradientTape() - - if callable(loss): - with tape: - if not callable(var_list): - tape.watch(var_list) - loss = loss() - if callable(var_list): - var_list = var_list() - - var_list = nest.flatten(var_list) - if var_list: - grads = tape.gradient(loss, var_list, grad_loss) - grads_and_vars = list(zip(grads, var_list)) - self.apply_gradients(grads_and_vars) - - def apply_gradients(self, grads_and_vars): - self.optimizer.apply_gradients(grads_and_vars, global_step=self.iterations) - - def get_grads(self, loss, params): - return self.optimizer.compute_gradients(loss, params) - - def get_updates(self, loss, params): - if distribution_strategy_context.has_strategy(): - self.updates = [] - - if not params: - # After the model vars have been created, the second call to get_updates - # is called with params as an empty list. This ensures that we call - # compute_gradients with params=None. - grads = self.optimizer.compute_gradients(loss) - else: - grads = self.optimizer.compute_gradients(loss, params) - global_step = training_util.get_global_step() - opt_update = self.optimizer.apply_gradients(grads, global_step) - else: - if not params: - self.updates = [state_ops.assign_add(self.iterations, 1)] - return self.updates - - # Updates list starts out empty because the iterations variable is - # incremented in optimizer.apply_gradients() - self.updates = [] - grads = self.optimizer.compute_gradients(loss, params) - opt_update = self.optimizer.apply_gradients( - grads, global_step=self.iterations) - - self.updates.append(opt_update) - return self.updates - - @property - def weights(self): - raise NotImplementedError - - def get_config(self): - raise NotImplementedError - - def from_config(self, config): - raise NotImplementedError - - -# Aliases. - -sgd = SGD -rmsprop = RMSprop -adagrad = Adagrad -adadelta = Adadelta -adam = Adam -adamax = Adamax -nadam = Nadam - - @keras_export('keras.optimizers.serialize') def serialize(optimizer): return serialize_keras_object(optimizer) diff --git a/tensorflow/python/keras/optimizers_test.py b/tensorflow/python/keras/optimizers_test.py index db051eafea0..8c6658a10b9 100644 --- a/tensorflow/python/keras/optimizers_test.py +++ b/tensorflow/python/keras/optimizers_test.py @@ -27,6 +27,7 @@ from tensorflow.python import keras from tensorflow.python.eager import context from tensorflow.python.framework import ops from tensorflow.python.keras import keras_parameterized +from tensorflow.python.keras import optimizer_v1 from tensorflow.python.keras import testing_utils from tensorflow.python.keras.utils import np_utils from tensorflow.python.platform import test @@ -109,63 +110,63 @@ class KerasOptimizersTest(keras_parameterized.TestCase): def test_sgd(self): with self.cached_session(): - self._test_optimizer(keras.optimizers.SGD()) + self._test_optimizer(optimizer_v1.SGD()) def test_momentum(self): with self.cached_session(): self._test_optimizer( - keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True)) + optimizer_v1.SGD(lr=0.01, momentum=0.9, nesterov=True)) def test_rmsprop(self): with self.cached_session(): - self._test_optimizer(keras.optimizers.RMSprop()) - self._test_optimizer(keras.optimizers.RMSprop(decay=1e-3)) + self._test_optimizer(optimizer_v1.RMSprop()) + self._test_optimizer(optimizer_v1.RMSprop(decay=1e-3)) def test_adagrad(self): with self.cached_session(): - self._test_optimizer(keras.optimizers.Adagrad()) - self._test_optimizer(keras.optimizers.Adagrad(decay=1e-3)) + self._test_optimizer(optimizer_v1.Adagrad()) + self._test_optimizer(optimizer_v1.Adagrad(decay=1e-3)) def test_adadelta(self): with self.cached_session(): - self._test_optimizer(keras.optimizers.Adadelta(), target=0.6) + self._test_optimizer(optimizer_v1.Adadelta(), target=0.6) # Accuracy seems dependent on the initialization. Even adding # tf.compat.v1.Print nodes in the graph seemed to affect the # initialization seed, and hence the accuracy. - self._test_optimizer(keras.optimizers.Adadelta(decay=1e-3), target=0.4) + self._test_optimizer(optimizer_v1.Adadelta(decay=1e-3), target=0.4) def test_adam(self): with self.cached_session(): - self._test_optimizer(keras.optimizers.Adam()) + self._test_optimizer(optimizer_v1.Adam()) # Accuracy seems dependent on the seed initialization. # TODO(b/121051441): fix test flakiness. - self._test_optimizer(keras.optimizers.Adam(decay=1e-3), target=0.73) - self._test_optimizer(keras.optimizers.Adam(amsgrad=True)) + self._test_optimizer(optimizer_v1.Adam(decay=1e-3), target=0.73) + self._test_optimizer(optimizer_v1.Adam(amsgrad=True)) def test_adamax(self): with self.cached_session(): - self._test_optimizer(keras.optimizers.Adamax()) - self._test_optimizer(keras.optimizers.Adamax(decay=1e-3)) + self._test_optimizer(optimizer_v1.Adamax()) + self._test_optimizer(optimizer_v1.Adamax(decay=1e-3)) def test_nadam(self): with self.cached_session(): - self._test_optimizer(keras.optimizers.Nadam()) + self._test_optimizer(optimizer_v1.Nadam()) def test_clipnorm(self): with self.cached_session(): self._test_optimizer( - keras.optimizers.SGD(lr=0.01, momentum=0.9, clipnorm=0.5)) + optimizer_v1.SGD(lr=0.01, momentum=0.9, clipnorm=0.5)) def test_clipvalue(self): with self.cached_session(): self._test_optimizer( - keras.optimizers.SGD(lr=0.01, momentum=0.9, clipvalue=0.5)) + optimizer_v1.SGD(lr=0.01, momentum=0.9, clipvalue=0.5)) def test_tf_optimizer(self): if context.executing_eagerly(): self.skipTest( 'v1 optimizer does not run in eager mode') - optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01)) + optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01)) model = keras.models.Sequential() model.add(keras.layers.Dense( 2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1))) @@ -194,7 +195,7 @@ class KerasOptimizersTest(keras_parameterized.TestCase): 'v1 optimizer does not run in eager mode') graph = ops.Graph() with graph.as_default(): - optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01)) + optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01)) keras.backend.track_tf_optimizer(optimizer) optimizer_weak = weakref.ref(optimizer) graph_weak = weakref.ref(graph) @@ -209,7 +210,7 @@ class KerasOptimizersTest(keras_parameterized.TestCase): self.skipTest( 'v1 optimizer does not run in eager mode') with self.cached_session(): - optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01)) + optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01)) model = keras.models.Sequential() model.add(keras.layers.Dense( 2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1))) @@ -229,9 +230,9 @@ class KerasOptimizersTest(keras_parameterized.TestCase): def test_negative_clipvalue_or_clipnorm(self): with self.assertRaises(ValueError): - _ = keras.optimizers.SGD(lr=0.01, clipvalue=-0.5) + _ = optimizer_v1.SGD(lr=0.01, clipvalue=-0.5) with self.assertRaises(ValueError): - _ = keras.optimizers.Adam(clipnorm=-2.0) + _ = optimizer_v1.Adam(clipnorm=-2.0) def test_mixed_precision_loss_scale_optimizer(self): if context.executing_eagerly(): diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py index d8b888c8996..d3bb10c98dd 100644 --- a/tensorflow/python/keras/saving/hdf5_format.py +++ b/tensorflow/python/keras/saving/hdf5_format.py @@ -26,7 +26,7 @@ import numpy as np from six.moves import zip # pylint: disable=redefined-builtin from tensorflow.python.keras import backend as K -from tensorflow.python.keras import optimizers +from tensorflow.python.keras import optimizer_v1 from tensorflow.python.keras.saving import model_config as model_config_lib from tensorflow.python.keras.saving import saving_utils from tensorflow.python.keras.saving.saved_model import json_utils @@ -127,7 +127,7 @@ def save_model_to_hdf5(model, filepath, overwrite=True, include_optimizer=True): # TODO(b/128683857): Add integration tests between tf.keras and external # Keras, to avoid breaking TF.js users. if (include_optimizer and model.optimizer and - not isinstance(model.optimizer, optimizers.TFOptimizer)): + not isinstance(model.optimizer, optimizer_v1.TFOptimizer)): save_optimizer_weights_to_hdf5_group(f, model.optimizer) f.flush() diff --git a/tensorflow/python/keras/saving/hdf5_format_test.py b/tensorflow/python/keras/saving/hdf5_format_test.py index 1817bfc9263..e91b9b323ce 100644 --- a/tensorflow/python/keras/saving/hdf5_format_test.py +++ b/tensorflow/python/keras/saving/hdf5_format_test.py @@ -34,6 +34,7 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.keras import combinations from tensorflow.python.keras import keras_parameterized +from tensorflow.python.keras import optimizer_v1 from tensorflow.python.keras import optimizers from tensorflow.python.keras import testing_utils from tensorflow.python.keras.engine import training @@ -341,7 +342,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase): name='d1')) ref_model.add(keras.layers.Dense(num_classes, name='d2')) ref_model.compile(loss=keras.losses.MSE, - optimizer=keras.optimizers.RMSprop(lr=0.0001), + optimizer=optimizer_v1.RMSprop(lr=0.0001), metrics=[keras.metrics.categorical_accuracy]) f_ref_model = h5py.File(h5_path, 'w') @@ -354,7 +355,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase): name='d1')) model.add(keras.layers.Dense(num_classes, name='d2')) model.compile(loss=keras.losses.MSE, - optimizer=keras.optimizers.RMSprop(lr=0.0001), + optimizer=optimizer_v1.RMSprop(lr=0.0001), metrics=[keras.metrics.categorical_accuracy]) with self.assertRaisesRegex( ValueError, r'Layer #0 \(named "d1"\), weight ' @@ -515,7 +516,7 @@ class TestWholeModelSaving(keras_parameterized.TestCase): with ops.Graph().as_default(), self.cached_session(): # test with custom optimizer, loss - class CustomOp(keras.optimizers.RMSprop): + class CustomOp(optimizer_v1.RMSprop): pass def custom_loss(y_true, y_pred): @@ -692,7 +693,7 @@ class TestWholeModelSaving(keras_parameterized.TestCase): model = keras.Model(inputs, outputs) model.compile( loss=keras.losses.MSE, - optimizer=keras.optimizers.Adam(), + optimizer=optimizer_v1.Adam(), metrics=[ keras.metrics.categorical_accuracy, keras.metrics.CategoricalAccuracy() @@ -1028,7 +1029,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase): model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3,))) model.add(keras.layers.Dense(3)) - model.compile(loss='mse', optimizer=optimizers.Adam(), metrics=['acc']) + model.compile(loss='mse', optimizer=optimizer_v1.Adam(), metrics=['acc']) if not ops.executing_eagerly_outside_functions(): model._make_train_function() temp_dir = self.get_temp_dir() diff --git a/tensorflow/python/keras/saving/saved_model_experimental.py b/tensorflow/python/keras/saving/saved_model_experimental.py index dddab2a084e..cbb75d1ebab 100644 --- a/tensorflow/python/keras/saving/saved_model_experimental.py +++ b/tensorflow/python/keras/saving/saved_model_experimental.py @@ -25,7 +25,7 @@ import six from tensorflow.python.client import session from tensorflow.python.framework import ops from tensorflow.python.keras import backend as K -from tensorflow.python.keras import optimizers +from tensorflow.python.keras import optimizer_v1 from tensorflow.python.keras.optimizer_v2 import optimizer_v2 from tensorflow.python.keras.saving import model_config from tensorflow.python.keras.saving import saving_utils @@ -206,7 +206,7 @@ def _save_v1_format(model, path, custom_objects, as_text, input_signature): has_saved_vars = False if model.optimizer: - if isinstance(model.optimizer, (optimizers.TFOptimizer, + if isinstance(model.optimizer, (optimizer_v1.TFOptimizer, optimizer_v2.OptimizerV2)): _export_mode(mode_keys.ModeKeys.TRAIN, has_saved_vars, **export_args) has_saved_vars = True diff --git a/tensorflow/python/keras/saving/saved_model_experimental_test.py b/tensorflow/python/keras/saving/saved_model_experimental_test.py index f4b91298d10..45130922250 100644 --- a/tensorflow/python/keras/saving/saved_model_experimental_test.py +++ b/tensorflow/python/keras/saving/saved_model_experimental_test.py @@ -31,6 +31,7 @@ from tensorflow.python.eager import context from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_spec +from tensorflow.python.keras import optimizer_v1 from tensorflow.python.keras.engine import training as model_lib from tensorflow.python.keras.optimizer_v2 import adadelta from tensorflow.python.keras.optimizer_v2 import rmsprop @@ -458,7 +459,7 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase): x = keras.layers.Dense(2)(inputs) x = keras.layers.Dense(3)(x) clone = keras.models.Model(inputs, x) - clone.compile(loss='mse', optimizer=keras.optimizers.RMSprop(lr=0.0001)) + clone.compile(loss='mse', optimizer=optimizer_v1.RMSprop(lr=0.0001)) clone.train_on_batch(input_arr, target_arr) keras_saved_model._assert_same_non_optimizer_objects( @@ -487,7 +488,7 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase): x = keras.layers.Dense(4)(x) x = keras.layers.Dense(3)(x) clone = keras.models.Model(inputs, x) - clone.compile(loss='mse', optimizer=keras.optimizers.RMSprop(lr=0.0001)) + clone.compile(loss='mse', optimizer=optimizer_v1.RMSprop(lr=0.0001)) clone.train_on_batch(input_arr, target_arr) def testSaveSequentialModelWithoutInputShapes(self): diff --git a/tensorflow/python/keras/saving/saving_utils.py b/tensorflow/python/keras/saving/saving_utils.py index 0c3d044e80d..b240c4262af 100644 --- a/tensorflow/python/keras/saving/saving_utils.py +++ b/tensorflow/python/keras/saving/saving_utils.py @@ -24,6 +24,7 @@ import six from tensorflow.python.eager import def_function from tensorflow.python.keras import backend as K from tensorflow.python.keras import losses +from tensorflow.python.keras import optimizer_v1 from tensorflow.python.keras import optimizers from tensorflow.python.keras.engine import base_layer_utils from tensorflow.python.keras.utils import generic_utils @@ -161,7 +162,7 @@ def model_metadata(model, include_optimizer=True, require_config=True): backend=K.backend(), model_config=model_config) if model.optimizer and include_optimizer: - if isinstance(model.optimizer, optimizers.TFOptimizer): + if isinstance(model.optimizer, optimizer_v1.TFOptimizer): logging.warning( 'TensorFlow optimizers do not ' 'make it possible to access ' diff --git a/tensorflow/python/keras/tests/model_architectures_test.py b/tensorflow/python/keras/tests/model_architectures_test.py index fe7b7e476b0..f1f2cc4fe33 100644 --- a/tensorflow/python/keras/tests/model_architectures_test.py +++ b/tensorflow/python/keras/tests/model_architectures_test.py @@ -26,6 +26,7 @@ import numpy as np from tensorflow.python import keras from tensorflow.python.keras import keras_parameterized +from tensorflow.python.keras import optimizer_v1 from tensorflow.python.keras import testing_utils from tensorflow.python.keras.tests import model_architectures from tensorflow.python.platform import test @@ -62,7 +63,7 @@ class TestModelArchitectures(keras_parameterized.TestCase): def get_custom_objects(self): """Define custom_objects.""" - class CustomOpt(keras.optimizers.SGD): + class CustomOpt(optimizer_v1.SGD): pass def custom_loss(y_true, y_pred): diff --git a/tensorflow/python/keras/utils/multi_gpu_utils_test.py b/tensorflow/python/keras/utils/multi_gpu_utils_test.py index 0765afb4db7..322028bbf7d 100644 --- a/tensorflow/python/keras/utils/multi_gpu_utils_test.py +++ b/tensorflow/python/keras/utils/multi_gpu_utils_test.py @@ -24,6 +24,7 @@ from tensorflow.python import keras from tensorflow.python.eager import context from tensorflow.python.framework import config from tensorflow.python.framework import ops +from tensorflow.python.keras import optimizer_v1 from tensorflow.python.keras.utils import multi_gpu_utils from tensorflow.python.keras.utils import np_utils from tensorflow.python.platform import test @@ -191,7 +192,7 @@ class TestMultiGPUModel(test.TestCase): parallel_model.compile( loss='categorical_crossentropy', - optimizer=keras.optimizers.RMSprop(lr=0.0001, decay=1e-6), + optimizer=optimizer_v1.RMSprop(lr=0.0001, decay=1e-6), metrics=['accuracy'], target_tensors=[targets]) parallel_model.fit(epochs=1, steps_per_epoch=3)