STT-tensorflow/tensorflow/contrib/opt/python/training/external_optimizer.py

# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""TensorFlow interface for third-party optimizers."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np

from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import gradients
from tensorflow.python.ops import variables
from tensorflow.python.platform import tf_logging as logging

__all__ = ['ExternalOptimizerInterface', 'ScipyOptimizerInterface']


class ExternalOptimizerInterface(object):
  """Base class for interfaces with external optimization algorithms.

  Subclass this and implement `_minimize` in order to wrap a new optimization
  algorithm.

  `ExternalOptimizerInterface` should not be instantiated directly; instead use
  e.g. `ScipyOptimizerInterface`.

  @@__init__

  @@minimize
  """

  def __init__(self,
               loss,
               var_list=None,
               equalities=None,
               inequalities=None,
               var_to_bounds=None,
               **optimizer_kwargs):
    """Initialize a new interface instance.

    Args:
      loss: A scalar `Tensor` to be minimized.
      var_list: Optional `list` of `Variable` objects to update to minimize
        `loss`.  Defaults to the list of variables collected in the graph
        under the key `GraphKeys.TRAINABLE_VARIABLES`.
      equalities: Optional `list` of equality constraint scalar `Tensor`s to be
        held equal to zero.
      inequalities: Optional `list` of inequality constraint scalar `Tensor`s
        to be held nonnegative.
      var_to_bounds: Optional `dict` where each key is an optimization
        `Variable` and each corresponding value is a length-2 tuple of
        `(low, high)` bounds. Although enforcing this kind of simple constraint
        could be accomplished with the `inequalities` arg, not all optimization
        algorithms support general inequality constraints, e.g. L-BFGS-B. Both
        `low` and `high` can either be numbers or anything convertible to a
        NumPy array that can be broadcast to the shape of `var` (using
        `np.broadcast_to`). To indicate that there is no bound, use `None` (or
        `+/- np.infty`). For example, if `var` is a 2x3 matrix, then any of
        the following corresponding `bounds` could be supplied:
        * `(0, np.infty)`: Each element of `var` held positive.
        * `(-np.infty, [1, 2])`: First column less than 1, second column less
          than 2.
        * `(-np.infty, [[1], [2], [3]])`: First row less than 1, second row less
          than 2, etc.
        * `(-np.infty, [[1, 2, 3], [4, 5, 6]])`: Entry `var[0, 0]` less than 1,
          `var[0, 1]` less than 2, etc.
      **optimizer_kwargs: Other subclass-specific keyword arguments.
    """
    self._loss = loss
    self._equalities = equalities or []
    self._inequalities = inequalities or []

    if var_list is None:
      self._vars = variables.trainable_variables()
    else:
      self._vars = list(var_list)

    packed_bounds = None
    if var_to_bounds is not None:
      left_packed_bounds = []
      right_packed_bounds = []
      for var in self._vars:
        shape = var.get_shape().as_list()
        bounds = (-np.infty, np.infty)
        if var in var_to_bounds:
          bounds = var_to_bounds[var]
        left_packed_bounds.extend(list(np.broadcast_to(bounds[0], shape).flat))
        right_packed_bounds.extend(list(np.broadcast_to(bounds[1], shape).flat))
      packed_bounds = list(zip(left_packed_bounds, right_packed_bounds))
    self._packed_bounds = packed_bounds

    self._update_placeholders = [
        array_ops.placeholder(var.dtype) for var in self._vars
    ]
    self._var_updates = [
        var.assign(array_ops.reshape(placeholder, _get_shape_tuple(var)))
        for var, placeholder in zip(self._vars, self._update_placeholders)
    ]

    loss_grads = _compute_gradients(loss, self._vars)
    equalities_grads = [
        _compute_gradients(equality, self._vars)
        for equality in self._equalities
    ]
    inequalities_grads = [
        _compute_gradients(inequality, self._vars)
        for inequality in self._inequalities
    ]

    self.optimizer_kwargs = optimizer_kwargs

    self._packed_var = self._pack(self._vars)
    self._packed_loss_grad = self._pack(loss_grads)
    self._packed_equality_grads = [
        self._pack(equality_grads) for equality_grads in equalities_grads
    ]
    self._packed_inequality_grads = [
        self._pack(inequality_grads) for inequality_grads in inequalities_grads
    ]

    dims = [_prod(_get_shape_tuple(var)) for var in self._vars]
    accumulated_dims = list(_accumulate(dims))
    self._packing_slices = [
        slice(start, end)
        for start, end in zip(accumulated_dims[:-1], accumulated_dims[1:])
    ]

  def minimize(self,
               session=None,
               feed_dict=None,
               fetches=None,
               step_callback=None,
               loss_callback=None,
               **run_kwargs):
    """Minimize a scalar `Tensor`.

    Variables subject to optimization are updated in-place at the end of
    optimization.

    Note that this method does *not* just return a minimization `Op`, unlike
    `Optimizer.minimize()`; instead it actually performs minimization by
    executing commands to control a `Session`.

    Args:
      session: A `Session` instance.
      feed_dict: A feed dict to be passed to calls to `session.run`.
      fetches: A list of `Tensor`s to fetch and supply to `loss_callback`
        as positional arguments.
      step_callback: A function to be called at each optimization step;
        arguments are the current values of all optimization variables
        flattened into a single vector.
      loss_callback: A function to be called every time the loss and gradients
        are computed, with evaluated fetches supplied as positional arguments.
      **run_kwargs: kwargs to pass to `session.run`.
    """
    session = session or ops.get_default_session()
    feed_dict = feed_dict or {}
    fetches = fetches or []

    loss_callback = loss_callback or (lambda *fetches: None)
    step_callback = step_callback or (lambda xk: None)

    # Construct loss function and associated gradient.
    loss_grad_func = self._make_eval_func([self._loss,
                                           self._packed_loss_grad], session,
                                          feed_dict, fetches, loss_callback)

    # Construct equality constraint functions and associated gradients.
    equality_funcs = self._make_eval_funcs(self._equalities, session, feed_dict,
                                           fetches)
    equality_grad_funcs = self._make_eval_funcs(self._packed_equality_grads,
                                                session, feed_dict, fetches)

    # Construct inequality constraint functions and associated gradients.
    inequality_funcs = self._make_eval_funcs(self._inequalities, session,
                                             feed_dict, fetches)
    inequality_grad_funcs = self._make_eval_funcs(self._packed_inequality_grads,
                                                  session, feed_dict, fetches)

    # Get initial value from TF session.
    initial_packed_var_val = session.run(self._packed_var)

    # Perform minimization.
    packed_var_val = self._minimize(
        initial_val=initial_packed_var_val,
        loss_grad_func=loss_grad_func,
        equality_funcs=equality_funcs,
        equality_grad_funcs=equality_grad_funcs,
        inequality_funcs=inequality_funcs,
        inequality_grad_funcs=inequality_grad_funcs,
        packed_bounds=self._packed_bounds,
        step_callback=step_callback,
        optimizer_kwargs=self.optimizer_kwargs)
    var_vals = [
        packed_var_val[packing_slice] for packing_slice in self._packing_slices
    ]

    # Set optimization variables to their new values.
    session.run(
        self._var_updates,
        feed_dict=dict(zip(self._update_placeholders, var_vals)),
        **run_kwargs)

  def _minimize(self, initial_val, loss_grad_func, equality_funcs,
                equality_grad_funcs, inequality_funcs, inequality_grad_funcs,
                packed_bounds, step_callback, optimizer_kwargs):
    """Wrapper for a particular optimization algorithm implementation.

    It would be appropriate for a subclass implementation of this method to
    raise `NotImplementedError` if unsupported arguments are passed: e.g. if an
    algorithm does not support constraints but `len(equality_funcs) > 0`.

    Args:
      initial_val: A NumPy vector of initial values.
      loss_grad_func: A function accepting a NumPy packed variable vector and
        returning two outputs, a loss value and the gradient of that loss with
        respect to the packed variable vector.
      equality_funcs: A list of functions each of which specifies a scalar
        quantity that an optimizer should hold exactly zero.
      equality_grad_funcs: A list of gradients of equality_funcs.
      inequality_funcs: A list of functions each of which specifies a scalar
        quantity that an optimizer should hold >= 0.
      inequality_grad_funcs: A list of gradients of inequality_funcs.
      packed_bounds: A list of bounds for each index, or `None`.
      step_callback: A callback function to execute at each optimization step,
        supplied with the current value of the packed variable vector.
      optimizer_kwargs: Other key-value arguments available to the optimizer.

    Returns:
      The optimal variable vector as a NumPy vector.
    """
    raise NotImplementedError(
        'To use ExternalOptimizerInterface, subclass from it and implement '
        'the _minimize() method.')

  @classmethod
  def _pack(cls, tensors):
    """Pack a list of `Tensor`s into a single, flattened, rank-1 `Tensor`."""
    if not tensors:
      return None
    elif len(tensors) == 1:
      return array_ops.reshape(tensors[0], [-1])
    else:
      flattened = [array_ops.reshape(tensor, [-1]) for tensor in tensors]
      return array_ops.concat(flattened, 0)

  def _make_eval_func(self, tensors, session, feed_dict, fetches,
                      callback=None):
    """Construct a function that evaluates a `Tensor` or list of `Tensor`s."""
    if not isinstance(tensors, list):
      tensors = [tensors]
    num_tensors = len(tensors)

    def eval_func(x):
      """Function to evaluate a `Tensor`."""
      augmented_feed_dict = {
          var: x[packing_slice].reshape(_get_shape_tuple(var))
          for var, packing_slice in zip(self._vars, self._packing_slices)
      }
      augmented_feed_dict.update(feed_dict)
      augmented_fetches = tensors + fetches

      augmented_fetch_vals = session.run(
          augmented_fetches, feed_dict=augmented_feed_dict)

      if callable(callback):
        callback(*augmented_fetch_vals[num_tensors:])

      return augmented_fetch_vals[:num_tensors]

    return eval_func

  def _make_eval_funcs(self,
                       tensors,
                       session,
                       feed_dict,
                       fetches,
                       callback=None):
    return [
        self._make_eval_func(tensor, session, feed_dict, fetches, callback)
        for tensor in tensors
    ]


class ScipyOptimizerInterface(ExternalOptimizerInterface):
  """Wrapper allowing `scipy.optimize.minimize` to operate a `tf.Session`.

  Example:

  ```python
  vector = tf.Variable([7., 7.], 'vector')

  # Make vector norm as small as possible.
  loss = tf.reduce_sum(tf.square(vector))

  optimizer = ScipyOptimizerInterface(loss, options={'maxiter': 100})

  with tf.Session() as session:
    optimizer.minimize(session)

  # The value of vector should now be [0., 0.].
  ```

  Example with simple bound constraints:

  ```python
  vector = tf.Variable([7., 7.], 'vector')

  # Make vector norm as small as possible.
  loss = tf.reduce_sum(tf.square(vector))

  optimizer = ScipyOptimizerInterface(
      loss, var_to_bounds={vector: ([1, 2], np.infty)})

  with tf.Session() as session:
    optimizer.minimize(session)

  # The value of vector should now be [1., 2.].
  ```

  Example with more complicated constraints:

  ```python
  vector = tf.Variable([7., 7.], 'vector')

  # Make vector norm as small as possible.
  loss = tf.reduce_sum(tf.square(vector))
  # Ensure the vector's y component is = 1.
  equalities = [vector[1] - 1.]
  # Ensure the vector's x component is >= 1.
  inequalities = [vector[0] - 1.]

  # Our default SciPy optimization algorithm, L-BFGS-B, does not support
  # general constraints. Thus we use SLSQP instead.
  optimizer = ScipyOptimizerInterface(
      loss, equalities=equalities, inequalities=inequalities, method='SLSQP')

  with tf.Session() as session:
    optimizer.minimize(session)

  # The value of vector should now be [1., 1.].
  ```
  """

  _DEFAULT_METHOD = 'L-BFGS-B'

  def _minimize(self, initial_val, loss_grad_func, equality_funcs,
                equality_grad_funcs, inequality_funcs, inequality_grad_funcs,
                packed_bounds, step_callback, optimizer_kwargs):

    def loss_grad_func_wrapper(x):
      # SciPy's L-BFGS-B Fortran implementation requires gradients as doubles.
      loss, gradient = loss_grad_func(x)
      return loss, gradient.astype('float64')

    optimizer_kwargs = dict(optimizer_kwargs.items())
    method = optimizer_kwargs.pop('method', self._DEFAULT_METHOD)

    constraints = []
    for func, grad_func in zip(equality_funcs, equality_grad_funcs):
      constraints.append({'type': 'eq', 'fun': func, 'jac': grad_func})
    for func, grad_func in zip(inequality_funcs, inequality_grad_funcs):
      constraints.append({'type': 'ineq', 'fun': func, 'jac': grad_func})

    minimize_args = [loss_grad_func_wrapper, initial_val]
    minimize_kwargs = {
        'jac': True,
        'callback': step_callback,
        'method': method,
        'constraints': constraints,
        'bounds': packed_bounds,
    }

    for kwarg in minimize_kwargs:
      if kwarg in optimizer_kwargs:
        if kwarg == 'bounds':
          # Special handling for 'bounds' kwarg since ability to specify bounds
          # was added after this module was already publicly released.
          raise ValueError(
              'Bounds must be set using the var_to_bounds argument')
        raise ValueError(
            'Optimizer keyword arg \'{}\' is set '
            'automatically and cannot be injected manually'.format(kwarg))

    minimize_kwargs.update(optimizer_kwargs)

    import scipy.optimize  # pylint: disable=g-import-not-at-top
    result = scipy.optimize.minimize(*minimize_args, **minimize_kwargs)

    message_lines = [
        'Optimization terminated with:',
        '  Message: %s',
        '  Objective function value: %f',
    ]
    message_args = [result.message, result.fun]
    if hasattr(result, 'nit'):
      # Some optimization methods might not provide information such as nit and
      # nfev in the return. Logs only available information.
      message_lines.append('  Number of iterations: %d')
      message_args.append(result.nit)
    if hasattr(result, 'nfev'):
      message_lines.append('  Number of functions evaluations: %d')
      message_args.append(result.nfev)
    logging.info('\n'.join(message_lines), *message_args)

    return result['x']


def _accumulate(list_):
  total = 0
  yield total
  for x in list_:
    total += x
    yield total


def _get_shape_tuple(tensor):
  return tuple(dim.value for dim in tensor.get_shape())


def _prod(array):
  prod = 1
  for value in array:
    prod *= value
  return prod


def _compute_gradients(tensor, var_list):
  grads = gradients.gradients(tensor, var_list)
  # tf.gradients sometimes returns `None` when it should return 0.
  return [
      grad if grad is not None else array_ops.zeros_like(var)
      for var, grad in zip(var_list, grads)
  ]
No results found.