Reduce Python overheads for lazily constructed Layer properties.

PiperOrigin-RevId: 248435402
2019-05-15 17:08:20 -07:00 · 2019-05-15 17:08:20 -07:00 · 42ac719705
commit 42ac719705
parent 83668b0826
4 changed files with 245 additions and 9 deletions
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@ -2046,12 +2046,12 @@ class Layer(module.Module):
    return True
  @property
  @tracking.cached_per_instance
  def _call_fn_args(self):
-    if getattr(self, '__call_fn_args', None) is None:
+    return function_utils.fn_args(self.call)
      self.__call_fn_args = function_utils.fn_args(self.call)
    return self.__call_fn_args
  @property
  @tracking.cached_per_instance
  def _should_compute_mask(self):
    return ('mask' in self._call_fn_args or
            getattr(self, 'compute_mask', None) is not None)
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@ -50,6 +50,7 @@ from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.training.tracking import util as trackable_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import serialization
@ -513,6 +514,7 @@ class Network(base_layer.Layer):
    return weights
  @property
  @tracking.cached_per_instance
  def _should_compute_mask(self):
    return self._is_graph_network and super(Network, self)._should_compute_mask
--- a/tensorflow/python/training/tracking/tracking.py
+++ b/tensorflow/python/training/tracking/tracking.py
@ -17,6 +17,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import functools
 import weakref
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as defun
 from tensorflow.python.framework import dtypes
@ -241,6 +244,100 @@ class TrackableAsset(base.Trackable):
    """Fetch the current asset path."""
    return self._path
 def cached_per_instance(f):
  """Lightweight decorator for caching lazily constructed properties.
  When to use:
  This decorator provides simple caching with minimal overhead. It is designed
  for properties which are expensive to compute and static over the life of a
  class instance, and provides no mechanism for cache invalidation. Thus it is
  best suited for lazily exposing derived properties of other static data.
  For classes with custom getattr / setattr behavior (such as trackable
  objects), storing cache results as object attributes is not performant.
  Instead, a specialized cache can significantly reduce property lookup
  overhead. (While still allowing the decorated property to be lazily computed.)
  Consider the following class:
  ```
  class MyClass(object):
    def __setattr__(self, key, value):
      # Some expensive class specific code
      # ...
      # ...
      super(MyClass, self).__setattr__(key, value)
    @property
    def thing(self):
      # `thing` is expensive to compute (and may not even be requested), so we
      # want to lazily compute it and then cache it.
      output = getattr(self, '_thing', None)
      if output is None:
        self._thing = output = compute_thing(self)
      return output
  ```
  It's also worth noting that ANY overriding of __setattr__, even something as
  simple as:
  ```
    def __setattr__(self, key, value):
      super(MyClass, self).__setattr__(key, value)
  ```
  Slows down attribute assignment by nearly 10x.
  By contrast, replacing the definition of `thing` with the following sidesteps
  the expensive __setattr__ altogether:
  '''
  @property
  @tracking.cached_per_instance
  def thing(self):
    # `thing` is expensive to compute (and may not even be requested), so we
    # want to lazily compute it and then cache it.
    return compute_thing(self)
  '''
  Performance:
  The overhead for this decorator is ~0.4 us / call. A much lower overhead
  implementation (~0.085 us / call) can be achieved by using a custom dict type:
  ```
  def dict_based_cache(f):
    class Cache(dict):
      __slots__ = ()
      def __missing__(self, key):
        self[key] = output = f(key)
        return output
    return property(Cache().__getitem__)
  ```
  However, that implementation holds class instances as keys, and as a result
  blocks garbage collection. (And modifying it to use weakref's as keys raises
  the lookup overhead to ~0.4 us) As a result, the WeakKeyDictionary
  implementation below turns out to be more prudent.
  Args:
    f: The function to cache.
  Returns:
    f decorated with simple caching behavior.
  """
  cache = weakref.WeakKeyDictionary()
  @functools.wraps(f)
  def wrapped(item):
    output = cache.get(item)
    if output is None:
      cache[item] = output = f(item)
    return output
  return wrapped
 ops.register_tensor_conversion_function(
    TrackableAsset,
    lambda asset, **kw: ops.internal_convert_to_tensor(asset.asset_path, **kw))
--- a/tensorflow/python/training/tracking/tracking_test.py
+++ b/tensorflow/python/training/tracking/tracking_test.py
@ -16,9 +16,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 import contextlib
 import multiprocessing.dummy
 import os
 import pickle
 import time
 import timeit
-import numpy
+import numpy as np
 import six
 from tensorflow.python.framework import test_util
@ -32,6 +38,23 @@ from tensorflow.python.training.tracking import util
 from tensorflow.python.util import nest
 _PICKLEABLE_CALL_COUNT = collections.Counter()
 class MyPickleableObject(tracking.AutoTrackable):
  """Needed for InterfaceTests.test_property_cache_serialization.
  This class must be at the top level. This is a constraint of pickle,
  unrelated to `cached_per_instance`.
  """
  @property
  @tracking.cached_per_instance
  def my_id(self):
    _PICKLEABLE_CALL_COUNT[self] += 1
    return id(self)
 class InterfaceTests(test.TestCase):
  def testMultipleAssignment(self):
@ -199,15 +222,129 @@ class InterfaceTests(test.TestCase):
  @test_util.run_in_graph_and_eager_modes
  def testAssertions(self):
    a = tracking.AutoTrackable()
-    a.l = {"k": [numpy.zeros([2, 2])]}
+    a.l = {"k": [np.zeros([2, 2])]}
-    self.assertAllEqual(nest.flatten({"k": [numpy.zeros([2, 2])]}),
+    self.assertAllEqual(nest.flatten({"k": [np.zeros([2, 2])]}),
                        nest.flatten(a.l))
-    self.assertAllClose({"k": [numpy.zeros([2, 2])]}, a.l)
+    self.assertAllClose({"k": [np.zeros([2, 2])]}, a.l)
-    nest.map_structure(self.assertAllClose, a.l, {"k": [numpy.zeros([2, 2])]})
+    nest.map_structure(self.assertAllClose, a.l, {"k": [np.zeros([2, 2])]})
    a.tensors = {"k": [array_ops.ones([2, 2]), array_ops.zeros([3, 3])]}
-    self.assertAllClose({"k": [numpy.ones([2, 2]), numpy.zeros([3, 3])]},
+    self.assertAllClose({"k": [np.ones([2, 2]), np.zeros([3, 3])]},
                        self.evaluate(a.tensors))
  def test_property_cache(self):
    test_counter = collections.Counter()
    class MyObject(tracking.AutoTrackable):
      def __init__(self):
        super(MyObject, self).__init__()
        self._frozen = True
      def __setattr__(self, key, value):
        """Enforce that cache does not set attribute on MyObject."""
        if getattr(self, "_frozen", False):
          raise ValueError("Cannot mutate when frozen.")
        return super(MyObject, self).__setattr__(key, value)
      @property
      @tracking.cached_per_instance
      def test_property(self):
        test_counter[id(self)] += 1
        return id(self)
    first_object = MyObject()
    second_object = MyObject()
    # Make sure the objects return the correct values
    self.assertEqual(first_object.test_property, id(first_object))
    self.assertEqual(second_object.test_property, id(second_object))
    # Make sure the cache does not share across objects
    self.assertNotEqual(first_object.test_property, second_object.test_property)
    # Check again (Now the values should be cached.)
    self.assertEqual(first_object.test_property, id(first_object))
    self.assertEqual(second_object.test_property, id(second_object))
    # Count the function calls to make sure the cache is actually being used.
    self.assertAllEqual(tuple(test_counter.values()), (1, 1))
  def test_property_cache_threaded(self):
    call_count = collections.Counter()
    class MyObject(tracking.AutoTrackable):
      @property
      @tracking.cached_per_instance
      def test_property(self):
        # Random sleeps to ensure that the execution thread changes
        # mid-computation.
        call_count["test_property"] += 1
        time.sleep(np.random.random() + 1.)
        # Use a RandomState which is seeded off the instance's id (the mod is
        # because numpy limits the range of seeds) to ensure that an instance
        # returns the same value in different threads, but different instances
        # return different values.
        return int(np.random.RandomState(id(self) % (2 ** 31)).randint(2 ** 16))
      def get_test_property(self, _):
        """Function provided to .map for threading test."""
        return self.test_property
    # Test that multiple threads return the same value. This requires that
    # the underlying function is repeatable, as cached_property makes no attempt
    # to prioritize the first call.
    test_obj = MyObject()
    with contextlib.closing(multiprocessing.dummy.Pool(32)) as pool:
      # Intentionally make a large pool (even when there are only a small number
      # of cpus) to ensure that the runtime switches threads.
      results = pool.map(test_obj.get_test_property, range(64))
    self.assertEqual(len(set(results)), 1)
    # Make sure we actually are testing threaded behavior.
    self.assertGreater(call_count["test_property"], 1)
    # Make sure new threads still cache hit.
    with contextlib.closing(multiprocessing.dummy.Pool(2)) as pool:
      start_time = timeit.default_timer()  # Don't time pool instantiation.
      results = pool.map(test_obj.get_test_property, range(4))
    total_time = timeit.default_timer() - start_time
    # Note(taylorrobie): The reason that it is safe to time a unit test is that
    #                    a cache hit will be << 1 second, and a cache miss is
    #                    guaranteed to be >= 1 second. Emperically confirmed by
    #                    100,000 runs with no flakes.
    self.assertLess(total_time, 0.95)
  def test_property_cache_serialization(self):
    # Reset call count. .keys() must be wrapped in a list, because otherwise we
    # would mutate the iterator while iterating.
    for k in list(_PICKLEABLE_CALL_COUNT.keys()):
      _PICKLEABLE_CALL_COUNT.pop(k)
    first_instance = MyPickleableObject()
    self.assertEqual(id(first_instance), first_instance.my_id)
    # Test that we can pickle and un-pickle
    second_instance = pickle.loads(pickle.dumps(first_instance))
    self.assertEqual(id(second_instance), second_instance.my_id)
    self.assertNotEqual(first_instance.my_id, second_instance.my_id)
    # Make sure de-serialized object uses the cache.
    self.assertEqual(_PICKLEABLE_CALL_COUNT[second_instance], 1)
    # Make sure the decorator cache is not being serialized with the object.
    expected_size = len(pickle.dumps(second_instance))
    for _ in range(5):
      # Add some more entries to the cache.
      _ = MyPickleableObject().my_id
    self.assertEqual(len(_PICKLEABLE_CALL_COUNT), 7)
    size_check_instance = MyPickleableObject()
    _ = size_check_instance.my_id
    self.assertEqual(expected_size, len(pickle.dumps(size_check_instance)))
 class _DummyResource(tracking.TrackableResource):