Handle garbage collection race condition.

An exception is being thrown when objects that use `CapturableResourceDeleter` are garbage collected at the end of a program's life.  This can happen in very normal circumstances, such as when using `saved_model_cli` to inspect a model.

The cause of the exception appears to be a race condition with garbage collection between `CapturableResourceDeleter` and `ScopedTFFunction`. Both define a custom finalizer (`__del__`); `CaptureableResourceDeleter`'s finalizer ultimately calls a concrete function which calls an `_EagerDefinedFunction` which attempts to load and execute a `ScopedTFFunction`.

In the case of multiple objects in a reference cycle all going unreachable during the same garbage collection cycle, we get no guaranteed ordering for which of the objects will be collected first. In the case of the exception, `ScopedTFFunction` is collected first and its underlying function is deleted. Later, `CapturableResourceDeleter` is called, which fails, since the function it's trying to call is gone.

PiperOrigin-RevId: 358292164
Change-Id: I9162d5de622f5c1ec9b2954647b9958a7d3d87b6
This commit is contained in:
Daniel Ellis 2021-02-18 16:34:42 -08:00 committed by TensorFlower Gardener
parent 50ea65ffda
commit c29e9f25e7
4 changed files with 57 additions and 2 deletions
tensorflow/python
eager
framework
saved_model
training/tracking

View File

@ -417,6 +417,14 @@ class _EagerDefinedFunctionDeleter(object):
# been unloaded. Will catch other module unloads as well.
class FunctionAlreadyGarbageCollectedError(Exception):
def __init__(self, function_name):
super(FunctionAlreadyGarbageCollectedError, self).__init__(
"{} has already been garbage collected and cannot be called.".format(
function_name))
# TODO(apassos) get rid of this by splitting framework.function._DefinedFunction
# so it doesn't have the definition-generating logic and is just a container for
# an already-defined function.
@ -551,6 +559,8 @@ class _EagerDefinedFunction(object):
Raises:
ValueError: if the number of arguments is incorrect.
FunctionAlreadyGarbageCollectedError: if the function is no longer
available to be called because it has been garbage collected.
"""
if len(args) != len(self.signature.input_arg):
raise ValueError(
@ -558,6 +568,14 @@ class _EagerDefinedFunction(object):
"got: %s, expected: %s " %
(len(args), len(list(self.signature.input_arg))))
# If the `ScopedTFFunction` (accessed via `_c_func`) has already been
# cleaned up as a part of garbage collection, this `_EagerDefinedFunction`
# should also be garbage and is likely being called as part of a `__del__`
# elsewhere. In that case, there's nothing we can do, so we raise an
# exception for the caller to handle.
if self._c_func.has_been_garbage_collected:
raise FunctionAlreadyGarbageCollectedError(self.name)
function_call_options = ctx.function_call_options
if function_call_options.config_proto_serialized is None:
config = function_utils.get_disabled_rewriter_config()

View File

@ -101,8 +101,12 @@ class ScopedTFFunction(object):
# Function at shutdown, which satisfies leak checkers.
self.deleter = c_api.TF_DeleteFunction
@property
def has_been_garbage_collected(self):
return self.func is None
def __del__(self):
if self.func is not None:
if not self.has_been_garbage_collected:
self.deleter(self.func)
self.func = None

View File

@ -19,7 +19,10 @@ from __future__ import division
from __future__ import print_function
import collections
import contextlib
import functools
import gc
import io
import os
import sys
import tempfile
@ -2195,6 +2198,26 @@ class SingleCycleTests(test.TestCase, parameterized.TestCase):
previous_concrete_function = new_concrete_function
def test_garbage_collection_capturable_resource_doesnt_raise_exception(self):
model = module.Module()
model.mapping = lookup_ops.StaticHashTable(
lookup_ops.KeyValueTensorInitializer(
keys=math_ops.range(1, dtype=dtypes.int32),
values=["foo"]),
"default_value")
loaded = cycle(model, 1)
del model
del loaded
# Exceptions raised during garbage collection are simply printed to stderr
# and ignored, and we have no way to access them. We'll capture stdout
# during the garbage collection process and inspect to see if any
# exceptions were raised.
stderr = io.StringIO()
with contextlib.redirect_stderr(stderr):
gc.collect()
if "Exception ignored in" in stderr.getvalue():
raise Exception(stderr.getvalue())
if __name__ == "__main__":
test.main()

View File

@ -205,7 +205,17 @@ class CapturableResourceDeleter(object):
def __del__(self):
if self._destroy_resource:
with self._destruction_context():
self._destroy_resource()
try:
self._destroy_resource()
# There is a race condition between this and `ScopedTFFunction`
# whereby if an entire garbage collection chain containing both
# objects is moved to unreachable during the same garbage collection
# cycle, the __del__ for `ScopedTFFunction` can be collected before
# this method is called. In that case, we can't do much but
# continue.
except defun.FunctionAlreadyGarbageCollectedError:
pass
class CapturableResource(base.Trackable):