Set default check health initial timeout to infinite

Now we're exchanging device incarnation at the barrier (group resolution), so
there won't be failures between barrier and incarnation exchange. That used to
be possible and may cause a deadlock if we don't have a timeout in the initial
check health.

PiperOrigin-RevId: 330842625
Change-Id: I407ec93ba924b63d41a4ddee7e0050068f822f7e
This commit is contained in:
Ran Chen 2020-09-09 18:34:40 -07:00 committed by TensorFlower Gardener
parent d59bdf5493
commit 8fc390aa96
2 changed files with 4 additions and 7 deletions

View File

@ -189,10 +189,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
_check_health_interval = 30
# Timeout in seconds for the first check health. The first check health needs
# to wait for cluster, which may make a longer time.
#
# TODO(b/151232436): now the inital barrier may hang in a rare case, so we
# need a finite timeout.
_check_health_initial_timeout = 1200
_check_health_initial_timeout = 0
# Times to retry before considering the peer is down.
_check_health_retry_limit = 3
@ -683,8 +680,8 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
#
# TODO(b/151232436): change to an explicit barrier if we have it.
dummy_value = ops.convert_to_tensor([])
logging.info("Waiting for the cluster, timeout = %d",
self._check_health_initial_timeout)
logging.info("Waiting for the cluster, timeout = %s",
self._check_health_initial_timeout or "inf")
try:
self._host_cross_device_ops.reduce(
reduce_util.ReduceOp.SUM,

View File

@ -36,7 +36,7 @@ from tensorflow.python.eager import test
# Put it in top level so it executes in the child processes as well.
mwms_lib.CollectiveAllReduceExtended._enable_check_health = True
mwms_lib.CollectiveAllReduceExtended._check_health_interval = 3
mwms_lib.CollectiveAllReduceExtended._check_health_initial_timeout = 6
mwms_lib.CollectiveAllReduceExtended._check_health_initial_timeout = 0
def get_attempt(strategy, attempts):