Set default check health initial timeout to infinite
Now we're exchanging device incarnation at the barrier (group resolution), so there won't be failures between barrier and incarnation exchange. That used to be possible and may cause a deadlock if we don't have a timeout in the initial check health. PiperOrigin-RevId: 330842625 Change-Id: I407ec93ba924b63d41a4ddee7e0050068f822f7e
This commit is contained in:
parent
d59bdf5493
commit
8fc390aa96
@ -189,10 +189,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
|
||||
_check_health_interval = 30
|
||||
# Timeout in seconds for the first check health. The first check health needs
|
||||
# to wait for cluster, which may make a longer time.
|
||||
#
|
||||
# TODO(b/151232436): now the inital barrier may hang in a rare case, so we
|
||||
# need a finite timeout.
|
||||
_check_health_initial_timeout = 1200
|
||||
_check_health_initial_timeout = 0
|
||||
# Times to retry before considering the peer is down.
|
||||
_check_health_retry_limit = 3
|
||||
|
||||
@ -683,8 +680,8 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
|
||||
#
|
||||
# TODO(b/151232436): change to an explicit barrier if we have it.
|
||||
dummy_value = ops.convert_to_tensor([])
|
||||
logging.info("Waiting for the cluster, timeout = %d",
|
||||
self._check_health_initial_timeout)
|
||||
logging.info("Waiting for the cluster, timeout = %s",
|
||||
self._check_health_initial_timeout or "inf")
|
||||
try:
|
||||
self._host_cross_device_ops.reduce(
|
||||
reduce_util.ReduceOp.SUM,
|
||||
|
@ -36,7 +36,7 @@ from tensorflow.python.eager import test
|
||||
# Put it in top level so it executes in the child processes as well.
|
||||
mwms_lib.CollectiveAllReduceExtended._enable_check_health = True
|
||||
mwms_lib.CollectiveAllReduceExtended._check_health_interval = 3
|
||||
mwms_lib.CollectiveAllReduceExtended._check_health_initial_timeout = 6
|
||||
mwms_lib.CollectiveAllReduceExtended._check_health_initial_timeout = 0
|
||||
|
||||
|
||||
def get_attempt(strategy, attempts):
|
||||
|
Loading…
Reference in New Issue
Block a user