From b55673b490ecfbdf94c85b86605fc6efa153abbd Mon Sep 17 00:00:00 2001 From: Russell Power Date: Tue, 19 Mar 2019 09:52:01 -0700 Subject: [PATCH] TPUEstimator: Wait for workers to shutdown before continuing execution. PiperOrigin-RevId: 239210831 --- tensorflow/python/tpu/session_support.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/tpu/session_support.py b/tensorflow/python/tpu/session_support.py index 0cca8aeb55b..d74cd49f92c 100644 --- a/tensorflow/python/tpu/session_support.py +++ b/tensorflow/python/tpu/session_support.py @@ -148,10 +148,12 @@ class WorkerHeartbeatManager(object): shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR) self.configure(req) - # Wait for workers to shutdown. This isn't strictly required - # but it avoids triggering multiple checkpoints with the same lame worker. - logging.info('Waiting %dms for worker shutdown.', timeout_ms) - time.sleep(timeout_ms / 1000) + # Wait for workers to shutdown. If we continue immediately, we can create a + # new heartbeat manager before the workers shutdown: this keeps the workers + # alive and can introduce confusing behavior. + sleep_sec = 10.0 + timeout_ms / 1000 + logging.info('Waiting %.2f seconds for worker shutdown.', sleep_sec) + time.sleep(sleep_sec) def all_worker_devices(session):