Use try_run_and_except_connection_error on multi_worker_continuous_run_test to avoid flakiness.

TODO: Configure fail_fast flag so this is solved in all multi_worker_runner cases.
PiperOrigin-RevId: 279217412
Change-Id: I46b3a07538ef87e26cff00c1760aa72c745ec7d2
This commit is contained in:
Rick Chao 2019-11-07 19:03:23 -08:00 committed by TensorFlower Gardener
parent 1861a7b716
commit 4f42698fdc

View File

@ -26,6 +26,7 @@ import numpy as np
from tensorflow.python.distribute import collective_all_reduce_strategy
from tensorflow.python.distribute import combinations
from tensorflow.python.distribute import multi_process_runner
from tensorflow.python.distribute import multi_process_runner_util
from tensorflow.python.distribute import multi_worker_test_base as test_base
from tensorflow.python.distribute import reduce_util
from tensorflow.python.eager import context
@ -75,9 +76,11 @@ class MultiWorkerContinuousRunTest(test.TestCase, parameterized.TestCase):
for _ in range(100):
worker_step_fn()
multi_process_runner.MultiProcessRunner().run(
worker_fn,
cluster_spec=test_base.create_cluster_spec(num_workers=num_workers))
# TODO(b/141948186): Remove this `with` block once b/141948186 is resolved.
with multi_process_runner_util.try_run_and_except_connection_error(self):
multi_process_runner.MultiProcessRunner().run(
worker_fn,
cluster_spec=test_base.create_cluster_spec(num_workers=num_workers))
if __name__ == '__main__':