From 4f42698fdc71b4578366f403fbc05dd09bd575ee Mon Sep 17 00:00:00 2001 From: Rick Chao Date: Thu, 7 Nov 2019 19:03:23 -0800 Subject: [PATCH] Use try_run_and_except_connection_error on multi_worker_continuous_run_test to avoid flakiness. TODO: Configure fail_fast flag so this is solved in all multi_worker_runner cases. PiperOrigin-RevId: 279217412 Change-Id: I46b3a07538ef87e26cff00c1760aa72c745ec7d2 --- .../distribute/multi_worker_continuous_run_test.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/distribute/multi_worker_continuous_run_test.py b/tensorflow/python/distribute/multi_worker_continuous_run_test.py index 8785b56d1b9..19790a0d69f 100644 --- a/tensorflow/python/distribute/multi_worker_continuous_run_test.py +++ b/tensorflow/python/distribute/multi_worker_continuous_run_test.py @@ -26,6 +26,7 @@ import numpy as np from tensorflow.python.distribute import collective_all_reduce_strategy from tensorflow.python.distribute import combinations from tensorflow.python.distribute import multi_process_runner +from tensorflow.python.distribute import multi_process_runner_util from tensorflow.python.distribute import multi_worker_test_base as test_base from tensorflow.python.distribute import reduce_util from tensorflow.python.eager import context @@ -75,9 +76,11 @@ class MultiWorkerContinuousRunTest(test.TestCase, parameterized.TestCase): for _ in range(100): worker_step_fn() - multi_process_runner.MultiProcessRunner().run( - worker_fn, - cluster_spec=test_base.create_cluster_spec(num_workers=num_workers)) + # TODO(b/141948186): Remove this `with` block once b/141948186 is resolved. + with multi_process_runner_util.try_run_and_except_connection_error(self): + multi_process_runner.MultiProcessRunner().run( + worker_fn, + cluster_spec=test_base.create_cluster_spec(num_workers=num_workers)) if __name__ == '__main__':