Use try_run_and_except_connection_error on multi_worker_continuous_run_test to avoid flakiness.
TODO: Configure fail_fast flag so this is solved in all multi_worker_runner cases. PiperOrigin-RevId: 279217412 Change-Id: I46b3a07538ef87e26cff00c1760aa72c745ec7d2
This commit is contained in:
parent
1861a7b716
commit
4f42698fdc
@ -26,6 +26,7 @@ import numpy as np
|
||||
from tensorflow.python.distribute import collective_all_reduce_strategy
|
||||
from tensorflow.python.distribute import combinations
|
||||
from tensorflow.python.distribute import multi_process_runner
|
||||
from tensorflow.python.distribute import multi_process_runner_util
|
||||
from tensorflow.python.distribute import multi_worker_test_base as test_base
|
||||
from tensorflow.python.distribute import reduce_util
|
||||
from tensorflow.python.eager import context
|
||||
@ -75,9 +76,11 @@ class MultiWorkerContinuousRunTest(test.TestCase, parameterized.TestCase):
|
||||
for _ in range(100):
|
||||
worker_step_fn()
|
||||
|
||||
multi_process_runner.MultiProcessRunner().run(
|
||||
worker_fn,
|
||||
cluster_spec=test_base.create_cluster_spec(num_workers=num_workers))
|
||||
# TODO(b/141948186): Remove this `with` block once b/141948186 is resolved.
|
||||
with multi_process_runner_util.try_run_and_except_connection_error(self):
|
||||
multi_process_runner.MultiProcessRunner().run(
|
||||
worker_fn,
|
||||
cluster_spec=test_base.create_cluster_spec(num_workers=num_workers))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
Loading…
x
Reference in New Issue
Block a user