From 80999cc5086c635ad8ca564c648eaed45c99ecbc Mon Sep 17 00:00:00 2001 From: Haoyu Zhang <haoyuzhang@google.com> Date: Mon, 19 Oct 2020 18:35:15 -0700 Subject: [PATCH] Remove catching InternalError for failed tensor copies as worker failures. PiperOrigin-RevId: 337976482 Change-Id: Ic385376d39cd6928b39dea985a300a0be14bcc74 --- tensorflow/python/distribute/client/client.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tensorflow/python/distribute/client/client.py b/tensorflow/python/distribute/client/client.py index be7157c1fea..a5eb9ff5e21 100644 --- a/tensorflow/python/distribute/client/client.py +++ b/tensorflow/python/distribute/client/client.py @@ -1180,14 +1180,8 @@ def _is_worker_failure(error): # remote_handle" part. return True - # TODO(b/162541228): The following 3 types of errors are very rare and only + # TODO(b/162541228): The following 2 types of errors are very rare and only # observed in large-scale testing. The types of errors should be reduced. - # This error could show up when copying function inputs from remote tasks. - if isinstance(error, errors.InternalError): - if ("Failed copying input tensor" in str(error) or - "Unable to find a context_id" in str(error)): - return True - # This could happen when the function registration fails. In the observed # cases this only happens to the dataset related functions. if isinstance(error, errors.NotFoundError):