Update error message upon a preemption error to highlight a potential

gRPC failure and suggest increasing the number of parameter servers.

PiperOrigin-RevId: 214077622
This commit is contained in:
A. Unique TensorFlower 2018-09-21 19:08:11 -07:00 committed by TensorFlower Gardener
parent 812d5505f5
commit 174e782ded

View File

@ -1114,7 +1114,11 @@ class _RecoverableSession(_WrappedSession):
logging.info('An error was raised while a session was being created. '
'This may be due to a preemption of a connected worker '
'or parameter server. A new session will be created. '
'Error: %s', e)
'This error may also occur due to a gRPC failure caused '
'by high memory or network bandwidth usage in the '
'parameter servers. If this error occurs repeatedly, try '
'increasing the number of parameter servers assigned to '
'the job. Error: %s', e)
def _check_stop(self):
try:
@ -1127,7 +1131,11 @@ class _RecoverableSession(_WrappedSession):
'session is complete. This may be due to a preemption in '
'a connected worker or parameter server. The current '
'session will be closed and a new session will be '
'created. Error: %s', e)
'created. This error may also occur due to a gRPC failure '
'caused by high memory or network bandwidth usage in the '
'parameter servers. If this error occurs repeatedly, try '
'increasing the number of parameter servers assigned to '
'the job. Error: %s', e)
self.close()
self._sess = self._create_session()
# Since we have just recreated the session, the overall computation should
@ -1150,7 +1158,11 @@ class _RecoverableSession(_WrappedSession):
logging.info('An error was raised. This may be due to a preemption in '
'a connected worker or parameter server. The current '
'session will be closed and a new session will be '
'created. Error: %s', e)
'created. This error may also occur due to a gRPC failure '
'caused by high memory or network bandwidth usage in the '
'parameter servers. If this error occurs repeatedly, try '
'increasing the number of parameter servers assigned to '
'the job. Error: %s', e)
self.close()
self._sess = None
@ -1166,7 +1178,11 @@ class _RecoverableSession(_WrappedSession):
logging.info('An error was raised. This may be due to a preemption in '
'a connected worker or parameter server. The current '
'session will be closed and a new session will be '
'created. Error: %s', e)
'created. This error may also occur due to a gRPC failure '
'caused by high memory or network bandwidth usage in the '
'parameter servers. If this error occurs repeatedly, try '
'increasing the number of parameter servers assigned to '
'the job. Error: %s', e)
self.close()
self._sess = None