Fix the hanging in TensorBoard callback by adding step=0 to summary_ops_v2.graph. The hang is caused by TensorBoard callback using summary_ops_v2 to write a graph. summary_ops_v2.graph() doesn't actually need a global step variable but is creating one anyway (due to the structure of event log files), which causes a problem since the chief worker needs to send the initial value through collective_ops to other workers, but other workers don't have corresponding receiving call due to the lack of TensorBoard callback with ModelCheckpoint._chief_worker_only=True.

PiperOrigin-RevId: 242224121
This commit is contained in:
Rick Chao 2019-04-05 17:31:57 -07:00 committed by TensorFlower Gardener
parent 009fde6645
commit 2147192998
2 changed files with 2 additions and 2 deletions

View File

@ -1304,7 +1304,7 @@ class TensorBoard(Callback):
with self._get_writer(self._train_run_name).as_default(): with self._get_writer(self._train_run_name).as_default():
with summary_ops_v2.always_record_summaries(): with summary_ops_v2.always_record_summaries():
if not model.run_eagerly: if not model.run_eagerly:
summary_ops_v2.graph(K.get_graph()) summary_ops_v2.graph(K.get_graph(), step=0)
summary_writable = ( summary_writable = (
self.model._is_graph_network or # pylint: disable=protected-access self.model._is_graph_network or # pylint: disable=protected-access

View File

@ -168,7 +168,7 @@ class TensorBoard(callbacks.Callback):
self.writer = summary_ops_v2.create_file_writer(self.log_dir) self.writer = summary_ops_v2.create_file_writer(self.log_dir)
if not model.run_eagerly and self.write_graph: if not model.run_eagerly and self.write_graph:
with self.writer.as_default(): with self.writer.as_default():
summary_ops_v2.graph(K.get_graph()) summary_ops_v2.graph(K.get_graph(), step=0)
elif self.write_graph: elif self.write_graph:
self.writer = tf_summary.FileWriter(self.log_dir, K.get_graph()) self.writer = tf_summary.FileWriter(self.log_dir, K.get_graph())
else: else: