Fix the hanging in TensorBoard callback by adding step=0 to summary_ops_v2.graph. The hang is caused by TensorBoard callback using summary_ops_v2 to write a graph. summary_ops_v2.graph() doesn't actually need a global step variable but is creating one anyway (due to the structure of event log files), which causes a problem since the chief worker needs to send the initial value through collective_ops to other workers, but other workers don't have corresponding receiving call due to the lack of TensorBoard callback with ModelCheckpoint._chief_worker_only=True.
PiperOrigin-RevId: 242224121
This commit is contained in:
parent
009fde6645
commit
2147192998
@ -1304,7 +1304,7 @@ class TensorBoard(Callback):
|
|||||||
with self._get_writer(self._train_run_name).as_default():
|
with self._get_writer(self._train_run_name).as_default():
|
||||||
with summary_ops_v2.always_record_summaries():
|
with summary_ops_v2.always_record_summaries():
|
||||||
if not model.run_eagerly:
|
if not model.run_eagerly:
|
||||||
summary_ops_v2.graph(K.get_graph())
|
summary_ops_v2.graph(K.get_graph(), step=0)
|
||||||
|
|
||||||
summary_writable = (
|
summary_writable = (
|
||||||
self.model._is_graph_network or # pylint: disable=protected-access
|
self.model._is_graph_network or # pylint: disable=protected-access
|
||||||
|
@ -168,7 +168,7 @@ class TensorBoard(callbacks.Callback):
|
|||||||
self.writer = summary_ops_v2.create_file_writer(self.log_dir)
|
self.writer = summary_ops_v2.create_file_writer(self.log_dir)
|
||||||
if not model.run_eagerly and self.write_graph:
|
if not model.run_eagerly and self.write_graph:
|
||||||
with self.writer.as_default():
|
with self.writer.as_default():
|
||||||
summary_ops_v2.graph(K.get_graph())
|
summary_ops_v2.graph(K.get_graph(), step=0)
|
||||||
elif self.write_graph:
|
elif self.write_graph:
|
||||||
self.writer = tf_summary.FileWriter(self.log_dir, K.get_graph())
|
self.writer = tf_summary.FileWriter(self.log_dir, K.get_graph())
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user