Add profile logging to Tpu Estimator.
PiperOrigin-RevId: 240469608
This commit is contained in:
parent
3d2488f052
commit
a26413ef0a
@ -125,6 +125,7 @@ py_library(
|
||||
"__init__.py",
|
||||
"bfloat16.py",
|
||||
"device_assignment.py",
|
||||
"profile_logger.py",
|
||||
"session_support.py",
|
||||
"tensor_tracer.py",
|
||||
"topology.py",
|
||||
|
69
tensorflow/python/tpu/profile_logger.py
Normal file
69
tensorflow/python/tpu/profile_logger.py
Normal file
@ -0,0 +1,69 @@
|
||||
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ========================================================================
|
||||
"""A logger for profiling events."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import os.path
|
||||
|
||||
from tensorflow.core.framework.summary_pb2 import Summary
|
||||
from tensorflow.python.platform import tf_logging as logging
|
||||
from tensorflow.python.summary.writer import writer
|
||||
|
||||
|
||||
class ProfileLogger(object):
|
||||
"""For logging profiling events."""
|
||||
|
||||
def _set_summary_dir(self, model_dir):
|
||||
"""Sets the summary directory to be model_dir."""
|
||||
if model_dir is None:
|
||||
self._summary_dir = None
|
||||
self._summary_writer = None
|
||||
logging.warning('profile_logger: model_dir is None.'
|
||||
'So nowhere to write summaries')
|
||||
return
|
||||
self._summary_dir = os.path.join(model_dir, 'profile')
|
||||
try:
|
||||
self._summary_writer = writer.FileWriter(
|
||||
logdir=self._summary_dir, filename_suffix='.profile_logger')
|
||||
logging.info('profile_logger(): set the summary directory to %s',
|
||||
self._summary_dir)
|
||||
except Exception: # pylint: disable=broad-except
|
||||
logging.warning('profile_logger(): failed to create %s',
|
||||
self._summary_dir)
|
||||
self._summary_dir = None
|
||||
self._summary_writer = None
|
||||
|
||||
def __init__(self, model_dir):
|
||||
self._set_summary_dir(model_dir)
|
||||
|
||||
def log_event(self, event, phase):
|
||||
"""Logs the given event to the summary directory."""
|
||||
|
||||
event_name = 'profile/' + event + '_' + phase
|
||||
if self._summary_writer is None:
|
||||
logging.warning('profile_logger: cannot log event "%s" '
|
||||
' because of no summary directory', event_name)
|
||||
return
|
||||
|
||||
# For now, we only need the event timestamp. No need to pass any value.
|
||||
s = Summary(value=[Summary.Value(tag=event_name,
|
||||
simple_value=0.0)])
|
||||
self._summary_writer.add_summary(s)
|
||||
self._summary_writer.flush()
|
||||
logging.info('profile_logger: log event "%s"', event_name)
|
@ -62,6 +62,7 @@ from tensorflow.python.summary import summary
|
||||
from tensorflow.python.tpu import _tpu_estimator_embedding
|
||||
from tensorflow.python.tpu import error_handling
|
||||
from tensorflow.python.tpu import functional as tpu_functional
|
||||
from tensorflow.python.tpu import profile_logger
|
||||
from tensorflow.python.tpu import session_support
|
||||
from tensorflow.python.tpu import tensor_tracer
|
||||
from tensorflow.python.tpu import tpu
|
||||
@ -451,6 +452,7 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
|
||||
enqueue_ops,
|
||||
dequeue_ops,
|
||||
tpu_compile_op,
|
||||
prof_logger,
|
||||
run_infeed_loop_on_coordinator=True,
|
||||
rendezvous=None,
|
||||
master=None,
|
||||
@ -478,6 +480,7 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
|
||||
# initialization.
|
||||
self._should_initialize_tpu = not ctx.model_parallelism_enabled
|
||||
self._tpu_compile_op = tpu_compile_op
|
||||
self._profile_logger = prof_logger
|
||||
|
||||
def begin(self):
|
||||
logging.info('TPU job name %s', self._master_job)
|
||||
@ -540,6 +543,7 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
|
||||
if self._should_initialize_tpu:
|
||||
logging.info('Init TPU system')
|
||||
start = time.time()
|
||||
self._profile_logger.log_event('init_system', 'begin')
|
||||
with ops.Graph().as_default():
|
||||
with tf_session.Session(
|
||||
self._master, config=self._session_config) as sess:
|
||||
@ -547,6 +551,7 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
|
||||
tpu.initialize_system(
|
||||
job=self._master_job,
|
||||
embedding_config=self._embedding_layer_config))
|
||||
self._profile_logger.log_event('init_system', 'end')
|
||||
logging.info('Initialized TPU in %d seconds', time.time() - start)
|
||||
|
||||
session.run(self._init_ops,
|
||||
@ -593,13 +598,14 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
|
||||
|
||||
class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook):
|
||||
|
||||
def __init__(self, ctx, enqueue_ops, dequeue_ops, tpu_compile_op,
|
||||
def __init__(self, ctx, enqueue_ops, dequeue_ops, tpu_compile_op, prof_logger,
|
||||
rendezvous=None, master=None, session_config=None):
|
||||
super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__(
|
||||
ctx,
|
||||
enqueue_ops,
|
||||
dequeue_ops,
|
||||
tpu_compile_op=tpu_compile_op,
|
||||
prof_logger=prof_logger,
|
||||
run_infeed_loop_on_coordinator=False,
|
||||
rendezvous=rendezvous,
|
||||
master=master,
|
||||
@ -2382,6 +2388,7 @@ class TPUEstimator(estimator_lib.Estimator):
|
||||
|
||||
self._is_input_fn_invoked = None
|
||||
self._rendezvous = {}
|
||||
self._profile_logger = profile_logger.ProfileLogger(self.model_dir)
|
||||
|
||||
def _add_meta_graph_for_mode(self,
|
||||
builder,
|
||||
@ -2711,6 +2718,7 @@ class TPUEstimator(estimator_lib.Estimator):
|
||||
rendezvous = error_handling.ErrorRendezvous(num_sources=3)
|
||||
self._rendezvous[model_fn_lib.ModeKeys.TRAIN] = rendezvous
|
||||
try:
|
||||
self._profile_logger.log_event('train', 'begin')
|
||||
return super(TPUEstimator, self).train(
|
||||
input_fn=input_fn,
|
||||
hooks=hooks,
|
||||
@ -2720,6 +2728,7 @@ class TPUEstimator(estimator_lib.Estimator):
|
||||
except Exception: # pylint: disable=broad-except
|
||||
rendezvous.record_error('training_loop', sys.exc_info())
|
||||
finally:
|
||||
self._profile_logger.log_event('train', 'end')
|
||||
rendezvous.record_done('training_loop')
|
||||
rendezvous.raise_errors()
|
||||
|
||||
@ -2732,6 +2741,7 @@ class TPUEstimator(estimator_lib.Estimator):
|
||||
rendezvous = error_handling.ErrorRendezvous(num_sources=3)
|
||||
self._rendezvous[model_fn_lib.ModeKeys.EVAL] = rendezvous
|
||||
try:
|
||||
self._profile_logger.log_event('eval', 'begin')
|
||||
return super(TPUEstimator, self).evaluate(
|
||||
input_fn,
|
||||
steps=steps,
|
||||
@ -2741,6 +2751,7 @@ class TPUEstimator(estimator_lib.Estimator):
|
||||
except Exception: # pylint: disable=broad-except
|
||||
rendezvous.record_error('evaluation_loop', sys.exc_info())
|
||||
finally:
|
||||
self._profile_logger.log_event('eval', 'end')
|
||||
rendezvous.record_done('evaluation_loop')
|
||||
rendezvous.raise_errors()
|
||||
|
||||
@ -2753,6 +2764,7 @@ class TPUEstimator(estimator_lib.Estimator):
|
||||
rendezvous = error_handling.ErrorRendezvous(num_sources=3)
|
||||
self._rendezvous[model_fn_lib.ModeKeys.PREDICT] = rendezvous
|
||||
try:
|
||||
self._profile_logger.log_event('predict', 'begin')
|
||||
for result in super(TPUEstimator, self).predict(
|
||||
input_fn=input_fn,
|
||||
predict_keys=predict_keys,
|
||||
@ -2763,6 +2775,7 @@ class TPUEstimator(estimator_lib.Estimator):
|
||||
except Exception: # pylint: disable=broad-except
|
||||
rendezvous.record_error('prediction_loop', sys.exc_info())
|
||||
finally:
|
||||
self._profile_logger.log_event('predict', 'end')
|
||||
rendezvous.record_done('prediction_loop')
|
||||
rendezvous.raise_errors()
|
||||
|
||||
@ -2775,6 +2788,7 @@ class TPUEstimator(estimator_lib.Estimator):
|
||||
def _model_fn(features, labels, mode, config, params):
|
||||
"""A Estimator `model_fn` for TPUEstimator."""
|
||||
|
||||
self._profile_logger.log_event('model_fn', 'begin')
|
||||
# `input_fn` is called in `train()`, `evaluate()`, and `predict()`,
|
||||
# but not in `export_savedmodel()`.
|
||||
if self._is_input_fn_invoked:
|
||||
@ -2814,6 +2828,7 @@ class TPUEstimator(estimator_lib.Estimator):
|
||||
if self._log_every_n_steps is not None:
|
||||
estimator_spec = estimator_spec._replace(
|
||||
training_hooks=estimator_spec.training_hooks + (examples_hook,))
|
||||
self._profile_logger.log_event('model_fn', 'end')
|
||||
return estimator_spec
|
||||
|
||||
assert labels is None, '`labels` passed to `model_fn` must be `None`.'
|
||||
@ -2830,10 +2845,12 @@ class TPUEstimator(estimator_lib.Estimator):
|
||||
tpu_init_ops.append(dummy_table_variables_init)
|
||||
|
||||
input_holders = _InputPipeline(input_fn, batch_axis, ctx)
|
||||
self._profile_logger.log_event('setup_infeed', 'begin')
|
||||
enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = (
|
||||
input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
|
||||
|
||||
graph = ops.get_default_graph()
|
||||
self._profile_logger.log_event('setup_infeed', 'end')
|
||||
for enqueue_op in enqueue_ops:
|
||||
if isinstance(enqueue_op, list):
|
||||
graph.get_collection_ref(_TPU_ENQUEUE_OPS).extend(enqueue_op)
|
||||
@ -2897,6 +2914,7 @@ class TPUEstimator(estimator_lib.Estimator):
|
||||
enqueue_ops,
|
||||
host_ops,
|
||||
tpu_compile_op=compile_op,
|
||||
prof_logger=self._profile_logger,
|
||||
run_infeed_loop_on_coordinator=(
|
||||
run_infeed_loop_on_coordinator),
|
||||
rendezvous=self._rendezvous[mode],
|
||||
@ -2947,6 +2965,7 @@ class TPUEstimator(estimator_lib.Estimator):
|
||||
train_op = control_flow_ops.group(*update_ops)
|
||||
graph.add_to_collection(_TPU_TRAIN_OP, train_op)
|
||||
|
||||
self._profile_logger.log_event('model_fn', 'end')
|
||||
return model_fn_lib.EstimatorSpec(
|
||||
mode,
|
||||
loss=loss,
|
||||
@ -3022,6 +3041,7 @@ class TPUEstimator(estimator_lib.Estimator):
|
||||
enqueue_ops,
|
||||
eval_update_ops + host_ops,
|
||||
tpu_compile_op=compile_op,
|
||||
prof_logger=self._profile_logger,
|
||||
run_infeed_loop_on_coordinator=(
|
||||
run_infeed_loop_on_coordinator),
|
||||
rendezvous=self._rendezvous[mode],
|
||||
@ -3033,6 +3053,7 @@ class TPUEstimator(estimator_lib.Estimator):
|
||||
if eval_hooks:
|
||||
hooks.extend(eval_hooks)
|
||||
|
||||
self._profile_logger.log_event('model_fn', 'end')
|
||||
return model_fn_lib.EstimatorSpec(
|
||||
mode,
|
||||
loss=mean_loss,
|
||||
@ -3102,6 +3123,7 @@ class TPUEstimator(estimator_lib.Estimator):
|
||||
TPUInfeedOutfeedSessionHookForPrediction(
|
||||
ctx, enqueue_ops, host_ops, rendezvous=self._rendezvous[mode],
|
||||
tpu_compile_op=compile_op,
|
||||
prof_logger=self._profile_logger,
|
||||
master=self._config.master,
|
||||
session_config=self._session_config),
|
||||
] + input_hooks
|
||||
@ -3109,6 +3131,7 @@ class TPUEstimator(estimator_lib.Estimator):
|
||||
if prediction_hooks:
|
||||
hooks.extend(prediction_hooks)
|
||||
|
||||
self._profile_logger.log_event('model_fn', 'end')
|
||||
return model_fn_lib.EstimatorSpec(
|
||||
mode,
|
||||
prediction_hooks=hooks,
|
||||
|
Loading…
x
Reference in New Issue
Block a user