From b70247fbdd1fc220b3713bef9cb8bcd212f4424c Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Fri, 5 Apr 2019 16:06:35 -0700 Subject: [PATCH] [XLA:Python] Add tracemes around runtime actions. PiperOrigin-RevId: 242211983 --- tensorflow/compiler/xla/python/BUILD | 1 + tensorflow/compiler/xla/python/local_client.cc | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD index e2dd12a5eb0..0aacf914903 100644 --- a/tensorflow/compiler/xla/python/BUILD +++ b/tensorflow/compiler/xla/python/BUILD @@ -137,6 +137,7 @@ tf_pybind_extension( "//tensorflow/compiler/xla/service:cpu_plugin", "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry", "//tensorflow/core:lib", + "//tensorflow/core/profiler/lib:traceme", # Do NOT remove this dependency. The XLA Python extension must not # depend on any part of TensorFlow at runtime, **including** # libtensorflow_framework.so. The XLA module is deployed self-contained diff --git a/tensorflow/compiler/xla/python/local_client.cc b/tensorflow/compiler/xla/python/local_client.cc index 09899b030a4..3c747edc6b2 100644 --- a/tensorflow/compiler/xla/python/local_client.cc +++ b/tensorflow/compiler/xla/python/local_client.cc @@ -33,6 +33,7 @@ limitations under the License. #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/core/profiler/lib/traceme.h" namespace xla { namespace xla_python { @@ -70,12 +71,14 @@ StatusOr GetLocalClient(const std::string& platform_name) { /* static */ StatusOr LocalShapedBuffer::FromPython( const py::object& argument, LocalClient* client, int device_ordinal) { - VLOG(1) << "Creating shaped buffer from literal on device ordinal: " - << device_ordinal; + tensorflow::profiler::TraceMe("LocalShapedBuffer::FromPython"); TF_ASSIGN_OR_RETURN(PythonBufferTree tree, GetPythonBufferTree(argument)); // We are done manipulating Python objects; release the GIL. py::gil_scoped_release gil_release; + VLOG(1) << "LocalShapedBuffer::FromPython: shape: " << tree.shape.ToString() + << " device ordinal: " << device_ordinal; + DeviceMemoryAllocator* allocator = client->backend().memory_allocator(); TransferManager* transfer_manager = client->backend().transfer_manager(); TF_ASSIGN_OR_RETURN(ScopedShapedBuffer buffer, @@ -122,6 +125,7 @@ const Shape& LocalShapedBuffer::shape() const { } StatusOr LocalShapedBuffer::ToPython() const { + tensorflow::profiler::TraceMe("LocalShapedBuffer::ToPython"); auto literal = absl::make_unique(); { py::gil_scoped_release gil_release; @@ -132,6 +136,7 @@ StatusOr LocalShapedBuffer::ToPython() const { } StatusOr> LocalShapedBuffer::DestructureTuple() { + tensorflow::profiler::TraceMe("LocalShapedBuffer::DestructureTuple"); const Shape tuple_shape = shape(); if (!tuple_shape.IsTuple()) { @@ -192,6 +197,7 @@ std::vector LocalExecutableWrapper::DeviceOrdinals() const { StatusOr LocalExecutableWrapper::Execute( absl::Span argument_handles) { + tensorflow::profiler::TraceMe("LocalExecutable::Execute"); if (num_replicas() != 1) { return InvalidArgument( "Attempted to execute computation with %d replicas using Execute()", @@ -230,6 +236,7 @@ StatusOr LocalExecutableWrapper::Execute( StatusOr> LocalExecutableWrapper::ExecutePerReplica( absl::Span> argument_handles) { + tensorflow::profiler::TraceMe("LocalExecutable::ExecutePerReplica"); const int num_devices = client_->device_count(); if (argument_handles.size() != num_replicas()) { @@ -269,6 +276,8 @@ LocalExecutableWrapper::ExecutePerReplica( results[replica] = std::move(result_buffer_status); }; + VLOG(1) << "Executing replicated computation; num_replicas=" + << num_replicas(); if (num_replicas() == 1) { // Fast-path if there is only one replica — run the computation on the // current thread. @@ -283,6 +292,7 @@ LocalExecutableWrapper::ExecutePerReplica( } execute(num_replicas() - 1); } + VLOG(1) << "Replicated execution complete."; std::vector wrapped_results(num_replicas()); for (int replica = 0; replica < num_replicas(); ++replica) { @@ -339,6 +349,7 @@ LocalExecutableWrapper::Compile(const XlaComputation& computation, const std::vector& argument_shapes, const ExecutableBuildOptions* build_options, LocalClient* client) { + tensorflow::profiler::TraceMe("LocalExecutable::Compile"); std::vector argument_shape_pointers; argument_shape_pointers.reserve(argument_shapes.size()); for (auto& argument_shape : argument_shapes) {