diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD index 138ca7e3d59..a7734f0363f 100644 --- a/tensorflow/core/common_runtime/eager/BUILD +++ b/tensorflow/core/common_runtime/eager/BUILD @@ -245,6 +245,7 @@ cc_library( ":kernel_and_device", ":tensor_handle", "@com_google_absl//absl/strings", + "//tensorflow/core/profiler/lib:traceme", ] + select({ "//tensorflow:android": [ "//tensorflow/core:android_tensorflow_lib_lite", diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc index 4068aced323..0240fe31277 100644 --- a/tensorflow/core/common_runtime/eager/execute.cc +++ b/tensorflow/core/common_runtime/eager/execute.cc @@ -23,6 +23,7 @@ limitations under the License. // clang-format on #include "absl/strings/match.h" +#include "absl/strings/str_cat.h" #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/common_runtime/device_set.h" #include "tensorflow/core/common_runtime/eager/context.h" @@ -36,6 +37,7 @@ limitations under the License. #include "tensorflow/core/framework/node_def_util.h" #include "tensorflow/core/framework/types.pb.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/profiler/lib/traceme.h" #if !defined(IS_MOBILE_PLATFORM) #include "tensorflow/core/distributed_runtime/eager/eager_client.h" #include "tensorflow/core/distributed_runtime/eager/remote_execute_node.h" @@ -293,6 +295,8 @@ Status AddInputDevicesToCacheKey(const EagerContext* ctx, const EagerOperation* op, std::vector* input_dev_ptrs, Fprint128* cache_key) { + profiler::TraceMe activity("AddInputDevicesToCacheKey", + profiler::TraceMeLevel::kVerbose); input_dev_ptrs->reserve(op->Inputs().size()); Device* cpu_device = ctx->HostCPU(); for (TensorHandle* tensor_handle : op->Inputs()) { @@ -347,6 +351,8 @@ Status AddInputTensorShapesToCacheKey( const EagerContext* ctx, const EagerOperation* op, std::unordered_map* input_tensor_shapes, Fprint128* cache_key) { + profiler::TraceMe activity("AddInputTensorShapesToCacheKey", + profiler::TraceMeLevel::kVerbose); for (int i = 0; i < op->Inputs().size(); i++) { TensorHandle* tensor_handle = op->Inputs()[i]; @@ -382,6 +388,8 @@ Status AddInputResourceDtypesAndShapesToCacheKey( std::unordered_map>* input_resource_dtypes_shapes, Fprint128* cache_key) { + profiler::TraceMe activity("AddInputResourceDtypesAndShapesToCacheKey", + profiler::TraceMeLevel::kVerbose); for (int i = 0; i < op->Inputs().size(); i++) { TensorHandle* tensor_handle = op->Inputs()[i]; @@ -475,6 +483,9 @@ Status ShouldCompileWithXLA(const EagerOperation* op, const Device* device, Status EagerLocalExecute(EagerOperation* op, gtl::InlinedVector* retvals, int* num_retvals) { + profiler::TraceMe activity( + [&] { return absl::StrCat("EagerLocalExecute: ", op->Name()); }, + profiler::TraceMeLevel::kInfo); const string unspecified_device_name(""); EagerContext* ctx = op->EagerContext(); auto status = ctx->GetStatus(); @@ -505,16 +516,20 @@ Status EagerLocalExecute(EagerOperation* op, // Once that is the case, we will be able to write a thin wrapper layer over // the EagerService that behaves similar to the current // ClusterFunctionLibraryRuntime/DistributedFunctionLibraryRuntime. - for (int i = 0; i < op->Inputs().size(); i++) { - TensorHandle* input = op->Inputs()[i]; - if (input->IsRemote()) { - TensorHandle* handle = nullptr; - TF_RETURN_IF_ERROR(EagerCopyToDevice( - input, ctx, device == nullptr ? "" : device->name().c_str(), - ctx->MirrorTensors(), &handle)); - op->UpdateInput(i, handle); - // Unref handle since it has a ref as an input now - handle->Unref(); + { + profiler::TraceMe activity("EagerCopyToDevice", + profiler::TraceMeLevel::kInfo); + for (int i = 0; i < op->Inputs().size(); i++) { + TensorHandle* input = op->Inputs()[i]; + if (input->IsRemote()) { + TensorHandle* handle = nullptr; + TF_RETURN_IF_ERROR(EagerCopyToDevice( + input, ctx, device == nullptr ? "" : device->name().c_str(), + ctx->MirrorTensors(), &handle)); + op->UpdateInput(i, handle); + // Unref handle since it has a ref as an input now + handle->Unref(); + } } } TF_RETURN_IF_ERROR( @@ -1038,6 +1053,9 @@ Status MaybeUpdateOpDevice(EagerOperation* op) { Status EagerExecute(EagerOperation* op, gtl::InlinedVector* retvals, int* num_retvals) { + profiler::TraceMe activity( + [&] { return absl::StrCat("EagerExecute: ", op->Name()); }, + profiler::TraceMeLevel::kInfo); TF_RETURN_IF_ERROR(MaybeUpdateOpDevice(op)); bool op_is_local = op->EagerContext()->IsLocal(op->Device()); @@ -1063,6 +1081,8 @@ Status EagerKernelExecute(EagerContext* ctx, StepStats* maybe_step_stats, GraphCollector* graph_collector, TensorHandle** retvals, int num_retvals) { + profiler::TraceMe activity("EagerKernelExecute", + profiler::TraceMeLevel::kInfo); std::vector outputs(1); // If there are multiple references to a TensorHandle in 'op_inputs' we must diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc index 6ec269f96a7..8d943a64023 100644 --- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc +++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc @@ -180,12 +180,16 @@ Status EagerServiceImpl::ExecuteOp(const Operation& operation, TF_RETURN_IF_ERROR(op->SetDevice(operation.device().c_str())); - for (const auto& remote_handle : operation.inputs()) { - tensorflow::TensorHandle* handle; - TF_RETURN_IF_ERROR(server_context->GetTensorHandle( - RemoteTensorHandleInternal(remote_handle), &handle)); + { + profiler::TraceMe activity("EagerService:RemoteTensorHandleInternal", + profiler::TraceMeLevel::kVerbose); + for (const auto& remote_handle : operation.inputs()) { + tensorflow::TensorHandle* handle; + TF_RETURN_IF_ERROR(server_context->GetTensorHandle( + RemoteTensorHandleInternal(remote_handle), &handle)); - op->AddInput(handle); + op->AddInput(handle); + } } for (const auto& attr : operation.attrs()) {