From 20bacc0d3546896db2d65257495c9e1a752fd9b9 Mon Sep 17 00:00:00 2001 From: Jiho Choi Date: Mon, 6 Apr 2020 11:21:48 -0700 Subject: [PATCH] Enable tracing for nested thunks. - moves the annotation cache to the thunk itself - iterates through the whole executable, not just top-level thunks, to construct annotations - constructs a ScopedAnnotation anywhere an atomic thunk might be executed (currently in GpuExecutable and SequentialThunk) PiperOrigin-RevId: 305075290 Change-Id: Ice38b1a5abd88899bbf7ca1f5e3d94f09a5d1573 --- .../xla/service/gpu/conditional_thunk.cc | 7 +++++++ .../xla/service/gpu/conditional_thunk.h | 1 + .../compiler/xla/service/gpu/for_thunk.cc | 5 +++++ tensorflow/compiler/xla/service/gpu/for_thunk.h | 1 + .../compiler/xla/service/gpu/gpu_executable.cc | 17 ++++------------- .../compiler/xla/service/gpu/gpu_executable.h | 4 ---- .../xla/service/gpu/sequential_thunk.cc | 10 ++++++++++ .../compiler/xla/service/gpu/sequential_thunk.h | 1 + tensorflow/compiler/xla/service/gpu/thunk.h | 13 +++++++++++++ .../compiler/xla/service/gpu/while_thunk.cc | 6 ++++++ .../compiler/xla/service/gpu/while_thunk.h | 1 + 11 files changed, 49 insertions(+), 17 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc index 43f9f176c35..e31f45942b1 100644 --- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc @@ -43,6 +43,13 @@ ConditionalThunk::ConditionalThunk( } } +void ConditionalThunk::ComputeAnnotations() { + Thunk::ComputeAnnotations(); + for (auto& branch_thunk : branch_thunks_) { + branch_thunk->ComputeAnnotations(); + } +} + Status ConditionalThunk::Initialize(const GpuExecutable& executable, se::StreamExecutor* executor) { if (branch_index_is_bool_) { diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h index 453e2e4efa5..404e2131eff 100644 --- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h @@ -51,6 +51,7 @@ class ConditionalThunk : public Thunk { ConditionalThunk(const ConditionalThunk&) = delete; ConditionalThunk& operator=(const ConditionalThunk&) = delete; + void ComputeAnnotations() override; Status Initialize(const GpuExecutable& executable, se::StreamExecutor* executor) override; Status ExecuteOnStream(const ExecuteParams& params) override; diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.cc b/tensorflow/compiler/xla/service/gpu/for_thunk.cc index bed5f271f20..0a97f668b38 100644 --- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc @@ -34,6 +34,11 @@ ForThunk::ForThunk(const int64 loop_limit, // this ForThunk, and shouldn't be profiled separately from it. std::move(*body_thunk_sequence), nullptr)) {} +void ForThunk::ComputeAnnotations() { + Thunk::ComputeAnnotations(); + body_thunk_sequence_->ComputeAnnotations(); +} + Status ForThunk::Initialize(const GpuExecutable& executable, se::StreamExecutor* executor) { TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable, executor)); diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h index 8ac283848e8..57402f70627 100644 --- a/tensorflow/compiler/xla/service/gpu/for_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h @@ -37,6 +37,7 @@ class ForThunk : public Thunk { ForThunk(const ForThunk&) = delete; ForThunk& operator=(const ForThunk&) = delete; + void ComputeAnnotations() override; Status Initialize(const GpuExecutable& executable, se::StreamExecutor* executor) override; Status ExecuteOnStream(const ExecuteParams& params) override; diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index 0041415c920..2df6b50d361 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -45,7 +45,7 @@ namespace xla { namespace gpu { namespace { -using tensorflow::profiler::ScopedAnnotation; +using ::tensorflow::profiler::ScopedAnnotation; } // namespace @@ -91,13 +91,8 @@ GpuExecutable::~GpuExecutable() { } void GpuExecutable::ComputeThunkAnnotations() { - CanonicalNameMap canonical_name_map; for (Thunk* thunk : thunk_schedule_->TotalOrder()) { - const HloInstruction* hlo = thunk->hlo_instruction(); - CHECK(hlo); - thunk_annotations_[thunk] = - absl::StrFormat("Thunk#hlo_op=%s,hlo_module=%s#", hlo->name(), - hlo->GetModule()->name()); + thunk->ComputeAnnotations(); } } @@ -173,17 +168,13 @@ Status GpuExecutable::ExecuteThunks( tensorflow::profiler::TraceMeLevel::kInfo); std::map> thunk_to_finish_event; - bool scoped_annotation_enabled = ScopedAnnotation::IsEnabled(); std::vector> deferred_host_callbacks; for (Thunk* thunk : thunk_schedule_->TotalOrder()) { + CHECK(thunk->hlo_instruction()); // Annotate execution of this op if tracing was enabled when we started // running this module. If tracing is enabled *while* we're running the // module, we won't get any data, but that's probably an OK trade-off. - absl::optional op_annotation; - CHECK(thunk->hlo_instruction()); - if (scoped_annotation_enabled) { - op_annotation.emplace(FindOrDie(thunk_annotations_, thunk)); - } + ScopedAnnotation annotation([&] { return thunk->profile_annotation(); }); TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor)); int32 stream_no = diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h index 33642a7dc3d..045a36c099b 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h @@ -151,10 +151,6 @@ class GpuExecutable : public Executable { // memory for every output/temp buffers. const std::shared_ptr assignment_; - // Maps a thunk to a string describing the thunk. This is useful when - // constructing ScopeAnnotation objects. - absl::flat_hash_map thunk_annotations_; - // Cache of module handles and constant buffer allocation maps used by // `ResolveConstantGlobals`. tensorflow::mutex module_handle_mutex_; diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc index db49ee36064..025ca60ef0c 100644 --- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc @@ -17,14 +17,23 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/profiler/lib/scoped_annotation.h" namespace xla { namespace gpu { +using ::tensorflow::profiler::ScopedAnnotation; + SequentialThunk::SequentialThunk(std::vector> thunks, const HloInstruction* hlo) : Thunk(Kind::kSequential, hlo), thunks_(std::move(thunks)) {} +void SequentialThunk::ComputeAnnotations() { + for (const auto& thunk : thunks_) { + thunk->ComputeAnnotations(); + } +} + Status SequentialThunk::Initialize(const GpuExecutable& executable, se::StreamExecutor* executor) { for (auto& thunk : thunks_) { @@ -37,6 +46,7 @@ Status SequentialThunk::ExecuteOnStream(const ExecuteParams& params) { auto op_profiler = params.profiler->MakeScopedInstructionProfiler(hlo_instruction()); for (const auto& thunk : thunks_) { + ScopedAnnotation annotation([&] { return thunk->profile_annotation(); }); TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(params)); } return Status::OK(); diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h index 0874203424f..3abb82c0b66 100644 --- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h @@ -39,6 +39,7 @@ class SequentialThunk : public Thunk { const std::vector>& thunks() const { return thunks_; } + void ComputeAnnotations() override; Status Initialize(const GpuExecutable& executable, se::StreamExecutor* executor) override; Status ExecuteOnStream(const ExecuteParams& params) override; diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h index 326c5a20716..e9be41b74de 100644 --- a/tensorflow/compiler/xla/service/gpu/thunk.h +++ b/tensorflow/compiler/xla/service/gpu/thunk.h @@ -79,6 +79,18 @@ class Thunk { Kind kind() const { return kind_; } const HloInstruction* hlo_instruction() const { return hlo_instruction_; } + string profile_annotation() const { return profile_annotation_; } + + // Constructs and caches the profile annotation string for this thunk and + // any child thunks. + virtual void ComputeAnnotations() { + const HloInstruction* hlo = hlo_instruction(); + if (hlo) { + profile_annotation_ = + absl::StrFormat("Thunk:#hlo_op=%s,hlo_module=%s#", hlo->name(), + hlo->GetModule()->name()); + } + } // Prepares the thunk for execution on the given StreamExecutor. // @@ -130,6 +142,7 @@ class Thunk { private: Kind kind_; const HloInstruction* hlo_instruction_; + string profile_annotation_; }; // A sequence of thunks. diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc index 0dd873ba64e..4134cd39832 100644 --- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc @@ -39,6 +39,12 @@ WhileThunk::WhileThunk( body_thunk_sequence_(absl::make_unique( std::move(*body_thunk_sequence), nullptr)) {} +void WhileThunk::ComputeAnnotations() { + Thunk::ComputeAnnotations(); + condition_thunk_sequence_->ComputeAnnotations(); + body_thunk_sequence_->ComputeAnnotations(); +} + Status WhileThunk::Initialize(const GpuExecutable& executable, se::StreamExecutor* executor) { TF_RETURN_IF_ERROR( diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h index 091bec7490e..31db01b72ba 100644 --- a/tensorflow/compiler/xla/service/gpu/while_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h @@ -46,6 +46,7 @@ class WhileThunk : public Thunk { WhileThunk(const WhileThunk&) = delete; WhileThunk& operator=(const WhileThunk&) = delete; + void ComputeAnnotations() override; Status Initialize(const GpuExecutable& executable, se::StreamExecutor* executor) override; Status ExecuteOnStream(const ExecuteParams& params) override;