From 20bacc0d3546896db2d65257495c9e1a752fd9b9 Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Mon, 6 Apr 2020 11:21:48 -0700
Subject: [PATCH] Enable tracing for nested thunks.  - moves the annotation
 cache to the thunk itself  - iterates through the whole executable, not just
 top-level thunks, to construct annotations  - constructs a ScopedAnnotation
 anywhere an atomic thunk might be executed (currently in GpuExecutable and
 SequentialThunk)

PiperOrigin-RevId: 305075290
Change-Id: Ice38b1a5abd88899bbf7ca1f5e3d94f09a5d1573
---
 .../xla/service/gpu/conditional_thunk.cc        |  7 +++++++
 .../xla/service/gpu/conditional_thunk.h         |  1 +
 .../compiler/xla/service/gpu/for_thunk.cc       |  5 +++++
 tensorflow/compiler/xla/service/gpu/for_thunk.h |  1 +
 .../compiler/xla/service/gpu/gpu_executable.cc  | 17 ++++-------------
 .../compiler/xla/service/gpu/gpu_executable.h   |  4 ----
 .../xla/service/gpu/sequential_thunk.cc         | 10 ++++++++++
 .../compiler/xla/service/gpu/sequential_thunk.h |  1 +
 tensorflow/compiler/xla/service/gpu/thunk.h     | 13 +++++++++++++
 .../compiler/xla/service/gpu/while_thunk.cc     |  6 ++++++
 .../compiler/xla/service/gpu/while_thunk.h      |  1 +
 11 files changed, 49 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
index 43f9f176c35..e31f45942b1 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -43,6 +43,13 @@ ConditionalThunk::ConditionalThunk(
   }
 }
 
+void ConditionalThunk::ComputeAnnotations() {
+  Thunk::ComputeAnnotations();
+  for (auto& branch_thunk : branch_thunks_) {
+    branch_thunk->ComputeAnnotations();
+  }
+}
+
 Status ConditionalThunk::Initialize(const GpuExecutable& executable,
                                     se::StreamExecutor* executor) {
   if (branch_index_is_bool_) {
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
index 453e2e4efa5..404e2131eff 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@@ -51,6 +51,7 @@ class ConditionalThunk : public Thunk {
   ConditionalThunk(const ConditionalThunk&) = delete;
   ConditionalThunk& operator=(const ConditionalThunk&) = delete;
 
+  void ComputeAnnotations() override;
   Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.cc b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
index bed5f271f20..0a97f668b38 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
@@ -34,6 +34,11 @@ ForThunk::ForThunk(const int64 loop_limit,
           // this ForThunk, and shouldn't be profiled separately from it.
           std::move(*body_thunk_sequence), nullptr)) {}
 
+void ForThunk::ComputeAnnotations() {
+  Thunk::ComputeAnnotations();
+  body_thunk_sequence_->ComputeAnnotations();
+}
+
 Status ForThunk::Initialize(const GpuExecutable& executable,
                             se::StreamExecutor* executor) {
   TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable, executor));
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h
index 8ac283848e8..57402f70627 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h
@@ -37,6 +37,7 @@ class ForThunk : public Thunk {
   ForThunk(const ForThunk&) = delete;
   ForThunk& operator=(const ForThunk&) = delete;
 
+  void ComputeAnnotations() override;
   Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 0041415c920..2df6b50d361 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -45,7 +45,7 @@ namespace xla {
 namespace gpu {
 namespace {
 
-using tensorflow::profiler::ScopedAnnotation;
+using ::tensorflow::profiler::ScopedAnnotation;
 
 }  // namespace
 
@@ -91,13 +91,8 @@ GpuExecutable::~GpuExecutable() {
 }
 
 void GpuExecutable::ComputeThunkAnnotations() {
-  CanonicalNameMap canonical_name_map;
   for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
-    const HloInstruction* hlo = thunk->hlo_instruction();
-    CHECK(hlo);
-    thunk_annotations_[thunk] =
-        absl::StrFormat("Thunk#hlo_op=%s,hlo_module=%s#", hlo->name(),
-                        hlo->GetModule()->name());
+    thunk->ComputeAnnotations();
   }
 }
 
@@ -173,17 +168,13 @@ Status GpuExecutable::ExecuteThunks(
       tensorflow::profiler::TraceMeLevel::kInfo);
 
   std::map<const Thunk*, std::unique_ptr<se::Event>> thunk_to_finish_event;
-  bool scoped_annotation_enabled = ScopedAnnotation::IsEnabled();
   std::vector<std::function<void()>> deferred_host_callbacks;
   for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
+    CHECK(thunk->hlo_instruction());
     // Annotate execution of this op if tracing was enabled when we started
     // running this module.  If tracing is enabled *while* we're running the
     // module, we won't get any data, but that's probably an OK trade-off.
-    absl::optional<ScopedAnnotation> op_annotation;
-    CHECK(thunk->hlo_instruction());
-    if (scoped_annotation_enabled) {
-      op_annotation.emplace(FindOrDie(thunk_annotations_, thunk));
-    }
+    ScopedAnnotation annotation([&] { return thunk->profile_annotation(); });
 
     TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
     int32 stream_no =
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 33642a7dc3d..045a36c099b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -151,10 +151,6 @@ class GpuExecutable : public Executable {
   // memory for every output/temp buffers.
   const std::shared_ptr<const BufferAssignment> assignment_;
 
-  // Maps a thunk to a string describing the thunk.  This is useful when
-  // constructing ScopeAnnotation objects.
-  absl::flat_hash_map<Thunk*, string> thunk_annotations_;
-
   // Cache of module handles and constant buffer allocation maps used by
   // `ResolveConstantGlobals`.
   tensorflow::mutex module_handle_mutex_;
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
index db49ee36064..025ca60ef0c 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
@@ -17,14 +17,23 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/profiler/lib/scoped_annotation.h"
 
 namespace xla {
 namespace gpu {
 
+using ::tensorflow::profiler::ScopedAnnotation;
+
 SequentialThunk::SequentialThunk(std::vector<std::unique_ptr<Thunk>> thunks,
                                  const HloInstruction* hlo)
     : Thunk(Kind::kSequential, hlo), thunks_(std::move(thunks)) {}
 
+void SequentialThunk::ComputeAnnotations() {
+  for (const auto& thunk : thunks_) {
+    thunk->ComputeAnnotations();
+  }
+}
+
 Status SequentialThunk::Initialize(const GpuExecutable& executable,
                                    se::StreamExecutor* executor) {
   for (auto& thunk : thunks_) {
@@ -37,6 +46,7 @@ Status SequentialThunk::ExecuteOnStream(const ExecuteParams& params) {
   auto op_profiler =
       params.profiler->MakeScopedInstructionProfiler(hlo_instruction());
   for (const auto& thunk : thunks_) {
+    ScopedAnnotation annotation([&] { return thunk->profile_annotation(); });
     TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(params));
   }
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
index 0874203424f..3abb82c0b66 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
@@ -39,6 +39,7 @@ class SequentialThunk : public Thunk {
 
   const std::vector<std::unique_ptr<Thunk>>& thunks() const { return thunks_; }
 
+  void ComputeAnnotations() override;
   Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 326c5a20716..e9be41b74de 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -79,6 +79,18 @@ class Thunk {
 
   Kind kind() const { return kind_; }
   const HloInstruction* hlo_instruction() const { return hlo_instruction_; }
+  string profile_annotation() const { return profile_annotation_; }
+
+  // Constructs and caches the profile annotation string for this thunk and
+  // any child thunks.
+  virtual void ComputeAnnotations() {
+    const HloInstruction* hlo = hlo_instruction();
+    if (hlo) {
+      profile_annotation_ =
+          absl::StrFormat("Thunk:#hlo_op=%s,hlo_module=%s#", hlo->name(),
+                          hlo->GetModule()->name());
+    }
+  }
 
   // Prepares the thunk for execution on the given StreamExecutor.
   //
@@ -130,6 +142,7 @@ class Thunk {
  private:
   Kind kind_;
   const HloInstruction* hlo_instruction_;
+  string profile_annotation_;
 };
 
 // A sequence of thunks.
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index 0dd873ba64e..4134cd39832 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -39,6 +39,12 @@ WhileThunk::WhileThunk(
       body_thunk_sequence_(absl::make_unique<SequentialThunk>(
           std::move(*body_thunk_sequence), nullptr)) {}
 
+void WhileThunk::ComputeAnnotations() {
+  Thunk::ComputeAnnotations();
+  condition_thunk_sequence_->ComputeAnnotations();
+  body_thunk_sequence_->ComputeAnnotations();
+}
+
 Status WhileThunk::Initialize(const GpuExecutable& executable,
                               se::StreamExecutor* executor) {
   TF_RETURN_IF_ERROR(
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h
index 091bec7490e..31db01b72ba 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h
@@ -46,6 +46,7 @@ class WhileThunk : public Thunk {
   WhileThunk(const WhileThunk&) = delete;
   WhileThunk& operator=(const WhileThunk&) = delete;
 
+  void ComputeAnnotations() override;
   Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;