Enable tracing for nested thunks.

- moves the annotation cache to the thunk itself - iterates through the whole executable, not just top-level thunks, to construct annotations - constructs a ScopedAnnotation anywhere an atomic thunk might be executed (currently in GpuExecutable and SequentialThunk) PiperOrigin-RevId: 305075290 Change-Id: Ice38b1a5abd88899bbf7ca1f5e3d94f09a5d1573
2020-04-06 11:21:48 -07:00 · 2020-04-06 11:21:48 -07:00 · 20bacc0d35
commit 20bacc0d35
parent da2d11092e
11 changed files with 49 additions and 17 deletions
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@ -43,6 +43,13 @@ ConditionalThunk::ConditionalThunk(
  }
 }

+void ConditionalThunk::ComputeAnnotations() {
+  Thunk::ComputeAnnotations();
+  for (auto& branch_thunk : branch_thunks_) {
+    branch_thunk->ComputeAnnotations();
+  }
+}
+
 Status ConditionalThunk::Initialize(const GpuExecutable& executable,
                                    se::StreamExecutor* executor) {
  if (branch_index_is_bool_) {
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@ -51,6 +51,7 @@ class ConditionalThunk : public Thunk {
  ConditionalThunk(const ConditionalThunk&) = delete;
  ConditionalThunk& operator=(const ConditionalThunk&) = delete;

+  void ComputeAnnotations() override;
  Status Initialize(const GpuExecutable& executable,
                    se::StreamExecutor* executor) override;
  Status ExecuteOnStream(const ExecuteParams& params) override;
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
@ -34,6 +34,11 @@ ForThunk::ForThunk(const int64 loop_limit,
          // this ForThunk, and shouldn't be profiled separately from it.
          std::move(*body_thunk_sequence), nullptr)) {}

+void ForThunk::ComputeAnnotations() {
+  Thunk::ComputeAnnotations();
+  body_thunk_sequence_->ComputeAnnotations();
+}
+
 Status ForThunk::Initialize(const GpuExecutable& executable,
                            se::StreamExecutor* executor) {
  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable, executor));
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h
@ -37,6 +37,7 @@ class ForThunk : public Thunk {
  ForThunk(const ForThunk&) = delete;
  ForThunk& operator=(const ForThunk&) = delete;

+  void ComputeAnnotations() override;
  Status Initialize(const GpuExecutable& executable,
                    se::StreamExecutor* executor) override;
  Status ExecuteOnStream(const ExecuteParams& params) override;
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@ -45,7 +45,7 @@ namespace xla {
 namespace gpu {
 namespace {

-using tensorflow::profiler::ScopedAnnotation;
+using ::tensorflow::profiler::ScopedAnnotation;

 }  // namespace

@ -91,13 +91,8 @@ GpuExecutable::~GpuExecutable() {
 }

 void GpuExecutable::ComputeThunkAnnotations() {
-  CanonicalNameMap canonical_name_map;
  for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
-    const HloInstruction* hlo = thunk->hlo_instruction();
-    CHECK(hlo);
-    thunk_annotations_[thunk] =
-        absl::StrFormat("Thunk#hlo_op=%s,hlo_module=%s#", hlo->name(),
-                        hlo->GetModule()->name());
+    thunk->ComputeAnnotations();
  }
 }

@ -173,17 +168,13 @@ Status GpuExecutable::ExecuteThunks(
      tensorflow::profiler::TraceMeLevel::kInfo);

  std::map<const Thunk*, std::unique_ptr<se::Event>> thunk_to_finish_event;
-  bool scoped_annotation_enabled = ScopedAnnotation::IsEnabled();
  std::vector<std::function<void()>> deferred_host_callbacks;
  for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
+    CHECK(thunk->hlo_instruction());
    // Annotate execution of this op if tracing was enabled when we started
    // running this module.  If tracing is enabled *while* we're running the
    // module, we won't get any data, but that's probably an OK trade-off.
-    absl::optional<ScopedAnnotation> op_annotation;
-    CHECK(thunk->hlo_instruction());
-    if (scoped_annotation_enabled) {
-      op_annotation.emplace(FindOrDie(thunk_annotations_, thunk));
-    }
+    ScopedAnnotation annotation([&] { return thunk->profile_annotation(); });

    TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
    int32 stream_no =
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@ -151,10 +151,6 @@ class GpuExecutable : public Executable {
  // memory for every output/temp buffers.
  const std::shared_ptr<const BufferAssignment> assignment_;

-  // Maps a thunk to a string describing the thunk.  This is useful when
-  // constructing ScopeAnnotation objects.
-  absl::flat_hash_map<Thunk*, string> thunk_annotations_;
-
  // Cache of module handles and constant buffer allocation maps used by
  // `ResolveConstantGlobals`.
  tensorflow::mutex module_handle_mutex_;
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
@ -17,14 +17,23 @@ limitations under the License.

 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/profiler/lib/scoped_annotation.h"

 namespace xla {
 namespace gpu {

+using ::tensorflow::profiler::ScopedAnnotation;
+
 SequentialThunk::SequentialThunk(std::vector<std::unique_ptr<Thunk>> thunks,
                                 const HloInstruction* hlo)
    : Thunk(Kind::kSequential, hlo), thunks_(std::move(thunks)) {}

+void SequentialThunk::ComputeAnnotations() {
+  for (const auto& thunk : thunks_) {
+    thunk->ComputeAnnotations();
+  }
+}
+
 Status SequentialThunk::Initialize(const GpuExecutable& executable,
                                   se::StreamExecutor* executor) {
  for (auto& thunk : thunks_) {
@ -37,6 +46,7 @@ Status SequentialThunk::ExecuteOnStream(const ExecuteParams& params) {
  auto op_profiler =
      params.profiler->MakeScopedInstructionProfiler(hlo_instruction());
  for (const auto& thunk : thunks_) {
+    ScopedAnnotation annotation([&] { return thunk->profile_annotation(); });
    TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(params));
  }
  return Status::OK();
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
@ -39,6 +39,7 @@ class SequentialThunk : public Thunk {

  const std::vector<std::unique_ptr<Thunk>>& thunks() const { return thunks_; }

+  void ComputeAnnotations() override;
  Status Initialize(const GpuExecutable& executable,
                    se::StreamExecutor* executor) override;
  Status ExecuteOnStream(const ExecuteParams& params) override;
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@ -79,6 +79,18 @@ class Thunk {

  Kind kind() const { return kind_; }
  const HloInstruction* hlo_instruction() const { return hlo_instruction_; }
+  string profile_annotation() const { return profile_annotation_; }
+
+  // Constructs and caches the profile annotation string for this thunk and
+  // any child thunks.
+  virtual void ComputeAnnotations() {
+    const HloInstruction* hlo = hlo_instruction();
+    if (hlo) {
+      profile_annotation_ =
+          absl::StrFormat("Thunk:#hlo_op=%s,hlo_module=%s#", hlo->name(),
+                          hlo->GetModule()->name());
+    }
+  }

  // Prepares the thunk for execution on the given StreamExecutor.
  //
@ -130,6 +142,7 @@ class Thunk {
 private:
  Kind kind_;
  const HloInstruction* hlo_instruction_;
+  string profile_annotation_;
 };

 // A sequence of thunks.
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@ -39,6 +39,12 @@ WhileThunk::WhileThunk(
      body_thunk_sequence_(absl::make_unique<SequentialThunk>(
          std::move(*body_thunk_sequence), nullptr)) {}

+void WhileThunk::ComputeAnnotations() {
+  Thunk::ComputeAnnotations();
+  condition_thunk_sequence_->ComputeAnnotations();
+  body_thunk_sequence_->ComputeAnnotations();
+}
+
 Status WhileThunk::Initialize(const GpuExecutable& executable,
                              se::StreamExecutor* executor) {
  TF_RETURN_IF_ERROR(
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h
@ -46,6 +46,7 @@ class WhileThunk : public Thunk {
  WhileThunk(const WhileThunk&) = delete;
  WhileThunk& operator=(const WhileThunk&) = delete;

+  void ComputeAnnotations() override;
  Status Initialize(const GpuExecutable& executable,
                    se::StreamExecutor* executor) override;
  Status ExecuteOnStream(const ExecuteParams& params) override;